| | |
| | | |
| | | |
| | | class BaseTokenizer(ABC): |
| | | def __init__(self, token_list: Union[Path, str, Iterable[str]] = None, |
| | | def __init__( |
| | | self, |
| | | token_list: Union[Path, str, Iterable[str]] = None, |
| | | unk_symbol: str = "<unk>", |
| | | **kwargs, |
| | | ): |
| | |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | | |
| | | with open(token_list, 'r', encoding='utf-8') as f: |
| | | with open(token_list, "r", encoding="utf-8") as f: |
| | | self.token_list = json.load(f) |
| | | |
| | | |
| | | else: |
| | | self.token_list: List[str] = list(token_list) |
| | |
| | | |
| | | self.unk_symbol = unk_symbol |
| | | if self.unk_symbol not in self.token2id: |
| | | raise RuntimeError( |
| | | f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list" |
| | | ) |
| | | raise RuntimeError(f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list") |
| | | self.unk_id = self.token2id[self.unk_symbol] |
| | | |
| | | def encode(self, text): |