| | |
| | | from abc import ABC |
| | | from abc import abstractmethod |
| | | from typing import Iterable |
| | | from typing import List |
| | | from pathlib import Path |
| | | from typing import Dict |
| | | from typing import Iterable |
| | | from typing import List |
| | | from typing import Union |
| | | |
| | | import json |
| | | import numpy as np |
| | | from abc import ABC |
| | | from pathlib import Path |
| | | from abc import abstractmethod |
| | | from typing import Union, Iterable, List, Dict |
| | | |
| | | |
| | | class AbsTokenizer(ABC): |
| | | @abstractmethod |
| | | def text2tokens(self, line: str) -> List[str]: |
| | | raise NotImplementedError |
| | | |
| | | |
| | | @abstractmethod |
| | | def tokens2text(self, tokens: Iterable[str]) -> str: |
| | | raise NotImplementedError |
| | | |
| | | |
| | | class BaseTokenizer(ABC): |
| | | def __init__(self, token_list: Union[Path, str, Iterable[str]]=None, |
| | | def __init__(self, token_list: Union[Path, str, Iterable[str]] = None, |
| | | unk_symbol: str = "<unk>", |
| | | **kwargs, |
| | | ): |
| | | |
| | | if token_list is not None: |
| | | if isinstance(token_list, (Path, str)): |
| | | if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"): |
| | | token_list = Path(token_list) |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | |
| | | for idx, line in enumerate(f): |
| | | line = line.rstrip() |
| | | self.token_list.append(line) |
| | | elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"): |
| | | token_list = Path(token_list) |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | | |
| | | with open(token_list, 'r', encoding='utf-8') as f: |
| | | self.token_list = json.load(f) |
| | | |
| | | |
| | | else: |
| | | self.token_list: List[str] = list(token_list) |
| | |
| | | return text_ints |
| | | |
| | | def decode(self, text_ints): |
| | | return self.ids2tokens(text_ints) |
| | | token = self.ids2tokens(text_ints) |
| | | text = self.tokens2text(token) |
| | | return text |
| | | |
| | | def get_num_vocabulary_size(self) -> int: |
| | | return len(self.token_list) |
| | |
| | | |
| | | @abstractmethod |
| | | def tokens2text(self, tokens: Iterable[str]) -> str: |
| | | raise NotImplementedError |
| | | raise NotImplementedError |