| | |
| | | from abc import ABC |
| | | from abc import abstractmethod |
| | | from typing import Iterable |
| | | from typing import List |
| | | from pathlib import Path |
| | | from typing import Dict |
| | | from typing import Iterable |
| | | from typing import List |
| | | from typing import Union |
| | | import json |
| | | |
| | | import numpy as np |
| | | from abc import ABC |
| | | from pathlib import Path |
| | | from abc import abstractmethod |
| | | from typing import Union, Iterable, List, Dict |
| | | |
| | | |
| | | class AbsTokenizer(ABC): |
| | | @abstractmethod |
| | | def text2tokens(self, line: str) -> List[str]: |
| | | raise NotImplementedError |
| | | |
| | | |
| | | @abstractmethod |
| | | def tokens2text(self, tokens: Iterable[str]) -> str: |
| | | raise NotImplementedError |
| | | |
| | | |
| | | class BaseTokenizer(ABC): |
| | | def __init__(self, token_list: Union[Path, str, Iterable[str]]=None, |
| | | def __init__(self, token_list: Union[Path, str, Iterable[str]] = None, |
| | | unk_symbol: str = "<unk>", |
| | | **kwargs, |
| | | ): |
| | |
| | | token_list = Path(token_list) |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | | |
| | | |
| | | with open(token_list, 'r', encoding='utf-8') as f: |
| | | self.token_list = json.load(f) |
| | | |
| | | |
| | | |
| | | |
| | | else: |
| | | self.token_list: List[str] = list(token_list) |
| | | self.token_list_repr = "" |
| | |
| | | |
| | | @abstractmethod |
| | | def tokens2text(self, tokens: Iterable[str]) -> str: |
| | | raise NotImplementedError |
| | | raise NotImplementedError |