| | |
| | | from typing import Iterable |
| | | from typing import List |
| | | from typing import Union |
| | | import json |
| | | |
| | | import numpy as np |
| | | |
| | |
| | | ): |
| | | |
| | | if token_list is not None: |
| | | if isinstance(token_list, (Path, str)): |
| | | if isinstance(token_list, (Path, str)) and token_list.endswith(".txt"): |
| | | token_list = Path(token_list) |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | |
| | | for idx, line in enumerate(f): |
| | | line = line.rstrip() |
| | | self.token_list.append(line) |
| | | |
| | | elif isinstance(token_list, (Path, str)) and token_list.endswith(".json"): |
| | | token_list = Path(token_list) |
| | | self.token_list_repr = str(token_list) |
| | | self.token_list: List[str] = [] |
| | | |
| | | with open(token_list, 'r', encoding='utf-8') as f: |
| | | self.token_list = json.load(f) |
| | | |
| | | |
| | | else: |
| | | self.token_list: List[str] = list(token_list) |
| | | self.token_list_repr = "" |
| | |
| | | return text_ints |
| | | |
| | | def decode(self, text_ints): |
| | | return self.ids2tokens(text_ints) |
| | | token = self.ids2tokens(text_ints) |
| | | text = self.tokens2text(token) |
| | | return text |
| | | |
| | | def get_num_vocabulary_size(self) -> int: |
| | | return len(self.token_list) |