zhifu gao
2024-04-24 861147c7308b91068ffa02724fdf74ee623a909e
funasr/tokenizer/abs_tokenizer.py
@@ -17,7 +17,9 @@
class BaseTokenizer(ABC):
    def __init__(self, token_list: Union[Path, str, Iterable[str]] = None,
    def __init__(
        self,
        token_list: Union[Path, str, Iterable[str]] = None,
                 unk_symbol: str = "<unk>",
                 **kwargs,
                 ):
@@ -37,9 +39,8 @@
                self.token_list_repr = str(token_list)
                self.token_list: List[str] = []
                
                with open(token_list, 'r', encoding='utf-8') as f:
                with open(token_list, "r", encoding="utf-8") as f:
                    self.token_list = json.load(f)
            
            else:
                self.token_list: List[str] = list(token_list)
@@ -58,9 +59,7 @@
            
            self.unk_symbol = unk_symbol
            if self.unk_symbol not in self.token2id:
                raise RuntimeError(
                    f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list"
                )
                raise RuntimeError(f"Unknown symbol '{unk_symbol}' doesn't exist in the token_list")
            self.unk_id = self.token2id[self.unk_symbol]
    
    def encode(self, text):