| | |
| | | from pathlib import Path |
| | | from typing import Iterable |
| | | from typing import Union |
| | | from abc import ABC |
| | | from abc import abstractmethod |
| | | from typing import Iterable |
| | | from typing import List |
| | | from pathlib import Path |
| | | from typing import Dict |
| | | from typing import Iterable |
| | | from typing import List |
| | | from typing import Union |
| | | |
| | | import numpy as np |
| | | |
| | | from funasr.tokenizer.abs_tokenizer import AbsTokenizer |
| | | from funasr.tokenizer.char_tokenizer import CharTokenizer |
| | |
| | | space_symbol: str = "<space>", |
| | | delimiter: str = None, |
| | | g2p_type: str = None, |
| | | **kwargs, |
| | | ) -> AbsTokenizer: |
| | | """A helper function to instantiate Tokenizer""" |
| | | if token_type == "bpe": |
| | |
| | | raise RuntimeError( |
| | | "remove_non_linguistic_symbols is not implemented for token_type=bpe" |
| | | ) |
| | | return SentencepiecesTokenizer(bpemodel, **kwargs) |
| | | return SentencepiecesTokenizer(bpemodel) |
| | | |
| | | elif token_type == "word": |
| | | if remove_non_linguistic_symbols and non_linguistic_symbols is not None: |
| | |
| | | remove_non_linguistic_symbols=True, |
| | | ) |
| | | else: |
| | | return WordTokenizer(delimiter=delimiter, **kwargs) |
| | | return WordTokenizer(delimiter=delimiter) |
| | | |
| | | elif token_type == "char": |
| | | return CharTokenizer( |
| | | non_linguistic_symbols=non_linguistic_symbols, |
| | | space_symbol=space_symbol, |
| | | remove_non_linguistic_symbols=remove_non_linguistic_symbols, |
| | | **kwargs |
| | | ) |
| | | |
| | | elif token_type == "phn": |
| | |
| | | non_linguistic_symbols=non_linguistic_symbols, |
| | | space_symbol=space_symbol, |
| | | remove_non_linguistic_symbols=remove_non_linguistic_symbols, |
| | | **kwargs |
| | | ) |
| | | |
| | | else: |
| | | raise ValueError( |
| | | f"token_mode must be one of bpe, word, char or phn: " f"{token_type}" |
| | | ) |
| | | raise ValueError(f"token_mode must be one of bpe, word, char or phn: " f"{token_type}") |