| | |
| | | from funasr.tokenizer.abs_tokenizer import BaseTokenizer |
| | | from funasr.register import tables |
| | | |
| | | |
| | | @tables.register("tokenizer_classes", "CharTokenizer") |
| | | class CharTokenizer(BaseTokenizer): |
| | | def __init__( |
| | |
| | | seg_dict = seg_dict if seg_dict is not None else kwargs.get("seg_dict_file", None) |
| | | if seg_dict is not None: |
| | | self.seg_dict = load_seg_dict(seg_dict) |
| | | |
| | | |
| | | def __repr__(self): |
| | | return ( |
| | |
| | | seg_dict[key] = " ".join(value) |
| | | return seg_dict |
| | | |
| | | |
| | | def seg_tokenize(txt, seg_dict): |
| | | # pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') |
| | | pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") |