| funasr/datasets/preprocessor.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
funasr/datasets/preprocessor.py
@@ -47,15 +47,11 @@ def seg_tokenize(txt, seg_dict): out_txt = "" pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])") for word in txt: if pattern.match(word): if word in seg_dict: out_txt += seg_dict[word] + " " else: out_txt += "<unk>" + " " if word in seg_dict: out_txt += seg_dict[word] + " " else: continue out_txt += "<unk>" + " " return out_txt.strip().split() def seg_tokenize_wo_pattern(txt, seg_dict):