funasr/datasets/large_datasets/utils/tokenize.py
@@ -17,13 +17,21 @@ return word_list def seg_tokenize(txt, seg_dict): pattern = re.compile(r'^[\u4E00-\u9FA50-9]+$') out_txt = "" for word in txt: word = word.lower() if word in seg_dict: out_txt += seg_dict[word] + " " else: out_txt += "<unk>" + " " if pattern.match(word): for char in word: if char in seg_dict: out_txt += seg_dict[char] + " " else: out_txt += "<unk>" + " " else: out_txt += "<unk>" + " " return out_txt.strip().split() def tokenize(data,