funasr/datasets/large_datasets/utils/tokenize.py
@@ -28,13 +28,17 @@ def tokenize(data, vocab=None, seg_dict=None, punc_dict=None): punc_dict=None, bpe_tokenizer=None): assert "text" in data assert isinstance(vocab, dict) text = data["text"] token = [] vad = -2 if bpe_tokenizer is not None: text = bpe_tokenizer.text2tokens("".join(text)) if seg_dict is not None: assert isinstance(seg_dict, dict) txt = forward_segment("".join(text).lower(), seg_dict)