funasr/datasets/large_datasets/utils/tokenize.py
@@ -46,10 +46,8 @@ text = data["text"] token = [] vad = -2 if bpe_tokenizer is not None: text = bpe_tokenizer.text2tokens("".join(text)) text = bpe_tokenizer.text2tokens(" ".join(text)) if seg_dict is not None: assert isinstance(seg_dict, dict) text = seg_tokenize(text, seg_dict) @@ -57,7 +55,7 @@ length = len(text) if 'hw_tag' in data: hotword_indxs = sample_hotword(length, **hw_config) data[hotword_indxs] = hotword_indxs data['hotword_indxs'] = hotword_indxs del data['hw_tag'] for i in range(length): x = text[i]