funasr/datasets/large_datasets/utils/tokenize.py
@@ -48,7 +48,7 @@ vad = -2 if bpe_tokenizer is not None: text = bpe_tokenizer.text2tokens("".join(text)) text = bpe_tokenizer.text2tokens(text) if seg_dict is not None: assert isinstance(seg_dict, dict) @@ -57,7 +57,7 @@ length = len(text) if 'hw_tag' in data: hotword_indxs = sample_hotword(length, **hw_config) data[hotword_indxs] = hotword_indxs data['hotword_indxs'] = hotword_indxs del data['hw_tag'] for i in range(length): x = text[i]