funasr/datasets/large_datasets/utils/tokenize.py
@@ -37,7 +37,7 @@ vad = -2 if bpe_tokenizer is not None: text = bpe_tokenizer.text2tokens(text) text = bpe_tokenizer.text2tokens("".join(text)) if seg_dict is not None: assert isinstance(seg_dict, dict) @@ -47,8 +47,8 @@ length = len(text) for i in range(length): x = text[i] if i == length-1 and "punc" in data and text[i].startswith("vad:"): vad = x[-1][4:] if i == length-1 and "punc" in data and x.startswith("vad:"): vad = x[4:] if len(vad) == 0: vad = -1 else: