funasr/datasets/large_datasets/dataset.py
@@ -158,6 +158,7 @@ filter_fn = partial(filter, **filter_conf) dataset = FilterIterDataPipe(dataset, fn=filter_fn) if "text" in data_names: vocab = {'vocab': dict, 'seg_dict': seg_dict} tokenize_fn = partial(tokenize, **vocab) dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)