funasr/datasets/large_datasets/dataset.py
@@ -158,9 +158,10 @@ filter_fn = partial(filter, **filter_conf) dataset = FilterIterDataPipe(dataset, fn=filter_fn) vocab = {'vocab': dict, 'seg_dict': seg_dict} tokenize_fn = partial(tokenize, **vocab) dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) if "text" in data_names: vocab = {'vocab': dict, 'seg_dict': seg_dict} tokenize_fn = partial(tokenize, **vocab) dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) if shuffle: buffer_conf = conf.get('shuffle_conf', {})