| | |
| | | elif data_type == "text" or data_type == "sound": |
| | | text_reader = open(data_file, "r") |
| | | reader_list.append(text_reader) |
| | | elif data_type == "none": |
| | | continue |
| | | else: |
| | | raise TypeError("Data type {} is not supported".format(data_type)) |
| | | |
| | |
| | | filter_fn = partial(filter, **filter_conf) |
| | | dataset = FilterIterDataPipe(dataset, fn=filter_fn) |
| | | |
| | | vocab = {'vocab': dict, 'seg_dict': seg_dict} |
| | | tokenize_fn = partial(tokenize, **vocab) |
| | | dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) |
| | | if "text" in data_names: |
| | | vocab = {'vocab': dict, 'seg_dict': seg_dict} |
| | | tokenize_fn = partial(tokenize, **vocab) |
| | | dataset = MapperIterDataPipe(dataset, fn=tokenize_fn) |
| | | |
| | | if shuffle: |
| | | buffer_conf = conf.get('shuffle_conf', {}) |