python/FunASR-XL.git

			@@ -28,7 +28,7 @@


			class AudioDataset(IterableDataset):
			def __init__(self, scp_lists, data_names, data_types, frontend_conf=None, shuffle=True, mode="train"):
			def __init__(self, scp_lists, data_names, data_types, frontend_conf=None, shuffle=True, mode="train", pre_hwlist=None, pre_prob=0.0):
			self.scp_lists = scp_lists
			self.data_names = data_names
			self.data_types = data_types
			@@ -40,6 +40,8 @@
			self.world_size = 1
			self.worker_id = 0
			self.num_workers = 1
			self.pre_hwlist = pre_hwlist
			self.pre_prob = pre_prob

			def set_epoch(self, epoch):
			self.epoch = epoch
			@@ -131,6 +133,13 @@
			sample_dict["sampling_rate"] = sampling_rate
			if data_name == "speech":
			sample_dict["key"] = key
			elif data_type == "text_hotword":
			text = item
			segs = text.strip().split()
			sample_dict[data_name] = segs[1:]
			if "key" not in sample_dict:
			sample_dict["key"] = segs[0]
			sample_dict['hw_tag'] = 1
			else:
			text = item
			segs = text.strip().split()
			@@ -167,14 +176,39 @@
			shuffle = conf.get('shuffle', True)
			data_names = conf.get("data_names", "speech,text")
			data_types = conf.get("data_types", "kaldi_ark,text")
			dataset = AudioDataset(scp_lists, data_names, data_types, frontend_conf=frontend_conf, shuffle=shuffle, mode=mode)

			pre_hwfile = conf.get("pre_hwlist", None)
			pre_prob = conf.get("pre_prob", 0)

			hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
			"double_rate": conf.get("double_rate", 0.1),
			"hotword_min_length": conf.get("hotword_min_length", 2),
			"hotword_max_length": conf.get("hotword_max_length", 8)}


			if pre_hwfile is not None:
			pre_hwlist = []
			with open(pre_hwfile, 'r') as fin:
			for line in fin.readlines():
			pre_hwlist.append(line.strip())
			else:
			pre_hwlist = None
			# logging.warning("Previous hwlist: {}".format(pre_hwlist))
			dataset = AudioDataset(scp_lists,
			data_names,
			data_types,
			frontend_conf=frontend_conf,
			shuffle=shuffle,
			mode=mode,
			pre_hwlist=pre_hwlist,
			pre_prob=pre_prob)

			filter_conf = conf.get('filter_conf', {})
			filter_fn = partial(filter, **filter_conf)
			dataset = FilterIterDataPipe(dataset, fn=filter_fn)

			if "text" in data_names:
			vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer}
			vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
			tokenize_fn = partial(tokenize, **vocab)
			dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)