python/FunASR-XL.git

			@@ -174,90 +174,94 @@
			def __iter__(self) -> Iterator[Tuple[Union[str, int], Dict[str, np.ndarray]]]:
			count = 0
			if len(self.path_name_type_list) != 0 and (self.path_name_type_list[0][2] == "bytes" or self.path_name_type_list[0][2] == "waveform"):
			linenum = len(self.path_name_type_list)
			data = {}
			value = self.path_name_type_list[0][0]
			uid = 'utt_id'
			name = self.path_name_type_list[0][1]
			_type = self.path_name_type_list[0][2]
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array
			for i in range(linenum):
			value = self.path_name_type_list[i][0]
			uid = 'utt_id'
			name = self.path_name_type_list[i][1]
			_type = self.path_name_type_list[i][2]
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and (name == "speech" or name == "ref_speech"):
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value
			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value

			yield uid, data

			elif len(self.path_name_type_list) != 0 and self.path_name_type_list[0][2] == "sound" and not self.path_name_type_list[0][0].lower().endswith(".scp"):
			linenum = len(self.path_name_type_list)
			data = {}
			value = self.path_name_type_list[0][0]
			uid = os.path.basename(self.path_name_type_list[0][0]).split(".")[0]
			name = self.path_name_type_list[0][1]
			_type = self.path_name_type_list[0][2]
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			_type = "pcm"
			for i in range(linenum):
			value = self.path_name_type_list[i][0]
			uid = os.path.basename(self.path_name_type_list[i][0]).split(".")[0]
			name = self.path_name_type_list[i][1]
			_type = self.path_name_type_list[i][2]
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			_type = "pcm"

			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and (name == "speech" or name == "ref_speech"):
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value
			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value

			yield uid, data