python/FunASR-XL.git

			@@ -8,13 +8,14 @@
			from typing import Iterator
			from typing import Tuple
			from typing import Union
			from typing import List

			import kaldiio
			import numpy as np
			import torch
			import torchaudio
			import soundfile
			from torch.utils.data.dataset import IterableDataset
			from typeguard import check_argument_types
			import os.path

			from funasr.datasets.dataset import ESPnetDataset
			@@ -65,8 +66,17 @@
			bytes = f.read()
			return load_bytes(bytes)

			def load_wav(input):
			try:
			return torchaudio.load(input)[0].numpy()
			except:
			waveform, _ = soundfile.read(input, dtype='float32')
			if waveform.ndim == 2:
			waveform = waveform[:, 0]
			return np.expand_dims(waveform, axis=0)

			DATA_TYPES = {
			"sound": lambda x: torchaudio.load(x)[0][0].numpy(),
			"sound": load_wav,
			"pcm": load_pcm,
			"kaldi_ark": load_kaldi,
			"bytes": load_bytes,
			@@ -106,10 +116,10 @@
			] = None,
			float_dtype: str = "float32",
			fs: dict = None,
			mc: bool = False,
			int_dtype: str = "long",
			key_file: str = None,
			):
			assert check_argument_types()
			if len(path_name_type_list) == 0:
			raise ValueError(
			'1 or more elements are required for "path_name_type_list"'
			@@ -122,12 +132,13 @@
			self.int_dtype = int_dtype
			self.key_file = key_file
			self.fs = fs
			self.mc = mc

			self.debug_info = {}
			non_iterable_list = []
			self.path_name_type_list = []

			if not isinstance(path_name_type_list[0], Tuple):
			if not isinstance(path_name_type_list[0], (Tuple, List)):
			path = path_name_type_list[0]
			name = path_name_type_list[1]
			_type = path_name_type_list[2]
			@@ -174,90 +185,97 @@
			def __iter__(self) -> Iterator[Tuple[Union[str, int], Dict[str, np.ndarray]]]:
			count = 0
			if len(self.path_name_type_list) != 0 and (self.path_name_type_list[0][2] == "bytes" or self.path_name_type_list[0][2] == "waveform"):
			linenum = len(self.path_name_type_list)
			data = {}
			value = self.path_name_type_list[0][0]
			uid = 'utt_id'
			name = self.path_name_type_list[0][1]
			_type = self.path_name_type_list[0][2]
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array
			for i in range(linenum):
			value = self.path_name_type_list[i][0]
			uid = 'utt_id'
			name = self.path_name_type_list[i][1]
			_type = self.path_name_type_list[i][2]
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and (name == "speech" or name == "ref_speech"):
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value
			data[name] = array

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value

			yield uid, data

			elif len(self.path_name_type_list) != 0 and self.path_name_type_list[0][2] == "sound" and not self.path_name_type_list[0][0].lower().endswith(".scp"):
			linenum = len(self.path_name_type_list)
			data = {}
			value = self.path_name_type_list[0][0]
			uid = os.path.basename(self.path_name_type_list[0][0]).split(".")[0]
			name = self.path_name_type_list[0][1]
			_type = self.path_name_type_list[0][2]
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			_type = "pcm"

			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			for i in range(linenum):
			value = self.path_name_type_list[i][0]
			uid = os.path.basename(self.path_name_type_list[i][0]).split(".")[0]
			name = self.path_name_type_list[i][1]
			_type = self.path_name_type_list[i][2]
			if _type == "sound":
			audio_type = os.path.basename(value).lower()
			if audio_type.rfind(".pcm") >= 0:
			_type = "pcm"
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and (name == "speech" or name == "ref_speech"):
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.numpy()

			if _type == "sound":
			if self.mc:
			data[name] = array.transpose((1, 0))
			else:
			data[name] = array[0]
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value
			data[name] = array

			if self.preprocess is not None:
			data = self.preprocess(uid, data)
			for name in data:
			count += 1
			value = data[name]
			if not isinstance(value, np.ndarray):
			raise RuntimeError(
			f'All values must be converted to np.ndarray object '
			f'by preprocessing, but "{name}" is still {type(value)}.')
			# Cast to desired type
			if value.dtype.kind == 'f':
			value = value.astype(self.float_dtype)
			elif value.dtype.kind == 'i':
			value = value.astype(self.int_dtype)
			else:
			raise NotImplementedError(
			f'Not supported dtype: {value.dtype}')
			data[name] = value

			yield uid, data

			@@ -322,11 +340,8 @@
			# 2.a. Load data streamingly
			for value, (path, name, _type) in zip(values, self.path_name_type_list):
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			audio_type = os.path.basename(value).lower()
			if audio_type.rfind(".pcm") >= 0:
			_type = "pcm"
			func = DATA_TYPES[_type]
			# Load entry
			@@ -336,11 +351,16 @@
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array
			array = array.numpy()
			if _type == "sound":
			if self.mc:
			data[name] = array.transpose((1, 0))
			else:
			data[name] = array[0]
			else:
			data[name] = array
			if self.non_iterable_dataset is not None:
			# 2.b. Load data from non-iterable dataset
			_, from_non_iterable = self.non_iterable_dataset[uid]
			@@ -373,3 +393,4 @@

			if count == 0:
			raise RuntimeError("No iteration")