python/FunASR-XL.git

			@@ -8,17 +8,20 @@
			from typing import Iterator
			from typing import Tuple
			from typing import Union
			from typing import List

			import kaldiio
			import numpy as np
			import soundfile
			import torch
			import torchaudio
			from torch.utils.data.dataset import IterableDataset
			from typeguard import check_argument_types
			import os.path

			from funasr.datasets.dataset import ESPnetDataset


			SUPPORT_AUDIO_TYPE_SETS = ['flac', 'mp3', 'ogg', 'opus', 'wav', 'pcm']

			def load_kaldi(input):
			retval = kaldiio.load_mat(input)
			@@ -58,9 +61,14 @@
			array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
			return array

			def load_pcm(input):
			with open(input,"rb") as f:
			bytes = f.read()
			return load_bytes(bytes)

			DATA_TYPES = {
			"sound": lambda x: soundfile.read(x)[0],
			"sound": lambda x: torchaudio.load(x)[0][0].numpy(),
			"pcm": load_pcm,
			"kaldi_ark": load_kaldi,
			"bytes": load_bytes,
			"waveform": lambda x: x,
			@@ -98,6 +106,7 @@
			[str, Dict[str, np.ndarray]], Dict[str, np.ndarray]
			] = None,
			float_dtype: str = "float32",
			fs: dict = None,
			int_dtype: str = "long",
			key_file: str = None,
			):
			@@ -113,12 +122,13 @@
			self.float_dtype = float_dtype
			self.int_dtype = int_dtype
			self.key_file = key_file
			self.fs = fs

			self.debug_info = {}
			non_iterable_list = []
			self.path_name_type_list = []

			if not isinstance(path_name_type_list[0], Tuple):
			if not isinstance(path_name_type_list[0], (Tuple, List)):
			path = path_name_type_list[0]
			name = path_name_type_list[1]
			_type = path_name_type_list[2]
			@@ -172,6 +182,15 @@
			_type = self.path_name_type_list[0][2]
			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array

			if self.preprocess is not None:
			@@ -201,8 +220,25 @@
			uid = os.path.basename(self.path_name_type_list[0][0]).split(".")[0]
			name = self.path_name_type_list[0][1]
			_type = self.path_name_type_list[0][2]
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			_type = "pcm"

			func = DATA_TYPES[_type]
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array

			if self.preprocess is not None:
			@@ -286,9 +322,25 @@
			data = {}
			# 2.a. Load data streamingly
			for value, (path, name, _type) in zip(values, self.path_name_type_list):
			if _type == "sound":
			audio_type = os.path.basename(value).split(".")[1].lower()
			if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
			raise NotImplementedError(
			f'Not supported audio type: {audio_type}')
			if audio_type == "pcm":
			_type = "pcm"
			func = DATA_TYPES[_type]
			# Load entry
			array = func(value)
			if self.fs is not None and name == "speech":
			audio_fs = self.fs["audio_fs"]
			model_fs = self.fs["model_fs"]
			if audio_fs is not None and model_fs is not None:
			array = torch.from_numpy(array)
			array = array.unsqueeze(0)
			array = torchaudio.transforms.Resample(orig_freq=audio_fs,
			new_freq=model_fs)(array)
			array = array.squeeze(0).numpy()
			data[name] = array
			if self.non_iterable_dataset is not None:
			# 2.b. Load data from non-iterable dataset