python/FunASR-XL.git

			@@ -19,6 +19,7 @@
			import numpy as np
			import torch
			import torchaudio
			import soundfile
			import yaml
			from typeguard import check_argument_types

			@@ -259,6 +260,7 @@
			export_mode = param_dict.get("export_mode", False)
			else:
			hotword_list_or_file = None
			clas_scale = param_dict.get('clas_scale', 1.0)

			if kwargs.get("device", None) == "cpu":
			ngpu = 0
			@@ -291,6 +293,7 @@
			penalty=penalty,
			nbest=nbest,
			hotword_list_or_file=hotword_list_or_file,
			clas_scale=clas_scale,
			)

			speech2text = Speech2TextParaformer(**speech2text_kwargs)
			@@ -863,7 +866,13 @@
			raw_inputs = _load_bytes(data_path_and_name_and_type[0])
			raw_inputs = torch.tensor(raw_inputs)
			if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
			raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
			try:
			raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
			except:
			raw_inputs = soundfile.read(data_path_and_name_and_type[0], dtype='float32')[0]
			if raw_inputs.ndim == 2:
			raw_inputs = raw_inputs[:, 0]
			raw_inputs = torch.tensor(raw_inputs)
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, np.ndarray):
			raw_inputs = torch.tensor(raw_inputs)