python/FunASR-XL.git

			@@ -8,75 +8,78 @@
			import librosa
			import numpy as np

			from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
			OrtInferSession, TokenIDConverter, get_logger,
			read_yaml)
			from .utils.utils import (
			CharTokenizer,
			Hypothesis,
			ONNXRuntimeError,
			OrtInferSession,
			TokenIDConverter,
			get_logger,
			read_yaml,
			)
			from .utils.postprocess_utils import sentence_postprocess
			from .utils.frontend import WavFrontendOnline, SinusoidalPositionEncoderOnline

			logging = get_logger()


			class Paraformer():
			def __init__(self, model_dir: Union[str, Path] = None,
			class Paraformer:
			def __init__(
			self,
			model_dir: Union[str, Path] = None,
			batch_size: int = 1,
			chunk_size: List = [5, 10, 5],
			device_id: Union[str, int] = "-1",
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			cache_dir: str = None,
			**kwargs
			**kwargs,
			):

			if not Path(model_dir).exists():
			try:
			from modelscope.hub.snapshot_download import snapshot_download
			except:
			raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" \
			"\npip3 install -U modelscope\n" \
			"For the users in China, you could install with the command:\n" \
			"\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
			raise "You are exporting model from modelscope, please install modelscope and try it again. To install modelscope, you could:\n" "\npip3 install -U modelscope\n" "For the users in China, you could install with the command:\n" "\npip3 install -U modelscope -i https://mirror.sjtu.edu.cn/pypi/web/simple"
			try:
			model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
			except:
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(model_dir)
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
			model_dir
			)

			encoder_model_file = os.path.join(model_dir, 'model.onnx')
			decoder_model_file = os.path.join(model_dir, 'decoder.onnx')
			encoder_model_file = os.path.join(model_dir, "model.onnx")
			decoder_model_file = os.path.join(model_dir, "decoder.onnx")
			if quantize:
			encoder_model_file = os.path.join(model_dir, 'model_quant.onnx')
			decoder_model_file = os.path.join(model_dir, 'decoder_quant.onnx')
			encoder_model_file = os.path.join(model_dir, "model_quant.onnx")
			decoder_model_file = os.path.join(model_dir, "decoder_quant.onnx")
			if not os.path.exists(encoder_model_file) or not os.path.exists(decoder_model_file):
			print(".onnx is not exist, begin to export onnx")
			try:
			from funasr import AutoModel
			except:
			raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" \
			"\npip3 install -U funasr\n" \
			"For the users in China, you could install with the command:\n" \
			"\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"
			raise "You are exporting onnx, please install funasr and try it again. To install funasr, you could:\n" "\npip3 install -U funasr\n" "For the users in China, you could install with the command:\n" "\npip3 install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple"

			model = AutoModel(model=model_dir)
			model_dir = model.export(type="onnx", quantize=quantize, **kwargs)

			config_file = os.path.join(model_dir, 'config.yaml')
			cmvn_file = os.path.join(model_dir, 'am.mvn')
			config_file = os.path.join(model_dir, "config.yaml")
			cmvn_file = os.path.join(model_dir, "am.mvn")
			config = read_yaml(config_file)
			token_list = os.path.join(model_dir, 'tokens.json')
			with open(token_list, 'r', encoding='utf-8') as f:
			token_list = os.path.join(model_dir, "tokens.json")
			with open(token_list, "r", encoding="utf-8") as f:
			token_list = json.load(f)

			self.converter = TokenIDConverter(token_list)
			self.tokenizer = CharTokenizer()
			self.frontend = WavFrontendOnline(
			cmvn_file=cmvn_file,
			**config['frontend_conf']
			)
			self.frontend = WavFrontendOnline(cmvn_file=cmvn_file, **config["frontend_conf"])
			self.pe = SinusoidalPositionEncoderOnline()
			self.ort_encoder_infer = OrtInferSession(encoder_model_file, device_id,
			intra_op_num_threads=intra_op_num_threads)
			self.ort_decoder_infer = OrtInferSession(decoder_model_file, device_id,
			intra_op_num_threads=intra_op_num_threads)
			self.ort_encoder_infer = OrtInferSession(
			encoder_model_file, device_id, intra_op_num_threads=intra_op_num_threads
			)
			self.ort_decoder_infer = OrtInferSession(
			decoder_model_file, device_id, intra_op_num_threads=intra_op_num_threads
			)
			self.batch_size = batch_size
			self.chunk_size = chunk_size
			self.encoder_output_size = config["encoder_conf"]["output_size"]
			@@ -95,7 +98,9 @@
			cache["cif_alphas"] = np.zeros((batch_size, 1)).astype(np.float32)
			cache["chunk_size"] = self.chunk_size
			cache["last_chunk"] = False
			cache["feats"] = np.zeros((batch_size, self.chunk_size[0] + self.chunk_size[2], self.feats_dims)).astype(np.float32)
			cache["feats"] = np.zeros(
			(batch_size, self.chunk_size[0] + self.chunk_size[2], self.feats_dims)
			).astype(np.float32)
			cache["decoder_fsmn"] = []
			for i in range(self.fsmn_layer):
			fsmn_cache = np.zeros((batch_size, self.fsmn_dims, self.fsmn_lorder)).astype(np.float32)
			@@ -118,9 +123,9 @@

			def __call__(self, audio_in: np.ndarray, **kwargs):
			waveforms = np.expand_dims(audio_in, axis=0)
			param_dict = kwargs.get('param_dict', dict())
			is_final = param_dict.get('is_final', False)
			cache = param_dict.get('cache', dict())
			param_dict = kwargs.get("param_dict", dict())
			is_final = param_dict.get("is_final", False)
			cache = param_dict.get("cache", dict())
			asr_res = []

			if waveforms.shape[1] < 16 * 60 and is_final and len(cache) > 0:
			@@ -151,7 +156,10 @@

			# last chunk
			cache["last_chunk"] = True
			feats_chunk2 = self.add_overlap_chunk(feats[:, -(feats.shape[1] + self.chunk_size[2] - self.chunk_size[1]):, :], cache)
			feats_chunk2 = self.add_overlap_chunk(
			feats[:, -(feats.shape[1] + self.chunk_size[2] - self.chunk_size[1]) :, :],
			cache,
			)
			feats_len = np.array([feats_chunk2.shape[1]]).astype(np.int32)
			asr_res_chunk2 = self.infer(feats_chunk2, feats_len, cache)

			@@ -188,17 +196,18 @@
			dec_input.extend(cache["decoder_fsmn"])
			dec_output = self.ort_decoder_infer(dec_input)
			logits, sample_ids, cache["decoder_fsmn"] = dec_output[0], dec_output[1], dec_output[2:]
			cache["decoder_fsmn"] = [item[:, :, -self.fsmn_lorder:] for item in cache["decoder_fsmn"]]
			cache["decoder_fsmn"] = [
			item[:, :, -self.fsmn_lorder :] for item in cache["decoder_fsmn"]
			]

			preds = self.decode(logits, acoustic_embeds_len)
			for pred in preds:
			pred = sentence_postprocess(pred)
			asr_res.append({'preds': pred})
			asr_res.append({"preds": pred})

			return asr_res

			def load_data(self,
			wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
			def load_data(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
			def load_wav(path: str) -> np.ndarray:
			waveform, _ = librosa.load(path, sr=fs)
			return waveform
			@@ -212,11 +221,10 @@
			if isinstance(wav_content, list):
			return [load_wav(path) for path in wav_content]

			raise TypeError(
			f'The type of {wav_content} is not in [str, np.ndarray, list]')
			raise TypeError(f"The type of {wav_content} is not in [str, np.ndarray, list]")

			def extract_feat(self,
			waveforms: np.ndarray, is_final: bool = False
			def extract_feat(
			self, waveforms: np.ndarray, is_final: bool = False
			) -> Tuple[np.ndarray, np.ndarray]:
			waveforms_lens = np.zeros(waveforms.shape[0]).astype(np.int32)
			for idx, waveform in enumerate(waveforms):
			@@ -226,12 +234,12 @@
			return feats.astype(np.float32), feats_len.astype(np.int32)

			def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
			return [self.decode_one(am_score, token_num)
			for am_score, token_num in zip(am_scores, token_nums)]
			return [
			self.decode_one(am_score, token_num)
			for am_score, token_num in zip(am_scores, token_nums)
			]

			def decode_one(self,
			am_score: np.ndarray,
			valid_token_num: int) -> List[str]:
			def decode_one(self, am_score: np.ndarray, valid_token_num: int) -> List[str]:
			yseq = am_score.argmax(axis=-1)
			score = am_score.max(axis=-1)
			score = np.sum(score, axis=-1)
			@@ -317,5 +325,6 @@
			cache["cif_hidden"] = np.stack(cache_hiddens, axis=0)
			cache["cif_hidden"] = np.expand_dims(cache["cif_hidden"], axis=0)

			return np.stack(list_ls, axis=0).astype(np.float32), np.stack(token_length, axis=0).astype(np.int32)

			return np.stack(list_ls, axis=0).astype(np.float32), np.stack(token_length, axis=0).astype(
			np.int32
			)