python/FunASR-XL.git

			@@ -55,6 +55,7 @@
			distribute_spk)
			from funasr.build_utils.build_model_from_file import build_model_from_file
			from funasr.utils.cluster_backend import ClusterBackend
			from funasr.utils.modelscope_utils import get_cache_dir
			from tqdm import tqdm

			def inference_asr(
			@@ -498,6 +499,7 @@
			):
			ncpu = kwargs.get("ncpu", 1)
			torch.set_num_threads(ncpu)
			language = kwargs.get("model_lang", None)

			if word_lm_train_config is not None:
			raise NotImplementedError("Word LM is not implemented")
			@@ -704,10 +706,13 @@
			text, token, token_int = result[0], result[1], result[2]
			time_stamp = result[4] if len(result[4]) > 0 else None

			if use_timestamp and time_stamp is not None and len(time_stamp):
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			if language == "en-bpe":
			postprocessed_result = postprocess_utils.sentence_postprocess_sentencepiece(token)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			if use_timestamp and time_stamp is not None and len(time_stamp):
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			text_postprocessed = ""
			time_stamp_postprocessed = ""
			text_postprocessed_punc = postprocessed_result
			@@ -787,7 +792,7 @@
			time_stamp_writer: bool = True,
			punc_infer_config: Optional[str] = None,
			punc_model_file: Optional[str] = None,
			sv_model_file: Optional[str] = None,
			sv_model_file: Optional[str] = None,
			streaming: bool = False,
			embedding_node: str = "resnet1_dense",
			sv_threshold: float = 0.9465,
			@@ -808,6 +813,9 @@
			level=log_level,
			format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
			)

			if sv_model_file is None:
			sv_model_file = "{}/damo/speech_paraformer-large-vad-punc-spk_asr_nat-zh-cn/campplus_cn_common.bin".format(get_cache_dir(None))

			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			@@ -1084,7 +1092,6 @@
			logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
			torch.cuda.empty_cache()
			distribute_spk(asr_result_list[0]['sentences'], sv_output)
			import pdb; pdb.set_trace()
			return asr_result_list

			return _forward
			@@ -2030,7 +2037,7 @@
			return inference_paraformer(**kwargs)
			elif mode == "paraformer_streaming":
			return inference_paraformer_online(**kwargs)
			elif mode == "paraformer_vad_speaker":
			elif mode.startswith("paraformer_vad_speaker"):
			return inference_paraformer_vad_speaker(**kwargs)
			elif mode.startswith("paraformer_vad"):
			return inference_paraformer_vad_punc(**kwargs)