python/FunASR-XL.git

			@@ -284,6 +284,7 @@
			encoder_class = tables.encoder_classes.get(encoder)
			encoder = encoder_class(**encoder_conf)
			self.encoder = encoder
			self.encoder_conf = encoder_conf

			def ResetDetection(self, cache: dict = {}):
			cache["stats"].continous_silence_frame_count = 0
			@@ -482,13 +483,17 @@

			return frame_state

			def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
			is_final: bool = False
			def forward(self, feats: torch.Tensor,
			waveform: torch.tensor,
			cache: dict = {},
			is_final: bool = False,
			**kwargs,
			):
			# if len(cache) == 0:
			# self.AllResetDetection()
			# self.waveform = waveform # compute decibel for each frame
			cache["stats"].waveform = waveform
			is_streaming_input = kwargs.get("is_streaming_input", True)
			self.ComputeDecibel(cache=cache)
			self.ComputeScores(feats, cache=cache)
			if not is_final:
			@@ -500,13 +505,32 @@
			segment_batch = []
			if len(cache["stats"].output_data_buf) > 0:
			for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
			if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
			cache["stats"].output_data_buf[
			i].contain_seg_end_point):
			continue
			segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
			if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
			if not cache["stats"].output_data_buf[i].contain_seg_start_point:
			continue
			if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
			continue
			start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
			if cache["stats"].output_data_buf[i].contain_seg_end_point:
			end_ms = cache["stats"].output_data_buf[i].end_ms
			cache["stats"].next_seg = True
			cache["stats"].output_data_buf_offset += 1
			else:
			end_ms = -1
			cache["stats"].next_seg = False
			segment = [start_ms, end_ms]

			else: # in this case, return [beg, end]

			if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
			cache["stats"].output_data_buf[
			i].contain_seg_end_point):
			continue
			segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
			cache["stats"].output_data_buf_offset += 1 # need update this parameter

			segment_batch.append(segment)
			cache["stats"].output_data_buf_offset += 1 # need update this parameter

			if segment_batch:
			segments.append(segment_batch)
			# if is_final:
			@@ -519,6 +543,11 @@
			cache["frontend"] = {}
			cache["prev_samples"] = torch.empty(0)
			cache["encoder"] = {}

			if kwargs.get("max_end_silence_time") is not None:
			# update the max_end_silence_time
			self.vad_opts.max_end_silence_time = kwargs.get("max_end_silence_time")

			windows_detector = WindowDetector(self.vad_opts.window_size_ms,
			self.vad_opts.sil_to_speech_time_thres,
			self.vad_opts.speech_to_sil_time_thres,
			@@ -551,7 +580,9 @@
			chunk_stride_samples = int(chunk_size * frontend.fs / 1000)

			time1 = time.perf_counter()
			cfg = {"is_final": kwargs.get("is_final", False)}
			is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
			is_final = kwargs.get("is_final", False) if is_streaming_input else kwargs.get("is_final", True)
			cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input}
			audio_sample_list = load_audio_text_image_video(data_in,
			fs=frontend.fs,
			audio_fs=kwargs.get("fs", 16000),
			@@ -560,7 +591,7 @@
			cache=cfg,
			)
			_is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True

			is_streaming_input = cfg["is_streaming_input"]
			time2 = time.perf_counter()
			meta_data["load_data"] = f"{time2 - time1:0.3f}"
			assert len(audio_sample_list) == 1, "batch_size must be set 1"
			@@ -588,7 +619,8 @@
			"feats": speech,
			"waveform": cache["frontend"]["waveforms"],
			"is_final": kwargs["is_final"],
			"cache": cache
			"cache": cache,
			"is_streaming_input": is_streaming_input
			}
			segments_i = self.forward(**batch)
			if len(segments_i) > 0:
			@@ -599,14 +631,15 @@
			self.init_cache(cache)

			ibest_writer = None
			if ibest_writer is None and kwargs.get("output_dir") is not None:
			writer = DatadirWriter(kwargs.get("output_dir"))
			ibest_writer = writer[f"{1}best_recog"]
			if kwargs.get("output_dir") is not None:
			if not hasattr(self, "writer"):
			self.writer = DatadirWriter(kwargs.get("output_dir"))
			ibest_writer = self.writer[f"{1}best_recog"]

			results = []
			result_i = {"key": key[0], "value": segments}
			if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
			result_i = json.dumps(result_i)
			# if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
			# result_i = json.dumps(result_i)

			results.append(result_i)

			@@ -615,6 +648,12 @@

			return results, meta_data

			def export(self, **kwargs):

			from .export_meta import export_rebuild_model
			models = export_rebuild_model(model=self, **kwargs)
			return models

			def DetectCommonFrames(self, cache: dict = {}) -> int:
			if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
			return 0