From 4a7a984a5f3e3f894f86ce82e76ddd13d8a42a20 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 11 三月 2024 17:56:30 +0800
Subject: [PATCH] Dev gzf (#1465)

---
 funasr/models/fsmn_vad_streaming/model.py |  100 +++++++++++++++++++++++++++++++++++++++++++-------
 1 files changed, 86 insertions(+), 14 deletions(-)

diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py
index 76eee81..c3063b0 100644
--- a/funasr/models/fsmn_vad_streaming/model.py
+++ b/funasr/models/fsmn_vad_streaming/model.py
@@ -284,6 +284,7 @@
 		encoder_class = tables.encoder_classes.get(encoder)
 		encoder = encoder_class(**encoder_conf)
 		self.encoder = encoder
+		self.encoder_conf = encoder_conf
 	
 	def ResetDetection(self, cache: dict = {}):
 		cache["stats"].continous_silence_frame_count = 0
@@ -482,13 +483,17 @@
 		
 		return frame_state
 	
-	def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
-	            is_final: bool = False
+	def forward(self, feats: torch.Tensor,
+	            waveform: torch.tensor,
+	            cache: dict = {},
+	            is_final: bool = False,
+	            **kwargs,
 	            ):
 		# if len(cache) == 0:
 		#     self.AllResetDetection()
 		# self.waveform = waveform  # compute decibel for each frame
 		cache["stats"].waveform = waveform
+		is_streaming_input = kwargs.get("is_streaming_input", True)
 		self.ComputeDecibel(cache=cache)
 		self.ComputeScores(feats, cache=cache)
 		if not is_final:
@@ -500,13 +505,32 @@
 			segment_batch = []
 			if len(cache["stats"].output_data_buf) > 0:
 				for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
-					if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
-					cache["stats"].output_data_buf[
-						i].contain_seg_end_point):
-						continue
-					segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+					if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
+						if not cache["stats"].output_data_buf[i].contain_seg_start_point:
+							continue
+						if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
+							continue
+						start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
+						if cache["stats"].output_data_buf[i].contain_seg_end_point:
+							end_ms = cache["stats"].output_data_buf[i].end_ms
+							cache["stats"].next_seg = True
+							cache["stats"].output_data_buf_offset += 1
+						else:
+							end_ms = -1
+							cache["stats"].next_seg = False
+						segment = [start_ms, end_ms]
+						
+					else: # in this case, return [beg, end]
+						
+						if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
+						cache["stats"].output_data_buf[
+							i].contain_seg_end_point):
+							continue
+						segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+						cache["stats"].output_data_buf_offset += 1  # need update this parameter
+					
 					segment_batch.append(segment)
-					cache["stats"].output_data_buf_offset += 1  # need update this parameter
+					
 			if segment_batch:
 				segments.append(segment_batch)
 		# if is_final:
@@ -551,7 +575,9 @@
 		chunk_stride_samples = int(chunk_size * frontend.fs / 1000)
 		
 		time1 = time.perf_counter()
-		cfg = {"is_final": kwargs.get("is_final", False)}
+		is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
+		is_final = kwargs.get("is_final", False) if is_streaming_input else kwargs.get("is_final", True)
+		cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input}
 		audio_sample_list = load_audio_text_image_video(data_in,
 		                                                fs=frontend.fs,
 		                                                audio_fs=kwargs.get("fs", 16000),
@@ -560,7 +586,7 @@
 		                                                cache=cfg,
 		                                                )
 		_is_final = cfg["is_final"]  # if data_in is a file or url, set is_final=True
-		
+		is_streaming_input = cfg["is_streaming_input"]
 		time2 = time.perf_counter()
 		meta_data["load_data"] = f"{time2 - time1:0.3f}"
 		assert len(audio_sample_list) == 1, "batch_size must be set 1"
@@ -588,7 +614,8 @@
 				"feats": speech,
 				"waveform": cache["frontend"]["waveforms"],
 				"is_final": kwargs["is_final"],
-				"cache": cache
+				"cache": cache,
+				"is_streaming_input": is_streaming_input
 			}
 			segments_i = self.forward(**batch)
 			if len(segments_i) > 0:
@@ -599,9 +626,10 @@
 			self.init_cache(cache)
 		
 		ibest_writer = None
-		if ibest_writer is None and kwargs.get("output_dir") is not None:
-			writer = DatadirWriter(kwargs.get("output_dir"))
-			ibest_writer = writer[f"{1}best_recog"]
+		if kwargs.get("output_dir") is not None:
+			if not hasattr(self, "writer"):
+				self.writer = DatadirWriter(kwargs.get("output_dir"))
+			ibest_writer = self.writer[f"{1}best_recog"]
 		
 		results = []
 		result_i = {"key": key[0], "value": segments}
@@ -615,6 +643,50 @@
 		
 		return results, meta_data
 	
+	def export(self, **kwargs):
+		is_onnx = kwargs.get("type", "onnx") == "onnx"
+		encoder_class = tables.encoder_classes.get(kwargs["encoder"] + "Export")
+		self.encoder = encoder_class(self.encoder, onnx=is_onnx)
+		self.forward = self._export_forward
+		
+		return self
+		
+	def _export_forward(self, feats: torch.Tensor, *args, **kwargs):
+		
+		scores, out_caches = self.encoder(feats, *args)
+		
+		return scores, out_caches
+	
+	def export_dummy_inputs(self, data_in=None, frame=30):
+		if data_in is None:
+			speech = torch.randn(1, frame, self.encoder_conf.get("input_dim"))
+		else:
+			speech = None # Undo
+		
+		cache_frames = self.encoder_conf.get("lorder") + self.encoder_conf.get("rorder") - 1
+		in_cache0 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1)
+		in_cache1 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1)
+		in_cache2 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1)
+		in_cache3 = torch.randn(1, self.encoder_conf.get("proj_dim"), cache_frames, 1)
+		
+		return (speech, in_cache0, in_cache1, in_cache2, in_cache3)
+	
+	def export_input_names(self):
+		return ['speech', 'in_cache0', 'in_cache1', 'in_cache2', 'in_cache3']
+	
+	def export_output_names(self):
+		return ['logits', 'out_cache0', 'out_cache1', 'out_cache2', 'out_cache3']
+	
+	def export_dynamic_axes(self):
+		return {
+			'speech': {
+				1: 'feats_length'
+			},
+		}
+	
+	def export_name(self, ):
+		return "model.onnx"
+	
 	def DetectCommonFrames(self, cache: dict = {}) -> int:
 		if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
 			return 0

--
Gitblit v1.9.1