From 320c7ff2c2dfbce13ee01589a64b515bf2d7857b Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 29 三月 2024 12:30:35 +0800
Subject: [PATCH] Dev gzf new (#1562)
---
funasr/models/fsmn_vad_streaming/model.py | 71 +++++++++++++++++++++++++++--------
1 files changed, 55 insertions(+), 16 deletions(-)
diff --git a/funasr/models/fsmn_vad_streaming/model.py b/funasr/models/fsmn_vad_streaming/model.py
index 76eee81..f1d4161 100644
--- a/funasr/models/fsmn_vad_streaming/model.py
+++ b/funasr/models/fsmn_vad_streaming/model.py
@@ -284,6 +284,7 @@
encoder_class = tables.encoder_classes.get(encoder)
encoder = encoder_class(**encoder_conf)
self.encoder = encoder
+ self.encoder_conf = encoder_conf
def ResetDetection(self, cache: dict = {}):
cache["stats"].continous_silence_frame_count = 0
@@ -482,13 +483,17 @@
return frame_state
- def forward(self, feats: torch.Tensor, waveform: torch.tensor, cache: dict = {},
- is_final: bool = False
+ def forward(self, feats: torch.Tensor,
+ waveform: torch.tensor,
+ cache: dict = {},
+ is_final: bool = False,
+ **kwargs,
):
# if len(cache) == 0:
# self.AllResetDetection()
# self.waveform = waveform # compute decibel for each frame
cache["stats"].waveform = waveform
+ is_streaming_input = kwargs.get("is_streaming_input", True)
self.ComputeDecibel(cache=cache)
self.ComputeScores(feats, cache=cache)
if not is_final:
@@ -500,13 +505,32 @@
segment_batch = []
if len(cache["stats"].output_data_buf) > 0:
for i in range(cache["stats"].output_data_buf_offset, len(cache["stats"].output_data_buf)):
- if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
- cache["stats"].output_data_buf[
- i].contain_seg_end_point):
- continue
- segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+ if is_streaming_input: # in this case, return [beg, -1], [], [-1, end], [beg, end]
+ if not cache["stats"].output_data_buf[i].contain_seg_start_point:
+ continue
+ if not cache["stats"].next_seg and not cache["stats"].output_data_buf[i].contain_seg_end_point:
+ continue
+ start_ms = cache["stats"].output_data_buf[i].start_ms if cache["stats"].next_seg else -1
+ if cache["stats"].output_data_buf[i].contain_seg_end_point:
+ end_ms = cache["stats"].output_data_buf[i].end_ms
+ cache["stats"].next_seg = True
+ cache["stats"].output_data_buf_offset += 1
+ else:
+ end_ms = -1
+ cache["stats"].next_seg = False
+ segment = [start_ms, end_ms]
+
+ else: # in this case, return [beg, end]
+
+ if not is_final and (not cache["stats"].output_data_buf[i].contain_seg_start_point or not
+ cache["stats"].output_data_buf[
+ i].contain_seg_end_point):
+ continue
+ segment = [cache["stats"].output_data_buf[i].start_ms, cache["stats"].output_data_buf[i].end_ms]
+ cache["stats"].output_data_buf_offset += 1 # need update this parameter
+
segment_batch.append(segment)
- cache["stats"].output_data_buf_offset += 1 # need update this parameter
+
if segment_batch:
segments.append(segment_batch)
# if is_final:
@@ -519,6 +543,11 @@
cache["frontend"] = {}
cache["prev_samples"] = torch.empty(0)
cache["encoder"] = {}
+
+ if kwargs.get("max_end_silence_time") is not None:
+ # update the max_end_silence_time
+ self.vad_opts.max_end_silence_time = kwargs.get("max_end_silence_time")
+
windows_detector = WindowDetector(self.vad_opts.window_size_ms,
self.vad_opts.sil_to_speech_time_thres,
self.vad_opts.speech_to_sil_time_thres,
@@ -551,7 +580,9 @@
chunk_stride_samples = int(chunk_size * frontend.fs / 1000)
time1 = time.perf_counter()
- cfg = {"is_final": kwargs.get("is_final", False)}
+ is_streaming_input = kwargs.get("is_streaming_input", False) if chunk_size >= 15000 else kwargs.get("is_streaming_input", True)
+ is_final = kwargs.get("is_final", False) if is_streaming_input else kwargs.get("is_final", True)
+ cfg = {"is_final": is_final, "is_streaming_input": is_streaming_input}
audio_sample_list = load_audio_text_image_video(data_in,
fs=frontend.fs,
audio_fs=kwargs.get("fs", 16000),
@@ -560,7 +591,7 @@
cache=cfg,
)
_is_final = cfg["is_final"] # if data_in is a file or url, set is_final=True
-
+ is_streaming_input = cfg["is_streaming_input"]
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
assert len(audio_sample_list) == 1, "batch_size must be set 1"
@@ -588,7 +619,8 @@
"feats": speech,
"waveform": cache["frontend"]["waveforms"],
"is_final": kwargs["is_final"],
- "cache": cache
+ "cache": cache,
+ "is_streaming_input": is_streaming_input
}
segments_i = self.forward(**batch)
if len(segments_i) > 0:
@@ -599,14 +631,15 @@
self.init_cache(cache)
ibest_writer = None
- if ibest_writer is None and kwargs.get("output_dir") is not None:
- writer = DatadirWriter(kwargs.get("output_dir"))
- ibest_writer = writer[f"{1}best_recog"]
+ if kwargs.get("output_dir") is not None:
+ if not hasattr(self, "writer"):
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
+ ibest_writer = self.writer[f"{1}best_recog"]
results = []
result_i = {"key": key[0], "value": segments}
- if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
- result_i = json.dumps(result_i)
+ # if "MODELSCOPE_ENVIRONMENT" in os.environ and os.environ["MODELSCOPE_ENVIRONMENT"] == "eas":
+ # result_i = json.dumps(result_i)
results.append(result_i)
@@ -615,6 +648,12 @@
return results, meta_data
+ def export(self, **kwargs):
+
+ from .export_meta import export_rebuild_model
+ models = export_rebuild_model(model=self, **kwargs)
+ return models
+
def DetectCommonFrames(self, cache: dict = {}) -> int:
if cache["stats"].vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
return 0
--
Gitblit v1.9.1