From 2cdb2d654f2109ef4e648bae6f169143e267e5db Mon Sep 17 00:00:00 2001
From: zhuzizyf <42790740+zhuzizyf@users.noreply.github.com>
Date: 星期六, 11 三月 2023 14:33:14 +0800
Subject: [PATCH] Update dataset.py

---
 funasr/models/e2e_vad.py |   55 +++++++++++++++++++++----------------------------------
 1 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index 8afc8db..b9be89a 100755
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -192,7 +192,7 @@
 
 
 class E2EVadModel(nn.Module):
-    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], streaming=False):
+    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]):
         super(E2EVadModel, self).__init__()
         self.vad_opts = VADXOptions(**vad_post_args)
         self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
@@ -201,7 +201,7 @@
                                                self.vad_opts.frame_in_ms)
         self.encoder = encoder
         # init variables
-        self.is_final_send = False
+        self.is_final = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -227,12 +227,10 @@
         self.data_buf = None
         self.data_buf_all = None
         self.waveform = None
-        self.streaming = streaming
         self.ResetDetection()
 
     def AllResetDetection(self):
-        self.encoder.cache_reset()  # reset the in_cache in self.encoder for next query or next long sentence
-        self.is_final_send = False
+        self.is_final = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -284,8 +282,8 @@
                 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
                                 0.000001))
 
-    def ComputeScores(self, feats: torch.Tensor) -> None:
-        scores = self.encoder(feats)  # return B * T * D
+    def ComputeScores(self, feats: torch.Tensor, in_cache: Dict[str, torch.Tensor]) -> None:
+        scores = self.encoder(feats, in_cache)  # return B * T * D
         assert scores.shape[1] == feats.shape[1], "The shape between feats and scores does not match"
         self.vad_opts.nn_eval_block_size = scores.shape[1]
         self.frm_cnt += scores.shape[1]  # count total frames
@@ -307,7 +305,7 @@
         expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
         if last_frm_is_end_point:
             extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
-                               self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
+                                      self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
             expected_sample_number += int(extra_sample)
         if end_point_is_sent_end:
             expected_sample_number = max(expected_sample_number, len(self.data_buf))
@@ -444,32 +442,33 @@
 
         return frame_state
 
-    def forward(self, feats: torch.Tensor, waveform: torch.tensor, is_final_send: bool = False) -> List[List[List[int]]]:
+    def forward(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
+                is_final: bool = False
+                ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
         self.waveform = waveform  # compute decibel for each frame
         self.ComputeDecibel()
-        self.ComputeScores(feats)
-        if not is_final_send:
+        self.ComputeScores(feats, in_cache)
+        if not is_final:
             self.DetectCommonFrames()
         else:
-            if self.streaming:
-                self.DetectLastFrames()
-            else:
-                self.AllResetDetection()
-                self.DetectAllFrames()  # offline decode and is_final_send == True
+            self.DetectLastFrames()
         segments = []
         for batch_num in range(0, feats.shape[0]):  # only support batch_size = 1 now
             segment_batch = []
             if len(self.output_data_buf) > 0:
                 for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
-                    if self.output_data_buf[i].contain_seg_start_point and self.output_data_buf[
+                    if not self.output_data_buf[i].contain_seg_start_point or not self.output_data_buf[
                         i].contain_seg_end_point:
-                        segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
-                        segment_batch.append(segment)
-                        self.output_data_buf_offset += 1  # need update this parameter
+                        continue
+                    segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
+                    segment_batch.append(segment)
+                    self.output_data_buf_offset += 1  # need update this parameter
             if segment_batch:
                 segments.append(segment_batch)
-
-        return segments
+        if is_final:
+            # reset class variables and clear the dict for the next query
+            self.AllResetDetection()
+        return segments, in_cache
 
     def DetectCommonFrames(self) -> int:
         if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
@@ -492,18 +491,6 @@
             else:
                 self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
 
-        return 0
-
-    def DetectAllFrames(self) -> int:
-        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
-            return 0
-        if self.vad_opts.nn_eval_block_size != self.vad_opts.dcd_block_size:
-            frame_state = FrameState.kFrameStateInvalid
-            for t in range(0, self.frm_cnt):
-                frame_state = self.GetFrameState(t)
-                self.DetectOneFrame(frame_state, t, t == self.frm_cnt - 1)
-        else:
-            pass
         return 0
 
     def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:

--
Gitblit v1.9.1