From 580b11b57ac4b62f7e2acda73813a4e10e8e4cd3 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 10 十月 2023 17:17:29 +0800
Subject: [PATCH] v0.8.0

---
 funasr/models/e2e_vad.py |  130 +++++++++++++++++++++++++++++++++++--------
 1 files changed, 106 insertions(+), 24 deletions(-)

diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
old mode 100755
new mode 100644
index b64c677..7c55b2e
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -5,6 +5,7 @@
 from torch import nn
 import math
 from funasr.models.encoder.fsmn_encoder import FSMN
+from funasr.models.base_model import FunASRModel
 
 
 class VadStateMachine(Enum):
@@ -35,6 +36,11 @@
 
 
 class VADXOptions:
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
     def __init__(
             self,
             sample_rate: int = 16000,
@@ -99,6 +105,11 @@
 
 
 class E2EVadSpeechBufWithDoa(object):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
     def __init__(self):
         self.start_ms = 0
         self.end_ms = 0
@@ -117,6 +128,11 @@
 
 
 class E2EVadFrameProb(object):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
     def __init__(self):
         self.noise_prob = 0.0
         self.speech_prob = 0.0
@@ -126,6 +142,11 @@
 
 
 class WindowDetector(object):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
     def __init__(self, window_size_ms: int, sil_to_speech_time: int,
                  speech_to_sil_time: int, frame_size_ms: int):
         self.window_size_ms = window_size_ms
@@ -191,8 +212,13 @@
         return int(self.frame_size_ms)
 
 
-class E2EVadModel(nn.Module):
-    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]):
+class E2EVadModel(FunASRModel):
+    """
+    Author: Speech Lab of DAMO Academy, Alibaba Group
+    Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+    https://arxiv.org/abs/1803.05030
+    """
+    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], frontend=None):
         super(E2EVadModel, self).__init__()
         self.vad_opts = VADXOptions(**vad_post_args)
         self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
@@ -201,7 +227,6 @@
                                                self.vad_opts.frame_in_ms)
         self.encoder = encoder
         # init variables
-        self.is_final_send = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -215,6 +240,7 @@
         self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
         self.noise_average_decibel = -100.0
         self.pre_end_silence_detected = False
+        self.next_seg = True
 
         self.output_data_buf = []
         self.output_data_buf_offset = 0
@@ -227,11 +253,10 @@
         self.data_buf = None
         self.data_buf_all = None
         self.waveform = None
-        self.ResetDetection()
+        self.frontend = frontend
+        self.last_drop_frames = 0
 
     def AllResetDetection(self):
-        self.encoder.cache_reset()  # reset the in_cache in self.encoder for next query or next long sentence
-        self.is_final_send = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -245,6 +270,7 @@
         self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
         self.noise_average_decibel = -100.0
         self.pre_end_silence_detected = False
+        self.next_seg = True
 
         self.output_data_buf = []
         self.output_data_buf_offset = 0
@@ -257,7 +283,8 @@
         self.data_buf = None
         self.data_buf_all = None
         self.waveform = None
-        self.ResetDetection()
+        self.last_drop_frames = 0
+        self.windows_detector.Reset()
 
     def ResetDetection(self):
         self.continous_silence_frame_count = 0
@@ -269,6 +296,15 @@
         self.windows_detector.Reset()
         self.sil_frame = 0
         self.frame_probs = []
+
+        if self.output_data_buf:
+            assert self.output_data_buf[-1].contain_seg_end_point == True
+            drop_frames = int(self.output_data_buf[-1].end_ms / self.vad_opts.frame_in_ms)
+            real_drop_frames = drop_frames - self.last_drop_frames
+            self.last_drop_frames = drop_frames
+            self.data_buf_all = self.data_buf_all[real_drop_frames * int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
+            self.decibel = self.decibel[real_drop_frames:]
+            self.scores = self.scores[:, real_drop_frames:, :]
 
     def ComputeDecibel(self) -> None:
         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
@@ -283,8 +319,8 @@
                 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
                                 0.000001))
 
-    def ComputeScores(self, feats: torch.Tensor) -> None:
-        scores = self.encoder(feats)  # return B * T * D
+    def ComputeScores(self, feats: torch.Tensor, in_cache: Dict[str, torch.Tensor]) -> None:
+        scores = self.encoder(feats, in_cache).to('cpu')  # return B * T * D
         assert scores.shape[1] == feats.shape[1], "The shape between feats and scores does not match"
         self.vad_opts.nn_eval_block_size = scores.shape[1]
         self.frm_cnt += scores.shape[1]  # count total frames
@@ -297,7 +333,7 @@
         while self.data_buf_start_frame < frame_idx:
             if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                 self.data_buf_start_frame += 1
-                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
+                self.data_buf = self.data_buf_all[(self.data_buf_start_frame - self.last_drop_frames) * int(
                     self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
 
     def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
@@ -306,7 +342,7 @@
         expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
         if last_frm_is_end_point:
             extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
-                               self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
+                                      self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
             expected_sample_number += int(extra_sample)
         if end_point_is_sent_end:
             expected_sample_number = max(expected_sample_number, len(self.data_buf))
@@ -443,11 +479,15 @@
 
         return frame_state
 
-    def forward(self, feats: torch.Tensor, waveform: torch.tensor, is_final_send: bool = False) -> List[List[List[int]]]:
+    def forward(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
+                is_final: bool = False
+                ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
+        if not in_cache:
+            self.AllResetDetection()
         self.waveform = waveform  # compute decibel for each frame
         self.ComputeDecibel()
-        self.ComputeScores(feats)
-        if not is_final_send:
+        self.ComputeScores(feats, in_cache)
+        if not is_final:
             self.DetectCommonFrames()
         else:
             self.DetectLastFrames()
@@ -456,23 +496,65 @@
             segment_batch = []
             if len(self.output_data_buf) > 0:
                 for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
-                    if self.output_data_buf[i].contain_seg_start_point and self.output_data_buf[
-                        i].contain_seg_end_point:
-                        segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
-                        segment_batch.append(segment)
-                        self.output_data_buf_offset += 1  # need update this parameter
+                    if not is_final and (not self.output_data_buf[i].contain_seg_start_point or not self.output_data_buf[
+                        i].contain_seg_end_point):
+                        continue
+                    segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
+                    segment_batch.append(segment)
+                    self.output_data_buf_offset += 1  # need update this parameter
             if segment_batch:
                 segments.append(segment_batch)
-        if is_final_send:
-            self.AllResetDetection() 
-        return segments
+        if is_final:
+            # reset class variables and clear the dict for the next query
+            self.AllResetDetection()
+        return segments, in_cache
+
+    def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
+                       is_final: bool = False, max_end_sil: int = 800
+                       ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
+        if not in_cache:
+            self.AllResetDetection()
+        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
+        self.waveform = waveform  # compute decibel for each frame
+
+        self.ComputeScores(feats, in_cache)
+        self.ComputeDecibel()
+        if not is_final:
+            self.DetectCommonFrames()
+        else:
+            self.DetectLastFrames()
+        segments = []
+        for batch_num in range(0, feats.shape[0]):  # only support batch_size = 1 now
+            segment_batch = []
+            if len(self.output_data_buf) > 0:
+                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+                    if not self.output_data_buf[i].contain_seg_start_point:
+                        continue
+                    if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
+                        continue
+                    start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
+                    if self.output_data_buf[i].contain_seg_end_point:
+                        end_ms = self.output_data_buf[i].end_ms
+                        self.next_seg = True
+                        self.output_data_buf_offset += 1
+                    else:
+                        end_ms = -1
+                        self.next_seg = False
+                    segment = [start_ms, end_ms]
+                    segment_batch.append(segment)
+            if segment_batch:
+                segments.append(segment_batch)
+        if is_final:
+            # reset class variables and clear the dict for the next query
+            self.AllResetDetection()
+        return segments, in_cache
 
     def DetectCommonFrames(self) -> int:
         if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
             return 0
         for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
             frame_state = FrameState.kFrameStateInvalid
-            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i - self.last_drop_frames)
             self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
 
         return 0
@@ -482,7 +564,7 @@
             return 0
         for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
             frame_state = FrameState.kFrameStateInvalid
-            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i - self.last_drop_frames)
             if i != 0:
                 self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
             else:

--
Gitblit v1.9.1