From 37c9e0eacb6c7a04f827e9d327fce890b61a8040 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 29 三月 2023 13:30:52 +0800
Subject: [PATCH] Merge pull request #304 from alibaba-damo-academy/dev_lhn

---
 funasr/bin/asr_inference_paraformer_streaming.py |   87 ++++++++++++++++++++++++++++---------------
 1 files changed, 57 insertions(+), 30 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 907f190..df590df 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -42,6 +42,7 @@
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+np.set_printoptions(threshold=np.inf)
 
 class Speech2Text:
     """Speech2Text class
@@ -203,7 +204,6 @@
         # Input as audio signal
         if isinstance(speech, np.ndarray):
             speech = torch.tensor(speech)
-
         if self.frontend is not None:
             feats, feats_len = self.frontend.forward(speech, speech_lengths)
             feats = to_device(feats, device=self.device)
@@ -213,13 +213,16 @@
             feats = speech
             feats_len = speech_lengths
         lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+        feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
+        feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
+        feats_len = torch.tensor([feats_len])
         batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
 
         # a. To device
         batch = to_device(batch, device=self.device)
 
         # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode_chunk(**batch)
+        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
         if isinstance(enc, tuple):
             enc = enc[0]
         # assert len(enc) == 1, len(enc)
@@ -592,7 +595,6 @@
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, np.ndarray):
                 raw_inputs = torch.tensor(raw_inputs)
-
         is_final = False
         if param_dict is not None and "cache" in param_dict:
             cache = param_dict["cache"]
@@ -605,62 +607,87 @@
         asr_result = ""
         wait = True
         if len(cache) == 0:
-            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
             cache_de = {"decode_fsmn": None}
             cache["decoder"] = cache_de
             cache["first_chunk"] = True
             cache["speech"] = []
-            cache["chunk_index"] = 0
-            cache["speech_chunk"] = []
+            cache["accum_speech"] = 0
 
         if raw_inputs is not None:
             if len(cache["speech"]) == 0:
                 cache["speech"] = raw_inputs
             else:
                 cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
-            if len(cache["speech_chunk"]) == 0:
-                cache["speech_chunk"] = raw_inputs
-            else:
-                cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
-            while len(cache["speech_chunk"]) >= 960:
+            cache["accum_speech"] += len(raw_inputs)
+            while cache["accum_speech"] >= 960:
                 if cache["first_chunk"]:
-                    if len(cache["speech_chunk"]) >= 14400:
-                        speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
-                        speech_length = torch.tensor([14400])
+                    if cache["accum_speech"] >= 14400:
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
+                        cache["encoder"]["pad_left"] = 5 
+                        cache["encoder"]["pad_right"] = 5 
+                        cache["encoder"]["stride"] = 10
+                        cache["encoder"]["left"] = 5
+                        cache["encoder"]["right"] = 0
                         results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"]= cache["speech_chunk"][4800:]
+                        cache["accum_speech"] -= 4800
                         cache["first_chunk"] = False
                         cache["encoder"]["start_idx"] = -5
+                        cache["encoder"]["is_final"] = False
                         wait = False
                     else:
                         if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
+                            cache["encoder"]["stride"] = len(cache["speech"]) // 960
+                            cache["encoder"]["pad_left"] = 0
                             cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
+                            speech = torch.unsqueeze(cache["speech"], axis=0)
+                            speech_length = torch.tensor([len(cache["speech"])])
                             results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
+                            cache["accum_speech"] = 0
                             wait = False
                         else:
                             break
                 else:
-                    if len(cache["speech_chunk"]) >= 19200:
+                    if cache["accum_speech"] >= 19200:
                         cache["encoder"]["start_idx"] += 10
+                        cache["encoder"]["stride"] = 10
                         cache["encoder"]["pad_left"] = 5
-                        speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
-                        speech_length = torch.tensor([19200])
+                        cache["encoder"]["pad_right"] = 5
+                        cache["encoder"]["left"] = 0
+                        cache["encoder"]["right"] = 0
+                        speech = torch.unsqueeze(cache["speech"], axis=0)
+                        speech_length = torch.tensor([len(cache["speech"])])
                         results = speech2text(cache, speech, speech_length)
-                        cache["speech_chunk"] = cache["speech_chunk"][9600:]
+                        cache["accum_speech"] -= 9600
                         wait = False
                     else:
                         if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
-                            cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech_chunk"])])
-                            results = speech2text(cache, speech, speech_length)
-                            cache["speech_chunk"] = []
-                            wait = False
+                            cache["encoder"]["is_final"] = True
+                            if cache["accum_speech"] >= 14400:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = 10
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 5
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] -= 9600
+                                wait = False
+                            else:
+                                cache["encoder"]["start_idx"] += 10
+                                cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
+                                cache["encoder"]["pad_left"] = 5
+                                cache["encoder"]["pad_right"] = 0
+                                cache["encoder"]["left"] = 0
+                                cache["encoder"]["right"] = 0
+                                speech = torch.unsqueeze(cache["speech"], axis=0)
+                                speech_length = torch.tensor([len(cache["speech"])])
+                                results = speech2text(cache, speech, speech_length)
+                                cache["accum_speech"] = 0
+                                wait = False
                         else:
                             break
                 

--
Gitblit v1.9.1