From 5589b4a6177a0ca6836c304a152befe87f0ddd96 Mon Sep 17 00:00:00 2001
From: haoneng.lhn <haoneng.lhn@alibaba-inc.com>
Date: 星期五, 14 四月 2023 11:47:28 +0800
Subject: [PATCH] support wav_file input

---
 funasr/bin/asr_inference_paraformer_streaming.py |   17 +++++++++++------
 funasr/models/predictor/cif.py                   |   12 +++++++++---
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 66dec39..3520297 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -19,6 +19,7 @@
 
 import numpy as np
 import torch
+import torchaudio
 from typeguard import check_argument_types
 
 from funasr.fileio.datadir_writer import DatadirWriter
@@ -607,17 +608,21 @@
     ):
 
         # 3. Build data-iterator
-        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
-            raw_inputs = _load_bytes(data_path_and_name_and_type[0])
-            raw_inputs = torch.tensor(raw_inputs)
-        if data_path_and_name_and_type is None and raw_inputs is not None:
-            if isinstance(raw_inputs, np.ndarray):
-                raw_inputs = torch.tensor(raw_inputs)
         is_final = False
         if param_dict is not None and "cache" in param_dict:
             cache = param_dict["cache"]
         if param_dict is not None and "is_final" in param_dict:
             is_final = param_dict["is_final"]
+
+        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
+            raw_inputs = _load_bytes(data_path_and_name_and_type[0])
+            raw_inputs = torch.tensor(raw_inputs)
+        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
+            raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
+            is_final = True
+        if data_path_and_name_and_type is None and raw_inputs is not None:
+            if isinstance(raw_inputs, np.ndarray):
+                raw_inputs = torch.tensor(raw_inputs)
         # 7 .Start for-loop
         # FIXME(kamo): The output format should be discussed about
         asr_result_list = []
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index e80a915..a5273f8 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -234,6 +234,7 @@
         last_fire_place = len_time - 1
         last_fire_remainds = 0.0
         pre_alphas_length = 0
+        last_fire = False
  
         mask_chunk_peak_predictor = None
         if cache is not None:
@@ -251,10 +252,15 @@
             if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
                 last_fire_place = len_time - 1 - i
                 last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
+                last_fire = True
                 break
-        last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
-        cache["cif_hidden"] = hidden[:, last_fire_place:, :]
-        cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
+        if last_fire:
+           last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
+           cache["cif_hidden"] = hidden[:, last_fire_place:, :]
+           cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
+        else:
+           cache["cif_hidden"] = hidden
+           cache["cif_alphas"] = alphas
         token_num_int = token_num.floor().type(torch.int32).item()
         return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
 

--
Gitblit v1.9.1