From 0eacba96a12d5c0dea89c4533ca68b40decd8e9f Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期四, 06 四月 2023 10:53:13 +0800
Subject: [PATCH] Merge branch 'main' into main
---
funasr/bin/asr_inference_paraformer_streaming.py | 105 +++++++++++++++++++++++++++++++++++++---------------
1 files changed, 75 insertions(+), 30 deletions(-)
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 907f190..66dec39 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -42,6 +42,7 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+np.set_printoptions(threshold=np.inf)
class Speech2Text:
"""Speech2Text class
@@ -203,7 +204,6 @@
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
-
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
@@ -213,13 +213,16 @@
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
+ feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
+ feats_len = torch.tensor([feats_len])
batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
- enc, enc_len = self.asr_model.encode_chunk(**batch)
+ enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
@@ -578,7 +581,22 @@
speech2text = Speech2TextExport(**speech2text_kwargs)
else:
speech2text = Speech2Text(**speech2text_kwargs)
+
+ def _load_bytes(input):
+ middle_data = np.frombuffer(input, dtype=np.int16)
+ middle_data = np.asarray(middle_data)
+ if middle_data.dtype.kind not in 'iu':
+ raise TypeError("'middle_data' must be an array of integers")
+ dtype = np.dtype('float32')
+ if dtype.kind != 'f':
+ raise TypeError("'dtype' must be a floating point type")
+ i = np.iinfo(middle_data.dtype)
+ abs_max = 2 ** (i.bits - 1)
+ offset = i.min + abs_max
+ array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+ return array
+
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@@ -589,10 +607,12 @@
):
# 3. Build data-iterator
+ if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
+ raw_inputs = _load_bytes(data_path_and_name_and_type[0])
+ raw_inputs = torch.tensor(raw_inputs)
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, np.ndarray):
raw_inputs = torch.tensor(raw_inputs)
-
is_final = False
if param_dict is not None and "cache" in param_dict:
cache = param_dict["cache"]
@@ -605,62 +625,87 @@
asr_result = ""
wait = True
if len(cache) == 0:
- cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+ cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
cache_de = {"decode_fsmn": None}
cache["decoder"] = cache_de
cache["first_chunk"] = True
cache["speech"] = []
- cache["chunk_index"] = 0
- cache["speech_chunk"] = []
+ cache["accum_speech"] = 0
if raw_inputs is not None:
if len(cache["speech"]) == 0:
cache["speech"] = raw_inputs
else:
cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
- if len(cache["speech_chunk"]) == 0:
- cache["speech_chunk"] = raw_inputs
- else:
- cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
- while len(cache["speech_chunk"]) >= 960:
+ cache["accum_speech"] += len(raw_inputs)
+ while cache["accum_speech"] >= 960:
if cache["first_chunk"]:
- if len(cache["speech_chunk"]) >= 14400:
- speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
- speech_length = torch.tensor([14400])
+ if cache["accum_speech"] >= 14400:
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["stride"] = 10
+ cache["encoder"]["left"] = 5
+ cache["encoder"]["right"] = 0
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"]= cache["speech_chunk"][4800:]
+ cache["accum_speech"] -= 4800
cache["first_chunk"] = False
cache["encoder"]["start_idx"] = -5
+ cache["encoder"]["is_final"] = False
wait = False
else:
if is_final:
- cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
+ cache["encoder"]["stride"] = len(cache["speech"]) // 960
+ cache["encoder"]["pad_left"] = 0
cache["encoder"]["pad_right"] = 0
- speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
- speech_length = torch.tensor([len(cache["speech_chunk"])])
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = []
+ cache["accum_speech"] = 0
wait = False
else:
break
else:
- if len(cache["speech_chunk"]) >= 19200:
+ if cache["accum_speech"] >= 19200:
cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = 10
cache["encoder"]["pad_left"] = 5
- speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
- speech_length = torch.tensor([19200])
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = 0
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = cache["speech_chunk"][9600:]
+ cache["accum_speech"] -= 9600
wait = False
else:
if is_final:
- cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
- cache["encoder"]["pad_right"] = 0
- speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
- speech_length = torch.tensor([len(cache["speech_chunk"])])
- results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = []
- wait = False
+ cache["encoder"]["is_final"] = True
+ if cache["accum_speech"] >= 14400:
+ cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = 10
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ results = speech2text(cache, speech, speech_length)
+ cache["accum_speech"] -= 9600
+ wait = False
+ else:
+ cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 0
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = 0
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ results = speech2text(cache, speech, speech_length)
+ cache["accum_speech"] = 0
+ wait = False
else:
break
--
Gitblit v1.9.1