From 3cd3473bf7a3b41484baa86d9092248d78e7af39 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 21 四月 2023 17:17:37 +0800
Subject: [PATCH] docs
---
funasr/bin/asr_inference_paraformer_streaming.py | 119 +++++++++++++++++++++++++++++++++++++++++++----------------
1 files changed, 86 insertions(+), 33 deletions(-)
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 907f190..821f694 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -19,6 +19,7 @@
import numpy as np
import torch
+import torchaudio
from typeguard import check_argument_types
from funasr.fileio.datadir_writer import DatadirWriter
@@ -42,6 +43,7 @@
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+np.set_printoptions(threshold=np.inf)
class Speech2Text:
"""Speech2Text class
@@ -203,7 +205,6 @@
# Input as audio signal
if isinstance(speech, np.ndarray):
speech = torch.tensor(speech)
-
if self.frontend is not None:
feats, feats_len = self.frontend.forward(speech, speech_lengths)
feats = to_device(feats, device=self.device)
@@ -213,13 +214,16 @@
feats = speech
feats_len = speech_lengths
lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
+ feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
+ feats_len = torch.tensor([feats_len])
batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
# a. To device
batch = to_device(batch, device=self.device)
# b. Forward Encoder
- enc, enc_len = self.asr_model.encode_chunk(**batch)
+ enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
if isinstance(enc, tuple):
enc = enc[0]
# assert len(enc) == 1, len(enc)
@@ -532,6 +536,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
@@ -578,7 +584,22 @@
speech2text = Speech2TextExport(**speech2text_kwargs)
else:
speech2text = Speech2Text(**speech2text_kwargs)
+
+ def _load_bytes(input):
+ middle_data = np.frombuffer(input, dtype=np.int16)
+ middle_data = np.asarray(middle_data)
+ if middle_data.dtype.kind not in 'iu':
+ raise TypeError("'middle_data' must be an array of integers")
+ dtype = np.dtype('float32')
+ if dtype.kind != 'f':
+ raise TypeError("'dtype' must be a floating point type")
+ i = np.iinfo(middle_data.dtype)
+ abs_max = 2 ** (i.bits - 1)
+ offset = i.min + abs_max
+ array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+ return array
+
def _forward(
data_path_and_name_and_type,
raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@@ -589,15 +610,22 @@
):
# 3. Build data-iterator
- if data_path_and_name_and_type is None and raw_inputs is not None:
- if isinstance(raw_inputs, np.ndarray):
- raw_inputs = torch.tensor(raw_inputs)
-
is_final = False
+ cache = {}
if param_dict is not None and "cache" in param_dict:
cache = param_dict["cache"]
if param_dict is not None and "is_final" in param_dict:
is_final = param_dict["is_final"]
+
+ if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
+ raw_inputs = _load_bytes(data_path_and_name_and_type[0])
+ raw_inputs = torch.tensor(raw_inputs)
+ if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
+ raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
+ is_final = True
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, np.ndarray):
+ raw_inputs = torch.tensor(raw_inputs)
# 7 .Start for-loop
# FIXME(kamo): The output format should be discussed about
asr_result_list = []
@@ -605,62 +633,87 @@
asr_result = ""
wait = True
if len(cache) == 0:
- cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+ cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
cache_de = {"decode_fsmn": None}
cache["decoder"] = cache_de
cache["first_chunk"] = True
cache["speech"] = []
- cache["chunk_index"] = 0
- cache["speech_chunk"] = []
+ cache["accum_speech"] = 0
if raw_inputs is not None:
if len(cache["speech"]) == 0:
cache["speech"] = raw_inputs
else:
cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
- if len(cache["speech_chunk"]) == 0:
- cache["speech_chunk"] = raw_inputs
- else:
- cache["speech_chunk"] = torch.cat([cache["speech_chunk"], raw_inputs], dim=0)
- while len(cache["speech_chunk"]) >= 960:
+ cache["accum_speech"] += len(raw_inputs)
+ while cache["accum_speech"] >= 960:
if cache["first_chunk"]:
- if len(cache["speech_chunk"]) >= 14400:
- speech = torch.unsqueeze(cache["speech_chunk"][0:14400], axis=0)
- speech_length = torch.tensor([14400])
+ if cache["accum_speech"] >= 14400:
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["stride"] = 10
+ cache["encoder"]["left"] = 5
+ cache["encoder"]["right"] = 0
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"]= cache["speech_chunk"][4800:]
+ cache["accum_speech"] -= 4800
cache["first_chunk"] = False
cache["encoder"]["start_idx"] = -5
+ cache["encoder"]["is_final"] = False
wait = False
else:
if is_final:
- cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
+ cache["encoder"]["stride"] = len(cache["speech"]) // 960
+ cache["encoder"]["pad_left"] = 0
cache["encoder"]["pad_right"] = 0
- speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
- speech_length = torch.tensor([len(cache["speech_chunk"])])
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = []
+ cache["accum_speech"] = 0
wait = False
else:
break
else:
- if len(cache["speech_chunk"]) >= 19200:
+ if cache["accum_speech"] >= 19200:
cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = 10
cache["encoder"]["pad_left"] = 5
- speech = torch.unsqueeze(cache["speech_chunk"][:19200], axis=0)
- speech_length = torch.tensor([19200])
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = 0
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = cache["speech_chunk"][9600:]
+ cache["accum_speech"] -= 9600
wait = False
else:
if is_final:
- cache["encoder"]["stride"] = len(cache["speech_chunk"]) // 960
- cache["encoder"]["pad_right"] = 0
- speech = torch.unsqueeze(cache["speech_chunk"], axis=0)
- speech_length = torch.tensor([len(cache["speech_chunk"])])
- results = speech2text(cache, speech, speech_length)
- cache["speech_chunk"] = []
- wait = False
+ cache["encoder"]["is_final"] = True
+ if cache["accum_speech"] >= 14400:
+ cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = 10
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 5
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ results = speech2text(cache, speech, speech_length)
+ cache["accum_speech"] -= 9600
+ wait = False
+ else:
+ cache["encoder"]["start_idx"] += 10
+ cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
+ cache["encoder"]["pad_left"] = 5
+ cache["encoder"]["pad_right"] = 0
+ cache["encoder"]["left"] = 0
+ cache["encoder"]["right"] = 0
+ speech = torch.unsqueeze(cache["speech"], axis=0)
+ speech_length = torch.tensor([len(cache["speech"])])
+ results = speech2text(cache, speech, speech_length)
+ cache["accum_speech"] = 0
+ wait = False
else:
break
--
Gitblit v1.9.1