From 7c5fdf30f428e22fd0fdb98055834e0d2616d308 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 29 三月 2023 00:27:11 +0800
Subject: [PATCH] export
---
funasr/runtime/python/onnxruntime/setup.py | 2
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py | 0
funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py | 134 +++
funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py | 607 ++++++++++++++
funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt | 17
funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt | 1
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py | 59 +
funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO | 80 +
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py | 257 +++++
funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt | 1
funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py | 2
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py | 0
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py | 240 +++++
funasr/runtime/python/onnxruntime/demo_vad.py | 12
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py | 607 ++++++++++++++
funasr/runtime/python/onnxruntime/demo.py | 2
funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg | 0
funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt | 7
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py | 187 ++++
funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py | 1
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py | 166 +++
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py | 3
funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py | 191 ++++
funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg | 0
24 files changed, 2,572 insertions(+), 4 deletions(-)
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py
new file mode 100644
index 0000000..4750479
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py
@@ -0,0 +1,3 @@
+# -*- encoding: utf-8 -*-
+from .paraformer_bin import Paraformer
+from .vad_bin import Fsmn_vad
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py
new file mode 100644
index 0000000..cbdb8d9
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py
@@ -0,0 +1,187 @@
+# -*- encoding: utf-8 -*-
+
+import os.path
+from pathlib import Path
+from typing import List, Union, Tuple
+
+import copy
+import librosa
+import numpy as np
+
+from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
+ OrtInferSession, TokenIDConverter, get_logger,
+ read_yaml)
+from .utils.postprocess_utils import sentence_postprocess
+from .utils.frontend import WavFrontend
+from .utils.timestamp_utils import time_stamp_lfr6_onnx
+
+logging = get_logger()
+
+
+class Paraformer():
+ def __init__(self, model_dir: Union[str, Path] = None,
+ batch_size: int = 1,
+ device_id: Union[str, int] = "-1",
+ plot_timestamp_to: str = "",
+ pred_bias: int = 1,
+ quantize: bool = False,
+ intra_op_num_threads: int = 4,
+ ):
+
+ if not Path(model_dir).exists():
+ raise FileNotFoundError(f'{model_dir} does not exist.')
+
+ model_file = os.path.join(model_dir, 'model.onnx')
+ if quantize:
+ model_file = os.path.join(model_dir, 'model_quant.onnx')
+ config_file = os.path.join(model_dir, 'config.yaml')
+ cmvn_file = os.path.join(model_dir, 'am.mvn')
+ config = read_yaml(config_file)
+
+ self.converter = TokenIDConverter(config['token_list'])
+ self.tokenizer = CharTokenizer()
+ self.frontend = WavFrontend(
+ cmvn_file=cmvn_file,
+ **config['frontend_conf']
+ )
+ self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
+ self.batch_size = batch_size
+ self.plot_timestamp_to = plot_timestamp_to
+ self.pred_bias = pred_bias
+
+ def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
+ waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
+ waveform_nums = len(waveform_list)
+ asr_res = []
+ for beg_idx in range(0, waveform_nums, self.batch_size):
+
+ end_idx = min(waveform_nums, beg_idx + self.batch_size)
+ feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
+ try:
+ outputs = self.infer(feats, feats_len)
+ am_scores, valid_token_lens = outputs[0], outputs[1]
+ if len(outputs) == 4:
+ # for BiCifParaformer Inference
+ us_alphas, us_peaks = outputs[2], outputs[3]
+ else:
+ us_alphas, us_peaks = None, None
+ except ONNXRuntimeError:
+ #logging.warning(traceback.format_exc())
+ logging.warning("input wav is silence or noise")
+ preds = ['']
+ else:
+ preds = self.decode(am_scores, valid_token_lens)
+ if us_peaks is None:
+ for pred in preds:
+ pred = sentence_postprocess(pred)
+ asr_res.append({'preds': pred})
+ else:
+ for pred, us_peaks_ in zip(preds, us_peaks):
+ raw_tokens = pred
+ timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
+ text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
+ # logging.warning(timestamp)
+ if len(self.plot_timestamp_to):
+ self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
+ asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
+ return asr_res
+
+ def plot_wave_timestamp(self, wav, text_timestamp, dest):
+ # TODO: Plot the wav and timestamp results with matplotlib
+ import matplotlib
+ matplotlib.use('Agg')
+ matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
+ import matplotlib.pyplot as plt
+ fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
+ ax2 = ax1.twinx()
+ ax2.set_ylim([0, 2.0])
+ # plot waveform
+ ax1.set_ylim([-0.3, 0.3])
+ time = np.arange(wav.shape[0]) / 16000
+ ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
+ # plot lines and text
+ for (char, start, end) in text_timestamp:
+ ax1.vlines(start, -0.3, 0.3, ls='--')
+ ax1.vlines(end, -0.3, 0.3, ls='--')
+ x_adj = 0.045 if char != '<sil>' else 0.12
+ ax1.text((start + end) * 0.5 - x_adj, 0, char)
+ # plt.legend()
+ plotname = "{}/timestamp.png".format(dest)
+ plt.savefig(plotname, bbox_inches='tight')
+
+ def load_data(self,
+ wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+ def load_wav(path: str) -> np.ndarray:
+ waveform, _ = librosa.load(path, sr=fs)
+ return waveform
+
+ if isinstance(wav_content, np.ndarray):
+ return [wav_content]
+
+ if isinstance(wav_content, str):
+ return [load_wav(wav_content)]
+
+ if isinstance(wav_content, list):
+ return [load_wav(path) for path in wav_content]
+
+ raise TypeError(
+ f'The type of {wav_content} is not in [str, np.ndarray, list]')
+
+ def extract_feat(self,
+ waveform_list: List[np.ndarray]
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ feats, feats_len = [], []
+ for waveform in waveform_list:
+ speech, _ = self.frontend.fbank(waveform)
+ feat, feat_len = self.frontend.lfr_cmvn(speech)
+ feats.append(feat)
+ feats_len.append(feat_len)
+
+ feats = self.pad_feats(feats, np.max(feats_len))
+ feats_len = np.array(feats_len).astype(np.int32)
+ return feats, feats_len
+
+ @staticmethod
+ def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+ def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+ pad_width = ((0, max_feat_len - cur_len), (0, 0))
+ return np.pad(feat, pad_width, 'constant', constant_values=0)
+
+ feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+ feats = np.array(feat_res).astype(np.float32)
+ return feats
+
+ def infer(self, feats: np.ndarray,
+ feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ outputs = self.ort_infer([feats, feats_len])
+ return outputs
+
+ def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
+ return [self.decode_one(am_score, token_num)
+ for am_score, token_num in zip(am_scores, token_nums)]
+
+ def decode_one(self,
+ am_score: np.ndarray,
+ valid_token_num: int) -> List[str]:
+ yseq = am_score.argmax(axis=-1)
+ score = am_score.max(axis=-1)
+ score = np.sum(score, axis=-1)
+
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ # asr_model.sos:1 asr_model.eos:2
+ yseq = np.array([1] + yseq.tolist() + [2])
+ hyp = Hypothesis(yseq=yseq, score=score)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x not in (0, 2), token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+ token = token[:valid_token_num-self.pred_bias]
+ # texts = sentence_postprocess(token)
+ return token
+
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py
new file mode 100644
index 0000000..8eed22f
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py
@@ -0,0 +1,607 @@
+from enum import Enum
+from typing import List, Tuple, Dict, Any
+
+import math
+import numpy as np
+
+class VadStateMachine(Enum):
+ kVadInStateStartPointNotDetected = 1
+ kVadInStateInSpeechSegment = 2
+ kVadInStateEndPointDetected = 3
+
+
+class FrameState(Enum):
+ kFrameStateInvalid = -1
+ kFrameStateSpeech = 1
+ kFrameStateSil = 0
+
+
+# final voice/unvoice state per frame
+class AudioChangeState(Enum):
+ kChangeStateSpeech2Speech = 0
+ kChangeStateSpeech2Sil = 1
+ kChangeStateSil2Sil = 2
+ kChangeStateSil2Speech = 3
+ kChangeStateNoBegin = 4
+ kChangeStateInvalid = 5
+
+
+class VadDetectMode(Enum):
+ kVadSingleUtteranceDetectMode = 0
+ kVadMutipleUtteranceDetectMode = 1
+
+
+class VADXOptions:
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
+ snr_mode: int = 0,
+ max_end_silence_time: int = 800,
+ max_start_silence_time: int = 3000,
+ do_start_point_detection: bool = True,
+ do_end_point_detection: bool = True,
+ window_size_ms: int = 200,
+ sil_to_speech_time_thres: int = 150,
+ speech_to_sil_time_thres: int = 150,
+ speech_2_noise_ratio: float = 1.0,
+ do_extend: int = 1,
+ lookback_time_start_point: int = 200,
+ lookahead_time_end_point: int = 100,
+ max_single_segment_time: int = 60000,
+ nn_eval_block_size: int = 8,
+ dcd_block_size: int = 4,
+ snr_thres: int = -100.0,
+ noise_frame_num_used_for_snr: int = 100,
+ decibel_thres: int = -100.0,
+ speech_noise_thres: float = 0.6,
+ fe_prior_thres: float = 1e-4,
+ silence_pdf_num: int = 1,
+ sil_pdf_ids: List[int] = [0],
+ speech_noise_thresh_low: float = -0.1,
+ speech_noise_thresh_high: float = 0.3,
+ output_frame_probs: bool = False,
+ frame_in_ms: int = 10,
+ frame_length_ms: int = 25,
+ ):
+ self.sample_rate = sample_rate
+ self.detect_mode = detect_mode
+ self.snr_mode = snr_mode
+ self.max_end_silence_time = max_end_silence_time
+ self.max_start_silence_time = max_start_silence_time
+ self.do_start_point_detection = do_start_point_detection
+ self.do_end_point_detection = do_end_point_detection
+ self.window_size_ms = window_size_ms
+ self.sil_to_speech_time_thres = sil_to_speech_time_thres
+ self.speech_to_sil_time_thres = speech_to_sil_time_thres
+ self.speech_2_noise_ratio = speech_2_noise_ratio
+ self.do_extend = do_extend
+ self.lookback_time_start_point = lookback_time_start_point
+ self.lookahead_time_end_point = lookahead_time_end_point
+ self.max_single_segment_time = max_single_segment_time
+ self.nn_eval_block_size = nn_eval_block_size
+ self.dcd_block_size = dcd_block_size
+ self.snr_thres = snr_thres
+ self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
+ self.decibel_thres = decibel_thres
+ self.speech_noise_thres = speech_noise_thres
+ self.fe_prior_thres = fe_prior_thres
+ self.silence_pdf_num = silence_pdf_num
+ self.sil_pdf_ids = sil_pdf_ids
+ self.speech_noise_thresh_low = speech_noise_thresh_low
+ self.speech_noise_thresh_high = speech_noise_thresh_high
+ self.output_frame_probs = output_frame_probs
+ self.frame_in_ms = frame_in_ms
+ self.frame_length_ms = frame_length_ms
+
+
+class E2EVadSpeechBufWithDoa(object):
+ def __init__(self):
+ self.start_ms = 0
+ self.end_ms = 0
+ self.buffer = []
+ self.contain_seg_start_point = False
+ self.contain_seg_end_point = False
+ self.doa = 0
+
+ def Reset(self):
+ self.start_ms = 0
+ self.end_ms = 0
+ self.buffer = []
+ self.contain_seg_start_point = False
+ self.contain_seg_end_point = False
+ self.doa = 0
+
+
+class E2EVadFrameProb(object):
+ def __init__(self):
+ self.noise_prob = 0.0
+ self.speech_prob = 0.0
+ self.score = 0.0
+ self.frame_id = 0
+ self.frm_state = 0
+
+
+class WindowDetector(object):
+ def __init__(self, window_size_ms: int, sil_to_speech_time: int,
+ speech_to_sil_time: int, frame_size_ms: int):
+ self.window_size_ms = window_size_ms
+ self.sil_to_speech_time = sil_to_speech_time
+ self.speech_to_sil_time = speech_to_sil_time
+ self.frame_size_ms = frame_size_ms
+
+ self.win_size_frame = int(window_size_ms / frame_size_ms)
+ self.win_sum = 0
+ self.win_state = [0] * self.win_size_frame # 鍒濆鍖栫獥
+
+ self.cur_win_pos = 0
+ self.pre_frame_state = FrameState.kFrameStateSil
+ self.cur_frame_state = FrameState.kFrameStateSil
+ self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
+ self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
+
+ self.voice_last_frame_count = 0
+ self.noise_last_frame_count = 0
+ self.hydre_frame_count = 0
+
+ def Reset(self) -> None:
+ self.cur_win_pos = 0
+ self.win_sum = 0
+ self.win_state = [0] * self.win_size_frame
+ self.pre_frame_state = FrameState.kFrameStateSil
+ self.cur_frame_state = FrameState.kFrameStateSil
+ self.voice_last_frame_count = 0
+ self.noise_last_frame_count = 0
+ self.hydre_frame_count = 0
+
+ def GetWinSize(self) -> int:
+ return int(self.win_size_frame)
+
+ def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
+ cur_frame_state = FrameState.kFrameStateSil
+ if frameState == FrameState.kFrameStateSpeech:
+ cur_frame_state = 1
+ elif frameState == FrameState.kFrameStateSil:
+ cur_frame_state = 0
+ else:
+ return AudioChangeState.kChangeStateInvalid
+ self.win_sum -= self.win_state[self.cur_win_pos]
+ self.win_sum += cur_frame_state
+ self.win_state[self.cur_win_pos] = cur_frame_state
+ self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
+
+ if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
+ self.pre_frame_state = FrameState.kFrameStateSpeech
+ return AudioChangeState.kChangeStateSil2Speech
+
+ if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
+ self.pre_frame_state = FrameState.kFrameStateSil
+ return AudioChangeState.kChangeStateSpeech2Sil
+
+ if self.pre_frame_state == FrameState.kFrameStateSil:
+ return AudioChangeState.kChangeStateSil2Sil
+ if self.pre_frame_state == FrameState.kFrameStateSpeech:
+ return AudioChangeState.kChangeStateSpeech2Speech
+ return AudioChangeState.kChangeStateInvalid
+
+ def FrameSizeMs(self) -> int:
+ return int(self.frame_size_ms)
+
+
+class E2EVadModel():
+ def __init__(self, vad_post_args: Dict[str, Any]):
+ super(E2EVadModel, self).__init__()
+ self.vad_opts = VADXOptions(**vad_post_args)
+ self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
+ self.vad_opts.sil_to_speech_time_thres,
+ self.vad_opts.speech_to_sil_time_thres,
+ self.vad_opts.frame_in_ms)
+ # self.encoder = encoder
+ # init variables
+ self.is_final = False
+ self.data_buf_start_frame = 0
+ self.frm_cnt = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.continous_silence_frame_count = 0
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.number_end_time_detected = 0
+ self.sil_frame = 0
+ self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+ self.noise_average_decibel = -100.0
+ self.pre_end_silence_detected = False
+ self.next_seg = True
+
+ self.output_data_buf = []
+ self.output_data_buf_offset = 0
+ self.frame_probs = []
+ self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+ self.speech_noise_thres = self.vad_opts.speech_noise_thres
+ self.scores = None
+ self.max_time_out = False
+ self.decibel = []
+ self.data_buf = None
+ self.data_buf_all = None
+ self.waveform = None
+ self.ResetDetection()
+
+ def AllResetDetection(self):
+ self.is_final = False
+ self.data_buf_start_frame = 0
+ self.frm_cnt = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.continous_silence_frame_count = 0
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.number_end_time_detected = 0
+ self.sil_frame = 0
+ self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+ self.noise_average_decibel = -100.0
+ self.pre_end_silence_detected = False
+ self.next_seg = True
+
+ self.output_data_buf = []
+ self.output_data_buf_offset = 0
+ self.frame_probs = []
+ self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+ self.speech_noise_thres = self.vad_opts.speech_noise_thres
+ self.scores = None
+ self.max_time_out = False
+ self.decibel = []
+ self.data_buf = None
+ self.data_buf_all = None
+ self.waveform = None
+ self.ResetDetection()
+
+ def ResetDetection(self):
+ self.continous_silence_frame_count = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.windows_detector.Reset()
+ self.sil_frame = 0
+ self.frame_probs = []
+
+ def ComputeDecibel(self) -> None:
+ frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
+ frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
+ if self.data_buf_all is None:
+ self.data_buf_all = self.waveform[0] # self.data_buf is pointed to self.waveform[0]
+ self.data_buf = self.data_buf_all
+ else:
+ self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
+ for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
+ self.decibel.append(
+ 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
+ 0.000001))
+
+ def ComputeScores(self, scores: np.ndarray) -> None:
+ # scores = self.encoder(feats, in_cache) # return B * T * D
+ self.vad_opts.nn_eval_block_size = scores.shape[1]
+ self.frm_cnt += scores.shape[1] # count total frames
+ if self.scores is None:
+ self.scores = scores # the first calculation
+ else:
+ self.scores = np.concatenate((self.scores, scores), axis=1)
+
+ def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again
+ while self.data_buf_start_frame < frame_idx:
+ if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
+ self.data_buf_start_frame += 1
+ self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
+ self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
+
+ def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
+ last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
+ self.PopDataBufTillFrame(start_frm)
+ expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
+ if last_frm_is_end_point:
+ extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
+ self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
+ expected_sample_number += int(extra_sample)
+ if end_point_is_sent_end:
+ expected_sample_number = max(expected_sample_number, len(self.data_buf))
+ if len(self.data_buf) < expected_sample_number:
+ print('error in calling pop data_buf\n')
+
+ if len(self.output_data_buf) == 0 or first_frm_is_start_point:
+ self.output_data_buf.append(E2EVadSpeechBufWithDoa())
+ self.output_data_buf[-1].Reset()
+ self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
+ self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
+ self.output_data_buf[-1].doa = 0
+ cur_seg = self.output_data_buf[-1]
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+ print('warning\n')
+ out_pos = len(cur_seg.buffer) # cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
+ data_to_pop = 0
+ if end_point_is_sent_end:
+ data_to_pop = expected_sample_number
+ else:
+ data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
+ if data_to_pop > len(self.data_buf):
+ print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
+ data_to_pop = len(self.data_buf)
+ expected_sample_number = len(self.data_buf)
+
+ cur_seg.doa = 0
+ for sample_cpy_out in range(0, data_to_pop):
+ # cur_seg.buffer[out_pos ++] = data_buf_.back();
+ out_pos += 1
+ for sample_cpy_out in range(data_to_pop, expected_sample_number):
+ # cur_seg.buffer[out_pos++] = data_buf_.back()
+ out_pos += 1
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+ print('Something wrong with the VAD algorithm\n')
+ self.data_buf_start_frame += frm_cnt
+ cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
+ if first_frm_is_start_point:
+ cur_seg.contain_seg_start_point = True
+ if last_frm_is_end_point:
+ cur_seg.contain_seg_end_point = True
+
+ def OnSilenceDetected(self, valid_frame: int):
+ self.lastest_confirmed_silence_frame = valid_frame
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ self.PopDataBufTillFrame(valid_frame)
+ # silence_detected_callback_
+ # pass
+
+ def OnVoiceDetected(self, valid_frame: int) -> None:
+ self.latest_confirmed_speech_frame = valid_frame
+ self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
+
+ def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
+ if self.vad_opts.do_start_point_detection:
+ pass
+ if self.confirmed_start_frame != -1:
+ print('not reset vad properly\n')
+ else:
+ self.confirmed_start_frame = start_frame
+
+ if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
+
+ def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
+ for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
+ self.OnVoiceDetected(t)
+ if self.vad_opts.do_end_point_detection:
+ pass
+ if self.confirmed_end_frame != -1:
+ print('not reset vad properly\n')
+ else:
+ self.confirmed_end_frame = end_frame
+ if not fake_result:
+ self.sil_frame = 0
+ self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
+ self.number_end_time_detected += 1
+
+ def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
+ if is_final_frame:
+ self.OnVoiceEnd(cur_frm_idx, False, True)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+
+ def GetLatency(self) -> int:
+ return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
+
+ def LatencyFrmNumAtStartPoint(self) -> int:
+ vad_latency = self.windows_detector.GetWinSize()
+ if self.vad_opts.do_extend:
+ vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
+ return vad_latency
+
+ def GetFrameState(self, t: int) -> FrameState:
+ frame_state = FrameState.kFrameStateInvalid
+ cur_decibel = self.decibel[t]
+ cur_snr = cur_decibel - self.noise_average_decibel
+ # for each frame, calc log posterior probability of each state
+ if cur_decibel < self.vad_opts.decibel_thres:
+ frame_state = FrameState.kFrameStateSil
+ self.DetectOneFrame(frame_state, t, False)
+ return frame_state
+
+ sum_score = 0.0
+ noise_prob = 0.0
+ assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
+ if len(self.sil_pdf_ids) > 0:
+ assert len(self.scores) == 1 # 鍙敮鎸乥atch_size = 1鐨勬祴璇�
+ sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
+ sum_score = sum(sil_pdf_scores)
+ noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
+ total_score = 1.0
+ sum_score = total_score - sum_score
+ speech_prob = math.log(sum_score)
+ if self.vad_opts.output_frame_probs:
+ frame_prob = E2EVadFrameProb()
+ frame_prob.noise_prob = noise_prob
+ frame_prob.speech_prob = speech_prob
+ frame_prob.score = sum_score
+ frame_prob.frame_id = t
+ self.frame_probs.append(frame_prob)
+ if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
+ if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
+ frame_state = FrameState.kFrameStateSpeech
+ else:
+ frame_state = FrameState.kFrameStateSil
+ else:
+ frame_state = FrameState.kFrameStateSil
+ if self.noise_average_decibel < -99.9:
+ self.noise_average_decibel = cur_decibel
+ else:
+ self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
+ self.vad_opts.noise_frame_num_used_for_snr
+ - 1)) / self.vad_opts.noise_frame_num_used_for_snr
+
+ return frame_state
+
+
+ def __call__(self, score: np.ndarray, waveform: np.ndarray,
+ is_final: bool = False, max_end_sil: int = 800
+ ):
+ self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
+ self.waveform = waveform # compute decibel for each frame
+ self.ComputeDecibel()
+ self.ComputeScores(score)
+ if not is_final:
+ self.DetectCommonFrames()
+ else:
+ self.DetectLastFrames()
+ segments = []
+ for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now
+ segment_batch = []
+ if len(self.output_data_buf) > 0:
+ for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+ if not self.output_data_buf[i].contain_seg_start_point:
+ continue
+ if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
+ continue
+ start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
+ if self.output_data_buf[i].contain_seg_end_point:
+ end_ms = self.output_data_buf[i].end_ms
+ self.next_seg = True
+ self.output_data_buf_offset += 1
+ else:
+ end_ms = -1
+ self.next_seg = False
+ segment = [start_ms, end_ms]
+ segment_batch.append(segment)
+ if segment_batch:
+ segments.append(segment_batch)
+ if is_final:
+ # reset class variables and clear the dict for the next query
+ self.AllResetDetection()
+ return segments
+
+ def DetectCommonFrames(self) -> int:
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+ return 0
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+ frame_state = FrameState.kFrameStateInvalid
+ frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+
+ return 0
+
+ def DetectLastFrames(self) -> int:
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+ return 0
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+ frame_state = FrameState.kFrameStateInvalid
+ frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+ if i != 0:
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+ else:
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
+
+ return 0
+
+ def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
+ tmp_cur_frm_state = FrameState.kFrameStateInvalid
+ if cur_frm_state == FrameState.kFrameStateSpeech:
+ if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
+ tmp_cur_frm_state = FrameState.kFrameStateSpeech
+ else:
+ tmp_cur_frm_state = FrameState.kFrameStateSil
+ elif cur_frm_state == FrameState.kFrameStateSil:
+ tmp_cur_frm_state = FrameState.kFrameStateSil
+ state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
+ frm_shift_in_ms = self.vad_opts.frame_in_ms
+ if AudioChangeState.kChangeStateSil2Speech == state_change:
+ silence_frame_count = self.continous_silence_frame_count
+ self.continous_silence_frame_count = 0
+ self.pre_end_silence_detected = False
+ start_frame = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
+ self.OnVoiceStart(start_frame)
+ self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
+ for t in range(start_frame + 1, cur_frm_idx + 1):
+ self.OnVoiceDetected(t)
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
+ self.OnVoiceDetected(t)
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
+ self.continous_silence_frame_count = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ pass
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
+ self.continous_silence_frame_count = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.max_time_out = True
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSil2Sil == state_change:
+ self.continous_silence_frame_count += 1
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ # silence timeout, return zero length decision
+ if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
+ self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
+ or (is_final_frame and self.number_end_time_detected == 0):
+ for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
+ self.OnSilenceDetected(t)
+ self.OnVoiceStart(0, True)
+ self.OnVoiceEnd(0, True, False);
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ else:
+ if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
+ self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
+ lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
+ if self.vad_opts.do_extend:
+ lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
+ lookback_frame -= 1
+ lookback_frame = max(0, lookback_frame)
+ self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif self.vad_opts.do_extend and not is_final_frame:
+ if self.continous_silence_frame_count <= int(
+ self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
+ self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
+ self.ResetDetection()
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py
new file mode 100644
index 0000000..11a8644
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py
@@ -0,0 +1,191 @@
+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+from typeguard import check_argument_types
+import kaldi_native_fbank as knf
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class WavFrontend():
+ """Conventional frontend structure for ASR.
+ """
+
+ def __init__(
+ self,
+ cmvn_file: str = None,
+ fs: int = 16000,
+ window: str = 'hamming',
+ n_mels: int = 80,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ dither: float = 1.0,
+ **kwargs,
+ ) -> None:
+ check_argument_types()
+
+ opts = knf.FbankOptions()
+ opts.frame_opts.samp_freq = fs
+ opts.frame_opts.dither = dither
+ opts.frame_opts.window_type = window
+ opts.frame_opts.frame_shift_ms = float(frame_shift)
+ opts.frame_opts.frame_length_ms = float(frame_length)
+ opts.mel_opts.num_bins = n_mels
+ opts.energy_floor = 0
+ opts.frame_opts.snip_edges = True
+ opts.mel_opts.debug_mel = False
+ self.opts = opts
+
+ self.lfr_m = lfr_m
+ self.lfr_n = lfr_n
+ self.cmvn_file = cmvn_file
+
+ if self.cmvn_file:
+ self.cmvn = self.load_cmvn()
+ self.fbank_fn = None
+ self.fbank_beg_idx = 0
+ self.reset_status()
+
+ def fbank(self,
+ waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ waveform = waveform * (1 << 15)
+ self.fbank_fn = knf.OnlineFbank(self.opts)
+ self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+ frames = self.fbank_fn.num_frames_ready
+ mat = np.empty([frames, self.opts.mel_opts.num_bins])
+ for i in range(frames):
+ mat[i, :] = self.fbank_fn.get_frame(i)
+ feat = mat.astype(np.float32)
+ feat_len = np.array(mat.shape[0]).astype(np.int32)
+ return feat, feat_len
+
+ def fbank_online(self,
+ waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ waveform = waveform * (1 << 15)
+ # self.fbank_fn = knf.OnlineFbank(self.opts)
+ self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+ frames = self.fbank_fn.num_frames_ready
+ mat = np.empty([frames, self.opts.mel_opts.num_bins])
+ for i in range(self.fbank_beg_idx, frames):
+ mat[i, :] = self.fbank_fn.get_frame(i)
+ # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
+ feat = mat.astype(np.float32)
+ feat_len = np.array(mat.shape[0]).astype(np.int32)
+ return feat, feat_len
+
+ def reset_status(self):
+ self.fbank_fn = knf.OnlineFbank(self.opts)
+ self.fbank_beg_idx = 0
+
+ def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ if self.lfr_m != 1 or self.lfr_n != 1:
+ feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+
+ if self.cmvn_file:
+ feat = self.apply_cmvn(feat)
+
+ feat_len = np.array(feat.shape[0]).astype(np.int32)
+ return feat, feat_len
+
+ @staticmethod
+ def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+ LFR_inputs = []
+
+ T = inputs.shape[0]
+ T_lfr = int(np.ceil(T / lfr_n))
+ left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+ inputs = np.vstack((left_padding, inputs))
+ T = T + (lfr_m - 1) // 2
+ for i in range(T_lfr):
+ if lfr_m <= T - i * lfr_n:
+ LFR_inputs.append(
+ (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
+ else:
+ # process last LFR frame
+ num_padding = lfr_m - (T - i * lfr_n)
+ frame = inputs[i * lfr_n:].reshape(-1)
+ for _ in range(num_padding):
+ frame = np.hstack((frame, inputs[-1]))
+
+ LFR_inputs.append(frame)
+ LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+ return LFR_outputs
+
+ def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+ """
+ Apply CMVN with mvn data
+ """
+ frame, dim = inputs.shape
+ means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+ vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+ inputs = (inputs + means) * vars
+ return inputs
+
+ def load_cmvn(self,) -> np.ndarray:
+ with open(self.cmvn_file, 'r', encoding='utf-8') as f:
+ lines = f.readlines()
+
+ means_list = []
+ vars_list = []
+ for i in range(len(lines)):
+ line_item = lines[i].split()
+ if line_item[0] == '<AddShift>':
+ line_item = lines[i + 1].split()
+ if line_item[0] == '<LearnRateCoef>':
+ add_shift_line = line_item[3:(len(line_item) - 1)]
+ means_list = list(add_shift_line)
+ continue
+ elif line_item[0] == '<Rescale>':
+ line_item = lines[i + 1].split()
+ if line_item[0] == '<LearnRateCoef>':
+ rescale_line = line_item[3:(len(line_item) - 1)]
+ vars_list = list(rescale_line)
+ continue
+
+ means = np.array(means_list).astype(np.float64)
+ vars = np.array(vars_list).astype(np.float64)
+ cmvn = np.array([means, vars])
+ return cmvn
+
+def load_bytes(input):
+ middle_data = np.frombuffer(input, dtype=np.int16)
+ middle_data = np.asarray(middle_data)
+ if middle_data.dtype.kind not in 'iu':
+ raise TypeError("'middle_data' must be an array of integers")
+ dtype = np.dtype('float32')
+ if dtype.kind != 'f':
+ raise TypeError("'dtype' must be a floating point type")
+
+ i = np.iinfo(middle_data.dtype)
+ abs_max = 2 ** (i.bits - 1)
+ offset = i.min + abs_max
+ array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
+ return array
+
+
+def test():
+ path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
+ import librosa
+ cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
+ config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
+ from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
+ config = read_yaml(config_file)
+ waveform, _ = librosa.load(path, sr=None)
+ frontend = WavFrontend(
+ cmvn_file=cmvn_file,
+ **config['frontend_conf'],
+ )
+ speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy
+ feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
+
+ frontend.reset_status() # clear cache
+ return feat, feat_len
+
+if __name__ == '__main__':
+ test()
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py
new file mode 100644
index 0000000..575fb90
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py
@@ -0,0 +1,240 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import string
+import logging
+from typing import Any, List, Union
+
+
+def isChinese(ch: str):
+ if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+ return True
+ return False
+
+
+def isAllChinese(word: Union[List[Any], str]):
+ word_lists = []
+ for i in word:
+ cur = i.replace(' ', '')
+ cur = cur.replace('</s>', '')
+ cur = cur.replace('<s>', '')
+ word_lists.append(cur)
+
+ if len(word_lists) == 0:
+ return False
+
+ for ch in word_lists:
+ if isChinese(ch) is False:
+ return False
+ return True
+
+
+def isAllAlpha(word: Union[List[Any], str]):
+ word_lists = []
+ for i in word:
+ cur = i.replace(' ', '')
+ cur = cur.replace('</s>', '')
+ cur = cur.replace('<s>', '')
+ word_lists.append(cur)
+
+ if len(word_lists) == 0:
+ return False
+
+ for ch in word_lists:
+ if ch.isalpha() is False and ch != "'":
+ return False
+ elif ch.isalpha() is True and isChinese(ch) is True:
+ return False
+
+ return True
+
+
+# def abbr_dispose(words: List[Any]) -> List[Any]:
+def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
+ words_size = len(words)
+ word_lists = []
+ abbr_begin = []
+ abbr_end = []
+ last_num = -1
+ ts_lists = []
+ ts_nums = []
+ ts_index = 0
+ for num in range(words_size):
+ if num <= last_num:
+ continue
+
+ if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
+ if num + 1 < words_size and words[
+ num + 1] == ' ' and num + 2 < words_size and len(
+ words[num +
+ 2]) == 1 and words[num +
+ 2].encode('utf-8').isalpha():
+ # found the begin of abbr
+ abbr_begin.append(num)
+ num += 2
+ abbr_end.append(num)
+ # to find the end of abbr
+ while True:
+ num += 1
+ if num < words_size and words[num] == ' ':
+ num += 1
+ if num < words_size and len(
+ words[num]) == 1 and words[num].encode(
+ 'utf-8').isalpha():
+ abbr_end.pop()
+ abbr_end.append(num)
+ last_num = num
+ else:
+ break
+ else:
+ break
+
+ for num in range(words_size):
+ if words[num] == ' ':
+ ts_nums.append(ts_index)
+ else:
+ ts_nums.append(ts_index)
+ ts_index += 1
+ last_num = -1
+ for num in range(words_size):
+ if num <= last_num:
+ continue
+
+ if num in abbr_begin:
+ if time_stamp is not None:
+ begin = time_stamp[ts_nums[num]][0]
+ word_lists.append(words[num].upper())
+ num += 1
+ while num < words_size:
+ if num in abbr_end:
+ word_lists.append(words[num].upper())
+ last_num = num
+ break
+ else:
+ if words[num].encode('utf-8').isalpha():
+ word_lists.append(words[num].upper())
+ num += 1
+ if time_stamp is not None:
+ end = time_stamp[ts_nums[num]][1]
+ ts_lists.append([begin, end])
+ else:
+ word_lists.append(words[num])
+ if time_stamp is not None and words[num] != ' ':
+ begin = time_stamp[ts_nums[num]][0]
+ end = time_stamp[ts_nums[num]][1]
+ ts_lists.append([begin, end])
+ begin = end
+
+ if time_stamp is not None:
+ return word_lists, ts_lists
+ else:
+ return word_lists
+
+
+def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
+ middle_lists = []
+ word_lists = []
+ word_item = ''
+ ts_lists = []
+
+ # wash words lists
+ for i in words:
+ word = ''
+ if isinstance(i, str):
+ word = i
+ else:
+ word = i.decode('utf-8')
+
+ if word in ['<s>', '</s>', '<unk>']:
+ continue
+ else:
+ middle_lists.append(word)
+
+ # all chinese characters
+ if isAllChinese(middle_lists):
+ for i, ch in enumerate(middle_lists):
+ word_lists.append(ch.replace(' ', ''))
+ if time_stamp is not None:
+ ts_lists = time_stamp
+
+ # all alpha characters
+ elif isAllAlpha(middle_lists):
+ ts_flag = True
+ for i, ch in enumerate(middle_lists):
+ if ts_flag and time_stamp is not None:
+ begin = time_stamp[i][0]
+ end = time_stamp[i][1]
+ word = ''
+ if '@@' in ch:
+ word = ch.replace('@@', '')
+ word_item += word
+ if time_stamp is not None:
+ ts_flag = False
+ end = time_stamp[i][1]
+ else:
+ word_item += ch
+ word_lists.append(word_item)
+ word_lists.append(' ')
+ word_item = ''
+ if time_stamp is not None:
+ ts_flag = True
+ end = time_stamp[i][1]
+ ts_lists.append([begin, end])
+ begin = end
+
+ # mix characters
+ else:
+ alpha_blank = False
+ ts_flag = True
+ begin = -1
+ end = -1
+ for i, ch in enumerate(middle_lists):
+ if ts_flag and time_stamp is not None:
+ begin = time_stamp[i][0]
+ end = time_stamp[i][1]
+ word = ''
+ if isAllChinese(ch):
+ if alpha_blank is True:
+ word_lists.pop()
+ word_lists.append(ch)
+ alpha_blank = False
+ if time_stamp is not None:
+ ts_flag = True
+ ts_lists.append([begin, end])
+ begin = end
+ elif '@@' in ch:
+ word = ch.replace('@@', '')
+ word_item += word
+ alpha_blank = False
+ if time_stamp is not None:
+ ts_flag = False
+ end = time_stamp[i][1]
+ elif isAllAlpha(ch):
+ word_item += ch
+ word_lists.append(word_item)
+ word_lists.append(' ')
+ word_item = ''
+ alpha_blank = True
+ if time_stamp is not None:
+ ts_flag = True
+ end = time_stamp[i][1]
+ ts_lists.append([begin, end])
+ begin = end
+ else:
+ raise ValueError('invalid character: {}'.format(ch))
+
+ if time_stamp is not None:
+ word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
+ real_word_lists = []
+ for ch in word_lists:
+ if ch != ' ':
+ real_word_lists.append(ch)
+ sentence = ' '.join(real_word_lists).strip()
+ return sentence, ts_lists, real_word_lists
+ else:
+ word_lists = abbr_dispose(word_lists)
+ real_word_lists = []
+ for ch in word_lists:
+ if ch != ' ':
+ real_word_lists.append(ch)
+ sentence = ''.join(word_lists).strip()
+ return sentence, real_word_lists
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py
new file mode 100644
index 0000000..3a01812
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py
@@ -0,0 +1,59 @@
+import numpy as np
+
+
+def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
+ if not len(char_list):
+ return []
+ START_END_THRESHOLD = 5
+ MAX_TOKEN_DURATION = 30
+ TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
+ cif_peak = us_cif_peak.reshape(-1)
+ num_frames = cif_peak.shape[-1]
+ if char_list[-1] == '</s>':
+ char_list = char_list[:-1]
+ # char_list = [i for i in text]
+ timestamp_list = []
+ new_char_list = []
+ # for bicif model trained with large data, cif2 actually fires when a character starts
+ # so treat the frames between two peaks as the duration of the former token
+ fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
+ num_peak = len(fire_place)
+ assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
+ # begin silence
+ if fire_place[0] > START_END_THRESHOLD:
+ # char_list.insert(0, '<sil>')
+ timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
+ new_char_list.append('<sil>')
+ # tokens timestamp
+ for i in range(len(fire_place)-1):
+ new_char_list.append(char_list[i])
+ if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
+ timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
+ else:
+ # cut the duration to token and sil of the 0-weight frames last long
+ _split = fire_place[i] + MAX_TOKEN_DURATION
+ timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE])
+ timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE])
+ new_char_list.append('<sil>')
+ # tail token and end silence
+ if num_frames - fire_place[-1] > START_END_THRESHOLD:
+ _end = (num_frames + fire_place[-1]) / 2
+ timestamp_list[-1][1] = _end*TIME_RATE
+ timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE])
+ new_char_list.append("<sil>")
+ else:
+ timestamp_list[-1][1] = num_frames*TIME_RATE
+ if begin_time: # add offset time in model with vad
+ for i in range(len(timestamp_list)):
+ timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
+ timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
+ assert len(new_char_list) == len(timestamp_list)
+ res_str = ""
+ for char, timestamp in zip(new_char_list, timestamp_list):
+ res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
+ res = []
+ for char, timestamp in zip(new_char_list, timestamp_list):
+ if char != '<sil>':
+ res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
+ return res_str, res
+
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py
new file mode 100644
index 0000000..2edde11
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py
@@ -0,0 +1,257 @@
+# -*- encoding: utf-8 -*-
+
+import functools
+import logging
+import pickle
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+import yaml
+from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+ SessionOptions, get_available_providers, get_device)
+from typeguard import check_argument_types
+
+import warnings
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class TokenIDConverter():
+ def __init__(self, token_list: Union[List, str],
+ ):
+ check_argument_types()
+
+ # self.token_list = self.load_token(token_path)
+ self.token_list = token_list
+ self.unk_symbol = token_list[-1]
+
+ # @staticmethod
+ # def load_token(file_path: Union[Path, str]) -> List:
+ # if not Path(file_path).exists():
+ # raise TokenIDConverterError(f'The {file_path} does not exist.')
+ #
+ # with open(str(file_path), 'rb') as f:
+ # token_list = pickle.load(f)
+ #
+ # if len(token_list) != len(set(token_list)):
+ # raise TokenIDConverterError('The Token exists duplicated symbol.')
+ # return token_list
+
+ def get_num_vocabulary_size(self) -> int:
+ return len(self.token_list)
+
+ def ids2tokens(self,
+ integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+ if isinstance(integers, np.ndarray) and integers.ndim != 1:
+ raise TokenIDConverterError(
+ f"Must be 1 dim ndarray, but got {integers.ndim}")
+ return [self.token_list[i] for i in integers]
+
+ def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+ token2id = {v: i for i, v in enumerate(self.token_list)}
+ if self.unk_symbol not in token2id:
+ raise TokenIDConverterError(
+ f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
+ )
+ unk_id = token2id[self.unk_symbol]
+ return [token2id.get(i, unk_id) for i in tokens]
+
+
+class CharTokenizer():
+ def __init__(
+ self,
+ symbol_value: Union[Path, str, Iterable[str]] = None,
+ space_symbol: str = "<space>",
+ remove_non_linguistic_symbols: bool = False,
+ ):
+ check_argument_types()
+
+ self.space_symbol = space_symbol
+ self.non_linguistic_symbols = self.load_symbols(symbol_value)
+ self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
+
+ @staticmethod
+ def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
+ if value is None:
+ return set()
+
+ if isinstance(value, Iterable[str]):
+ return set(value)
+
+ file_path = Path(value)
+ if not file_path.exists():
+ logging.warning("%s doesn't exist.", file_path)
+ return set()
+
+ with file_path.open("r", encoding="utf-8") as f:
+ return set(line.rstrip() for line in f)
+
+ def text2tokens(self, line: Union[str, list]) -> List[str]:
+ tokens = []
+ while len(line) != 0:
+ for w in self.non_linguistic_symbols:
+ if line.startswith(w):
+ if not self.remove_non_linguistic_symbols:
+ tokens.append(line[: len(w)])
+ line = line[len(w):]
+ break
+ else:
+ t = line[0]
+ if t == " ":
+ t = "<space>"
+ tokens.append(t)
+ line = line[1:]
+ return tokens
+
+ def tokens2text(self, tokens: Iterable[str]) -> str:
+ tokens = [t if t != self.space_symbol else " " for t in tokens]
+ return "".join(tokens)
+
+ def __repr__(self):
+ return (
+ f"{self.__class__.__name__}("
+ f'space_symbol="{self.space_symbol}"'
+ f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
+ f")"
+ )
+
+
+
+class Hypothesis(NamedTuple):
+ """Hypothesis data type."""
+
+ yseq: np.ndarray
+ score: Union[float, np.ndarray] = 0
+ scores: Dict[str, Union[float, np.ndarray]] = dict()
+ states: Dict[str, Any] = dict()
+
+ def asdict(self) -> dict:
+ """Convert data to JSON-friendly dict."""
+ return self._replace(
+ yseq=self.yseq.tolist(),
+ score=float(self.score),
+ scores={k: float(v) for k, v in self.scores.items()},
+ )._asdict()
+
+
+class TokenIDConverterError(Exception):
+ pass
+
+
+class ONNXRuntimeError(Exception):
+ pass
+
+
+class OrtInferSession():
+ def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
+ device_id = str(device_id)
+ sess_opt = SessionOptions()
+ sess_opt.intra_op_num_threads = intra_op_num_threads
+ sess_opt.log_severity_level = 4
+ sess_opt.enable_cpu_mem_arena = False
+ sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+
+ cuda_ep = 'CUDAExecutionProvider'
+ cuda_provider_options = {
+ "device_id": device_id,
+ "arena_extend_strategy": "kNextPowerOfTwo",
+ "cudnn_conv_algo_search": "EXHAUSTIVE",
+ "do_copy_in_default_stream": "true",
+ }
+ cpu_ep = 'CPUExecutionProvider'
+ cpu_provider_options = {
+ "arena_extend_strategy": "kSameAsRequested",
+ }
+
+ EP_list = []
+ if device_id != "-1" and get_device() == 'GPU' \
+ and cuda_ep in get_available_providers():
+ EP_list = [(cuda_ep, cuda_provider_options)]
+ EP_list.append((cpu_ep, cpu_provider_options))
+
+ self._verify_model(model_file)
+ self.session = InferenceSession(model_file,
+ sess_options=sess_opt,
+ providers=EP_list)
+
+ if device_id != "-1" and cuda_ep not in self.session.get_providers():
+ warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
+ 'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
+ 'you can check their relations from the offical web site: '
+ 'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
+ RuntimeWarning)
+
+ def __call__(self,
+ input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
+ input_dict = dict(zip(self.get_input_names(), input_content))
+ try:
+ return self.session.run(None, input_dict)
+ except Exception as e:
+ raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
+
+ def get_input_names(self, ):
+ return [v.name for v in self.session.get_inputs()]
+
+ def get_output_names(self,):
+ return [v.name for v in self.session.get_outputs()]
+
+ def get_character_list(self, key: str = 'character'):
+ return self.meta_dict[key].splitlines()
+
+ def have_key(self, key: str = 'character') -> bool:
+ self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+ if key in self.meta_dict.keys():
+ return True
+ return False
+
+ @staticmethod
+ def _verify_model(model_path):
+ model_path = Path(model_path)
+ if not model_path.exists():
+ raise FileNotFoundError(f'{model_path} does not exists.')
+ if not model_path.is_file():
+ raise FileExistsError(f'{model_path} is not a file.')
+
+
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+ if not Path(yaml_path).exists():
+ raise FileExistsError(f'The {yaml_path} does not exist.')
+
+ with open(str(yaml_path), 'rb') as f:
+ data = yaml.load(f, Loader=yaml.Loader)
+ return data
+
+
+@functools.lru_cache()
+def get_logger(name='rapdi_paraformer'):
+ """Initialize and get a logger by name.
+ If the logger has not been initialized, this method will initialize the
+ logger by adding one or two handlers, otherwise the initialized logger will
+ be directly returned. During initialization, a StreamHandler will always be
+ added.
+ Args:
+ name (str): Logger name.
+ Returns:
+ logging.Logger: The expected logger.
+ """
+ logger = logging.getLogger(name)
+ if name in logger_initialized:
+ return logger
+
+ for logger_name in logger_initialized:
+ if name.startswith(logger_name):
+ return logger
+
+ formatter = logging.Formatter(
+ '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+ datefmt="%Y/%m/%d %H:%M:%S")
+
+ sh = logging.StreamHandler()
+ sh.setFormatter(formatter)
+ logger.addHandler(sh)
+ logger_initialized[name] = True
+ logger.propagate = False
+ return logger
diff --git a/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py
new file mode 100644
index 0000000..58913bb
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py
@@ -0,0 +1,166 @@
+# -*- encoding: utf-8 -*-
+
+import os.path
+from pathlib import Path
+from typing import List, Union, Tuple
+
+import copy
+import librosa
+import numpy as np
+
+from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
+ OrtInferSession, TokenIDConverter, get_logger,
+ read_yaml)
+from .utils.postprocess_utils import sentence_postprocess
+from .utils.frontend import WavFrontend
+from .utils.timestamp_utils import time_stamp_lfr6_onnx
+from .utils.e2e_vad import E2EVadModel
+
+logging = get_logger()
+
+
+class Fsmn_vad():
+ def __init__(self, model_dir: Union[str, Path] = None,
+ batch_size: int = 1,
+ device_id: Union[str, int] = "-1",
+ quantize: bool = False,
+ intra_op_num_threads: int = 4,
+ max_end_sil: int = 800,
+ ):
+
+ if not Path(model_dir).exists():
+ raise FileNotFoundError(f'{model_dir} does not exist.')
+
+ model_file = os.path.join(model_dir, 'model.onnx')
+ if quantize:
+ model_file = os.path.join(model_dir, 'model_quant.onnx')
+ config_file = os.path.join(model_dir, 'vad.yaml')
+ cmvn_file = os.path.join(model_dir, 'vad.mvn')
+ config = read_yaml(config_file)
+
+ self.frontend = WavFrontend(
+ cmvn_file=cmvn_file,
+ **config['frontend_conf']
+ )
+ self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
+ self.batch_size = batch_size
+ self.vad_scorer = E2EVadModel(**config)
+ self.max_end_sil = max_end_sil
+
+ def prepare_cache(self, in_cache: list = []):
+ if len(in_cache) > 0:
+ return in_cache
+
+ for i in range(4):
+ cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
+ in_cache.append(cache)
+ return in_cache
+
+
+ def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
+ waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
+ waveform_nums = len(waveform_list)
+ is_final = kwargs.get('kwargs', False)
+
+ asr_res = []
+ for beg_idx in range(0, waveform_nums, self.batch_size):
+
+ end_idx = min(waveform_nums, beg_idx + self.batch_size)
+ waveform = waveform_list[beg_idx:end_idx]
+ feats, feats_len = self.extract_feat(waveform)
+ param_dict = kwargs.get('param_dict', dict())
+ in_cache = param_dict.get('cache', list())
+ in_cache = self.prepare_cache(in_cache)
+ try:
+
+ scores, out_caches = self.infer(feats, *in_cache)
+ param_dict['cache'] = out_caches
+ segments = self.vad_scorer(scores, waveform, is_final=is_final, max_end_sil=self.max_end_sil)
+
+ except ONNXRuntimeError:
+ # logging.warning(traceback.format_exc())
+ logging.warning("input wav is silence or noise")
+ segments = ''
+ asr_res.append(segments)
+ # else:
+ # preds = self.decode(am_scores, valid_token_lens)
+ #
+ # asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
+
+ return asr_res
+
+ def load_data(self,
+ wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+ def load_wav(path: str) -> np.ndarray:
+ waveform, _ = librosa.load(path, sr=fs)
+ return waveform
+
+ if isinstance(wav_content, np.ndarray):
+ return [wav_content]
+
+ if isinstance(wav_content, str):
+ return [load_wav(wav_content)]
+
+ if isinstance(wav_content, list):
+ return [load_wav(path) for path in wav_content]
+
+ raise TypeError(
+ f'The type of {wav_content} is not in [str, np.ndarray, list]')
+
+ def extract_feat(self,
+ waveform_list: List[np.ndarray]
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ feats, feats_len = [], []
+ for waveform in waveform_list:
+ speech, _ = self.frontend.fbank(waveform)
+ feat, feat_len = self.frontend.lfr_cmvn(speech)
+ feats.append(feat)
+ feats_len.append(feat_len)
+
+ feats = self.pad_feats(feats, np.max(feats_len))
+ feats_len = np.array(feats_len).astype(np.int32)
+ return feats, feats_len
+
+ @staticmethod
+ def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+ def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+ pad_width = ((0, max_feat_len - cur_len), (0, 0))
+ return np.pad(feat, pad_width, 'constant', constant_values=0)
+
+ feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+ feats = np.array(feat_res).astype(np.float32)
+ return feats
+
+ def infer(self, feats: np.ndarray,
+ feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+ outputs = self.ort_infer([feats, feats_len])
+ return outputs
+
+ def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
+ return [self.decode_one(am_score, token_num)
+ for am_score, token_num in zip(am_scores, token_nums)]
+
+ def decode_one(self,
+ am_score: np.ndarray,
+ valid_token_num: int) -> List[str]:
+ yseq = am_score.argmax(axis=-1)
+ score = am_score.max(axis=-1)
+ score = np.sum(score, axis=-1)
+
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ # asr_model.sos:1 asr_model.eos:2
+ yseq = np.array([1] + yseq.tolist() + [2])
+ hyp = Hypothesis(yseq=yseq, score=score)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x not in (0, 2), token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+ token = token[:valid_token_num - self.pred_bias]
+ # texts = sentence_postprocess(token)
+ return token
diff --git a/funasr/runtime/python/onnxruntime/demo.py b/funasr/runtime/python/onnxruntime/demo.py
index 248d2e1..48d54e9 100644
--- a/funasr/runtime/python/onnxruntime/demo.py
+++ b/funasr/runtime/python/onnxruntime/demo.py
@@ -1,8 +1,6 @@
from funasr_onnx import Paraformer
-#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"
# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0
diff --git a/funasr/runtime/python/onnxruntime/demo_vad.py b/funasr/runtime/python/onnxruntime/demo_vad.py
new file mode 100644
index 0000000..ae033cc
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/demo_vad.py
@@ -0,0 +1,12 @@
+
+from funasr_onnx import Fsmn_vad
+
+
+model_dir = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+
+model = Fsmn_vad(model_dir)
+
+wav_path = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+
+result = model(wav_path)
+print(result)
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg
new file mode 100644
index 0000000..b24107b
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg
Binary files differ
diff --git a/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg
new file mode 100644
index 0000000..a7ccaf5
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg
Binary files differ
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO
new file mode 100644
index 0000000..94d2cb8
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO
@@ -0,0 +1,80 @@
+Metadata-Version: 2.1
+Name: funasr-onnx
+Version: 0.0.3
+Summary: FunASR: A Fundamental End-to-End Speech Recognition Toolkit
+Home-page: https://github.com/alibaba-damo-academy/FunASR.git
+Author: Speech Lab, Alibaba Group, China
+Author-email: funasr@list.alibaba-inc.com
+License: MIT
+Keywords: funasr,asr
+Platform: Any
+Classifier: Programming Language :: Python :: 3.6
+Classifier: Programming Language :: Python :: 3.7
+Classifier: Programming Language :: Python :: 3.8
+Classifier: Programming Language :: Python :: 3.9
+Classifier: Programming Language :: Python :: 3.10
+Description-Content-Type: text/markdown
+
+## Using funasr with ONNXRuntime
+
+
+### Introduction
+- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
+
+
+### Steps:
+1. Export the model.
+ - Command: (`Tips`: torch >= 1.11.0 is required.)
+
+ More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))
+
+ - `e.g.`, Export model from modelscope
+ ```shell
+ python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
+ ```
+ - `e.g.`, Export model from local path, the model'name must be `model.pb`.
+ ```shell
+ python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
+ ```
+
+
+2. Install the `funasr_onnx`
+
+install from pip
+```shell
+pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
+```
+
+or install from source code
+
+```shell
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+cd funasr/runtime/python/funasr_onnx
+python setup.py build
+python setup.py install
+```
+
+3. Run the demo.
+ - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
+ - Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+ - Output: `List[str]`: recognition result.
+ - Example:
+ ```python
+ from funasr_onnx import Paraformer
+
+ model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ model = Paraformer(model_dir, batch_size=1)
+
+ wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+ result = model(wav_path)
+ print(result)
+ ```
+
+## Performance benchmark
+
+Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_onnx.md)
+
+## Acknowledge
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt
new file mode 100644
index 0000000..e759e27
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt
@@ -0,0 +1,17 @@
+README.md
+setup.py
+funasr_onnx/__init__.py
+funasr_onnx/paraformer_bin.py
+funasr_onnx/punc_bin.py
+funasr_onnx/vad_bin.py
+funasr_onnx.egg-info/PKG-INFO
+funasr_onnx.egg-info/SOURCES.txt
+funasr_onnx.egg-info/dependency_links.txt
+funasr_onnx.egg-info/requires.txt
+funasr_onnx.egg-info/top_level.txt
+funasr_onnx/utils/__init__.py
+funasr_onnx/utils/e2e_vad.py
+funasr_onnx/utils/frontend.py
+funasr_onnx/utils/postprocess_utils.py
+funasr_onnx/utils/timestamp_utils.py
+funasr_onnx/utils/utils.py
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt
@@ -0,0 +1 @@
+
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt
new file mode 100644
index 0000000..6fcb632
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt
@@ -0,0 +1,7 @@
+librosa
+onnxruntime>=1.7.0
+scipy
+numpy>=1.19.3
+typeguard
+kaldi-native-fbank
+PyYAML>=5.1.2
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt
new file mode 100644
index 0000000..de41eb9
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt
@@ -0,0 +1 @@
+funasr_onnx
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py b/funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
index 647f9fa..4750479 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py
@@ -1,2 +1,3 @@
# -*- encoding: utf-8 -*-
from .paraformer_bin import Paraformer
+from .vad_bin import Fsmn_vad
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
new file mode 100644
index 0000000..3f6c3d1
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
@@ -0,0 +1,607 @@
+from enum import Enum
+from typing import List, Tuple, Dict, Any
+
+import math
+import numpy as np
+
+class VadStateMachine(Enum):
+ kVadInStateStartPointNotDetected = 1
+ kVadInStateInSpeechSegment = 2
+ kVadInStateEndPointDetected = 3
+
+
+class FrameState(Enum):
+ kFrameStateInvalid = -1
+ kFrameStateSpeech = 1
+ kFrameStateSil = 0
+
+
+# final voice/unvoice state per frame
+class AudioChangeState(Enum):
+ kChangeStateSpeech2Speech = 0
+ kChangeStateSpeech2Sil = 1
+ kChangeStateSil2Sil = 2
+ kChangeStateSil2Speech = 3
+ kChangeStateNoBegin = 4
+ kChangeStateInvalid = 5
+
+
+class VadDetectMode(Enum):
+ kVadSingleUtteranceDetectMode = 0
+ kVadMutipleUtteranceDetectMode = 1
+
+
+class VADXOptions:
+ def __init__(
+ self,
+ sample_rate: int = 16000,
+ detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
+ snr_mode: int = 0,
+ max_end_silence_time: int = 800,
+ max_start_silence_time: int = 3000,
+ do_start_point_detection: bool = True,
+ do_end_point_detection: bool = True,
+ window_size_ms: int = 200,
+ sil_to_speech_time_thres: int = 150,
+ speech_to_sil_time_thres: int = 150,
+ speech_2_noise_ratio: float = 1.0,
+ do_extend: int = 1,
+ lookback_time_start_point: int = 200,
+ lookahead_time_end_point: int = 100,
+ max_single_segment_time: int = 60000,
+ nn_eval_block_size: int = 8,
+ dcd_block_size: int = 4,
+ snr_thres: int = -100.0,
+ noise_frame_num_used_for_snr: int = 100,
+ decibel_thres: int = -100.0,
+ speech_noise_thres: float = 0.6,
+ fe_prior_thres: float = 1e-4,
+ silence_pdf_num: int = 1,
+ sil_pdf_ids: List[int] = [0],
+ speech_noise_thresh_low: float = -0.1,
+ speech_noise_thresh_high: float = 0.3,
+ output_frame_probs: bool = False,
+ frame_in_ms: int = 10,
+ frame_length_ms: int = 25,
+ ):
+ self.sample_rate = sample_rate
+ self.detect_mode = detect_mode
+ self.snr_mode = snr_mode
+ self.max_end_silence_time = max_end_silence_time
+ self.max_start_silence_time = max_start_silence_time
+ self.do_start_point_detection = do_start_point_detection
+ self.do_end_point_detection = do_end_point_detection
+ self.window_size_ms = window_size_ms
+ self.sil_to_speech_time_thres = sil_to_speech_time_thres
+ self.speech_to_sil_time_thres = speech_to_sil_time_thres
+ self.speech_2_noise_ratio = speech_2_noise_ratio
+ self.do_extend = do_extend
+ self.lookback_time_start_point = lookback_time_start_point
+ self.lookahead_time_end_point = lookahead_time_end_point
+ self.max_single_segment_time = max_single_segment_time
+ self.nn_eval_block_size = nn_eval_block_size
+ self.dcd_block_size = dcd_block_size
+ self.snr_thres = snr_thres
+ self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
+ self.decibel_thres = decibel_thres
+ self.speech_noise_thres = speech_noise_thres
+ self.fe_prior_thres = fe_prior_thres
+ self.silence_pdf_num = silence_pdf_num
+ self.sil_pdf_ids = sil_pdf_ids
+ self.speech_noise_thresh_low = speech_noise_thresh_low
+ self.speech_noise_thresh_high = speech_noise_thresh_high
+ self.output_frame_probs = output_frame_probs
+ self.frame_in_ms = frame_in_ms
+ self.frame_length_ms = frame_length_ms
+
+
+class E2EVadSpeechBufWithDoa(object):
+ def __init__(self):
+ self.start_ms = 0
+ self.end_ms = 0
+ self.buffer = []
+ self.contain_seg_start_point = False
+ self.contain_seg_end_point = False
+ self.doa = 0
+
+ def Reset(self):
+ self.start_ms = 0
+ self.end_ms = 0
+ self.buffer = []
+ self.contain_seg_start_point = False
+ self.contain_seg_end_point = False
+ self.doa = 0
+
+
+class E2EVadFrameProb(object):
+ def __init__(self):
+ self.noise_prob = 0.0
+ self.speech_prob = 0.0
+ self.score = 0.0
+ self.frame_id = 0
+ self.frm_state = 0
+
+
+class WindowDetector(object):
+ def __init__(self, window_size_ms: int, sil_to_speech_time: int,
+ speech_to_sil_time: int, frame_size_ms: int):
+ self.window_size_ms = window_size_ms
+ self.sil_to_speech_time = sil_to_speech_time
+ self.speech_to_sil_time = speech_to_sil_time
+ self.frame_size_ms = frame_size_ms
+
+ self.win_size_frame = int(window_size_ms / frame_size_ms)
+ self.win_sum = 0
+ self.win_state = [0] * self.win_size_frame # 鍒濆鍖栫獥
+
+ self.cur_win_pos = 0
+ self.pre_frame_state = FrameState.kFrameStateSil
+ self.cur_frame_state = FrameState.kFrameStateSil
+ self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
+ self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)
+
+ self.voice_last_frame_count = 0
+ self.noise_last_frame_count = 0
+ self.hydre_frame_count = 0
+
+ def Reset(self) -> None:
+ self.cur_win_pos = 0
+ self.win_sum = 0
+ self.win_state = [0] * self.win_size_frame
+ self.pre_frame_state = FrameState.kFrameStateSil
+ self.cur_frame_state = FrameState.kFrameStateSil
+ self.voice_last_frame_count = 0
+ self.noise_last_frame_count = 0
+ self.hydre_frame_count = 0
+
+ def GetWinSize(self) -> int:
+ return int(self.win_size_frame)
+
+ def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
+ cur_frame_state = FrameState.kFrameStateSil
+ if frameState == FrameState.kFrameStateSpeech:
+ cur_frame_state = 1
+ elif frameState == FrameState.kFrameStateSil:
+ cur_frame_state = 0
+ else:
+ return AudioChangeState.kChangeStateInvalid
+ self.win_sum -= self.win_state[self.cur_win_pos]
+ self.win_sum += cur_frame_state
+ self.win_state[self.cur_win_pos] = cur_frame_state
+ self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame
+
+ if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
+ self.pre_frame_state = FrameState.kFrameStateSpeech
+ return AudioChangeState.kChangeStateSil2Speech
+
+ if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
+ self.pre_frame_state = FrameState.kFrameStateSil
+ return AudioChangeState.kChangeStateSpeech2Sil
+
+ if self.pre_frame_state == FrameState.kFrameStateSil:
+ return AudioChangeState.kChangeStateSil2Sil
+ if self.pre_frame_state == FrameState.kFrameStateSpeech:
+ return AudioChangeState.kChangeStateSpeech2Speech
+ return AudioChangeState.kChangeStateInvalid
+
+ def FrameSizeMs(self) -> int:
+ return int(self.frame_size_ms)
+
+
+class E2EVadModel():
+ def __init__(self, vad_post_args: Dict[str, Any]):
+ super(E2EVadModel, self).__init__()
+ self.vad_opts = VADXOptions(**vad_post_args)
+ self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
+ self.vad_opts.sil_to_speech_time_thres,
+ self.vad_opts.speech_to_sil_time_thres,
+ self.vad_opts.frame_in_ms)
+ # self.encoder = encoder
+ # init variables
+ self.is_final = False
+ self.data_buf_start_frame = 0
+ self.frm_cnt = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.continous_silence_frame_count = 0
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.number_end_time_detected = 0
+ self.sil_frame = 0
+ self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+ self.noise_average_decibel = -100.0
+ self.pre_end_silence_detected = False
+ self.next_seg = True
+
+ self.output_data_buf = []
+ self.output_data_buf_offset = 0
+ self.frame_probs = []
+ self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+ self.speech_noise_thres = self.vad_opts.speech_noise_thres
+ self.scores = None
+ self.max_time_out = False
+ self.decibel = []
+ self.data_buf = None
+ self.data_buf_all = None
+ self.waveform = None
+ self.ResetDetection()
+
+ def AllResetDetection(self):
+ self.is_final = False
+ self.data_buf_start_frame = 0
+ self.frm_cnt = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.continous_silence_frame_count = 0
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.number_end_time_detected = 0
+ self.sil_frame = 0
+ self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
+ self.noise_average_decibel = -100.0
+ self.pre_end_silence_detected = False
+ self.next_seg = True
+
+ self.output_data_buf = []
+ self.output_data_buf_offset = 0
+ self.frame_probs = []
+ self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
+ self.speech_noise_thres = self.vad_opts.speech_noise_thres
+ self.scores = None
+ self.max_time_out = False
+ self.decibel = []
+ self.data_buf = None
+ self.data_buf_all = None
+ self.waveform = None
+ self.ResetDetection()
+
+ def ResetDetection(self):
+ self.continous_silence_frame_count = 0
+ self.latest_confirmed_speech_frame = 0
+ self.lastest_confirmed_silence_frame = -1
+ self.confirmed_start_frame = -1
+ self.confirmed_end_frame = -1
+ self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
+ self.windows_detector.Reset()
+ self.sil_frame = 0
+ self.frame_probs = []
+
+ def ComputeDecibel(self) -> None:
+ frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
+ frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
+ if self.data_buf_all is None:
+ self.data_buf_all = self.waveform[0] # self.data_buf is pointed to self.waveform[0]
+ self.data_buf = self.data_buf_all
+ else:
+ self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
+ for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
+ self.decibel.append(
+ 10 * math.log10(np.square((self.waveform[0][offset: offset + frame_sample_length])).sum() + \
+ 0.000001))
+
+ def ComputeScores(self, scores: np.ndarray) -> None:
+ # scores = self.encoder(feats, in_cache) # return B * T * D
+ self.vad_opts.nn_eval_block_size = scores.shape[1]
+ self.frm_cnt += scores.shape[1] # count total frames
+ if self.scores is None:
+ self.scores = scores # the first calculation
+ else:
+ self.scores = np.concatenate((self.scores, scores), axis=1)
+
+ def PopDataBufTillFrame(self, frame_idx: int) -> None: # need check again
+ while self.data_buf_start_frame < frame_idx:
+ if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
+ self.data_buf_start_frame += 1
+ self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
+ self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
+
+ def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
+ last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
+ self.PopDataBufTillFrame(start_frm)
+ expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
+ if last_frm_is_end_point:
+ extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
+ self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
+ expected_sample_number += int(extra_sample)
+ if end_point_is_sent_end:
+ expected_sample_number = max(expected_sample_number, len(self.data_buf))
+ if len(self.data_buf) < expected_sample_number:
+ print('error in calling pop data_buf\n')
+
+ if len(self.output_data_buf) == 0 or first_frm_is_start_point:
+ self.output_data_buf.append(E2EVadSpeechBufWithDoa())
+ self.output_data_buf[-1].Reset()
+ self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
+ self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
+ self.output_data_buf[-1].doa = 0
+ cur_seg = self.output_data_buf[-1]
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+ print('warning\n')
+ out_pos = len(cur_seg.buffer) # cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
+ data_to_pop = 0
+ if end_point_is_sent_end:
+ data_to_pop = expected_sample_number
+ else:
+ data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
+ if data_to_pop > len(self.data_buf):
+ print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
+ data_to_pop = len(self.data_buf)
+ expected_sample_number = len(self.data_buf)
+
+ cur_seg.doa = 0
+ for sample_cpy_out in range(0, data_to_pop):
+ # cur_seg.buffer[out_pos ++] = data_buf_.back();
+ out_pos += 1
+ for sample_cpy_out in range(data_to_pop, expected_sample_number):
+ # cur_seg.buffer[out_pos++] = data_buf_.back()
+ out_pos += 1
+ if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
+ print('Something wrong with the VAD algorithm\n')
+ self.data_buf_start_frame += frm_cnt
+ cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
+ if first_frm_is_start_point:
+ cur_seg.contain_seg_start_point = True
+ if last_frm_is_end_point:
+ cur_seg.contain_seg_end_point = True
+
+ def OnSilenceDetected(self, valid_frame: int):
+ self.lastest_confirmed_silence_frame = valid_frame
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ self.PopDataBufTillFrame(valid_frame)
+ # silence_detected_callback_
+ # pass
+
+ def OnVoiceDetected(self, valid_frame: int) -> None:
+ self.latest_confirmed_speech_frame = valid_frame
+ self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
+
+ def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
+ if self.vad_opts.do_start_point_detection:
+ pass
+ if self.confirmed_start_frame != -1:
+ print('not reset vad properly\n')
+ else:
+ self.confirmed_start_frame = start_frame
+
+ if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)
+
+ def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
+ for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
+ self.OnVoiceDetected(t)
+ if self.vad_opts.do_end_point_detection:
+ pass
+ if self.confirmed_end_frame != -1:
+ print('not reset vad properly\n')
+ else:
+ self.confirmed_end_frame = end_frame
+ if not fake_result:
+ self.sil_frame = 0
+ self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
+ self.number_end_time_detected += 1
+
+ def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
+ if is_final_frame:
+ self.OnVoiceEnd(cur_frm_idx, False, True)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+
+ def GetLatency(self) -> int:
+ return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)
+
+ def LatencyFrmNumAtStartPoint(self) -> int:
+ vad_latency = self.windows_detector.GetWinSize()
+ if self.vad_opts.do_extend:
+ vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
+ return vad_latency
+
+ def GetFrameState(self, t: int) -> FrameState:
+ frame_state = FrameState.kFrameStateInvalid
+ cur_decibel = self.decibel[t]
+ cur_snr = cur_decibel - self.noise_average_decibel
+ # for each frame, calc log posterior probability of each state
+ if cur_decibel < self.vad_opts.decibel_thres:
+ frame_state = FrameState.kFrameStateSil
+ self.DetectOneFrame(frame_state, t, False)
+ return frame_state
+
+ sum_score = 0.0
+ noise_prob = 0.0
+ assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
+ if len(self.sil_pdf_ids) > 0:
+ assert len(self.scores) == 1 # 鍙敮鎸乥atch_size = 1鐨勬祴璇�
+ sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
+ sum_score = sum(sil_pdf_scores)
+ noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
+ total_score = 1.0
+ sum_score = total_score - sum_score
+ speech_prob = math.log(sum_score)
+ if self.vad_opts.output_frame_probs:
+ frame_prob = E2EVadFrameProb()
+ frame_prob.noise_prob = noise_prob
+ frame_prob.speech_prob = speech_prob
+ frame_prob.score = sum_score
+ frame_prob.frame_id = t
+ self.frame_probs.append(frame_prob)
+ if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
+ if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
+ frame_state = FrameState.kFrameStateSpeech
+ else:
+ frame_state = FrameState.kFrameStateSil
+ else:
+ frame_state = FrameState.kFrameStateSil
+ if self.noise_average_decibel < -99.9:
+ self.noise_average_decibel = cur_decibel
+ else:
+ self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
+ self.vad_opts.noise_frame_num_used_for_snr
+ - 1)) / self.vad_opts.noise_frame_num_used_for_snr
+
+ return frame_state
+
+
+ def __call__(self, score: np.ndarray, waveform: np.ndarray,
+ is_final: bool = False, max_end_sil: int = 800
+ ):
+ self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
+ self.waveform = waveform # compute decibel for each frame
+ self.ComputeDecibel()
+ self.ComputeScores(score)
+ if not is_final:
+ self.DetectCommonFrames()
+ else:
+ self.DetectLastFrames()
+ segments = []
+ for batch_num in range(0, score.shape[0]): # only support batch_size = 1 now
+ segment_batch = []
+ if len(self.output_data_buf) > 0:
+ for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+ if not self.output_data_buf[i].contain_seg_start_point:
+ continue
+ if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
+ continue
+ start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
+ if self.output_data_buf[i].contain_seg_end_point:
+ end_ms = self.output_data_buf[i].end_ms
+ self.next_seg = True
+ self.output_data_buf_offset += 1
+ else:
+ end_ms = -1
+ self.next_seg = False
+ segment = [start_ms, end_ms]
+ segment_batch.append(segment)
+ if segment_batch:
+ segments.append(segment_batch)
+ if is_final:
+ # reset class variables and clear the dict for the next query
+ self.AllResetDetection()
+ return segments
+
+ def DetectCommonFrames(self) -> int:
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+ return 0
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+ frame_state = FrameState.kFrameStateInvalid
+ frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+
+ return 0
+
+ def DetectLastFrames(self) -> int:
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+ return 0
+ for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+ frame_state = FrameState.kFrameStateInvalid
+ frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+ if i != 0:
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+ else:
+ self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
+
+ return 0
+
+ def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
+ tmp_cur_frm_state = FrameState.kFrameStateInvalid
+ if cur_frm_state == FrameState.kFrameStateSpeech:
+ if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
+ tmp_cur_frm_state = FrameState.kFrameStateSpeech
+ else:
+ tmp_cur_frm_state = FrameState.kFrameStateSil
+ elif cur_frm_state == FrameState.kFrameStateSil:
+ tmp_cur_frm_state = FrameState.kFrameStateSil
+ state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
+ frm_shift_in_ms = self.vad_opts.frame_in_ms
+ if AudioChangeState.kChangeStateSil2Speech == state_change:
+ silence_frame_count = self.continous_silence_frame_count
+ self.continous_silence_frame_count = 0
+ self.pre_end_silence_detected = False
+ start_frame = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
+ self.OnVoiceStart(start_frame)
+ self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
+ for t in range(start_frame + 1, cur_frm_idx + 1):
+ self.OnVoiceDetected(t)
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
+ self.OnVoiceDetected(t)
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
+ self.continous_silence_frame_count = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ pass
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
+ self.continous_silence_frame_count = 0
+ if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.max_time_out = True
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif not is_final_frame:
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+ elif AudioChangeState.kChangeStateSil2Sil == state_change:
+ self.continous_silence_frame_count += 1
+ if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
+ # silence timeout, return zero length decision
+ if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
+ self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
+ or (is_final_frame and self.number_end_time_detected == 0):
+ for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
+ self.OnSilenceDetected(t)
+ self.OnVoiceStart(0, True)
+ self.OnVoiceEnd(0, True, False);
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ else:
+ if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
+ self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
+ elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
+ if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
+ lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
+ if self.vad_opts.do_extend:
+ lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
+ lookback_frame -= 1
+ lookback_frame = max(0, lookback_frame)
+ self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif cur_frm_idx - self.confirmed_start_frame + 1 > \
+ self.vad_opts.max_single_segment_time / frm_shift_in_ms:
+ self.OnVoiceEnd(cur_frm_idx, False, False)
+ self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
+ elif self.vad_opts.do_extend and not is_final_frame:
+ if self.continous_silence_frame_count <= int(
+ self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
+ self.OnVoiceDetected(cur_frm_idx)
+ else:
+ self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
+ else:
+ pass
+
+ if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
+ self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
+ self.ResetDetection()
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
index 2edde11..fccd5a0 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py
@@ -188,7 +188,7 @@
input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
input_dict = dict(zip(self.get_input_names(), input_content))
try:
- return self.session.run(None, input_dict)
+ return self.session.run(self.get_output_names(), input_dict)
except Exception as e:
raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py b/funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
new file mode 100644
index 0000000..533b4b7
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py
@@ -0,0 +1,134 @@
+# -*- encoding: utf-8 -*-
+
+import os.path
+from pathlib import Path
+from typing import List, Union, Tuple
+
+import copy
+import librosa
+import numpy as np
+
+from .utils.utils import (ONNXRuntimeError,
+ OrtInferSession, get_logger,
+ read_yaml)
+from .utils.frontend import WavFrontend
+from .utils.e2e_vad import E2EVadModel
+
+logging = get_logger()
+
+
+class Fsmn_vad():
+ def __init__(self, model_dir: Union[str, Path] = None,
+ batch_size: int = 1,
+ device_id: Union[str, int] = "-1",
+ quantize: bool = False,
+ intra_op_num_threads: int = 4,
+ max_end_sil: int = 800,
+ ):
+
+ if not Path(model_dir).exists():
+ raise FileNotFoundError(f'{model_dir} does not exist.')
+
+ model_file = os.path.join(model_dir, 'model.onnx')
+ if quantize:
+ model_file = os.path.join(model_dir, 'model_quant.onnx')
+ config_file = os.path.join(model_dir, 'vad.yaml')
+ cmvn_file = os.path.join(model_dir, 'vad.mvn')
+ config = read_yaml(config_file)
+
+ self.frontend = WavFrontend(
+ cmvn_file=cmvn_file,
+ **config['frontend_conf']
+ )
+ self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
+ self.batch_size = batch_size
+ self.vad_scorer = E2EVadModel(config["vad_post_conf"])
+ self.max_end_sil = max_end_sil
+
+ def prepare_cache(self, in_cache: list = []):
+ if len(in_cache) > 0:
+ return in_cache
+
+ for i in range(4):
+ cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
+ in_cache.append(cache)
+ return in_cache
+
+
+ def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
+ waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
+ waveform_nums = len(waveform_list)
+ is_final = kwargs.get('kwargs', False)
+
+ asr_res = []
+ for beg_idx in range(0, waveform_nums, self.batch_size):
+
+ end_idx = min(waveform_nums, beg_idx + self.batch_size)
+ waveform = waveform_list[beg_idx:end_idx]
+ feats, feats_len = self.extract_feat(waveform)
+ param_dict = kwargs.get('param_dict', dict())
+ in_cache = param_dict.get('cache', list())
+ in_cache = self.prepare_cache(in_cache)
+ try:
+ inputs = [feats]
+ inputs.extend(in_cache)
+ scores, out_caches = self.infer(inputs)
+ param_dict['cache'] = out_caches
+ segments = self.vad_scorer(scores, waveform[0][None, :], is_final=is_final, max_end_sil=self.max_end_sil)
+
+ except ONNXRuntimeError:
+ # logging.warning(traceback.format_exc())
+ logging.warning("input wav is silence or noise")
+ segments = ''
+ asr_res.append(segments)
+
+ return asr_res
+
+ def load_data(self,
+ wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+ def load_wav(path: str) -> np.ndarray:
+ waveform, _ = librosa.load(path, sr=fs)
+ return waveform
+
+ if isinstance(wav_content, np.ndarray):
+ return [wav_content]
+
+ if isinstance(wav_content, str):
+ return [load_wav(wav_content)]
+
+ if isinstance(wav_content, list):
+ return [load_wav(path) for path in wav_content]
+
+ raise TypeError(
+ f'The type of {wav_content} is not in [str, np.ndarray, list]')
+
+ def extract_feat(self,
+ waveform_list: List[np.ndarray]
+ ) -> Tuple[np.ndarray, np.ndarray]:
+ feats, feats_len = [], []
+ for waveform in waveform_list:
+ speech, _ = self.frontend.fbank(waveform)
+ feat, feat_len = self.frontend.lfr_cmvn(speech)
+ feats.append(feat)
+ feats_len.append(feat_len)
+
+ feats = self.pad_feats(feats, np.max(feats_len))
+ feats_len = np.array(feats_len).astype(np.int32)
+ return feats, feats_len
+
+ @staticmethod
+ def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+ def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+ pad_width = ((0, max_feat_len - cur_len), (0, 0))
+ return np.pad(feat, pad_width, 'constant', constant_values=0)
+
+ feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+ feats = np.array(feat_res).astype(np.float32)
+ return feats
+
+ def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
+
+ outputs = self.ort_infer(feats)
+ scores, out_caches = outputs[0], outputs[1:]
+ return scores, out_caches
+
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index 3b9ed3b..1a8ed7b 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -13,7 +13,7 @@
MODULE_NAME = 'funasr_onnx'
-VERSION_NUM = '0.0.2'
+VERSION_NUM = '0.0.3'
setuptools.setup(
name=MODULE_NAME,
--
Gitblit v1.9.1