python/FunASR-XL.git

parent: 5e59904f | 补丁 | 提交 | ignore whitespace

游雁

2023-03-29 7c5fdf30f428e22fd0fdb98055834e0d2616d308

export

4个文件已修改

20个文件已添加

	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py	187 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py	607 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py	191 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py	240 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py	59 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py	257 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py	166 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_vad.py	12 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg	补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO	80 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt	17 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py	607 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py	134 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/setup.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/__init__.py

New file
@@ -0,0 +1,3 @@
# -*- encoding: utf-8 -*-
from .paraformer_bin import Paraformer
from .vad_bin import Fsmn_vad

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/paraformer_bin.py

New file
@@ -0,0 +1,187 @@
# -*- encoding: utf-8 -*-

import os.path
from pathlib import Path
from typing import List, Union, Tuple

import copy
import librosa
import numpy as np

from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
                          OrtInferSession, TokenIDConverter, get_logger,
                          read_yaml)
from .utils.postprocess_utils import sentence_postprocess
from .utils.frontend import WavFrontend
from .utils.timestamp_utils import time_stamp_lfr6_onnx

logging = get_logger()


class Paraformer():
    def __init__(self, model_dir: Union[str, Path] = None,
                 batch_size: int = 1,
                 device_id: Union[str, int] = "-1",
                 plot_timestamp_to: str = "",
                 pred_bias: int = 1,
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 ):

        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')

        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        config_file = os.path.join(model_dir, 'config.yaml')
        cmvn_file = os.path.join(model_dir, 'am.mvn')
        config = read_yaml(config_file)

        self.converter = TokenIDConverter(config['token_list'])
        self.tokenizer = CharTokenizer()
        self.frontend = WavFrontend(
            cmvn_file=cmvn_file,
            **config['frontend_conf']
        )
        self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
        self.batch_size = batch_size
        self.plot_timestamp_to = plot_timestamp_to
        self.pred_bias = pred_bias

    def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
        waveform_nums = len(waveform_list)
        asr_res = []
        for beg_idx in range(0, waveform_nums, self.batch_size):
            
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
            try:
                outputs = self.infer(feats, feats_len)
                am_scores, valid_token_lens = outputs[0], outputs[1]
                if len(outputs) == 4:
                    # for BiCifParaformer Inference
                    us_alphas, us_peaks = outputs[2], outputs[3]
                else:
                    us_alphas, us_peaks = None, None
            except ONNXRuntimeError:
                #logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                preds = ['']
            else:
                preds = self.decode(am_scores, valid_token_lens)
                if us_peaks is None:
                    for pred in preds:
                        pred = sentence_postprocess(pred)
                        asr_res.append({'preds': pred})
                else:
                    for pred, us_peaks_ in zip(preds, us_peaks):
                        raw_tokens = pred
                        timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
                        text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
                        # logging.warning(timestamp)
                        if len(self.plot_timestamp_to):
                            self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
                        asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
        return asr_res

    def plot_wave_timestamp(self, wav, text_timestamp, dest):
        # TODO: Plot the wav and timestamp results with matplotlib
        import matplotlib
        matplotlib.use('Agg')
        matplotlib.rc("font", family='Alibaba PuHuiTi')  # set it to a font that your system supports
        import matplotlib.pyplot as plt
        fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
        ax2 = ax1.twinx()
        ax2.set_ylim([0, 2.0])
        # plot waveform
        ax1.set_ylim([-0.3, 0.3])
        time = np.arange(wav.shape[0]) / 16000
        ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
        # plot lines and text
        for (char, start, end) in text_timestamp:
            ax1.vlines(start, -0.3, 0.3, ls='--')
            ax1.vlines(end, -0.3, 0.3, ls='--')
            x_adj = 0.045 if char != '<sil>' else 0.12
            ax1.text((start + end) * 0.5 - x_adj, 0, char)
        # plt.legend()
        plotname = "{}/timestamp.png".format(dest)
        plt.savefig(plotname, bbox_inches='tight')

    def load_data(self,
                  wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
        def load_wav(path: str) -> np.ndarray:
            waveform, _ = librosa.load(path, sr=fs)
            return waveform

        if isinstance(wav_content, np.ndarray):
            return [wav_content]

        if isinstance(wav_content, str):
            return [load_wav(wav_content)]

        if isinstance(wav_content, list):
            return [load_wav(path) for path in wav_content]

        raise TypeError(
            f'The type of {wav_content} is not in [str, np.ndarray, list]')

    def extract_feat(self,
                     waveform_list: List[np.ndarray]
                     ) -> Tuple[np.ndarray, np.ndarray]:
        feats, feats_len = [], []
        for waveform in waveform_list:
            speech, _ = self.frontend.fbank(waveform)
            feat, feat_len = self.frontend.lfr_cmvn(speech)
            feats.append(feat)
            feats_len.append(feat_len)

        feats = self.pad_feats(feats, np.max(feats_len))
        feats_len = np.array(feats_len).astype(np.int32)
        return feats, feats_len

    @staticmethod
    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
            pad_width = ((0, max_feat_len - cur_len), (0, 0))
            return np.pad(feat, pad_width, 'constant', constant_values=0)

        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
        feats = np.array(feat_res).astype(np.float32)
        return feats

    def infer(self, feats: np.ndarray,
              feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        outputs = self.ort_infer([feats, feats_len])
        return outputs

    def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
        return [self.decode_one(am_score, token_num)
                for am_score, token_num in zip(am_scores, token_nums)]

    def decode_one(self,
                   am_score: np.ndarray,
                   valid_token_num: int) -> List[str]:
        yseq = am_score.argmax(axis=-1)
        score = am_score.max(axis=-1)
        score = np.sum(score, axis=-1)

        # pad with mask tokens to ensure compatibility with sos/eos tokens
        # asr_model.sos:1  asr_model.eos:2
        yseq = np.array([1] + yseq.tolist() + [2])
        hyp = Hypothesis(yseq=yseq, score=score)

        # remove sos/eos and get results
        last_pos = -1
        token_int = hyp.yseq[1:last_pos].tolist()

        # remove blank symbol id, which is assumed to be 0
        token_int = list(filter(lambda x: x not in (0, 2), token_int))

        # Change integer-ids to tokens
        token = self.converter.ids2tokens(token_int)
        token = token[:valid_token_num-self.pred_bias]
        # texts = sentence_postprocess(token)
        return token


 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/punc_bin.py


 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/__init__.py


 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/e2e_vad.py

New file
@@ -0,0 +1,607 @@
from enum import Enum
from typing import List, Tuple, Dict, Any

import math
import numpy as np

class VadStateMachine(Enum):
    kVadInStateStartPointNotDetected = 1
    kVadInStateInSpeechSegment = 2
    kVadInStateEndPointDetected = 3


class FrameState(Enum):
    kFrameStateInvalid = -1
    kFrameStateSpeech = 1
    kFrameStateSil = 0


# final voice/unvoice state per frame
class AudioChangeState(Enum):
    kChangeStateSpeech2Speech = 0
    kChangeStateSpeech2Sil = 1
    kChangeStateSil2Sil = 2
    kChangeStateSil2Speech = 3
    kChangeStateNoBegin = 4
    kChangeStateInvalid = 5


class VadDetectMode(Enum):
    kVadSingleUtteranceDetectMode = 0
    kVadMutipleUtteranceDetectMode = 1


class VADXOptions:
    def __init__(
            self,
            sample_rate: int = 16000,
            detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
            snr_mode: int = 0,
            max_end_silence_time: int = 800,
            max_start_silence_time: int = 3000,
            do_start_point_detection: bool = True,
            do_end_point_detection: bool = True,
            window_size_ms: int = 200,
            sil_to_speech_time_thres: int = 150,
            speech_to_sil_time_thres: int = 150,
            speech_2_noise_ratio: float = 1.0,
            do_extend: int = 1,
            lookback_time_start_point: int = 200,
            lookahead_time_end_point: int = 100,
            max_single_segment_time: int = 60000,
            nn_eval_block_size: int = 8,
            dcd_block_size: int = 4,
            snr_thres: int = -100.0,
            noise_frame_num_used_for_snr: int = 100,
            decibel_thres: int = -100.0,
            speech_noise_thres: float = 0.6,
            fe_prior_thres: float = 1e-4,
            silence_pdf_num: int = 1,
            sil_pdf_ids: List[int] = [0],
            speech_noise_thresh_low: float = -0.1,
            speech_noise_thresh_high: float = 0.3,
            output_frame_probs: bool = False,
            frame_in_ms: int = 10,
            frame_length_ms: int = 25,
    ):
        self.sample_rate = sample_rate
        self.detect_mode = detect_mode
        self.snr_mode = snr_mode
        self.max_end_silence_time = max_end_silence_time
        self.max_start_silence_time = max_start_silence_time
        self.do_start_point_detection = do_start_point_detection
        self.do_end_point_detection = do_end_point_detection
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time_thres = sil_to_speech_time_thres
        self.speech_to_sil_time_thres = speech_to_sil_time_thres
        self.speech_2_noise_ratio = speech_2_noise_ratio
        self.do_extend = do_extend
        self.lookback_time_start_point = lookback_time_start_point
        self.lookahead_time_end_point = lookahead_time_end_point
        self.max_single_segment_time = max_single_segment_time
        self.nn_eval_block_size = nn_eval_block_size
        self.dcd_block_size = dcd_block_size
        self.snr_thres = snr_thres
        self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
        self.decibel_thres = decibel_thres
        self.speech_noise_thres = speech_noise_thres
        self.fe_prior_thres = fe_prior_thres
        self.silence_pdf_num = silence_pdf_num
        self.sil_pdf_ids = sil_pdf_ids
        self.speech_noise_thresh_low = speech_noise_thresh_low
        self.speech_noise_thresh_high = speech_noise_thresh_high
        self.output_frame_probs = output_frame_probs
        self.frame_in_ms = frame_in_ms
        self.frame_length_ms = frame_length_ms


class E2EVadSpeechBufWithDoa(object):
    def __init__(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0

    def Reset(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0


class E2EVadFrameProb(object):
    def __init__(self):
        self.noise_prob = 0.0
        self.speech_prob = 0.0
        self.score = 0.0
        self.frame_id = 0
        self.frm_state = 0


class WindowDetector(object):
    def __init__(self, window_size_ms: int, sil_to_speech_time: int,
                 speech_to_sil_time: int, frame_size_ms: int):
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time = sil_to_speech_time
        self.speech_to_sil_time = speech_to_sil_time
        self.frame_size_ms = frame_size_ms

        self.win_size_frame = int(window_size_ms / frame_size_ms)
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame  # 初始化窗

        self.cur_win_pos = 0
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
        self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)

        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def Reset(self) -> None:
        self.cur_win_pos = 0
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def GetWinSize(self) -> int:
        return int(self.win_size_frame)

    def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
        cur_frame_state = FrameState.kFrameStateSil
        if frameState == FrameState.kFrameStateSpeech:
            cur_frame_state = 1
        elif frameState == FrameState.kFrameStateSil:
            cur_frame_state = 0
        else:
            return AudioChangeState.kChangeStateInvalid
        self.win_sum -= self.win_state[self.cur_win_pos]
        self.win_sum += cur_frame_state
        self.win_state[self.cur_win_pos] = cur_frame_state
        self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame

        if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
            self.pre_frame_state = FrameState.kFrameStateSpeech
            return AudioChangeState.kChangeStateSil2Speech

        if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
            self.pre_frame_state = FrameState.kFrameStateSil
            return AudioChangeState.kChangeStateSpeech2Sil

        if self.pre_frame_state == FrameState.kFrameStateSil:
            return AudioChangeState.kChangeStateSil2Sil
        if self.pre_frame_state == FrameState.kFrameStateSpeech:
            return AudioChangeState.kChangeStateSpeech2Speech
        return AudioChangeState.kChangeStateInvalid

    def FrameSizeMs(self) -> int:
        return int(self.frame_size_ms)


class E2EVadModel():
    def __init__(self, vad_post_args: Dict[str, Any]):
        super(E2EVadModel, self).__init__()
        self.vad_opts = VADXOptions(**vad_post_args)
        self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
                                               self.vad_opts.sil_to_speech_time_thres,
                                               self.vad_opts.speech_to_sil_time_thres,
                                               self.vad_opts.frame_in_ms)
        # self.encoder = encoder
        # init variables
        self.is_final = False
        self.data_buf_start_frame = 0
        self.frm_cnt = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.continous_silence_frame_count = 0
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.number_end_time_detected = 0
        self.sil_frame = 0
        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
        self.noise_average_decibel = -100.0
        self.pre_end_silence_detected = False
        self.next_seg = True

        self.output_data_buf = []
        self.output_data_buf_offset = 0
        self.frame_probs = []
        self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
        self.speech_noise_thres = self.vad_opts.speech_noise_thres
        self.scores = None
        self.max_time_out = False
        self.decibel = []
        self.data_buf = None
        self.data_buf_all = None
        self.waveform = None
        self.ResetDetection()

    def AllResetDetection(self):
        self.is_final = False
        self.data_buf_start_frame = 0
        self.frm_cnt = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.continous_silence_frame_count = 0
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.number_end_time_detected = 0
        self.sil_frame = 0
        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
        self.noise_average_decibel = -100.0
        self.pre_end_silence_detected = False
        self.next_seg = True

        self.output_data_buf = []
        self.output_data_buf_offset = 0
        self.frame_probs = []
        self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
        self.speech_noise_thres = self.vad_opts.speech_noise_thres
        self.scores = None
        self.max_time_out = False
        self.decibel = []
        self.data_buf = None
        self.data_buf_all = None
        self.waveform = None
        self.ResetDetection()

    def ResetDetection(self):
        self.continous_silence_frame_count = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.windows_detector.Reset()
        self.sil_frame = 0
        self.frame_probs = []

    def ComputeDecibel(self) -> None:
        frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
        frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
        if self.data_buf_all is None:
            self.data_buf_all = self.waveform[0]  # self.data_buf is pointed to self.waveform[0]
            self.data_buf = self.data_buf_all
        else:
            self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
        for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
            self.decibel.append(
                10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
                                0.000001))

    def ComputeScores(self, scores: np.ndarray) -> None:
        # scores = self.encoder(feats, in_cache)  # return B * T * D
        self.vad_opts.nn_eval_block_size = scores.shape[1]
        self.frm_cnt += scores.shape[1]  # count total frames
        if self.scores is None:
            self.scores = scores  # the first calculation
        else:
            self.scores = np.concatenate((self.scores, scores), axis=1)

    def PopDataBufTillFrame(self, frame_idx: int) -> None:  # need check again
        while self.data_buf_start_frame < frame_idx:
            if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                self.data_buf_start_frame += 1
                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
                    self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]

    def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
                           last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
        self.PopDataBufTillFrame(start_frm)
        expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
        if last_frm_is_end_point:
            extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
                                      self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
            expected_sample_number += int(extra_sample)
        if end_point_is_sent_end:
            expected_sample_number = max(expected_sample_number, len(self.data_buf))
        if len(self.data_buf) < expected_sample_number:
            print('error in calling pop data_buf\n')

        if len(self.output_data_buf) == 0 or first_frm_is_start_point:
            self.output_data_buf.append(E2EVadSpeechBufWithDoa())
            self.output_data_buf[-1].Reset()
            self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
            self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
            self.output_data_buf[-1].doa = 0
        cur_seg = self.output_data_buf[-1]
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            print('warning\n')
        out_pos = len(cur_seg.buffer)  # cur_seg.buff现在没做任何操作
        data_to_pop = 0
        if end_point_is_sent_end:
            data_to_pop = expected_sample_number
        else:
            data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
        if data_to_pop > len(self.data_buf):
            print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
            data_to_pop = len(self.data_buf)
            expected_sample_number = len(self.data_buf)

        cur_seg.doa = 0
        for sample_cpy_out in range(0, data_to_pop):
            # cur_seg.buffer[out_pos ++] = data_buf_.back();
            out_pos += 1
        for sample_cpy_out in range(data_to_pop, expected_sample_number):
            # cur_seg.buffer[out_pos++] = data_buf_.back()
            out_pos += 1
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            print('Something wrong with the VAD algorithm\n')
        self.data_buf_start_frame += frm_cnt
        cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
        if first_frm_is_start_point:
            cur_seg.contain_seg_start_point = True
        if last_frm_is_end_point:
            cur_seg.contain_seg_end_point = True

    def OnSilenceDetected(self, valid_frame: int):
        self.lastest_confirmed_silence_frame = valid_frame
        if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
            self.PopDataBufTillFrame(valid_frame)
        # silence_detected_callback_
        # pass

    def OnVoiceDetected(self, valid_frame: int) -> None:
        self.latest_confirmed_speech_frame = valid_frame
        self.PopDataToOutputBuf(valid_frame, 1, False, False, False)

    def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
        if self.vad_opts.do_start_point_detection:
            pass
        if self.confirmed_start_frame != -1:
            print('not reset vad properly\n')
        else:
            self.confirmed_start_frame = start_frame

        if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
            self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)

    def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
        for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
            self.OnVoiceDetected(t)
        if self.vad_opts.do_end_point_detection:
            pass
        if self.confirmed_end_frame != -1:
            print('not reset vad properly\n')
        else:
            self.confirmed_end_frame = end_frame
        if not fake_result:
            self.sil_frame = 0
            self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
        self.number_end_time_detected += 1

    def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
        if is_final_frame:
            self.OnVoiceEnd(cur_frm_idx, False, True)
            self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected

    def GetLatency(self) -> int:
        return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)

    def LatencyFrmNumAtStartPoint(self) -> int:
        vad_latency = self.windows_detector.GetWinSize()
        if self.vad_opts.do_extend:
            vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
        return vad_latency

    def GetFrameState(self, t: int) -> FrameState:
        frame_state = FrameState.kFrameStateInvalid
        cur_decibel = self.decibel[t]
        cur_snr = cur_decibel - self.noise_average_decibel
        # for each frame, calc log posterior probability of each state
        if cur_decibel < self.vad_opts.decibel_thres:
            frame_state = FrameState.kFrameStateSil
            self.DetectOneFrame(frame_state, t, False)
            return frame_state

        sum_score = 0.0
        noise_prob = 0.0
        assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
        if len(self.sil_pdf_ids) > 0:
            assert len(self.scores) == 1  # 只支持batch_size = 1的测试
            sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
            sum_score = sum(sil_pdf_scores)
            noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
            total_score = 1.0
            sum_score = total_score - sum_score
        speech_prob = math.log(sum_score)
        if self.vad_opts.output_frame_probs:
            frame_prob = E2EVadFrameProb()
            frame_prob.noise_prob = noise_prob
            frame_prob.speech_prob = speech_prob
            frame_prob.score = sum_score
            frame_prob.frame_id = t
            self.frame_probs.append(frame_prob)
        if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
            if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
                frame_state = FrameState.kFrameStateSpeech
            else:
                frame_state = FrameState.kFrameStateSil
        else:
            frame_state = FrameState.kFrameStateSil
            if self.noise_average_decibel < -99.9:
                self.noise_average_decibel = cur_decibel
            else:
                self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
                        self.vad_opts.noise_frame_num_used_for_snr
                        - 1)) / self.vad_opts.noise_frame_num_used_for_snr

        return frame_state
     

    def __call__(self, score: np.ndarray, waveform: np.ndarray,
                is_final: bool = False, max_end_sil: int = 800
                ):
        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
        self.waveform = waveform  # compute decibel for each frame
        self.ComputeDecibel()
        self.ComputeScores(score)
        if not is_final:
            self.DetectCommonFrames()
        else:
            self.DetectLastFrames()
        segments = []
        for batch_num in range(0, score.shape[0]):  # only support batch_size = 1 now
            segment_batch = []
            if len(self.output_data_buf) > 0:
                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
                    if not self.output_data_buf[i].contain_seg_start_point:
                        continue
                    if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
                        continue
                    start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
                    if self.output_data_buf[i].contain_seg_end_point:
                        end_ms = self.output_data_buf[i].end_ms
                        self.next_seg = True
                        self.output_data_buf_offset += 1
                    else:
                        end_ms = -1
                        self.next_seg = False
                    segment = [start_ms, end_ms]
                    segment_batch.append(segment)
            if segment_batch:
                segments.append(segment_batch)
        if is_final:
            # reset class variables and clear the dict for the next query
            self.AllResetDetection()
        return segments

    def DetectCommonFrames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
            self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)

        return 0

    def DetectLastFrames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
            if i != 0:
                self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
            else:
                self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)

        return 0

    def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
        tmp_cur_frm_state = FrameState.kFrameStateInvalid
        if cur_frm_state == FrameState.kFrameStateSpeech:
            if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
                tmp_cur_frm_state = FrameState.kFrameStateSpeech
            else:
                tmp_cur_frm_state = FrameState.kFrameStateSil
        elif cur_frm_state == FrameState.kFrameStateSil:
            tmp_cur_frm_state = FrameState.kFrameStateSil
        state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
        frm_shift_in_ms = self.vad_opts.frame_in_ms
        if AudioChangeState.kChangeStateSil2Speech == state_change:
            silence_frame_count = self.continous_silence_frame_count
            self.continous_silence_frame_count = 0
            self.pre_end_silence_detected = False
            start_frame = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
                self.OnVoiceStart(start_frame)
                self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
                for t in range(start_frame + 1, cur_frm_idx + 1):
                    self.OnVoiceDetected(t)
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
                    self.OnVoiceDetected(t)
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
            self.continous_silence_frame_count = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                pass
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
            self.continous_silence_frame_count = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.max_time_out = True
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSil2Sil == state_change:
            self.continous_silence_frame_count += 1
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                # silence timeout, return zero length decision
                if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
                        self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
                        or (is_final_frame and self.number_end_time_detected == 0):
                    for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
                        self.OnSilenceDetected(t)
                    self.OnVoiceStart(0, True)
                    self.OnVoiceEnd(0, True, False);
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                else:
                    if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
                        self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
                    lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
                    if self.vad_opts.do_extend:
                        lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
                        lookback_frame -= 1
                        lookback_frame = max(0, lookback_frame)
                    self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif self.vad_opts.do_extend and not is_final_frame:
                    if self.continous_silence_frame_count <= int(
                            self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
                        self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass

        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
                self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
            self.ResetDetection()

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/frontend.py

New file
@@ -0,0 +1,191 @@
# -*- encoding: utf-8 -*-
from pathlib import Path
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

import numpy as np
from typeguard import check_argument_types
import kaldi_native_fbank as knf

root_dir = Path(__file__).resolve().parent

logger_initialized = {}


class WavFrontend():
    """Conventional frontend structure for ASR.
    """

    def __init__(
            self,
            cmvn_file: str = None,
            fs: int = 16000,
            window: str = 'hamming',
            n_mels: int = 80,
            frame_length: int = 25,
            frame_shift: int = 10,
            lfr_m: int = 1,
            lfr_n: int = 1,
            dither: float = 1.0,
            **kwargs,
    ) -> None:
        check_argument_types()

        opts = knf.FbankOptions()
        opts.frame_opts.samp_freq = fs
        opts.frame_opts.dither = dither
        opts.frame_opts.window_type = window
        opts.frame_opts.frame_shift_ms = float(frame_shift)
        opts.frame_opts.frame_length_ms = float(frame_length)
        opts.mel_opts.num_bins = n_mels
        opts.energy_floor = 0
        opts.frame_opts.snip_edges = True
        opts.mel_opts.debug_mel = False
        self.opts = opts

        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.cmvn_file = cmvn_file

        if self.cmvn_file:
            self.cmvn = self.load_cmvn()
        self.fbank_fn = None
        self.fbank_beg_idx = 0
        self.reset_status()

    def fbank(self,
              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        waveform = waveform * (1 << 15)
        self.fbank_fn = knf.OnlineFbank(self.opts)
        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
        frames = self.fbank_fn.num_frames_ready
        mat = np.empty([frames, self.opts.mel_opts.num_bins])
        for i in range(frames):
            mat[i, :] = self.fbank_fn.get_frame(i)
        feat = mat.astype(np.float32)
        feat_len = np.array(mat.shape[0]).astype(np.int32)
        return feat, feat_len

    def fbank_online(self,
              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        waveform = waveform * (1 << 15)
        # self.fbank_fn = knf.OnlineFbank(self.opts)
        self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
        frames = self.fbank_fn.num_frames_ready
        mat = np.empty([frames, self.opts.mel_opts.num_bins])
        for i in range(self.fbank_beg_idx, frames):
            mat[i, :] = self.fbank_fn.get_frame(i)
        # self.fbank_beg_idx += (frames-self.fbank_beg_idx)
        feat = mat.astype(np.float32)
        feat_len = np.array(mat.shape[0]).astype(np.int32)
        return feat, feat_len

    def reset_status(self):
        self.fbank_fn = knf.OnlineFbank(self.opts)
        self.fbank_beg_idx = 0

    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        if self.lfr_m != 1 or self.lfr_n != 1:
            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

        if self.cmvn_file:
            feat = self.apply_cmvn(feat)

        feat_len = np.array(feat.shape[0]).astype(np.int32)
        return feat, feat_len

    @staticmethod
    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
        LFR_inputs = []

        T = inputs.shape[0]
        T_lfr = int(np.ceil(T / lfr_n))
        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
        inputs = np.vstack((left_padding, inputs))
        T = T + (lfr_m - 1) // 2
        for i in range(T_lfr):
            if lfr_m <= T - i * lfr_n:
                LFR_inputs.append(
                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
            else:
                # process last LFR frame
                num_padding = lfr_m - (T - i * lfr_n)
                frame = inputs[i * lfr_n:].reshape(-1)
                for _ in range(num_padding):
                    frame = np.hstack((frame, inputs[-1]))

                LFR_inputs.append(frame)
        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
        return LFR_outputs

    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
        """
        Apply CMVN with mvn data
        """
        frame, dim = inputs.shape
        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
        inputs = (inputs + means) * vars
        return inputs

    def load_cmvn(self,) -> np.ndarray:
        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
            lines = f.readlines()

        means_list = []
        vars_list = []
        for i in range(len(lines)):
            line_item = lines[i].split()
            if line_item[0] == '<AddShift>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    add_shift_line = line_item[3:(len(line_item) - 1)]
                    means_list = list(add_shift_line)
                    continue
            elif line_item[0] == '<Rescale>':
                line_item = lines[i + 1].split()
                if line_item[0] == '<LearnRateCoef>':
                    rescale_line = line_item[3:(len(line_item) - 1)]
                    vars_list = list(rescale_line)
                    continue

        means = np.array(means_list).astype(np.float64)
        vars = np.array(vars_list).astype(np.float64)
        cmvn = np.array([means, vars])
        return cmvn

def load_bytes(input):
    middle_data = np.frombuffer(input, dtype=np.int16)
    middle_data = np.asarray(middle_data)
    if middle_data.dtype.kind not in 'iu':
        raise TypeError("'middle_data' must be an array of integers")
    dtype = np.dtype('float32')
    if dtype.kind != 'f':
        raise TypeError("'dtype' must be a floating point type")

    i = np.iinfo(middle_data.dtype)
    abs_max = 2 ** (i.bits - 1)
    offset = i.min + abs_max
    array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
    return array


def test():
    path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
    import librosa
    cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
    config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
    from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
    config = read_yaml(config_file)
    waveform, _ = librosa.load(path, sr=None)
    frontend = WavFrontend(
        cmvn_file=cmvn_file,
        **config['frontend_conf'],
    )
    speech, _ = frontend.fbank_online(waveform)  #1d, (sample,), numpy
    feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)
    
    frontend.reset_status() # clear cache
    return feat, feat_len

if __name__ == '__main__':
    test()

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/postprocess_utils.py

New file
@@ -0,0 +1,240 @@
# Copyright (c) Alibaba, Inc. and its affiliates.

import string
import logging
from typing import Any, List, Union


def isChinese(ch: str):
    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
        return True
    return False


def isAllChinese(word: Union[List[Any], str]):
    word_lists = []
    for i in word:
        cur = i.replace(' ', '')
        cur = cur.replace('</s>', '')
        cur = cur.replace('<s>', '')
        word_lists.append(cur)

    if len(word_lists) == 0:
        return False

    for ch in word_lists:
        if isChinese(ch) is False:
            return False
    return True


def isAllAlpha(word: Union[List[Any], str]):
    word_lists = []
    for i in word:
        cur = i.replace(' ', '')
        cur = cur.replace('</s>', '')
        cur = cur.replace('<s>', '')
        word_lists.append(cur)

    if len(word_lists) == 0:
        return False

    for ch in word_lists:
        if ch.isalpha() is False and ch != "'":
            return False
        elif ch.isalpha() is True and isChinese(ch) is True:
            return False

    return True


# def abbr_dispose(words: List[Any]) -> List[Any]:
def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
    words_size = len(words)
    word_lists = []
    abbr_begin = []
    abbr_end = []
    last_num = -1
    ts_lists = []
    ts_nums = []
    ts_index = 0
    for num in range(words_size):
        if num <= last_num:
            continue

        if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
            if num + 1 < words_size and words[
                    num + 1] == ' ' and num + 2 < words_size and len(
                        words[num +
                              2]) == 1 and words[num +
                                                 2].encode('utf-8').isalpha():
                # found the begin of abbr
                abbr_begin.append(num)
                num += 2
                abbr_end.append(num)
                # to find the end of abbr
                while True:
                    num += 1
                    if num < words_size and words[num] == ' ':
                        num += 1
                        if num < words_size and len(
                                words[num]) == 1 and words[num].encode(
                                    'utf-8').isalpha():
                            abbr_end.pop()
                            abbr_end.append(num)
                            last_num = num
                        else:
                            break
                    else:
                        break

    for num in range(words_size):
        if words[num] == ' ':
            ts_nums.append(ts_index)
        else:
            ts_nums.append(ts_index)
            ts_index += 1 
    last_num = -1
    for num in range(words_size):
        if num <= last_num:
            continue

        if num in abbr_begin:
            if time_stamp is not None:
                begin = time_stamp[ts_nums[num]][0]
            word_lists.append(words[num].upper())
            num += 1
            while num < words_size:
                if num in abbr_end:
                    word_lists.append(words[num].upper())
                    last_num = num
                    break
                else:
                    if words[num].encode('utf-8').isalpha():
                        word_lists.append(words[num].upper())
                num += 1
            if time_stamp is not None:
                end = time_stamp[ts_nums[num]][1]
                ts_lists.append([begin, end])
        else:
            word_lists.append(words[num])
            if time_stamp is not None and words[num] != ' ':
                begin = time_stamp[ts_nums[num]][0]
                end = time_stamp[ts_nums[num]][1]
                ts_lists.append([begin, end])
                begin = end

    if time_stamp is not None:
        return word_lists, ts_lists
    else:
        return word_lists


def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
    middle_lists = []
    word_lists = []
    word_item = ''
    ts_lists = []

    # wash words lists
    for i in words:
        word = ''
        if isinstance(i, str):
            word = i
        else:
            word = i.decode('utf-8')

        if word in ['<s>', '</s>', '<unk>']:
            continue
        else:
            middle_lists.append(word)

    # all chinese characters
    if isAllChinese(middle_lists):
        for i, ch in enumerate(middle_lists):
            word_lists.append(ch.replace(' ', ''))
        if time_stamp is not None:
            ts_lists = time_stamp

    # all alpha characters
    elif isAllAlpha(middle_lists):
        ts_flag = True
        for i, ch in enumerate(middle_lists):
            if ts_flag and time_stamp is not None:
                begin = time_stamp[i][0]
                end = time_stamp[i][1]
            word = ''
            if '@@' in ch:
                word = ch.replace('@@', '')
                word_item += word
                if time_stamp is not None:
                    ts_flag = False
                    end = time_stamp[i][1]
            else:
                word_item += ch
                word_lists.append(word_item)
                word_lists.append(' ')
                word_item = ''
                if time_stamp is not None:
                    ts_flag = True
                    end = time_stamp[i][1]
                    ts_lists.append([begin, end])
                    begin = end

    # mix characters
    else:
        alpha_blank = False
        ts_flag = True
        begin = -1
        end = -1
        for i, ch in enumerate(middle_lists):
            if ts_flag and time_stamp is not None:
                begin = time_stamp[i][0]
                end = time_stamp[i][1]
            word = ''
            if isAllChinese(ch):
                if alpha_blank is True:
                    word_lists.pop()
                word_lists.append(ch)
                alpha_blank = False
                if time_stamp is not None:
                    ts_flag = True
                    ts_lists.append([begin, end])
                    begin = end
            elif '@@' in ch:
                word = ch.replace('@@', '')
                word_item += word
                alpha_blank = False
                if time_stamp is not None:
                    ts_flag = False
                    end = time_stamp[i][1]
            elif isAllAlpha(ch):
                word_item += ch
                word_lists.append(word_item)
                word_lists.append(' ')
                word_item = ''
                alpha_blank = True
                if time_stamp is not None:
                    ts_flag = True
                    end = time_stamp[i][1] 
                    ts_lists.append([begin, end])
                    begin = end
            else:
                raise ValueError('invalid character: {}'.format(ch))

    if time_stamp is not None: 
        word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
        real_word_lists = []
        for ch in word_lists:
            if ch != ' ':
                real_word_lists.append(ch)
        sentence = ' '.join(real_word_lists).strip()
        return sentence, ts_lists, real_word_lists
    else:
        word_lists = abbr_dispose(word_lists)
        real_word_lists = []
        for ch in word_lists:
            if ch != ' ':
                real_word_lists.append(ch)
        sentence = ''.join(word_lists).strip()
        return sentence, real_word_lists

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/timestamp_utils.py

New file
@@ -0,0 +1,59 @@
import numpy as np


def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
    if not len(char_list):
        return []
    START_END_THRESHOLD = 5
    MAX_TOKEN_DURATION = 30
    TIME_RATE = 10.0 * 6 / 1000 / 3  #  3 times upsampled
    cif_peak = us_cif_peak.reshape(-1)
    num_frames = cif_peak.shape[-1]
    if char_list[-1] == '</s>':
        char_list = char_list[:-1]
    # char_list = [i for i in text]
    timestamp_list = []
    new_char_list = []
    # for bicif model trained with large data, cif2 actually fires when a character starts
    # so treat the frames between two peaks as the duration of the former token
    fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset  # np format
    num_peak = len(fire_place)
    assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
    # begin silence
    if fire_place[0] > START_END_THRESHOLD:
        # char_list.insert(0, '<sil>')
        timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
        new_char_list.append('<sil>')
    # tokens timestamp
    for i in range(len(fire_place)-1):
        new_char_list.append(char_list[i])
        if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
            timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
        else:
            # cut the duration to token and sil of the 0-weight frames last long
            _split = fire_place[i] + MAX_TOKEN_DURATION
            timestamp_list.append([fire_place[i]*TIME_RATE, _split*TIME_RATE])
            timestamp_list.append([_split*TIME_RATE, fire_place[i+1]*TIME_RATE])
            new_char_list.append('<sil>')
    # tail token and end silence
    if num_frames - fire_place[-1] > START_END_THRESHOLD:
        _end = (num_frames + fire_place[-1]) / 2
        timestamp_list[-1][1] = _end*TIME_RATE
        timestamp_list.append([_end*TIME_RATE, num_frames*TIME_RATE])
        new_char_list.append("<sil>")
    else:
        timestamp_list[-1][1] = num_frames*TIME_RATE
    if begin_time:  # add offset time in model with vad
        for i in range(len(timestamp_list)):
            timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
            timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
    assert len(new_char_list) == len(timestamp_list)
    res_str = ""
    for char, timestamp in zip(new_char_list, timestamp_list):
        res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
    res = []
    for char, timestamp in zip(new_char_list, timestamp_list):
        if char != '<sil>':
            res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
    return res_str, res
    

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/utils/utils.py

New file
@@ -0,0 +1,257 @@
# -*- encoding: utf-8 -*-

import functools
import logging
import pickle
from pathlib import Path
from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

import numpy as np
import yaml
from onnxruntime import (GraphOptimizationLevel, InferenceSession,
                         SessionOptions, get_available_providers, get_device)
from typeguard import check_argument_types

import warnings

root_dir = Path(__file__).resolve().parent

logger_initialized = {}


class TokenIDConverter():
    def __init__(self, token_list: Union[List, str],
                 ):
        check_argument_types()

        # self.token_list = self.load_token(token_path)
        self.token_list = token_list
        self.unk_symbol = token_list[-1]

    # @staticmethod
    # def load_token(file_path: Union[Path, str]) -> List:
    #     if not Path(file_path).exists():
    #         raise TokenIDConverterError(f'The {file_path} does not exist.')
    #
    #     with open(str(file_path), 'rb') as f:
    #         token_list = pickle.load(f)
    #
    #     if len(token_list) != len(set(token_list)):
    #         raise TokenIDConverterError('The Token exists duplicated symbol.')
    #     return token_list

    def get_num_vocabulary_size(self) -> int:
        return len(self.token_list)

    def ids2tokens(self,
                   integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
        if isinstance(integers, np.ndarray) and integers.ndim != 1:
            raise TokenIDConverterError(
                f"Must be 1 dim ndarray, but got {integers.ndim}")
        return [self.token_list[i] for i in integers]

    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
        token2id = {v: i for i, v in enumerate(self.token_list)}
        if self.unk_symbol not in token2id:
            raise TokenIDConverterError(
                f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
            )
        unk_id = token2id[self.unk_symbol]
        return [token2id.get(i, unk_id) for i in tokens]


class CharTokenizer():
    def __init__(
        self,
        symbol_value: Union[Path, str, Iterable[str]] = None,
        space_symbol: str = "<space>",
        remove_non_linguistic_symbols: bool = False,
    ):
        check_argument_types()

        self.space_symbol = space_symbol
        self.non_linguistic_symbols = self.load_symbols(symbol_value)
        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols

    @staticmethod
    def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
        if value is None:
            return set()

        if isinstance(value, Iterable[str]):
            return set(value)

        file_path = Path(value)
        if not file_path.exists():
            logging.warning("%s doesn't exist.", file_path)
            return set()

        with file_path.open("r", encoding="utf-8") as f:
            return set(line.rstrip() for line in f)

    def text2tokens(self, line: Union[str, list]) -> List[str]:
        tokens = []
        while len(line) != 0:
            for w in self.non_linguistic_symbols:
                if line.startswith(w):
                    if not self.remove_non_linguistic_symbols:
                        tokens.append(line[: len(w)])
                    line = line[len(w):]
                    break
            else:
                t = line[0]
                if t == " ":
                    t = "<space>"
                tokens.append(t)
                line = line[1:]
        return tokens

    def tokens2text(self, tokens: Iterable[str]) -> str:
        tokens = [t if t != self.space_symbol else " " for t in tokens]
        return "".join(tokens)

    def __repr__(self):
        return (
            f"{self.__class__.__name__}("
            f'space_symbol="{self.space_symbol}"'
            f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
            f")"
        )



class Hypothesis(NamedTuple):
    """Hypothesis data type."""

    yseq: np.ndarray
    score: Union[float, np.ndarray] = 0
    scores: Dict[str, Union[float, np.ndarray]] = dict()
    states: Dict[str, Any] = dict()

    def asdict(self) -> dict:
        """Convert data to JSON-friendly dict."""
        return self._replace(
            yseq=self.yseq.tolist(),
            score=float(self.score),
            scores={k: float(v) for k, v in self.scores.items()},
        )._asdict()


class TokenIDConverterError(Exception):
    pass


class ONNXRuntimeError(Exception):
    pass


class OrtInferSession():
    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
        device_id = str(device_id)
        sess_opt = SessionOptions()
        sess_opt.intra_op_num_threads = intra_op_num_threads
        sess_opt.log_severity_level = 4
        sess_opt.enable_cpu_mem_arena = False
        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

        cuda_ep = 'CUDAExecutionProvider'
        cuda_provider_options = {
            "device_id": device_id,
            "arena_extend_strategy": "kNextPowerOfTwo",
            "cudnn_conv_algo_search": "EXHAUSTIVE",
            "do_copy_in_default_stream": "true",
        }
        cpu_ep = 'CPUExecutionProvider'
        cpu_provider_options = {
            "arena_extend_strategy": "kSameAsRequested",
        }

        EP_list = []
        if device_id != "-1" and get_device() == 'GPU' \
                and cuda_ep in get_available_providers():
            EP_list = [(cuda_ep, cuda_provider_options)]
        EP_list.append((cpu_ep, cpu_provider_options))

        self._verify_model(model_file)
        self.session = InferenceSession(model_file,
                                        sess_options=sess_opt,
                                        providers=EP_list)

        if device_id != "-1" and cuda_ep not in self.session.get_providers():
            warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
                          'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
                          'you can check their relations from the offical web site: '
                          'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
                          RuntimeWarning)

    def __call__(self,
                 input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
        input_dict = dict(zip(self.get_input_names(), input_content))
        try:
            return self.session.run(None, input_dict)
        except Exception as e:
            raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e

    def get_input_names(self, ):
        return [v.name for v in self.session.get_inputs()]

    def get_output_names(self,):
        return [v.name for v in self.session.get_outputs()]

    def get_character_list(self, key: str = 'character'):
        return self.meta_dict[key].splitlines()

    def have_key(self, key: str = 'character') -> bool:
        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
        if key in self.meta_dict.keys():
            return True
        return False

    @staticmethod
    def _verify_model(model_path):
        model_path = Path(model_path)
        if not model_path.exists():
            raise FileNotFoundError(f'{model_path} does not exists.')
        if not model_path.is_file():
            raise FileExistsError(f'{model_path} is not a file.')


def read_yaml(yaml_path: Union[str, Path]) -> Dict:
    if not Path(yaml_path).exists():
        raise FileExistsError(f'The {yaml_path} does not exist.')

    with open(str(yaml_path), 'rb') as f:
        data = yaml.load(f, Loader=yaml.Loader)
    return data


@functools.lru_cache()
def get_logger(name='rapdi_paraformer'):
    """Initialize and get a logger by name.
    If the logger has not been initialized, this method will initialize the
    logger by adding one or two handlers, otherwise the initialized logger will
    be directly returned. During initialization, a StreamHandler will always be
    added.
    Args:
        name (str): Logger name.
    Returns:
        logging.Logger: The expected logger.
    """
    logger = logging.getLogger(name)
    if name in logger_initialized:
        return logger

    for logger_name in logger_initialized:
        if name.startswith(logger_name):
            return logger

    formatter = logging.Formatter(
        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
        datefmt="%Y/%m/%d %H:%M:%S")

    sh = logging.StreamHandler()
    sh.setFormatter(formatter)
    logger.addHandler(sh)
    logger_initialized[name] = True
    logger.propagate = False
    return logger

 funasr/runtime/python/onnxruntime/build/lib/funasr_onnx/vad_bin.py

New file
@@ -0,0 +1,166 @@
# -*- encoding: utf-8 -*-

import os.path
from pathlib import Path
from typing import List, Union, Tuple

import copy
import librosa
import numpy as np

from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
                          OrtInferSession, TokenIDConverter, get_logger,
                          read_yaml)
from .utils.postprocess_utils import sentence_postprocess
from .utils.frontend import WavFrontend
from .utils.timestamp_utils import time_stamp_lfr6_onnx
from .utils.e2e_vad import E2EVadModel

logging = get_logger()


class Fsmn_vad():
    def __init__(self, model_dir: Union[str, Path] = None,
                 batch_size: int = 1,
                 device_id: Union[str, int] = "-1",
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 max_end_sil: int = 800,
                 ):
		
        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')
		
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        config_file = os.path.join(model_dir, 'vad.yaml')
        cmvn_file = os.path.join(model_dir, 'vad.mvn')
        config = read_yaml(config_file)
		
        self.frontend = WavFrontend(
            cmvn_file=cmvn_file,
            **config['frontend_conf']
        )
        self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
        self.batch_size = batch_size
        self.vad_scorer = E2EVadModel(**config)
        self.max_end_sil = max_end_sil
	
    def prepare_cache(self, in_cache: list = []):
        if len(in_cache) > 0:
            return in_cache
		
        for i in range(4):
            cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
            in_cache.append(cache)
        return in_cache
		
	
    def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
        waveform_nums = len(waveform_list)
        is_final = kwargs.get('kwargs', False)

        asr_res = []
        for beg_idx in range(0, waveform_nums, self.batch_size):
			
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            waveform = waveform_list[beg_idx:end_idx]
            feats, feats_len = self.extract_feat(waveform)
            param_dict = kwargs.get('param_dict', dict())
            in_cache = param_dict.get('cache', list())
            in_cache = self.prepare_cache(in_cache)
            try:
				
                scores, out_caches = self.infer(feats, *in_cache)
                param_dict['cache'] = out_caches
                segments = self.vad_scorer(scores, waveform, is_final=is_final, max_end_sil=self.max_end_sil)
				
            except ONNXRuntimeError:
                # logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                segments = ''
            asr_res.append(segments)
            # else:
            #     preds = self.decode(am_scores, valid_token_lens)
            #
            #     asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
				
        return asr_res

    def load_data(self,
                  wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
        def load_wav(path: str) -> np.ndarray:
            waveform, _ = librosa.load(path, sr=fs)
            return waveform
		
        if isinstance(wav_content, np.ndarray):
            return [wav_content]
		
        if isinstance(wav_content, str):
            return [load_wav(wav_content)]
		
        if isinstance(wav_content, list):
            return [load_wav(path) for path in wav_content]
		
        raise TypeError(
            f'The type of {wav_content} is not in [str, np.ndarray, list]')
	
    def extract_feat(self,
                     waveform_list: List[np.ndarray]
                     ) -> Tuple[np.ndarray, np.ndarray]:
        feats, feats_len = [], []
        for waveform in waveform_list:
            speech, _ = self.frontend.fbank(waveform)
            feat, feat_len = self.frontend.lfr_cmvn(speech)
            feats.append(feat)
            feats_len.append(feat_len)
		
        feats = self.pad_feats(feats, np.max(feats_len))
        feats_len = np.array(feats_len).astype(np.int32)
        return feats, feats_len
	
    @staticmethod
    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
            pad_width = ((0, max_feat_len - cur_len), (0, 0))
            return np.pad(feat, pad_width, 'constant', constant_values=0)
		
        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
        feats = np.array(feat_res).astype(np.float32)
        return feats
	
    def infer(self, feats: np.ndarray,
              feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
        outputs = self.ort_infer([feats, feats_len])
        return outputs
	
    def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
        return [self.decode_one(am_score, token_num)
                for am_score, token_num in zip(am_scores, token_nums)]
	
    def decode_one(self,
                   am_score: np.ndarray,
                   valid_token_num: int) -> List[str]:
        yseq = am_score.argmax(axis=-1)
        score = am_score.max(axis=-1)
        score = np.sum(score, axis=-1)
		
        # pad with mask tokens to ensure compatibility with sos/eos tokens
        # asr_model.sos:1  asr_model.eos:2
        yseq = np.array([1] + yseq.tolist() + [2])
        hyp = Hypothesis(yseq=yseq, score=score)
		
        # remove sos/eos and get results
        last_pos = -1
        token_int = hyp.yseq[1:last_pos].tolist()
		
        # remove blank symbol id, which is assumed to be 0
        token_int = list(filter(lambda x: x not in (0, 2), token_int))
		
        # Change integer-ids to tokens
        token = self.converter.ids2tokens(token_int)
        token = token[:valid_token_num - self.pred_bias]
        # texts = sentence_postprocess(token)
        return token

 funasr/runtime/python/onnxruntime/demo.py

@@ -1,8 +1,6 @@

from funasr_onnx import Paraformer

#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"

# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0

 funasr/runtime/python/onnxruntime/demo_vad.py

New file
@@ -0,0 +1,12 @@

from funasr_onnx import Fsmn_vad


model_dir = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch"

model = Fsmn_vad(model_dir)

wav_path = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"

result = model(wav_path)
print(result)

 funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.2-py3.8.egg

Binary files differ

 funasr/runtime/python/onnxruntime/dist/funasr_onnx-0.0.3-py3.8.egg

Binary files differ

 funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/PKG-INFO

New file
@@ -0,0 +1,80 @@
Metadata-Version: 2.1
Name: funasr-onnx
Version: 0.0.3
Summary: FunASR: A Fundamental End-to-End Speech Recognition Toolkit
Home-page: https://github.com/alibaba-damo-academy/FunASR.git
Author: Speech Lab, Alibaba Group, China
Author-email: funasr@list.alibaba-inc.com
License: MIT
Keywords: funasr,asr
Platform: Any
Classifier: Programming Language :: Python :: 3.6
Classifier: Programming Language :: Python :: 3.7
Classifier: Programming Language :: Python :: 3.8
Classifier: Programming Language :: Python :: 3.9
Classifier: Programming Language :: Python :: 3.10
Description-Content-Type: text/markdown

## Using funasr with ONNXRuntime


### Introduction
- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).


### Steps:
1. Export the model.
   - Command: (`Tips`: torch >= 1.11.0 is required.)

       More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

       - `e.g.`, Export model from modelscope
         ```shell
         python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
         ```
       - `e.g.`, Export model from local path, the model'name must be `model.pb`.
         ```shell
         python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
         ```


2. Install the `funasr_onnx`

install from pip
```shell
pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
```

or install from source code

```shell
git clone https://github.com/alibaba/FunASR.git && cd FunASR
cd funasr/runtime/python/funasr_onnx
python setup.py build
python setup.py install
```

3. Run the demo.
   - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
   - Input: wav formt file, support formats: `str, np.ndarray, List[str]`
   - Output: `List[str]`: recognition result.
   - Example:
        ```python
        from funasr_onnx import Paraformer

        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
        model = Paraformer(model_dir, batch_size=1)

        wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']

        result = model(wav_path)
        print(result)
        ```

## Performance benchmark

Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_onnx.md)

## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).

 funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/SOURCES.txt

New file
@@ -0,0 +1,17 @@
README.md
setup.py
funasr_onnx/__init__.py
funasr_onnx/paraformer_bin.py
funasr_onnx/punc_bin.py
funasr_onnx/vad_bin.py
funasr_onnx.egg-info/PKG-INFO
funasr_onnx.egg-info/SOURCES.txt
funasr_onnx.egg-info/dependency_links.txt
funasr_onnx.egg-info/requires.txt
funasr_onnx.egg-info/top_level.txt
funasr_onnx/utils/__init__.py
funasr_onnx/utils/e2e_vad.py
funasr_onnx/utils/frontend.py
funasr_onnx/utils/postprocess_utils.py
funasr_onnx/utils/timestamp_utils.py
funasr_onnx/utils/utils.py

 funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/dependency_links.txt

New file
@@ -0,0 +1 @@


 funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/requires.txt

New file
@@ -0,0 +1,7 @@
librosa
onnxruntime>=1.7.0
scipy
numpy>=1.19.3
typeguard
kaldi-native-fbank
PyYAML>=5.1.2

 funasr/runtime/python/onnxruntime/funasr_onnx.egg-info/top_level.txt

New file
@@ -0,0 +1 @@
funasr_onnx

 funasr/runtime/python/onnxruntime/funasr_onnx/__init__.py

@@ -1,2 +1,3 @@
# -*- encoding: utf-8 -*-
from .paraformer_bin import Paraformer
from .vad_bin import Fsmn_vad

 funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py

New file
@@ -0,0 +1,607 @@
from enum import Enum
from typing import List, Tuple, Dict, Any

import math
import numpy as np

class VadStateMachine(Enum):
    kVadInStateStartPointNotDetected = 1
    kVadInStateInSpeechSegment = 2
    kVadInStateEndPointDetected = 3


class FrameState(Enum):
    kFrameStateInvalid = -1
    kFrameStateSpeech = 1
    kFrameStateSil = 0


# final voice/unvoice state per frame
class AudioChangeState(Enum):
    kChangeStateSpeech2Speech = 0
    kChangeStateSpeech2Sil = 1
    kChangeStateSil2Sil = 2
    kChangeStateSil2Speech = 3
    kChangeStateNoBegin = 4
    kChangeStateInvalid = 5


class VadDetectMode(Enum):
    kVadSingleUtteranceDetectMode = 0
    kVadMutipleUtteranceDetectMode = 1


class VADXOptions:
    def __init__(
            self,
            sample_rate: int = 16000,
            detect_mode: int = VadDetectMode.kVadMutipleUtteranceDetectMode.value,
            snr_mode: int = 0,
            max_end_silence_time: int = 800,
            max_start_silence_time: int = 3000,
            do_start_point_detection: bool = True,
            do_end_point_detection: bool = True,
            window_size_ms: int = 200,
            sil_to_speech_time_thres: int = 150,
            speech_to_sil_time_thres: int = 150,
            speech_2_noise_ratio: float = 1.0,
            do_extend: int = 1,
            lookback_time_start_point: int = 200,
            lookahead_time_end_point: int = 100,
            max_single_segment_time: int = 60000,
            nn_eval_block_size: int = 8,
            dcd_block_size: int = 4,
            snr_thres: int = -100.0,
            noise_frame_num_used_for_snr: int = 100,
            decibel_thres: int = -100.0,
            speech_noise_thres: float = 0.6,
            fe_prior_thres: float = 1e-4,
            silence_pdf_num: int = 1,
            sil_pdf_ids: List[int] = [0],
            speech_noise_thresh_low: float = -0.1,
            speech_noise_thresh_high: float = 0.3,
            output_frame_probs: bool = False,
            frame_in_ms: int = 10,
            frame_length_ms: int = 25,
    ):
        self.sample_rate = sample_rate
        self.detect_mode = detect_mode
        self.snr_mode = snr_mode
        self.max_end_silence_time = max_end_silence_time
        self.max_start_silence_time = max_start_silence_time
        self.do_start_point_detection = do_start_point_detection
        self.do_end_point_detection = do_end_point_detection
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time_thres = sil_to_speech_time_thres
        self.speech_to_sil_time_thres = speech_to_sil_time_thres
        self.speech_2_noise_ratio = speech_2_noise_ratio
        self.do_extend = do_extend
        self.lookback_time_start_point = lookback_time_start_point
        self.lookahead_time_end_point = lookahead_time_end_point
        self.max_single_segment_time = max_single_segment_time
        self.nn_eval_block_size = nn_eval_block_size
        self.dcd_block_size = dcd_block_size
        self.snr_thres = snr_thres
        self.noise_frame_num_used_for_snr = noise_frame_num_used_for_snr
        self.decibel_thres = decibel_thres
        self.speech_noise_thres = speech_noise_thres
        self.fe_prior_thres = fe_prior_thres
        self.silence_pdf_num = silence_pdf_num
        self.sil_pdf_ids = sil_pdf_ids
        self.speech_noise_thresh_low = speech_noise_thresh_low
        self.speech_noise_thresh_high = speech_noise_thresh_high
        self.output_frame_probs = output_frame_probs
        self.frame_in_ms = frame_in_ms
        self.frame_length_ms = frame_length_ms


class E2EVadSpeechBufWithDoa(object):
    def __init__(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0

    def Reset(self):
        self.start_ms = 0
        self.end_ms = 0
        self.buffer = []
        self.contain_seg_start_point = False
        self.contain_seg_end_point = False
        self.doa = 0


class E2EVadFrameProb(object):
    def __init__(self):
        self.noise_prob = 0.0
        self.speech_prob = 0.0
        self.score = 0.0
        self.frame_id = 0
        self.frm_state = 0


class WindowDetector(object):
    def __init__(self, window_size_ms: int, sil_to_speech_time: int,
                 speech_to_sil_time: int, frame_size_ms: int):
        self.window_size_ms = window_size_ms
        self.sil_to_speech_time = sil_to_speech_time
        self.speech_to_sil_time = speech_to_sil_time
        self.frame_size_ms = frame_size_ms

        self.win_size_frame = int(window_size_ms / frame_size_ms)
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame  # 初始化窗

        self.cur_win_pos = 0
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.sil_to_speech_frmcnt_thres = int(sil_to_speech_time / frame_size_ms)
        self.speech_to_sil_frmcnt_thres = int(speech_to_sil_time / frame_size_ms)

        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def Reset(self) -> None:
        self.cur_win_pos = 0
        self.win_sum = 0
        self.win_state = [0] * self.win_size_frame
        self.pre_frame_state = FrameState.kFrameStateSil
        self.cur_frame_state = FrameState.kFrameStateSil
        self.voice_last_frame_count = 0
        self.noise_last_frame_count = 0
        self.hydre_frame_count = 0

    def GetWinSize(self) -> int:
        return int(self.win_size_frame)

    def DetectOneFrame(self, frameState: FrameState, frame_count: int) -> AudioChangeState:
        cur_frame_state = FrameState.kFrameStateSil
        if frameState == FrameState.kFrameStateSpeech:
            cur_frame_state = 1
        elif frameState == FrameState.kFrameStateSil:
            cur_frame_state = 0
        else:
            return AudioChangeState.kChangeStateInvalid
        self.win_sum -= self.win_state[self.cur_win_pos]
        self.win_sum += cur_frame_state
        self.win_state[self.cur_win_pos] = cur_frame_state
        self.cur_win_pos = (self.cur_win_pos + 1) % self.win_size_frame

        if self.pre_frame_state == FrameState.kFrameStateSil and self.win_sum >= self.sil_to_speech_frmcnt_thres:
            self.pre_frame_state = FrameState.kFrameStateSpeech
            return AudioChangeState.kChangeStateSil2Speech

        if self.pre_frame_state == FrameState.kFrameStateSpeech and self.win_sum <= self.speech_to_sil_frmcnt_thres:
            self.pre_frame_state = FrameState.kFrameStateSil
            return AudioChangeState.kChangeStateSpeech2Sil

        if self.pre_frame_state == FrameState.kFrameStateSil:
            return AudioChangeState.kChangeStateSil2Sil
        if self.pre_frame_state == FrameState.kFrameStateSpeech:
            return AudioChangeState.kChangeStateSpeech2Speech
        return AudioChangeState.kChangeStateInvalid

    def FrameSizeMs(self) -> int:
        return int(self.frame_size_ms)


class E2EVadModel():
    def __init__(self, vad_post_args: Dict[str, Any]):
        super(E2EVadModel, self).__init__()
        self.vad_opts = VADXOptions(**vad_post_args)
        self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
                                               self.vad_opts.sil_to_speech_time_thres,
                                               self.vad_opts.speech_to_sil_time_thres,
                                               self.vad_opts.frame_in_ms)
        # self.encoder = encoder
        # init variables
        self.is_final = False
        self.data_buf_start_frame = 0
        self.frm_cnt = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.continous_silence_frame_count = 0
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.number_end_time_detected = 0
        self.sil_frame = 0
        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
        self.noise_average_decibel = -100.0
        self.pre_end_silence_detected = False
        self.next_seg = True

        self.output_data_buf = []
        self.output_data_buf_offset = 0
        self.frame_probs = []
        self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
        self.speech_noise_thres = self.vad_opts.speech_noise_thres
        self.scores = None
        self.max_time_out = False
        self.decibel = []
        self.data_buf = None
        self.data_buf_all = None
        self.waveform = None
        self.ResetDetection()

    def AllResetDetection(self):
        self.is_final = False
        self.data_buf_start_frame = 0
        self.frm_cnt = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.continous_silence_frame_count = 0
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.number_end_time_detected = 0
        self.sil_frame = 0
        self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
        self.noise_average_decibel = -100.0
        self.pre_end_silence_detected = False
        self.next_seg = True

        self.output_data_buf = []
        self.output_data_buf_offset = 0
        self.frame_probs = []
        self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
        self.speech_noise_thres = self.vad_opts.speech_noise_thres
        self.scores = None
        self.max_time_out = False
        self.decibel = []
        self.data_buf = None
        self.data_buf_all = None
        self.waveform = None
        self.ResetDetection()

    def ResetDetection(self):
        self.continous_silence_frame_count = 0
        self.latest_confirmed_speech_frame = 0
        self.lastest_confirmed_silence_frame = -1
        self.confirmed_start_frame = -1
        self.confirmed_end_frame = -1
        self.vad_state_machine = VadStateMachine.kVadInStateStartPointNotDetected
        self.windows_detector.Reset()
        self.sil_frame = 0
        self.frame_probs = []

    def ComputeDecibel(self) -> None:
        frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
        frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
        if self.data_buf_all is None:
            self.data_buf_all = self.waveform[0]  # self.data_buf is pointed to self.waveform[0]
            self.data_buf = self.data_buf_all
        else:
            self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
        for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
            self.decibel.append(
                10 * math.log10(np.square((self.waveform[0][offset: offset + frame_sample_length])).sum() + \
                                0.000001))

    def ComputeScores(self, scores: np.ndarray) -> None:
        # scores = self.encoder(feats, in_cache)  # return B * T * D
        self.vad_opts.nn_eval_block_size = scores.shape[1]
        self.frm_cnt += scores.shape[1]  # count total frames
        if self.scores is None:
            self.scores = scores  # the first calculation
        else:
            self.scores = np.concatenate((self.scores, scores), axis=1)

    def PopDataBufTillFrame(self, frame_idx: int) -> None:  # need check again
        while self.data_buf_start_frame < frame_idx:
            if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                self.data_buf_start_frame += 1
                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
                    self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]

    def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
                           last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
        self.PopDataBufTillFrame(start_frm)
        expected_sample_number = int(frm_cnt * self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000)
        if last_frm_is_end_point:
            extra_sample = max(0, int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000 - \
                                      self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
            expected_sample_number += int(extra_sample)
        if end_point_is_sent_end:
            expected_sample_number = max(expected_sample_number, len(self.data_buf))
        if len(self.data_buf) < expected_sample_number:
            print('error in calling pop data_buf\n')

        if len(self.output_data_buf) == 0 or first_frm_is_start_point:
            self.output_data_buf.append(E2EVadSpeechBufWithDoa())
            self.output_data_buf[-1].Reset()
            self.output_data_buf[-1].start_ms = start_frm * self.vad_opts.frame_in_ms
            self.output_data_buf[-1].end_ms = self.output_data_buf[-1].start_ms
            self.output_data_buf[-1].doa = 0
        cur_seg = self.output_data_buf[-1]
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            print('warning\n')
        out_pos = len(cur_seg.buffer)  # cur_seg.buff现在没做任何操作
        data_to_pop = 0
        if end_point_is_sent_end:
            data_to_pop = expected_sample_number
        else:
            data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
        if data_to_pop > len(self.data_buf):
            print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
            data_to_pop = len(self.data_buf)
            expected_sample_number = len(self.data_buf)

        cur_seg.doa = 0
        for sample_cpy_out in range(0, data_to_pop):
            # cur_seg.buffer[out_pos ++] = data_buf_.back();
            out_pos += 1
        for sample_cpy_out in range(data_to_pop, expected_sample_number):
            # cur_seg.buffer[out_pos++] = data_buf_.back()
            out_pos += 1
        if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
            print('Something wrong with the VAD algorithm\n')
        self.data_buf_start_frame += frm_cnt
        cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
        if first_frm_is_start_point:
            cur_seg.contain_seg_start_point = True
        if last_frm_is_end_point:
            cur_seg.contain_seg_end_point = True

    def OnSilenceDetected(self, valid_frame: int):
        self.lastest_confirmed_silence_frame = valid_frame
        if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
            self.PopDataBufTillFrame(valid_frame)
        # silence_detected_callback_
        # pass

    def OnVoiceDetected(self, valid_frame: int) -> None:
        self.latest_confirmed_speech_frame = valid_frame
        self.PopDataToOutputBuf(valid_frame, 1, False, False, False)

    def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
        if self.vad_opts.do_start_point_detection:
            pass
        if self.confirmed_start_frame != -1:
            print('not reset vad properly\n')
        else:
            self.confirmed_start_frame = start_frame

        if not fake_result and self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
            self.PopDataToOutputBuf(self.confirmed_start_frame, 1, True, False, False)

    def OnVoiceEnd(self, end_frame: int, fake_result: bool, is_last_frame: bool) -> None:
        for t in range(self.latest_confirmed_speech_frame + 1, end_frame):
            self.OnVoiceDetected(t)
        if self.vad_opts.do_end_point_detection:
            pass
        if self.confirmed_end_frame != -1:
            print('not reset vad properly\n')
        else:
            self.confirmed_end_frame = end_frame
        if not fake_result:
            self.sil_frame = 0
            self.PopDataToOutputBuf(self.confirmed_end_frame, 1, False, True, is_last_frame)
        self.number_end_time_detected += 1

    def MaybeOnVoiceEndIfLastFrame(self, is_final_frame: bool, cur_frm_idx: int) -> None:
        if is_final_frame:
            self.OnVoiceEnd(cur_frm_idx, False, True)
            self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected

    def GetLatency(self) -> int:
        return int(self.LatencyFrmNumAtStartPoint() * self.vad_opts.frame_in_ms)

    def LatencyFrmNumAtStartPoint(self) -> int:
        vad_latency = self.windows_detector.GetWinSize()
        if self.vad_opts.do_extend:
            vad_latency += int(self.vad_opts.lookback_time_start_point / self.vad_opts.frame_in_ms)
        return vad_latency

    def GetFrameState(self, t: int) -> FrameState:
        frame_state = FrameState.kFrameStateInvalid
        cur_decibel = self.decibel[t]
        cur_snr = cur_decibel - self.noise_average_decibel
        # for each frame, calc log posterior probability of each state
        if cur_decibel < self.vad_opts.decibel_thres:
            frame_state = FrameState.kFrameStateSil
            self.DetectOneFrame(frame_state, t, False)
            return frame_state

        sum_score = 0.0
        noise_prob = 0.0
        assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
        if len(self.sil_pdf_ids) > 0:
            assert len(self.scores) == 1  # 只支持batch_size = 1的测试
            sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
            sum_score = sum(sil_pdf_scores)
            noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
            total_score = 1.0
            sum_score = total_score - sum_score
        speech_prob = math.log(sum_score)
        if self.vad_opts.output_frame_probs:
            frame_prob = E2EVadFrameProb()
            frame_prob.noise_prob = noise_prob
            frame_prob.speech_prob = speech_prob
            frame_prob.score = sum_score
            frame_prob.frame_id = t
            self.frame_probs.append(frame_prob)
        if math.exp(speech_prob) >= math.exp(noise_prob) + self.speech_noise_thres:
            if cur_snr >= self.vad_opts.snr_thres and cur_decibel >= self.vad_opts.decibel_thres:
                frame_state = FrameState.kFrameStateSpeech
            else:
                frame_state = FrameState.kFrameStateSil
        else:
            frame_state = FrameState.kFrameStateSil
            if self.noise_average_decibel < -99.9:
                self.noise_average_decibel = cur_decibel
            else:
                self.noise_average_decibel = (cur_decibel + self.noise_average_decibel * (
                        self.vad_opts.noise_frame_num_used_for_snr
                        - 1)) / self.vad_opts.noise_frame_num_used_for_snr

        return frame_state
     

    def __call__(self, score: np.ndarray, waveform: np.ndarray,
                is_final: bool = False, max_end_sil: int = 800
                ):
        self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
        self.waveform = waveform  # compute decibel for each frame
        self.ComputeDecibel()
        self.ComputeScores(score)
        if not is_final:
            self.DetectCommonFrames()
        else:
            self.DetectLastFrames()
        segments = []
        for batch_num in range(0, score.shape[0]):  # only support batch_size = 1 now
            segment_batch = []
            if len(self.output_data_buf) > 0:
                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
                    if not self.output_data_buf[i].contain_seg_start_point:
                        continue
                    if not self.next_seg and not self.output_data_buf[i].contain_seg_end_point:
                        continue
                    start_ms = self.output_data_buf[i].start_ms if self.next_seg else -1
                    if self.output_data_buf[i].contain_seg_end_point:
                        end_ms = self.output_data_buf[i].end_ms
                        self.next_seg = True
                        self.output_data_buf_offset += 1
                    else:
                        end_ms = -1
                        self.next_seg = False
                    segment = [start_ms, end_ms]
                    segment_batch.append(segment)
            if segment_batch:
                segments.append(segment_batch)
        if is_final:
            # reset class variables and clear the dict for the next query
            self.AllResetDetection()
        return segments

    def DetectCommonFrames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
            self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)

        return 0

    def DetectLastFrames(self) -> int:
        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
            return 0
        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
            frame_state = FrameState.kFrameStateInvalid
            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
            if i != 0:
                self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
            else:
                self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)

        return 0

    def DetectOneFrame(self, cur_frm_state: FrameState, cur_frm_idx: int, is_final_frame: bool) -> None:
        tmp_cur_frm_state = FrameState.kFrameStateInvalid
        if cur_frm_state == FrameState.kFrameStateSpeech:
            if math.fabs(1.0) > self.vad_opts.fe_prior_thres:
                tmp_cur_frm_state = FrameState.kFrameStateSpeech
            else:
                tmp_cur_frm_state = FrameState.kFrameStateSil
        elif cur_frm_state == FrameState.kFrameStateSil:
            tmp_cur_frm_state = FrameState.kFrameStateSil
        state_change = self.windows_detector.DetectOneFrame(tmp_cur_frm_state, cur_frm_idx)
        frm_shift_in_ms = self.vad_opts.frame_in_ms
        if AudioChangeState.kChangeStateSil2Speech == state_change:
            silence_frame_count = self.continous_silence_frame_count
            self.continous_silence_frame_count = 0
            self.pre_end_silence_detected = False
            start_frame = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                start_frame = max(self.data_buf_start_frame, cur_frm_idx - self.LatencyFrmNumAtStartPoint())
                self.OnVoiceStart(start_frame)
                self.vad_state_machine = VadStateMachine.kVadInStateInSpeechSegment
                for t in range(start_frame + 1, cur_frm_idx + 1):
                    self.OnVoiceDetected(t)
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                for t in range(self.latest_confirmed_speech_frame + 1, cur_frm_idx):
                    self.OnVoiceDetected(t)
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Sil == state_change:
            self.continous_silence_frame_count = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                pass
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSpeech2Speech == state_change:
            self.continous_silence_frame_count = 0
            if self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.max_time_out = True
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif not is_final_frame:
                    self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass
        elif AudioChangeState.kChangeStateSil2Sil == state_change:
            self.continous_silence_frame_count += 1
            if self.vad_state_machine == VadStateMachine.kVadInStateStartPointNotDetected:
                # silence timeout, return zero length decision
                if ((self.vad_opts.detect_mode == VadDetectMode.kVadSingleUtteranceDetectMode.value) and (
                        self.continous_silence_frame_count * frm_shift_in_ms > self.vad_opts.max_start_silence_time)) \
                        or (is_final_frame and self.number_end_time_detected == 0):
                    for t in range(self.lastest_confirmed_silence_frame + 1, cur_frm_idx):
                        self.OnSilenceDetected(t)
                    self.OnVoiceStart(0, True)
                    self.OnVoiceEnd(0, True, False);
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                else:
                    if cur_frm_idx >= self.LatencyFrmNumAtStartPoint():
                        self.OnSilenceDetected(cur_frm_idx - self.LatencyFrmNumAtStartPoint())
            elif self.vad_state_machine == VadStateMachine.kVadInStateInSpeechSegment:
                if self.continous_silence_frame_count * frm_shift_in_ms >= self.max_end_sil_frame_cnt_thresh:
                    lookback_frame = int(self.max_end_sil_frame_cnt_thresh / frm_shift_in_ms)
                    if self.vad_opts.do_extend:
                        lookback_frame -= int(self.vad_opts.lookahead_time_end_point / frm_shift_in_ms)
                        lookback_frame -= 1
                        lookback_frame = max(0, lookback_frame)
                    self.OnVoiceEnd(cur_frm_idx - lookback_frame, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif cur_frm_idx - self.confirmed_start_frame + 1 > \
                        self.vad_opts.max_single_segment_time / frm_shift_in_ms:
                    self.OnVoiceEnd(cur_frm_idx, False, False)
                    self.vad_state_machine = VadStateMachine.kVadInStateEndPointDetected
                elif self.vad_opts.do_extend and not is_final_frame:
                    if self.continous_silence_frame_count <= int(
                            self.vad_opts.lookahead_time_end_point / frm_shift_in_ms):
                        self.OnVoiceDetected(cur_frm_idx)
                else:
                    self.MaybeOnVoiceEndIfLastFrame(is_final_frame, cur_frm_idx)
            else:
                pass

        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected and \
                self.vad_opts.detect_mode == VadDetectMode.kVadMutipleUtteranceDetectMode.value:
            self.ResetDetection()

 funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py

@@ -188,7 +188,7 @@
                 input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
        input_dict = dict(zip(self.get_input_names(), input_content))
        try:
            return self.session.run(None, input_dict)
            return self.session.run(self.get_output_names(), input_dict)
        except Exception as e:
            raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e


 funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py

New file
@@ -0,0 +1,134 @@
# -*- encoding: utf-8 -*-

import os.path
from pathlib import Path
from typing import List, Union, Tuple

import copy
import librosa
import numpy as np

from .utils.utils import (ONNXRuntimeError,
                          OrtInferSession, get_logger,
                          read_yaml)
from .utils.frontend import WavFrontend
from .utils.e2e_vad import E2EVadModel

logging = get_logger()


class Fsmn_vad():
    def __init__(self, model_dir: Union[str, Path] = None,
                 batch_size: int = 1,
                 device_id: Union[str, int] = "-1",
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 max_end_sil: int = 800,
                 ):
		
        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')
		
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        config_file = os.path.join(model_dir, 'vad.yaml')
        cmvn_file = os.path.join(model_dir, 'vad.mvn')
        config = read_yaml(config_file)
		
        self.frontend = WavFrontend(
            cmvn_file=cmvn_file,
            **config['frontend_conf']
        )
        self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
        self.batch_size = batch_size
        self.vad_scorer = E2EVadModel(config["vad_post_conf"])
        self.max_end_sil = max_end_sil
	
    def prepare_cache(self, in_cache: list = []):
        if len(in_cache) > 0:
            return in_cache
		
        for i in range(4):
            cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
            in_cache.append(cache)
        return in_cache
		
	
    def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
        waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
        waveform_nums = len(waveform_list)
        is_final = kwargs.get('kwargs', False)

        asr_res = []
        for beg_idx in range(0, waveform_nums, self.batch_size):
			
            end_idx = min(waveform_nums, beg_idx + self.batch_size)
            waveform = waveform_list[beg_idx:end_idx]
            feats, feats_len = self.extract_feat(waveform)
            param_dict = kwargs.get('param_dict', dict())
            in_cache = param_dict.get('cache', list())
            in_cache = self.prepare_cache(in_cache)
            try:
                inputs = [feats]
                inputs.extend(in_cache)
                scores, out_caches = self.infer(inputs)
                param_dict['cache'] = out_caches
                segments = self.vad_scorer(scores, waveform[0][None, :], is_final=is_final, max_end_sil=self.max_end_sil)
				
            except ONNXRuntimeError:
                # logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                segments = ''
            asr_res.append(segments)
	
        return asr_res

    def load_data(self,
                  wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
        def load_wav(path: str) -> np.ndarray:
            waveform, _ = librosa.load(path, sr=fs)
            return waveform
		
        if isinstance(wav_content, np.ndarray):
            return [wav_content]
		
        if isinstance(wav_content, str):
            return [load_wav(wav_content)]
		
        if isinstance(wav_content, list):
            return [load_wav(path) for path in wav_content]
		
        raise TypeError(
            f'The type of {wav_content} is not in [str, np.ndarray, list]')
	
    def extract_feat(self,
                     waveform_list: List[np.ndarray]
                     ) -> Tuple[np.ndarray, np.ndarray]:
        feats, feats_len = [], []
        for waveform in waveform_list:
            speech, _ = self.frontend.fbank(waveform)
            feat, feat_len = self.frontend.lfr_cmvn(speech)
            feats.append(feat)
            feats_len.append(feat_len)
		
        feats = self.pad_feats(feats, np.max(feats_len))
        feats_len = np.array(feats_len).astype(np.int32)
        return feats, feats_len
	
    @staticmethod
    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
            pad_width = ((0, max_feat_len - cur_len), (0, 0))
            return np.pad(feat, pad_width, 'constant', constant_values=0)
		
        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
        feats = np.array(feat_res).astype(np.float32)
        return feats
	
    def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:
		
        outputs = self.ort_infer(feats)
        scores, out_caches = outputs[0], outputs[1:]
        return scores, out_caches
	

 funasr/runtime/python/onnxruntime/setup.py

@@ -13,7 +13,7 @@


MODULE_NAME = 'funasr_onnx'
VERSION_NUM = '0.0.2'
VERSION_NUM = '0.0.3'

setuptools.setup(
    name=MODULE_NAME,

New file
			@@ -0,0 +1,3 @@
			# -- encoding: utf-8 --
			from .paraformer_bin import Paraformer
			from .vad_bin import Fsmn_vad

New file
			@@ -0,0 +1,187 @@
			# -- encoding: utf-8 --

			import os.path
			from pathlib import Path
			from typing import List, Union, Tuple

			import copy
			import librosa
			import numpy as np

			from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
			OrtInferSession, TokenIDConverter, get_logger,
			read_yaml)
			from .utils.postprocess_utils import sentence_postprocess
			from .utils.frontend import WavFrontend
			from .utils.timestamp_utils import time_stamp_lfr6_onnx

			logging = get_logger()


			class Paraformer():
			def __init__(self, model_dir: Union[str, Path] = None,
			batch_size: int = 1,
			device_id: Union[str, int] = "-1",
			plot_timestamp_to: str = "",
			pred_bias: int = 1,
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			config_file = os.path.join(model_dir, 'config.yaml')
			cmvn_file = os.path.join(model_dir, 'am.mvn')
			config = read_yaml(config_file)

			self.converter = TokenIDConverter(config['token_list'])
			self.tokenizer = CharTokenizer()
			self.frontend = WavFrontend(
			cmvn_file=cmvn_file,
			**config['frontend_conf']
			)
			self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
			self.batch_size = batch_size
			self.plot_timestamp_to = plot_timestamp_to
			self.pred_bias = pred_bias

			def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
			waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
			waveform_nums = len(waveform_list)
			asr_res = []
			for beg_idx in range(0, waveform_nums, self.batch_size):

			end_idx = min(waveform_nums, beg_idx + self.batch_size)
			feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
			try:
			outputs = self.infer(feats, feats_len)
			am_scores, valid_token_lens = outputs[0], outputs[1]
			if len(outputs) == 4:
			# for BiCifParaformer Inference
			us_alphas, us_peaks = outputs[2], outputs[3]
			else:
			us_alphas, us_peaks = None, None
			except ONNXRuntimeError:
			#logging.warning(traceback.format_exc())
			logging.warning("input wav is silence or noise")
			preds = ['']
			else:
			preds = self.decode(am_scores, valid_token_lens)
			if us_peaks is None:
			for pred in preds:
			pred = sentence_postprocess(pred)
			asr_res.append({'preds': pred})
			else:
			for pred, us_peaks_ in zip(preds, us_peaks):
			raw_tokens = pred
			timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
			text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
			# logging.warning(timestamp)
			if len(self.plot_timestamp_to):
			self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
			asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
			return asr_res

			def plot_wave_timestamp(self, wav, text_timestamp, dest):
			# TODO: Plot the wav and timestamp results with matplotlib
			import matplotlib
			matplotlib.use('Agg')
			matplotlib.rc("font", family='Alibaba PuHuiTi') # set it to a font that your system supports
			import matplotlib.pyplot as plt
			fig, ax1 = plt.subplots(figsize=(11, 3.5), dpi=320)
			ax2 = ax1.twinx()
			ax2.set_ylim([0, 2.0])
			# plot waveform
			ax1.set_ylim([-0.3, 0.3])
			time = np.arange(wav.shape[0]) / 16000
			ax1.plot(time, wav/wav.max()*0.3, color='gray', alpha=0.4)
			# plot lines and text
			for (char, start, end) in text_timestamp:
			ax1.vlines(start, -0.3, 0.3, ls='--')
			ax1.vlines(end, -0.3, 0.3, ls='--')
			x_adj = 0.045 if char != '<sil>' else 0.12
			ax1.text((start + end) * 0.5 - x_adj, 0, char)
			# plt.legend()
			plotname = "{}/timestamp.png".format(dest)
			plt.savefig(plotname, bbox_inches='tight')

			def load_data(self,
			wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
			def load_wav(path: str) -> np.ndarray:
			waveform, _ = librosa.load(path, sr=fs)
			return waveform

			if isinstance(wav_content, np.ndarray):
			return [wav_content]

			if isinstance(wav_content, str):
			return [load_wav(wav_content)]

			if isinstance(wav_content, list):
			return [load_wav(path) for path in wav_content]

			raise TypeError(
			f'The type of {wav_content} is not in [str, np.ndarray, list]')

			def extract_feat(self,
			waveform_list: List[np.ndarray]
			) -> Tuple[np.ndarray, np.ndarray]:
			feats, feats_len = [], []
			for waveform in waveform_list:
			speech, _ = self.frontend.fbank(waveform)
			feat, feat_len = self.frontend.lfr_cmvn(speech)
			feats.append(feat)
			feats_len.append(feat_len)

			feats = self.pad_feats(feats, np.max(feats_len))
			feats_len = np.array(feats_len).astype(np.int32)
			return feats, feats_len

			@staticmethod
			def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
			def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
			pad_width = ((0, max_feat_len - cur_len), (0, 0))
			return np.pad(feat, pad_width, 'constant', constant_values=0)

			feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
			feats = np.array(feat_res).astype(np.float32)
			return feats

			def infer(self, feats: np.ndarray,
			feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			outputs = self.ort_infer([feats, feats_len])
			return outputs

			def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
			return [self.decode_one(am_score, token_num)
			for am_score, token_num in zip(am_scores, token_nums)]

			def decode_one(self,
			am_score: np.ndarray,
			valid_token_num: int) -> List[str]:
			yseq = am_score.argmax(axis=-1)
			score = am_score.max(axis=-1)
			score = np.sum(score, axis=-1)

			# pad with mask tokens to ensure compatibility with sos/eos tokens
			# asr_model.sos:1 asr_model.eos:2
			yseq = np.array([1] + yseq.tolist() + [2])
			hyp = Hypothesis(yseq=yseq, score=score)

			# remove sos/eos and get results
			last_pos = -1
			token_int = hyp.yseq[1:last_pos].tolist()

			# remove blank symbol id, which is assumed to be 0
			token_int = list(filter(lambda x: x not in (0, 2), token_int))

			# Change integer-ids to tokens
			token = self.converter.ids2tokens(token_int)
			token = token[:valid_token_num-self.pred_bias]
			# texts = sentence_postprocess(token)
			return token

New file
			@@ -0,0 +1,191 @@
			# -- encoding: utf-8 --
			from pathlib import Path
			from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

			import numpy as np
			from typeguard import check_argument_types
			import kaldi_native_fbank as knf

			root_dir = Path(__file__).resolve().parent

			logger_initialized = {}


			class WavFrontend():
			"""Conventional frontend structure for ASR.
			"""

			def __init__(
			self,
			cmvn_file: str = None,
			fs: int = 16000,
			window: str = 'hamming',
			n_mels: int = 80,
			frame_length: int = 25,
			frame_shift: int = 10,
			lfr_m: int = 1,
			lfr_n: int = 1,
			dither: float = 1.0,
			**kwargs,
			) -> None:
			check_argument_types()

			opts = knf.FbankOptions()
			opts.frame_opts.samp_freq = fs
			opts.frame_opts.dither = dither
			opts.frame_opts.window_type = window
			opts.frame_opts.frame_shift_ms = float(frame_shift)
			opts.frame_opts.frame_length_ms = float(frame_length)
			opts.mel_opts.num_bins = n_mels
			opts.energy_floor = 0
			opts.frame_opts.snip_edges = True
			opts.mel_opts.debug_mel = False
			self.opts = opts

			self.lfr_m = lfr_m
			self.lfr_n = lfr_n
			self.cmvn_file = cmvn_file

			if self.cmvn_file:
			self.cmvn = self.load_cmvn()
			self.fbank_fn = None
			self.fbank_beg_idx = 0
			self.reset_status()

			def fbank(self,
			waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			waveform = waveform * (1 << 15)
			self.fbank_fn = knf.OnlineFbank(self.opts)
			self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
			frames = self.fbank_fn.num_frames_ready
			mat = np.empty([frames, self.opts.mel_opts.num_bins])
			for i in range(frames):
			mat[i, :] = self.fbank_fn.get_frame(i)
			feat = mat.astype(np.float32)
			feat_len = np.array(mat.shape[0]).astype(np.int32)
			return feat, feat_len

			def fbank_online(self,
			waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			waveform = waveform * (1 << 15)
			# self.fbank_fn = knf.OnlineFbank(self.opts)
			self.fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
			frames = self.fbank_fn.num_frames_ready
			mat = np.empty([frames, self.opts.mel_opts.num_bins])
			for i in range(self.fbank_beg_idx, frames):
			mat[i, :] = self.fbank_fn.get_frame(i)
			# self.fbank_beg_idx += (frames-self.fbank_beg_idx)
			feat = mat.astype(np.float32)
			feat_len = np.array(mat.shape[0]).astype(np.int32)
			return feat, feat_len

			def reset_status(self):
			self.fbank_fn = knf.OnlineFbank(self.opts)
			self.fbank_beg_idx = 0

			def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			if self.lfr_m != 1 or self.lfr_n != 1:
			feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)

			if self.cmvn_file:
			feat = self.apply_cmvn(feat)

			feat_len = np.array(feat.shape[0]).astype(np.int32)
			return feat, feat_len

			@staticmethod
			def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
			LFR_inputs = []

			T = inputs.shape[0]
			T_lfr = int(np.ceil(T / lfr_n))
			left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
			inputs = np.vstack((left_padding, inputs))
			T = T + (lfr_m - 1) // 2
			for i in range(T_lfr):
			if lfr_m <= T - i * lfr_n:
			LFR_inputs.append(
			(inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
			else:
			# process last LFR frame
			num_padding = lfr_m - (T - i * lfr_n)
			frame = inputs[i * lfr_n:].reshape(-1)
			for _ in range(num_padding):
			frame = np.hstack((frame, inputs[-1]))

			LFR_inputs.append(frame)
			LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
			return LFR_outputs

			def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
			"""
			Apply CMVN with mvn data
			"""
			frame, dim = inputs.shape
			means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
			vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
			inputs = (inputs + means) * vars
			return inputs

			def load_cmvn(self,) -> np.ndarray:
			with open(self.cmvn_file, 'r', encoding='utf-8') as f:
			lines = f.readlines()

			means_list = []
			vars_list = []
			for i in range(len(lines)):
			line_item = lines[i].split()
			if line_item[0] == '<AddShift>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			add_shift_line = line_item[3:(len(line_item) - 1)]
			means_list = list(add_shift_line)
			continue
			elif line_item[0] == '<Rescale>':
			line_item = lines[i + 1].split()
			if line_item[0] == '<LearnRateCoef>':
			rescale_line = line_item[3:(len(line_item) - 1)]
			vars_list = list(rescale_line)
			continue

			means = np.array(means_list).astype(np.float64)
			vars = np.array(vars_list).astype(np.float64)
			cmvn = np.array([means, vars])
			return cmvn

			def load_bytes(input):
			middle_data = np.frombuffer(input, dtype=np.int16)
			middle_data = np.asarray(middle_data)
			if middle_data.dtype.kind not in 'iu':
			raise TypeError("'middle_data' must be an array of integers")
			dtype = np.dtype('float32')
			if dtype.kind != 'f':
			raise TypeError("'dtype' must be a floating point type")

			i = np.iinfo(middle_data.dtype)
			abs_max = 2 ** (i.bits - 1)
			offset = i.min + abs_max
			array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
			return array


			def test():
			path = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav"
			import librosa
			cmvn_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn"
			config_file = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml"
			from funasr.runtime.python.onnxruntime.rapid_paraformer.utils.utils import read_yaml
			config = read_yaml(config_file)
			waveform, _ = librosa.load(path, sr=None)
			frontend = WavFrontend(
			cmvn_file=cmvn_file,
			**config['frontend_conf'],
			)
			speech, _ = frontend.fbank_online(waveform) #1d, (sample,), numpy
			feat, feat_len = frontend.lfr_cmvn(speech) # 2d, (frame, 450), np.float32 -> torch, torch.from_numpy(), dtype, (1, frame, 450)

			frontend.reset_status() # clear cache
			return feat, feat_len

			if __name__ == '__main__':
			test()

New file
			@@ -0,0 +1,240 @@
			# Copyright (c) Alibaba, Inc. and its affiliates.

			import string
			import logging
			from typing import Any, List, Union


			def isChinese(ch: str):
			if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
			return True
			return False


			def isAllChinese(word: Union[List[Any], str]):
			word_lists = []
			for i in word:
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if isChinese(ch) is False:
			return False
			return True


			def isAllAlpha(word: Union[List[Any], str]):
			word_lists = []
			for i in word:
			cur = i.replace(' ', '')
			cur = cur.replace('</s>', '')
			cur = cur.replace('<s>', '')
			word_lists.append(cur)

			if len(word_lists) == 0:
			return False

			for ch in word_lists:
			if ch.isalpha() is False and ch != "'":
			return False
			elif ch.isalpha() is True and isChinese(ch) is True:
			return False

			return True


			# def abbr_dispose(words: List[Any]) -> List[Any]:
			def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
			words_size = len(words)
			word_lists = []
			abbr_begin = []
			abbr_end = []
			last_num = -1
			ts_lists = []
			ts_nums = []
			ts_index = 0
			for num in range(words_size):
			if num <= last_num:
			continue

			if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
			if num + 1 < words_size and words[
			num + 1] == ' ' and num + 2 < words_size and len(
			words[num +
			2]) == 1 and words[num +
			2].encode('utf-8').isalpha():
			# found the begin of abbr
			abbr_begin.append(num)
			num += 2
			abbr_end.append(num)
			# to find the end of abbr
			while True:
			num += 1
			if num < words_size and words[num] == ' ':
			num += 1
			if num < words_size and len(
			words[num]) == 1 and words[num].encode(
			'utf-8').isalpha():
			abbr_end.pop()
			abbr_end.append(num)
			last_num = num
			else:
			break
			else:
			break

			for num in range(words_size):
			if words[num] == ' ':
			ts_nums.append(ts_index)
			else:
			ts_nums.append(ts_index)
			ts_index += 1
			last_num = -1
			for num in range(words_size):
			if num <= last_num:
			continue

			if num in abbr_begin:
			if time_stamp is not None:
			begin = time_stamp[ts_nums[num]][0]
			word_lists.append(words[num].upper())
			num += 1
			while num < words_size:
			if num in abbr_end:
			word_lists.append(words[num].upper())
			last_num = num
			break
			else:
			if words[num].encode('utf-8').isalpha():
			word_lists.append(words[num].upper())
			num += 1
			if time_stamp is not None:
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			else:
			word_lists.append(words[num])
			if time_stamp is not None and words[num] != ' ':
			begin = time_stamp[ts_nums[num]][0]
			end = time_stamp[ts_nums[num]][1]
			ts_lists.append([begin, end])
			begin = end

			if time_stamp is not None:
			return word_lists, ts_lists
			else:
			return word_lists


			def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
			middle_lists = []
			word_lists = []
			word_item = ''
			ts_lists = []

			# wash words lists
			for i in words:
			word = ''
			if isinstance(i, str):
			word = i
			else:
			word = i.decode('utf-8')

			if word in ['<s>', '</s>', '<unk>']:
			continue
			else:
			middle_lists.append(word)

			# all chinese characters
			if isAllChinese(middle_lists):
			for i, ch in enumerate(middle_lists):
			word_lists.append(ch.replace(' ', ''))
			if time_stamp is not None:
			ts_lists = time_stamp

			# all alpha characters
			elif isAllAlpha(middle_lists):
			ts_flag = True
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			else:
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end

			# mix characters
			else:
			alpha_blank = False
			ts_flag = True
			begin = -1
			end = -1
			for i, ch in enumerate(middle_lists):
			if ts_flag and time_stamp is not None:
			begin = time_stamp[i][0]
			end = time_stamp[i][1]
			word = ''
			if isAllChinese(ch):
			if alpha_blank is True:
			word_lists.pop()
			word_lists.append(ch)
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = True
			ts_lists.append([begin, end])
			begin = end
			elif '@@' in ch:
			word = ch.replace('@@', '')
			word_item += word
			alpha_blank = False
			if time_stamp is not None:
			ts_flag = False
			end = time_stamp[i][1]
			elif isAllAlpha(ch):
			word_item += ch
			word_lists.append(word_item)
			word_lists.append(' ')
			word_item = ''
			alpha_blank = True
			if time_stamp is not None:
			ts_flag = True
			end = time_stamp[i][1]
			ts_lists.append([begin, end])
			begin = end
			else:
			raise ValueError('invalid character: {}'.format(ch))

			if time_stamp is not None:
			word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ' '.join(real_word_lists).strip()
			return sentence, ts_lists, real_word_lists
			else:
			word_lists = abbr_dispose(word_lists)
			real_word_lists = []
			for ch in word_lists:
			if ch != ' ':
			real_word_lists.append(ch)
			sentence = ''.join(word_lists).strip()
			return sentence, real_word_lists

New file
			@@ -0,0 +1,59 @@
			import numpy as np


			def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
			if not len(char_list):
			return []
			START_END_THRESHOLD = 5
			MAX_TOKEN_DURATION = 30
			TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
			cif_peak = us_cif_peak.reshape(-1)
			num_frames = cif_peak.shape[-1]
			if char_list[-1] == '</s>':
			char_list = char_list[:-1]
			# char_list = [i for i in text]
			timestamp_list = []
			new_char_list = []
			# for bicif model trained with large data, cif2 actually fires when a character starts
			# so treat the frames between two peaks as the duration of the former token
			fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
			num_peak = len(fire_place)
			assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
			# begin silence
			if fire_place[0] > START_END_THRESHOLD:
			# char_list.insert(0, '<sil>')
			timestamp_list.append([0.0, fire_place[0]*TIME_RATE])
			new_char_list.append('<sil>')
			# tokens timestamp
			for i in range(len(fire_place)-1):
			new_char_list.append(char_list[i])
			if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
			timestamp_list.append([fire_place[i]TIME_RATE, fire_place[i+1]TIME_RATE])
			else:
			# cut the duration to token and sil of the 0-weight frames last long
			_split = fire_place[i] + MAX_TOKEN_DURATION
			timestamp_list.append([fire_place[i]TIME_RATE, _splitTIME_RATE])
			timestamp_list.append([_splitTIME_RATE, fire_place[i+1]TIME_RATE])
			new_char_list.append('<sil>')
			# tail token and end silence
			if num_frames - fire_place[-1] > START_END_THRESHOLD:
			_end = (num_frames + fire_place[-1]) / 2
			timestamp_list[-1][1] = _end*TIME_RATE
			timestamp_list.append([_endTIME_RATE, num_framesTIME_RATE])
			new_char_list.append("<sil>")
			else:
			timestamp_list[-1][1] = num_frames*TIME_RATE
			if begin_time: # add offset time in model with vad
			for i in range(len(timestamp_list)):
			timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
			timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
			assert len(new_char_list) == len(timestamp_list)
			res_str = ""
			for char, timestamp in zip(new_char_list, timestamp_list):
			res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
			res = []
			for char, timestamp in zip(new_char_list, timestamp_list):
			if char != '<sil>':
			res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
			return res_str, res

New file
			@@ -0,0 +1,257 @@
			# -- encoding: utf-8 --

			import functools
			import logging
			import pickle
			from pathlib import Path
			from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union

			import numpy as np
			import yaml
			from onnxruntime import (GraphOptimizationLevel, InferenceSession,
			SessionOptions, get_available_providers, get_device)
			from typeguard import check_argument_types

			import warnings

			root_dir = Path(__file__).resolve().parent

			logger_initialized = {}


			class TokenIDConverter():
			def __init__(self, token_list: Union[List, str],
			):
			check_argument_types()

			# self.token_list = self.load_token(token_path)
			self.token_list = token_list
			self.unk_symbol = token_list[-1]

			# @staticmethod
			# def load_token(file_path: Union[Path, str]) -> List:
			# if not Path(file_path).exists():
			# raise TokenIDConverterError(f'The {file_path} does not exist.')
			#
			# with open(str(file_path), 'rb') as f:
			# token_list = pickle.load(f)
			#
			# if len(token_list) != len(set(token_list)):
			# raise TokenIDConverterError('The Token exists duplicated symbol.')
			# return token_list

			def get_num_vocabulary_size(self) -> int:
			return len(self.token_list)

			def ids2tokens(self,
			integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
			if isinstance(integers, np.ndarray) and integers.ndim != 1:
			raise TokenIDConverterError(
			f"Must be 1 dim ndarray, but got {integers.ndim}")
			return [self.token_list[i] for i in integers]

			def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
			token2id = {v: i for i, v in enumerate(self.token_list)}
			if self.unk_symbol not in token2id:
			raise TokenIDConverterError(
			f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
			)
			unk_id = token2id[self.unk_symbol]
			return [token2id.get(i, unk_id) for i in tokens]


			class CharTokenizer():
			def __init__(
			self,
			symbol_value: Union[Path, str, Iterable[str]] = None,
			space_symbol: str = "<space>",
			remove_non_linguistic_symbols: bool = False,
			):
			check_argument_types()

			self.space_symbol = space_symbol
			self.non_linguistic_symbols = self.load_symbols(symbol_value)
			self.remove_non_linguistic_symbols = remove_non_linguistic_symbols

			@staticmethod
			def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
			if value is None:
			return set()

			if isinstance(value, Iterable[str]):
			return set(value)

			file_path = Path(value)
			if not file_path.exists():
			logging.warning("%s doesn't exist.", file_path)
			return set()

			with file_path.open("r", encoding="utf-8") as f:
			return set(line.rstrip() for line in f)

			def text2tokens(self, line: Union[str, list]) -> List[str]:
			tokens = []
			while len(line) != 0:
			for w in self.non_linguistic_symbols:
			if line.startswith(w):
			if not self.remove_non_linguistic_symbols:
			tokens.append(line[: len(w)])
			line = line[len(w):]
			break
			else:
			t = line[0]
			if t == " ":
			t = "<space>"
			tokens.append(t)
			line = line[1:]
			return tokens

			def tokens2text(self, tokens: Iterable[str]) -> str:
			tokens = [t if t != self.space_symbol else " " for t in tokens]
			return "".join(tokens)

			def __repr__(self):
			return (
			f"{self.__class__.__name__}("
			f'space_symbol="{self.space_symbol}"'
			f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
			f")"
			)



			class Hypothesis(NamedTuple):
			"""Hypothesis data type."""

			yseq: np.ndarray
			score: Union[float, np.ndarray] = 0
			scores: Dict[str, Union[float, np.ndarray]] = dict()
			states: Dict[str, Any] = dict()

			def asdict(self) -> dict:
			"""Convert data to JSON-friendly dict."""
			return self._replace(
			yseq=self.yseq.tolist(),
			score=float(self.score),
			scores={k: float(v) for k, v in self.scores.items()},
			)._asdict()


			class TokenIDConverterError(Exception):
			pass


			class ONNXRuntimeError(Exception):
			pass


			class OrtInferSession():
			def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
			device_id = str(device_id)
			sess_opt = SessionOptions()
			sess_opt.intra_op_num_threads = intra_op_num_threads
			sess_opt.log_severity_level = 4
			sess_opt.enable_cpu_mem_arena = False
			sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

			cuda_ep = 'CUDAExecutionProvider'
			cuda_provider_options = {
			"device_id": device_id,
			"arena_extend_strategy": "kNextPowerOfTwo",
			"cudnn_conv_algo_search": "EXHAUSTIVE",
			"do_copy_in_default_stream": "true",
			}
			cpu_ep = 'CPUExecutionProvider'
			cpu_provider_options = {
			"arena_extend_strategy": "kSameAsRequested",
			}

			EP_list = []
			if device_id != "-1" and get_device() == 'GPU' \
			and cuda_ep in get_available_providers():
			EP_list = [(cuda_ep, cuda_provider_options)]
			EP_list.append((cpu_ep, cpu_provider_options))

			self._verify_model(model_file)
			self.session = InferenceSession(model_file,
			sess_options=sess_opt,
			providers=EP_list)

			if device_id != "-1" and cuda_ep not in self.session.get_providers():
			warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
			'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
			'you can check their relations from the offical web site: '
			'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
			RuntimeWarning)

			def __call__(self,
			input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
			input_dict = dict(zip(self.get_input_names(), input_content))
			try:
			return self.session.run(None, input_dict)
			except Exception as e:
			raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e

			def get_input_names(self, ):
			return [v.name for v in self.session.get_inputs()]

			def get_output_names(self,):
			return [v.name for v in self.session.get_outputs()]

			def get_character_list(self, key: str = 'character'):
			return self.meta_dict[key].splitlines()

			def have_key(self, key: str = 'character') -> bool:
			self.meta_dict = self.session.get_modelmeta().custom_metadata_map
			if key in self.meta_dict.keys():
			return True
			return False

			@staticmethod
			def _verify_model(model_path):
			model_path = Path(model_path)
			if not model_path.exists():
			raise FileNotFoundError(f'{model_path} does not exists.')
			if not model_path.is_file():
			raise FileExistsError(f'{model_path} is not a file.')


			def read_yaml(yaml_path: Union[str, Path]) -> Dict:
			if not Path(yaml_path).exists():
			raise FileExistsError(f'The {yaml_path} does not exist.')

			with open(str(yaml_path), 'rb') as f:
			data = yaml.load(f, Loader=yaml.Loader)
			return data


			@functools.lru_cache()
			def get_logger(name='rapdi_paraformer'):
			"""Initialize and get a logger by name.
			If the logger has not been initialized, this method will initialize the
			logger by adding one or two handlers, otherwise the initialized logger will
			be directly returned. During initialization, a StreamHandler will always be
			added.
			Args:
			name (str): Logger name.
			Returns:
			logging.Logger: The expected logger.
			"""
			logger = logging.getLogger(name)
			if name in logger_initialized:
			return logger

			for logger_name in logger_initialized:
			if name.startswith(logger_name):
			return logger

			formatter = logging.Formatter(
			'[%(asctime)s] %(name)s %(levelname)s: %(message)s',
			datefmt="%Y/%m/%d %H:%M:%S")

			sh = logging.StreamHandler()
			sh.setFormatter(formatter)
			logger.addHandler(sh)
			logger_initialized[name] = True
			logger.propagate = False
			return logger

New file
			@@ -0,0 +1,166 @@
			# -- encoding: utf-8 --

			import os.path
			from pathlib import Path
			from typing import List, Union, Tuple

			import copy
			import librosa
			import numpy as np

			from .utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
			OrtInferSession, TokenIDConverter, get_logger,
			read_yaml)
			from .utils.postprocess_utils import sentence_postprocess
			from .utils.frontend import WavFrontend
			from .utils.timestamp_utils import time_stamp_lfr6_onnx
			from .utils.e2e_vad import E2EVadModel

			logging = get_logger()


			class Fsmn_vad():
			def __init__(self, model_dir: Union[str, Path] = None,
			batch_size: int = 1,
			device_id: Union[str, int] = "-1",
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			max_end_sil: int = 800,
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			config_file = os.path.join(model_dir, 'vad.yaml')
			cmvn_file = os.path.join(model_dir, 'vad.mvn')
			config = read_yaml(config_file)

			self.frontend = WavFrontend(
			cmvn_file=cmvn_file,
			**config['frontend_conf']
			)
			self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
			self.batch_size = batch_size
			self.vad_scorer = E2EVadModel(**config)
			self.max_end_sil = max_end_sil

			def prepare_cache(self, in_cache: list = []):
			if len(in_cache) > 0:
			return in_cache

			for i in range(4):
			cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
			in_cache.append(cache)
			return in_cache


			def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
			waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
			waveform_nums = len(waveform_list)
			is_final = kwargs.get('kwargs', False)

			asr_res = []
			for beg_idx in range(0, waveform_nums, self.batch_size):

			end_idx = min(waveform_nums, beg_idx + self.batch_size)
			waveform = waveform_list[beg_idx:end_idx]
			feats, feats_len = self.extract_feat(waveform)
			param_dict = kwargs.get('param_dict', dict())
			in_cache = param_dict.get('cache', list())
			in_cache = self.prepare_cache(in_cache)
			try:

			scores, out_caches = self.infer(feats, *in_cache)
			param_dict['cache'] = out_caches
			segments = self.vad_scorer(scores, waveform, is_final=is_final, max_end_sil=self.max_end_sil)

			except ONNXRuntimeError:
			# logging.warning(traceback.format_exc())
			logging.warning("input wav is silence or noise")
			segments = ''
			asr_res.append(segments)
			# else:
			# preds = self.decode(am_scores, valid_token_lens)
			#
			# asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})

			return asr_res

			def load_data(self,
			wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
			def load_wav(path: str) -> np.ndarray:
			waveform, _ = librosa.load(path, sr=fs)
			return waveform

			if isinstance(wav_content, np.ndarray):
			return [wav_content]

			if isinstance(wav_content, str):
			return [load_wav(wav_content)]

			if isinstance(wav_content, list):
			return [load_wav(path) for path in wav_content]

			raise TypeError(
			f'The type of {wav_content} is not in [str, np.ndarray, list]')

			def extract_feat(self,
			waveform_list: List[np.ndarray]
			) -> Tuple[np.ndarray, np.ndarray]:
			feats, feats_len = [], []
			for waveform in waveform_list:
			speech, _ = self.frontend.fbank(waveform)
			feat, feat_len = self.frontend.lfr_cmvn(speech)
			feats.append(feat)
			feats_len.append(feat_len)

			feats = self.pad_feats(feats, np.max(feats_len))
			feats_len = np.array(feats_len).astype(np.int32)
			return feats, feats_len

			@staticmethod
			def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
			def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
			pad_width = ((0, max_feat_len - cur_len), (0, 0))
			return np.pad(feat, pad_width, 'constant', constant_values=0)

			feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
			feats = np.array(feat_res).astype(np.float32)
			return feats

			def infer(self, feats: np.ndarray,
			feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
			outputs = self.ort_infer([feats, feats_len])
			return outputs

			def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
			return [self.decode_one(am_score, token_num)
			for am_score, token_num in zip(am_scores, token_nums)]

			def decode_one(self,
			am_score: np.ndarray,
			valid_token_num: int) -> List[str]:
			yseq = am_score.argmax(axis=-1)
			score = am_score.max(axis=-1)
			score = np.sum(score, axis=-1)

			# pad with mask tokens to ensure compatibility with sos/eos tokens
			# asr_model.sos:1 asr_model.eos:2
			yseq = np.array([1] + yseq.tolist() + [2])
			hyp = Hypothesis(yseq=yseq, score=score)

			# remove sos/eos and get results
			last_pos = -1
			token_int = hyp.yseq[1:last_pos].tolist()

			# remove blank symbol id, which is assumed to be 0
			token_int = list(filter(lambda x: x not in (0, 2), token_int))

			# Change integer-ids to tokens
			token = self.converter.ids2tokens(token_int)
			token = token[:valid_token_num - self.pred_bias]
			# texts = sentence_postprocess(token)
			return token

			@@ -1,8 +1,6 @@

			from funasr_onnx import Paraformer

			#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			#model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model_dir = "/Users/shixian/code/funasr/export/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch"

			# if you use paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch, you should set pred_bias=0

New file
			@@ -0,0 +1,12 @@

			from funasr_onnx import Fsmn_vad


			model_dir = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch"

			model = Fsmn_vad(model_dir)

			wav_path = "/Users/zhifu/Downloads/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"

			result = model(wav_path)
			print(result)

New file
			@@ -0,0 +1,80 @@
			Metadata-Version: 2.1
			Name: funasr-onnx
			Version: 0.0.3
			Summary: FunASR: A Fundamental End-to-End Speech Recognition Toolkit
			Home-page: https://github.com/alibaba-damo-academy/FunASR.git
			Author: Speech Lab, Alibaba Group, China
			Author-email: funasr@list.alibaba-inc.com
			License: MIT
			Keywords: funasr,asr
			Platform: Any
			Classifier: Programming Language :: Python :: 3.6
			Classifier: Programming Language :: Python :: 3.7
			Classifier: Programming Language :: Python :: 3.8
			Classifier: Programming Language :: Python :: 3.9
			Classifier: Programming Language :: Python :: 3.10
			Description-Content-Type: text/markdown

			## Using funasr with ONNXRuntime


			### Introduction
			- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).


			### Steps:
			1. Export the model.
			- Command: (`Tips`: torch >= 1.11.0 is required.)

			More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))

			- `e.g.`, Export model from modelscope
			```shell
			python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
			```
			- `e.g.`, Export model from local path, the model'name must be `model.pb`.
			```shell
			python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
			```


			2. Install the `funasr_onnx`

			install from pip
			```shell
			pip install --upgrade funasr_onnx -i https://pypi.Python.org/simple
			```

			or install from source code

			```shell
			git clone https://github.com/alibaba/FunASR.git && cd FunASR
			cd funasr/runtime/python/funasr_onnx
			python setup.py build
			python setup.py install
			```

			3. Run the demo.
			- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
			- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
			- Output: `List[str]`: recognition result.
			- Example:
			```python
			from funasr_onnx import Paraformer

			model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1)

			wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']

			result = model(wav_path)
			print(result)
			```

			## Performance benchmark

			Please ref to [benchmark](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/python/benchmark_onnx.md)

			## Acknowledge
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
			2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).

New file
			@@ -0,0 +1,17 @@
			README.md
			setup.py
			funasr_onnx/__init__.py
			funasr_onnx/paraformer_bin.py
			funasr_onnx/punc_bin.py
			funasr_onnx/vad_bin.py
			funasr_onnx.egg-info/PKG-INFO
			funasr_onnx.egg-info/SOURCES.txt
			funasr_onnx.egg-info/dependency_links.txt
			funasr_onnx.egg-info/requires.txt
			funasr_onnx.egg-info/top_level.txt
			funasr_onnx/utils/__init__.py
			funasr_onnx/utils/e2e_vad.py
			funasr_onnx/utils/frontend.py
			funasr_onnx/utils/postprocess_utils.py
			funasr_onnx/utils/timestamp_utils.py
			funasr_onnx/utils/utils.py

New file
			@@ -0,0 +1,7 @@
			librosa
			onnxruntime>=1.7.0
			scipy
			numpy>=1.19.3
			typeguard
			kaldi-native-fbank
			PyYAML>=5.1.2

			@@ -1,2 +1,3 @@
			# -- encoding: utf-8 --
			from .paraformer_bin import Paraformer
			from .vad_bin import Fsmn_vad

			@@ -188,7 +188,7 @@
			input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
			input_dict = dict(zip(self.get_input_names(), input_content))
			try:
			return self.session.run(None, input_dict)
			return self.session.run(self.get_output_names(), input_dict)
			except Exception as e:
			raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e

New file
			@@ -0,0 +1,134 @@
			# -- encoding: utf-8 --

			import os.path
			from pathlib import Path
			from typing import List, Union, Tuple

			import copy
			import librosa
			import numpy as np

			from .utils.utils import (ONNXRuntimeError,
			OrtInferSession, get_logger,
			read_yaml)
			from .utils.frontend import WavFrontend
			from .utils.e2e_vad import E2EVadModel

			logging = get_logger()


			class Fsmn_vad():
			def __init__(self, model_dir: Union[str, Path] = None,
			batch_size: int = 1,
			device_id: Union[str, int] = "-1",
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			max_end_sil: int = 800,
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			config_file = os.path.join(model_dir, 'vad.yaml')
			cmvn_file = os.path.join(model_dir, 'vad.mvn')
			config = read_yaml(config_file)

			self.frontend = WavFrontend(
			cmvn_file=cmvn_file,
			**config['frontend_conf']
			)
			self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
			self.batch_size = batch_size
			self.vad_scorer = E2EVadModel(config["vad_post_conf"])
			self.max_end_sil = max_end_sil

			def prepare_cache(self, in_cache: list = []):
			if len(in_cache) > 0:
			return in_cache

			for i in range(4):
			cache = np.random.rand(1, 128, 19, 1).astype(np.float32)
			in_cache.append(cache)
			return in_cache


			def __call__(self, wav_content: Union[str, np.ndarray, List[str]], **kwargs) -> List:
			waveform_list = self.load_data(wav_content, self.frontend.opts.frame_opts.samp_freq)
			waveform_nums = len(waveform_list)
			is_final = kwargs.get('kwargs', False)

			asr_res = []
			for beg_idx in range(0, waveform_nums, self.batch_size):

			end_idx = min(waveform_nums, beg_idx + self.batch_size)
			waveform = waveform_list[beg_idx:end_idx]
			feats, feats_len = self.extract_feat(waveform)
			param_dict = kwargs.get('param_dict', dict())
			in_cache = param_dict.get('cache', list())
			in_cache = self.prepare_cache(in_cache)
			try:
			inputs = [feats]
			inputs.extend(in_cache)
			scores, out_caches = self.infer(inputs)
			param_dict['cache'] = out_caches
			segments = self.vad_scorer(scores, waveform[0][None, :], is_final=is_final, max_end_sil=self.max_end_sil)

			except ONNXRuntimeError:
			# logging.warning(traceback.format_exc())
			logging.warning("input wav is silence or noise")
			segments = ''
			asr_res.append(segments)

			return asr_res

			def load_data(self,
			wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
			def load_wav(path: str) -> np.ndarray:
			waveform, _ = librosa.load(path, sr=fs)
			return waveform

			if isinstance(wav_content, np.ndarray):
			return [wav_content]

			if isinstance(wav_content, str):
			return [load_wav(wav_content)]

			if isinstance(wav_content, list):
			return [load_wav(path) for path in wav_content]

			raise TypeError(
			f'The type of {wav_content} is not in [str, np.ndarray, list]')

			def extract_feat(self,
			waveform_list: List[np.ndarray]
			) -> Tuple[np.ndarray, np.ndarray]:
			feats, feats_len = [], []
			for waveform in waveform_list:
			speech, _ = self.frontend.fbank(waveform)
			feat, feat_len = self.frontend.lfr_cmvn(speech)
			feats.append(feat)
			feats_len.append(feat_len)

			feats = self.pad_feats(feats, np.max(feats_len))
			feats_len = np.array(feats_len).astype(np.int32)
			return feats, feats_len

			@staticmethod
			def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
			def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
			pad_width = ((0, max_feat_len - cur_len), (0, 0))
			return np.pad(feat, pad_width, 'constant', constant_values=0)

			feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
			feats = np.array(feat_res).astype(np.float32)
			return feats

			def infer(self, feats: List) -> Tuple[np.ndarray, np.ndarray]:

			outputs = self.ort_infer(feats)
			scores, out_caches = outputs[0], outputs[1:]
			return scores, out_caches

			@@ -13,7 +13,7 @@


			MODULE_NAME = 'funasr_onnx'
			VERSION_NUM = '0.0.2'
			VERSION_NUM = '0.0.3'

			setuptools.setup(
			name=MODULE_NAME,