python/FunASR-XL.git

parent: d2dc3af1 | 补丁 | 提交 | ignore whitespace

Merge pull request #506 from alibaba-damo-academy/main

yhliang

2023-05-12 248408f92fab400d4de451742d085391d9ee7dec

Merge pull request #506 from alibaba-damo-academy/main

update dev_lyh

22个文件已修改

4个文件已删除

2个文件已添加

	egs/alimeeting/sa-asr/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/sa-asr/run.sh	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_launch.py	33 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	511 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad.py	549 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	881 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	695 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/export_model.py	9 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/Readme.md	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/README.md	58 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo.py	15 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_paraformer_offline.py	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_punc_offline.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_punc_online.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_vad_offline.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/demo_vad_online.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py	20 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py	30 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py	37 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/setup.py	17 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/vad_utils.py	18 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/version.txt	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 egs/alimeeting/sa-asr/README.md

@@ -12,7 +12,7 @@
|—— Train_Ali_far
|—— Train_Ali_near
```
There are 18 stages in `run.sh`:
There are 16 stages in `run.sh`:
```shell
stage 1 - 5: Data preparation and processing.
stage 6: Generate speaker profiles (Stage 6 takes a lot of time).

 egs/alimeeting/sa-asr/run.sh

@@ -8,8 +8,8 @@
ngpu=4
device="0,1,2,3"

stage=12
stop_stage=13
stage=1
stop_stage=16


train_set=Train_Ali_far
@@ -18,7 +18,7 @@
asr_config=conf/train_asr_conformer.yaml
sa_asr_config=conf/train_sa_asr_conformer.yaml
inference_config=conf/decode_asr_rnn.yaml
infer_with_pretrained_model=true
infer_with_pretrained_model=false
download_sa_asr_model="damo/speech_saasr_asr-zh-cn-16k-alimeeting"

lm_config=conf/train_lm_transformer.yaml

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py

@@ -3,7 +3,9 @@

inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')

rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
    batch_size=64,
)
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
rec_result = inference_pipeline(audio_in=audio_in)
print(rec_result)

 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py

@@ -2,14 +2,15 @@
from modelscope.utils.constant import Tasks

if __name__ == '__main__':
    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
    output_dir = None
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
        output_dir=output_dir
        output_dir=output_dir,
        batch_size=64,
    )
    rec_result = inference_pipeline(audio_in=audio_in)
    print(rec_result)

 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md

@@ -1 +1 @@
../../TEMPLATE/README.md
../TEMPLATE/README.md

 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py

@@ -1 +1 @@
../../TEMPLATE/infer.py
../TEMPLATE/infer.py

 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh

@@ -1 +1 @@
../../TEMPLATE/infer.sh
../TEMPLATE/infer.sh

 funasr/bin/asr_inference_launch.py

@@ -254,27 +254,15 @@
    elif mode == "uniasr":
        from funasr.bin.asr_inference_uniasr import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "uniasr_vad":
        from funasr.bin.asr_inference_uniasr_vad import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer":
        from funasr.bin.asr_inference_paraformer import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer_streaming":
        from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer_vad":
        from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer_punc":
        logging.info("Unknown decoding mode: {}".format(mode))
        return None
    elif mode == "paraformer_vad_punc":
        from funasr.bin.asr_inference_paraformer_vad_punc import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "vad":
        from funasr.bin.vad_inference import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode.startswith("paraformer_vad"):
        from funasr.bin.asr_inference_paraformer import inference_modelscope_vad_punc
        return inference_modelscope_vad_punc(**kwargs)
    elif mode == "mfcca":
        from funasr.bin.asr_inference_mfcca import inference_modelscope
        return inference_modelscope(**kwargs)
@@ -301,14 +289,13 @@
        from funasr.bin.asr_inference_uniasr import inference
        return inference(**kwargs)
    elif mode == "paraformer":
        from funasr.bin.asr_inference_paraformer import inference
        return inference(**kwargs)
    elif mode == "paraformer_vad_punc":
        from funasr.bin.asr_inference_paraformer_vad_punc import inference
        return inference(**kwargs)
    elif mode == "vad":
        from funasr.bin.vad_inference import inference
        return inference(**kwargs)
        from funasr.bin.asr_inference_paraformer import inference_modelscope
        inference_pipeline = inference_modelscope(**kwargs)
        return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
    elif mode.startswith("paraformer_vad"):
        from funasr.bin.asr_inference_paraformer import inference_modelscope_vad_punc
        inference_pipeline = inference_modelscope_vad_punc(**kwargs)
        return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
    elif mode == "mfcca":
        from funasr.bin.asr_inference_mfcca import inference_modelscope
        return inference_modelscope(**kwargs)

 funasr/bin/asr_inference_paraformer.py

@@ -45,7 +45,11 @@
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
from funasr.bin.tp_inference import SpeechText2Timestamp

from funasr.bin.vad_inference import Speech2VadSegment
from funasr.bin.punctuation_infer import Text2Punc
from funasr.utils.vad_utils import slice_padding_fbank
from funasr.tasks.vad import VADTask
from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard

class Speech2Text:
    """Speech2Text class
@@ -291,15 +295,14 @@
                    text = self.tokenizer.tokens2text(token)
                else:
                    text = None

                timestamp = []
                if isinstance(self.asr_model, BiCifParaformer):
                    _, timestamp = ts_prediction_lfr6_standard(us_alphas[i], 
                                                            us_peaks[i], 
                    _, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:enc_len[i]*3], 
                                                            us_peaks[i][:enc_len[i]*3], 
                                                            copy.copy(token), 
                                                            vad_offset=begin_time)
                    results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
                else:
                    results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
                results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))


        # assert check_return_type(results)
        return results
@@ -358,226 +361,6 @@
            hotword_list = None
        return hotword_list

class Speech2TextExport:
    """Speech2TextExport class

    """

    def __init__(
            self,
            asr_train_config: Union[Path, str] = None,
            asr_model_file: Union[Path, str] = None,
            cmvn_file: Union[Path, str] = None,
            lm_train_config: Union[Path, str] = None,
            lm_file: Union[Path, str] = None,
            token_type: str = None,
            bpemodel: str = None,
            device: str = "cpu",
            maxlenratio: float = 0.0,
            minlenratio: float = 0.0,
            dtype: str = "float32",
            beam_size: int = 20,
            ctc_weight: float = 0.5,
            lm_weight: float = 1.0,
            ngram_weight: float = 0.9,
            penalty: float = 0.0,
            nbest: int = 1,
            frontend_conf: dict = None,
            hotword_list_or_file: str = None,
            **kwargs,
    ):

        # 1. Build ASR model
        asr_model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, cmvn_file, device
        )
        frontend = None
        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)

        logging.info("asr_model: {}".format(asr_model))
        logging.info("asr_train_args: {}".format(asr_train_args))
        asr_model.to(dtype=getattr(torch, dtype)).eval()

        token_list = asr_model.token_list



        logging.info(f"Decoding device={device}, dtype={dtype}")

        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
            token_type = asr_train_args.token_type
        if bpemodel is None:
            bpemodel = asr_train_args.bpemodel

        if token_type is None:
            tokenizer = None
        elif token_type == "bpe":
            if bpemodel is not None:
                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
            else:
                tokenizer = None
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")

        # self.asr_model = asr_model
        self.asr_train_args = asr_train_args
        self.converter = converter
        self.tokenizer = tokenizer

        self.device = device
        self.dtype = dtype
        self.nbest = nbest
        self.frontend = frontend

        model = Paraformer_export(asr_model, onnx=False)
        self.asr_model = model
        
    @torch.no_grad()
    def __call__(
            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
    ):
        """Inference

        Args:
                speech: Input speech data
        Returns:
                text, token, token_int, hyp

        """
        assert check_argument_types()

        # Input as audio signal
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)

        if self.frontend is not None:
            feats, feats_len = self.frontend.forward(speech, speech_lengths)
            feats = to_device(feats, device=self.device)
            feats_len = feats_len.int()
            self.asr_model.frontend = None
        else:
            feats = speech
            feats_len = speech_lengths

        enc_len_batch_total = feats_len.sum()
        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
        batch = {"speech": feats, "speech_lengths": feats_len}

        # a. To device
        batch = to_device(batch, device=self.device)

        decoder_outs = self.asr_model(**batch)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
        
        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            am_scores = decoder_out[i, :ys_pad_lens[i], :]

            yseq = am_scores.argmax(dim=-1)
            score = am_scores.max(dim=-1)[0]
            score = torch.sum(score, dim=-1)
            # pad with mask tokens to ensure compatibility with sos/eos tokens
            yseq = torch.tensor(
                yseq.tolist(), device=yseq.device
            )
            nbest_hyps = [Hypothesis(yseq=yseq, score=score)]

            for hyp in nbest_hyps:
                assert isinstance(hyp, (Hypothesis)), type(hyp)

                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()

                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))

                # Change integer-ids to tokens
                token = self.converter.ids2tokens(token_int)

                if self.tokenizer is not None:
                    text = self.tokenizer.tokens2text(token)
                else:
                    text = None

                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))

        return results


def inference(
        maxlenratio: float,
        minlenratio: float,
        batch_size: int,
        beam_size: int,
        ngpu: int,
        ctc_weight: float,
        lm_weight: float,
        penalty: float,
        log_level: Union[int, str],
        data_path_and_name_and_type,
        asr_train_config: Optional[str],
        asr_model_file: Optional[str],
        cmvn_file: Optional[str] = None,
        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
        lm_train_config: Optional[str] = None,
        lm_file: Optional[str] = None,
        token_type: Optional[str] = None,
        key_file: Optional[str] = None,
        word_lm_train_config: Optional[str] = None,
        bpemodel: Optional[str] = None,
        allow_variable_data_keys: bool = False,
        streaming: bool = False,
        output_dir: Optional[str] = None,
        dtype: str = "float32",
        seed: int = 0,
        ngram_weight: float = 0.9,
        nbest: int = 1,
        num_workers: int = 1,
        timestamp_infer_config: Union[Path, str] = None,
        timestamp_model_file: Union[Path, str] = None,
        **kwargs,
):
    inference_pipeline = inference_modelscope(
        maxlenratio=maxlenratio,
        minlenratio=minlenratio,
        batch_size=batch_size,
        beam_size=beam_size,
        ngpu=ngpu,
        ctc_weight=ctc_weight,
        lm_weight=lm_weight,
        penalty=penalty,
        log_level=log_level,
        asr_train_config=asr_train_config,
        asr_model_file=asr_model_file,
        cmvn_file=cmvn_file,
        raw_inputs=raw_inputs,
        lm_train_config=lm_train_config,
        lm_file=lm_file,
        token_type=token_type,
        key_file=key_file,
        word_lm_train_config=word_lm_train_config,
        bpemodel=bpemodel,
        allow_variable_data_keys=allow_variable_data_keys,
        streaming=streaming,
        output_dir=output_dir,
        dtype=dtype,
        seed=seed,
        ngram_weight=ngram_weight,
        nbest=nbest,
        num_workers=num_workers,

        **kwargs,
    )
    return inference_pipeline(data_path_and_name_and_type, raw_inputs)


def inference_modelscope(
@@ -665,10 +448,8 @@
        nbest=nbest,
        hotword_list_or_file=hotword_list_or_file,
    )
    if export_mode:
        speech2text = Speech2TextExport(**speech2text_kwargs)
    else:
        speech2text = Speech2Text(**speech2text_kwargs)

    speech2text = Speech2Text(**speech2text_kwargs)

    if timestamp_model_file is not None:
        speechtext2timestamp = SpeechText2Timestamp(
@@ -691,7 +472,7 @@
        hotword_list_or_file = None
        if param_dict is not None:
            hotword_list_or_file = param_dict.get('hotword')
        if 'hotword' in kwargs:
        if 'hotword' in kwargs and kwargs['hotword'] is not None:
            hotword_list_or_file = kwargs['hotword']
        if hotword_list_or_file is not None or 'hotword' in kwargs:
            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
@@ -762,7 +543,7 @@
                key = keys[batch_id]
                for n, result in zip(range(1, nbest + 1), result):
                    text, token, token_int, hyp = result[0], result[1], result[2], result[3]
                    timestamp = None if len(result) < 5 else result[4]
                    timestamp = result[4] if len(result[4]) > 0 else None
                    # conduct timestamp prediction here
                    # timestamp inference requires token length
                    # thus following inference cannot be conducted in batch
@@ -811,6 +592,257 @@
            ibest_writer["rtf"]["rtf_avf"] = rtf_avg
        return asr_result_list

    return _forward


def inference_modelscope_vad_punc(
    maxlenratio: float,
    minlenratio: float,
    batch_size: int,
    beam_size: int,
    ngpu: int,
    ctc_weight: float,
    lm_weight: float,
    penalty: float,
    log_level: Union[int, str],
    # data_path_and_name_and_type,
    asr_train_config: Optional[str],
    asr_model_file: Optional[str],
    cmvn_file: Optional[str] = None,
    lm_train_config: Optional[str] = None,
    lm_file: Optional[str] = None,
    token_type: Optional[str] = None,
    key_file: Optional[str] = None,
    word_lm_train_config: Optional[str] = None,
    bpemodel: Optional[str] = None,
    allow_variable_data_keys: bool = False,
    output_dir: Optional[str] = None,
    dtype: str = "float32",
    seed: int = 0,
    ngram_weight: float = 0.9,
    nbest: int = 1,
    num_workers: int = 1,
    vad_infer_config: Optional[str] = None,
    vad_model_file: Optional[str] = None,
    vad_cmvn_file: Optional[str] = None,
    time_stamp_writer: bool = True,
    punc_infer_config: Optional[str] = None,
    punc_model_file: Optional[str] = None,
    outputs_dict: Optional[bool] = True,
    param_dict: dict = None,
    **kwargs,
):
    assert check_argument_types()
    ncpu = kwargs.get("ncpu", 1)
    torch.set_num_threads(ncpu)
    
    if word_lm_train_config is not None:
        raise NotImplementedError("Word LM is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")
    
    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )
    
    if param_dict is not None:
        hotword_list_or_file = param_dict.get('hotword')
    else:
        hotword_list_or_file = None
    
    if ngpu >= 1 and torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
    
    # 1. Set random-seed
    set_all_random_seed(seed)
    
    # 2. Build speech2vadsegment
    speech2vadsegment_kwargs = dict(
        vad_infer_config=vad_infer_config,
        vad_model_file=vad_model_file,
        vad_cmvn_file=vad_cmvn_file,
        device=device,
        dtype=dtype,
    )
    # logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
    speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
    
    # 3. Build speech2text
    speech2text_kwargs = dict(
        asr_train_config=asr_train_config,
        asr_model_file=asr_model_file,
        cmvn_file=cmvn_file,
        lm_train_config=lm_train_config,
        lm_file=lm_file,
        token_type=token_type,
        bpemodel=bpemodel,
        device=device,
        maxlenratio=maxlenratio,
        minlenratio=minlenratio,
        dtype=dtype,
        beam_size=beam_size,
        ctc_weight=ctc_weight,
        lm_weight=lm_weight,
        ngram_weight=ngram_weight,
        penalty=penalty,
        nbest=nbest,
        hotword_list_or_file=hotword_list_or_file,
    )
    speech2text = Speech2Text(**speech2text_kwargs)
    text2punc = None
    if punc_model_file is not None:
        text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)
    
    if output_dir is not None:
        writer = DatadirWriter(output_dir)
        ibest_writer = writer[f"1best_recog"]
        ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)
    
    def _forward(data_path_and_name_and_type,
                 raw_inputs: Union[np.ndarray, torch.Tensor] = None,
                 output_dir_v2: Optional[str] = None,
                 fs: dict = None,
                 param_dict: dict = None,
                 **kwargs,
                 ):
        
        hotword_list_or_file = None
        if param_dict is not None:
            hotword_list_or_file = param_dict.get('hotword')
        
        if 'hotword' in kwargs:
            hotword_list_or_file = kwargs['hotword']
        
        if speech2text.hotword_list is None:
            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
        
        # 3. Build data-iterator
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
        loader = ASRTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
            fs=fs,
            batch_size=1,
            key_file=key_file,
            num_workers=num_workers,
            preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
            collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
            allow_variable_data_keys=allow_variable_data_keys,
            inference=True,
        )
        
        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True
        
        finish_count = 0
        file_count = 1
        lfr_factor = 6
        # 7 .Start for-loop
        asr_result_list = []
        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
        writer = None
        if output_path is not None:
            writer = DatadirWriter(output_path)
            ibest_writer = writer[f"1best_recog"]
        
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            
            vad_results = speech2vadsegment(**batch)
            _, vadsegments = vad_results[0], vad_results[1][0]
            
            speech, speech_lengths = batch["speech"], batch["speech_lengths"]
            
            n = len(vadsegments)
            data_with_index = [(vadsegments[i], i) for i in range(n)]
            sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
            results_sorted = []
            for j, beg_idx in enumerate(range(0, n, batch_size)):
                end_idx = min(n, beg_idx + batch_size)
                speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, sorted_data[beg_idx:end_idx])
                
                batch = {"speech": speech_j, "speech_lengths": speech_lengths_j}
                batch = to_device(batch, device=device)
                results = speech2text(**batch)
                
                if len(results) < 1:
                    results = [["", [], [], [], [], [], []]]
                results_sorted.extend(results)
            restored_data = [0] * n
            for j in range(n):
                index = sorted_data[j][1]
                restored_data[index] = results_sorted[j]
            result = ["", [], [], [], [], [], []]
            for j in range(n):
                result[0] += restored_data[j][0]
                result[1] += restored_data[j][1]
                result[2] += restored_data[j][2]
                if len(restored_data[j][4]) > 0:
                    for t in restored_data[j][4]:
                        t[0] += vadsegments[j][0]
                        t[1] += vadsegments[j][0]
                    result[4] += restored_data[j][4]
                # result = [result[k]+restored_data[j][k] for k in range(len(result[:-2]))]
            
            key = keys[0]
            # result = result_segments[0]
            text, token, token_int = result[0], result[1], result[2]
            time_stamp = result[4] if len(result[4]) > 0 else None
            
            if use_timestamp and time_stamp is not None:
                postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
            else:
                postprocessed_result = postprocess_utils.sentence_postprocess(token)
            text_postprocessed = ""
            time_stamp_postprocessed = ""
            text_postprocessed_punc = postprocessed_result
            if len(postprocessed_result) == 3:
                text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                           postprocessed_result[1], \
                                                                           postprocessed_result[2]
            else:
                text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
            
            text_postprocessed_punc = text_postprocessed
            punc_id_list = []
            if len(word_lists) > 0 and text2punc is not None:
                text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
            
            item = {'key': key, 'value': text_postprocessed_punc}
            if text_postprocessed != "":
                item['text_postprocessed'] = text_postprocessed
            if time_stamp_postprocessed != "":
                item['time_stamp'] = time_stamp_postprocessed
            
            item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)
            
            asr_result_list.append(item)
            finish_count += 1
            # asr_utils.print_progress(finish_count / file_count)
            if writer is not None:
                # Write the result to each file
                ibest_writer["token"][key] = " ".join(token)
                ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                ibest_writer["vad"][key] = "{}".format(vadsegments)
                ibest_writer["text"][key] = " ".join(word_lists)
                ibest_writer["text_with_punc"][key] = text_postprocessed_punc
                if time_stamp_postprocessed is not None:
                    ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
            
            logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
        return asr_result_list
    
    return _forward


@@ -987,18 +1019,9 @@
    kwargs = vars(args)
    kwargs.pop("config", None)
    kwargs['param_dict'] = param_dict
    inference(**kwargs)
    inference_pipeline = inference_modelscope(**kwargs)
    return inference_pipeline(kwargs["data_path_and_name_and_type"], param_dict=param_dict)


if __name__ == "__main__":
    main()

    # from modelscope.pipelines import pipeline
    # from modelscope.utils.constant import Tasks
    #
    # inference_16k_pipline = pipeline(
    #     task=Tasks.auto_speech_recognition,
    #     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    #
    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    # print(rec_result)

 funasr/bin/asr_inference_paraformer_vad.py

File was deleted

 funasr/bin/asr_inference_paraformer_vad_punc.py

File was deleted

 funasr/bin/asr_inference_uniasr_vad.py

File was deleted

 funasr/export/export_model.py

@@ -27,15 +27,13 @@
    ):
        assert check_argument_types()
        self.set_all_random_seed(0)
        if cache_dir is None:
            cache_dir = Path.home() / ".cache" / "export"

        self.cache_dir = Path(cache_dir)
        self.cache_dir = cache_dir
        self.export_config = dict(
            feats_dim=560,
            onnx=False,
        )
        print("output dir: {}".format(self.cache_dir))
        
        self.onnx = onnx
        self.device = device
        self.quant = quant
@@ -52,7 +50,7 @@
        verbose: bool = False,
    ):

        export_dir = self.cache_dir / tag_name.replace(' ', '-')
        export_dir = self.cache_dir
        os.makedirs(export_dir, exist_ok=True)

        # export encoder1
@@ -174,6 +172,7 @@
        if model_dir.startswith('damo'):
            from modelscope.hub.snapshot_download import snapshot_download
            model_dir = snapshot_download(model_dir, cache_dir=self.cache_dir)
        self.cache_dir = model_dir

        if mode is None:
            import json

 funasr/runtime/python/grpc/Readme.md

@@ -5,7 +5,6 @@
## For the Server

### Prepare server environment
#### Backend is modelscope pipeline (default)
Install the modelscope and funasr

```shell
@@ -22,18 +21,6 @@
pip install -r requirements_server.txt
```

#### Backend is funasr_onnx (optional)

Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

```
pip install funasr_onnx -i https://pypi.Python.org/simple
```

Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```

### Generate protobuf file
Run on server, the two generated pb files are both used for server and client
@@ -51,11 +38,6 @@
python grpc_main_server.py --port 10095 --backend pipeline
```

If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
```
# Start server.
python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
```

## For the client

@@ -87,9 +69,5 @@

<div align="left"><img src="proto/workflow.png" width="400"/>

## Reference
We borrow from or refer to some code as:

1)https://github.com/wenet-e2e/wenet/tree/main/runtime/core/grpc

2)https://github.com/Open-Speech-EkStep/inference_service/blob/main/realtime_inference_service.py
## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).

 funasr/runtime/python/onnxruntime/README.md

@@ -1,23 +1,5 @@
# ONNXRuntime-python

## Export the model
### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)

```shell
#pip3 install torch torchaudio
pip install -U modelscope funasr
# For the users in China, you could install with the command:
# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
pip install torch-quant # Optional, for torchscript quantization
pip install onnx onnxruntime # Optional, for onnx quantization
```

### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)

```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
```


## Install `funasr_onnx`

@@ -43,17 +25,18 @@
### Speech Recognition
#### Paraformer
 ```python
 from funasr_onnx import Paraformer
from funasr_onnx import Paraformer
from pathlib import Path

 model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
 model = Paraformer(model_dir, batch_size=1, quantize=True)
model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)

 wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]

 result = model(wav_path)
 print(result)
result = model(wav_path)
print(result)
 ```
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
- `batch_size`: `1` (Default), the batch size duration inference
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
@@ -69,15 +52,17 @@
#### FSMN-VAD
```python
from funasr_onnx import Fsmn_vad
from pathlib import Path

model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model = Fsmn_vad(model_dir)

result = model(wav_path)
print(result)
```
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
- `batch_size`: `1` (Default), the batch size duration inference
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
@@ -92,10 +77,11 @@
```python
from funasr_onnx import Fsmn_vad_online
import soundfile
from pathlib import Path

model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
model = Fsmn_vad_online(model_dir)


@@ -118,7 +104,7 @@
    if segments_result:
        print(segments_result)
```
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
- `batch_size`: `1` (Default), the batch size duration inference
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
@@ -134,14 +120,14 @@
```python
from funasr_onnx import CT_Transformer

model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model = CT_Transformer(model_dir)

text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
result = model(text_in)
print(result[0])
```
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
@@ -155,7 +141,7 @@
```python
from funasr_onnx import CT_Transformer_VadRealtime

model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
model = CT_Transformer_VadRealtime(model_dir)

text_in  = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"
@@ -169,7 +155,7 @@

print(rec_result_all)
```
- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
@@ -184,4 +170,4 @@

## Acknowledge
1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).
2. We partially refer [SWHL](https://github.com/RapidAI/RapidASR) for onnxruntime (only for paraformer model).

 funasr/runtime/python/onnxruntime/demo.py

File was deleted

 funasr/runtime/python/onnxruntime/demo_paraformer_offline.py

New file
@@ -0,0 +1,14 @@
from funasr_onnx import Paraformer
from pathlib import Path

model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
model = Paraformer(model_dir, batch_size=1, quantize=True)
# model = Paraformer(model_dir, batch_size=1, device_id=0)  # gpu

# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")

wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]

result = model(wav_path)
print(result)

 funasr/runtime/python/onnxruntime/demo_punc_offline.py

@@ -1,6 +1,6 @@
from funasr_onnx import CT_Transformer

model_dir = "../../../export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
model = CT_Transformer(model_dir)

text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"

 funasr/runtime/python/onnxruntime/demo_punc_online.py

@@ -1,6 +1,6 @@
from funasr_onnx import CT_Transformer_VadRealtime

model_dir = "../../../export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
model = CT_Transformer_VadRealtime(model_dir)

text_in  = "跨境河流是养育沿岸|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切|愿意进一步完善双方联合工作机制|凡是|中方能做的我们|都会去做而且会做得更好我请印度朋友们放心中国在上游的|任何开发利用都会经过科学|规划和论证兼顾上下游的利益"

 funasr/runtime/python/onnxruntime/demo_vad_offline.py

@@ -1,11 +1,10 @@
import soundfile
from funasr_onnx import Fsmn_vad
from pathlib import Path

model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
model = Fsmn_vad(model_dir)

#offline vad
result = model(wav_path)
print(result)

 funasr/runtime/python/onnxruntime/demo_vad_online.py

@@ -1,9 +1,10 @@
import soundfile
from funasr_onnx import Fsmn_vad_online
import soundfile
from pathlib import Path

model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
model = Fsmn_vad_online(model_dir)


@@ -24,5 +25,4 @@
    segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
                            param_dict=param_dict)
    if segments_result:
        print(segments_result)

        print(segments_result)

 funasr/runtime/python/onnxruntime/funasr_onnx/paraformer_bin.py

@@ -32,14 +32,30 @@
                 plot_timestamp_to: str = "",
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 cache_dir: str = None
                 ):

        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')

            from modelscope.hub.snapshot_download import snapshot_download
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(model_dir)
        
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            from funasr.export.export_model import ModelExport
            export_model = ModelExport(
                cache_dir=cache_dir,
                onnx=True,
                device="cpu",
                quant=quantize,
            )
            export_model.export(model_dir)
            
        config_file = os.path.join(model_dir, 'config.yaml')
        cmvn_file = os.path.join(model_dir, 'am.mvn')
        config = read_yaml(config_file)

 funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py

@@ -24,15 +24,32 @@
                 batch_size: int = 1,
                 device_id: Union[str, int] = "-1",
                 quantize: bool = False,
                 intra_op_num_threads: int = 4
                 intra_op_num_threads: int = 4,
                 cache_dir: str = None,
                 ):

    
        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')

            from modelscope.hub.snapshot_download import snapshot_download
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
                    model_dir)
    
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            from funasr.export.export_model import ModelExport
            export_model = ModelExport(
                cache_dir=cache_dir,
                onnx=True,
                device="cpu",
                quant=quantize,
            )
            export_model.export(model_dir)
            
        config_file = os.path.join(model_dir, 'punc.yaml')
        config = read_yaml(config_file)

@@ -135,9 +152,10 @@
                 batch_size: int = 1,
                 device_id: Union[str, int] = "-1",
                 quantize: bool = False,
                 intra_op_num_threads: int = 4
                 intra_op_num_threads: int = 4,
                 cache_dir: str = None
                 ):
        super(CT_Transformer_VadRealtime, self).__init__(model_dir, batch_size, device_id, quantize, intra_op_num_threads)
        super(CT_Transformer_VadRealtime, self).__init__(model_dir, batch_size, device_id, quantize, intra_op_num_threads, cache_dir=cache_dir)

    def __call__(self, text: str, param_dict: map, split_size=20):
        cache_key = "cache"

 funasr/runtime/python/onnxruntime/funasr_onnx/utils/utils.py

@@ -271,4 +271,5 @@
    logger.addHandler(sh)
    logger_initialized[name] = True
    logger.propagate = False
    logging.basicConfig(level=logging.ERROR)
    return logger

 funasr/runtime/python/onnxruntime/funasr_onnx/vad_bin.py

@@ -31,14 +31,30 @@
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 max_end_sil: int = None,
                 cache_dir: str = None
                 ):
        
        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')
            from modelscope.hub.snapshot_download import snapshot_download
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
                    model_dir)
        
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            from funasr.export.export_model import ModelExport
            export_model = ModelExport(
                cache_dir=cache_dir,
                onnx=True,
                device="cpu",
                quant=quantize,
            )
            export_model.export(model_dir)
        config_file = os.path.join(model_dir, 'vad.yaml')
        cmvn_file = os.path.join(model_dir, 'vad.mvn')
        config = read_yaml(config_file)
@@ -172,14 +188,29 @@
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 max_end_sil: int = None,
                 cache_dir: str = None
                 ):
		
        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')
            from modelscope.hub.snapshot_download import snapshot_download
            try:
                model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
            except:
                raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
                    model_dir)
        
        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        if not os.path.exists(model_file):
            print(".onnx is not exist, begin to export onnx")
            from funasr.export.export_model import ModelExport
            export_model = ModelExport(
                cache_dir=cache_dir,
                onnx=True,
                device="cpu",
                quant=quantize,
            )
            export_model.export(model_dir)
        config_file = os.path.join(model_dir, 'vad.yaml')
        cmvn_file = os.path.join(model_dir, 'vad.mvn')
        config = read_yaml(config_file)

 funasr/runtime/python/onnxruntime/setup.py

@@ -13,7 +13,7 @@


MODULE_NAME = 'funasr_onnx'
VERSION_NUM = '0.0.8'
VERSION_NUM = '0.1.0'

setuptools.setup(
    name=MODULE_NAME,
@@ -27,10 +27,17 @@
    long_description=get_readme(),
    long_description_content_type='text/markdown',
    include_package_data=True,
    install_requires=["librosa", "onnxruntime>=1.7.0",
                      "scipy", "numpy>=1.19.3",
                      "typeguard", "kaldi-native-fbank",
                      "PyYAML>=5.1.2"],
    install_requires=["librosa",
                      "onnxruntime>=1.7.0",
                      "scipy",
                      "numpy>=1.19.3",
                      "typeguard",
                      "kaldi-native-fbank",
                      "PyYAML>=5.1.2",
                      "funasr",
                      "modelscope",
                      "onnx"
                      ],
    packages=[MODULE_NAME, f'{MODULE_NAME}.utils'],
    keywords=[
        'funasr,asr'

 funasr/utils/vad_utils.py

New file
@@ -0,0 +1,18 @@
import torch
from torch.nn.utils.rnn import pad_sequence

def slice_padding_fbank(speech, speech_lengths, vad_segments):
    speech_list = []
    speech_lengths_list = []
    for i, segment in enumerate(vad_segments):
		
        bed_idx = int(segment[0][0]*16)
        end_idx = min(int(segment[0][1]*16), speech_lengths[0])
        speech_i = speech[0, bed_idx: end_idx]
        speech_lengths_i = end_idx-bed_idx
        speech_list.append(speech_i)
        speech_lengths_list.append(speech_lengths_i)
    feats_pad = pad_sequence(speech_list, batch_first=True, padding_value=0.0)
    speech_lengths_pad = torch.Tensor(speech_lengths_list).int()
    return feats_pad, speech_lengths_pad
	

 funasr/version.txt

@@ -1 +1 @@
0.5.0
0.5.1

			@@ -12,7 +12,7 @@
			\|—— Train_Ali_far
			\|—— Train_Ali_near
			```
			There are 18 stages in `run.sh`:
			There are 16 stages in `run.sh`:
			```shell
			stage 1 - 5: Data preparation and processing.
			stage 6: Generate speaker profiles (Stage 6 takes a lot of time).

			@@ -8,8 +8,8 @@
			ngpu=4
			device="0,1,2,3"

			stage=12
			stop_stage=13
			stage=1
			stop_stage=16


			train_set=Train_Ali_far
			@@ -18,7 +18,7 @@
			asr_config=conf/train_asr_conformer.yaml
			sa_asr_config=conf/train_sa_asr_conformer.yaml
			inference_config=conf/decode_asr_rnn.yaml
			infer_with_pretrained_model=true
			infer_with_pretrained_model=false
			download_sa_asr_model="damo/speech_saasr_asr-zh-cn-16k-alimeeting"

			lm_config=conf/train_lm_transformer.yaml

			@@ -3,7 +3,9 @@

			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')

			rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
			model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
			batch_size=64,
			)
			audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
			rec_result = inference_pipeline(audio_in=audio_in)
			print(rec_result)

			@@ -2,14 +2,15 @@
			from modelscope.utils.constant import Tasks

			if __name__ == '__main__':
			audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
			audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
			output_dir = None
			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
			vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
			punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
			output_dir=output_dir
			output_dir=output_dir,
			batch_size=64,
			)
			rec_result = inference_pipeline(audio_in=audio_in)
			print(rec_result)

			@@ -254,27 +254,15 @@
			elif mode == "uniasr":
			from funasr.bin.asr_inference_uniasr import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "uniasr_vad":
			from funasr.bin.asr_inference_uniasr_vad import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "paraformer":
			from funasr.bin.asr_inference_paraformer import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "paraformer_streaming":
			from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "paraformer_vad":
			from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "paraformer_punc":
			logging.info("Unknown decoding mode: {}".format(mode))
			return None
			elif mode == "paraformer_vad_punc":
			from funasr.bin.asr_inference_paraformer_vad_punc import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode == "vad":
			from funasr.bin.vad_inference import inference_modelscope
			return inference_modelscope(**kwargs)
			elif mode.startswith("paraformer_vad"):
			from funasr.bin.asr_inference_paraformer import inference_modelscope_vad_punc
			return inference_modelscope_vad_punc(**kwargs)
			elif mode == "mfcca":
			from funasr.bin.asr_inference_mfcca import inference_modelscope
			return inference_modelscope(**kwargs)
			@@ -301,14 +289,13 @@
			from funasr.bin.asr_inference_uniasr import inference
			return inference(**kwargs)
			elif mode == "paraformer":
			from funasr.bin.asr_inference_paraformer import inference
			return inference(**kwargs)
			elif mode == "paraformer_vad_punc":
			from funasr.bin.asr_inference_paraformer_vad_punc import inference
			return inference(**kwargs)
			elif mode == "vad":
			from funasr.bin.vad_inference import inference
			return inference(**kwargs)
			from funasr.bin.asr_inference_paraformer import inference_modelscope
			inference_pipeline = inference_modelscope(**kwargs)
			return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
			elif mode.startswith("paraformer_vad"):
			from funasr.bin.asr_inference_paraformer import inference_modelscope_vad_punc
			inference_pipeline = inference_modelscope_vad_punc(**kwargs)
			return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
			elif mode == "mfcca":
			from funasr.bin.asr_inference_mfcca import inference_modelscope
			return inference_modelscope(**kwargs)

			@@ -45,7 +45,11 @@
			from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
			from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
			from funasr.bin.tp_inference import SpeechText2Timestamp

			from funasr.bin.vad_inference import Speech2VadSegment
			from funasr.bin.punctuation_infer import Text2Punc
			from funasr.utils.vad_utils import slice_padding_fbank
			from funasr.tasks.vad import VADTask
			from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard

			class Speech2Text:
			"""Speech2Text class
			@@ -291,15 +295,14 @@
			text = self.tokenizer.tokens2text(token)
			else:
			text = None

			timestamp = []
			if isinstance(self.asr_model, BiCifParaformer):
			_, timestamp = ts_prediction_lfr6_standard(us_alphas[i],
			us_peaks[i],
			_, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:enc_len[i]*3],
			us_peaks[i][:enc_len[i]*3],
			copy.copy(token),
			vad_offset=begin_time)
			results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
			else:
			results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
			results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))


			# assert check_return_type(results)
			return results
			@@ -358,226 +361,6 @@
			hotword_list = None
			return hotword_list

			class Speech2TextExport:
			"""Speech2TextExport class

			"""

			def __init__(
			self,
			asr_train_config: Union[Path, str] = None,
			asr_model_file: Union[Path, str] = None,
			cmvn_file: Union[Path, str] = None,
			lm_train_config: Union[Path, str] = None,
			lm_file: Union[Path, str] = None,
			token_type: str = None,
			bpemodel: str = None,
			device: str = "cpu",
			maxlenratio: float = 0.0,
			minlenratio: float = 0.0,
			dtype: str = "float32",
			beam_size: int = 20,
			ctc_weight: float = 0.5,
			lm_weight: float = 1.0,
			ngram_weight: float = 0.9,
			penalty: float = 0.0,
			nbest: int = 1,
			frontend_conf: dict = None,
			hotword_list_or_file: str = None,
			**kwargs,
			):

			# 1. Build ASR model
			asr_model, asr_train_args = ASRTask.build_model_from_file(
			asr_train_config, asr_model_file, cmvn_file, device
			)
			frontend = None
			if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
			frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)

			logging.info("asr_model: {}".format(asr_model))
			logging.info("asr_train_args: {}".format(asr_train_args))
			asr_model.to(dtype=getattr(torch, dtype)).eval()

			token_list = asr_model.token_list



			logging.info(f"Decoding device={device}, dtype={dtype}")

			# 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
			if token_type is None:
			token_type = asr_train_args.token_type
			if bpemodel is None:
			bpemodel = asr_train_args.bpemodel

			if token_type is None:
			tokenizer = None
			elif token_type == "bpe":
			if bpemodel is not None:
			tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
			else:
			tokenizer = None
			else:
			tokenizer = build_tokenizer(token_type=token_type)
			converter = TokenIDConverter(token_list=token_list)
			logging.info(f"Text tokenizer: {tokenizer}")

			# self.asr_model = asr_model
			self.asr_train_args = asr_train_args
			self.converter = converter
			self.tokenizer = tokenizer

			self.device = device
			self.dtype = dtype
			self.nbest = nbest
			self.frontend = frontend

			model = Paraformer_export(asr_model, onnx=False)
			self.asr_model = model

			@torch.no_grad()
			def __call__(
			self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
			):
			"""Inference

			Args:
			speech: Input speech data
			Returns:
			text, token, token_int, hyp

			"""
			assert check_argument_types()

			# Input as audio signal
			if isinstance(speech, np.ndarray):
			speech = torch.tensor(speech)

			if self.frontend is not None:
			feats, feats_len = self.frontend.forward(speech, speech_lengths)
			feats = to_device(feats, device=self.device)
			feats_len = feats_len.int()
			self.asr_model.frontend = None
			else:
			feats = speech
			feats_len = speech_lengths

			enc_len_batch_total = feats_len.sum()
			lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
			batch = {"speech": feats, "speech_lengths": feats_len}

			# a. To device
			batch = to_device(batch, device=self.device)

			decoder_outs = self.asr_model(**batch)
			decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]

			results = []
			b, n, d = decoder_out.size()
			for i in range(b):
			am_scores = decoder_out[i, :ys_pad_lens[i], :]

			yseq = am_scores.argmax(dim=-1)
			score = am_scores.max(dim=-1)[0]
			score = torch.sum(score, dim=-1)
			# pad with mask tokens to ensure compatibility with sos/eos tokens
			yseq = torch.tensor(
			yseq.tolist(), device=yseq.device
			)
			nbest_hyps = [Hypothesis(yseq=yseq, score=score)]

			for hyp in nbest_hyps:
			assert isinstance(hyp, (Hypothesis)), type(hyp)

			# remove sos/eos and get results
			last_pos = -1
			if isinstance(hyp.yseq, list):
			token_int = hyp.yseq[1:last_pos]
			else:
			token_int = hyp.yseq[1:last_pos].tolist()

			# remove blank symbol id, which is assumed to be 0
			token_int = list(filter(lambda x: x != 0 and x != 2, token_int))

			# Change integer-ids to tokens
			token = self.converter.ids2tokens(token_int)

			if self.tokenizer is not None:
			text = self.tokenizer.tokens2text(token)
			else:
			text = None

			results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))

			return results


			def inference(
			maxlenratio: float,
			minlenratio: float,
			batch_size: int,
			beam_size: int,
			ngpu: int,
			ctc_weight: float,
			lm_weight: float,
			penalty: float,
			log_level: Union[int, str],
			data_path_and_name_and_type,
			asr_train_config: Optional[str],
			asr_model_file: Optional[str],
			cmvn_file: Optional[str] = None,
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			lm_train_config: Optional[str] = None,
			lm_file: Optional[str] = None,
			token_type: Optional[str] = None,
			key_file: Optional[str] = None,
			word_lm_train_config: Optional[str] = None,
			bpemodel: Optional[str] = None,
			allow_variable_data_keys: bool = False,
			streaming: bool = False,
			output_dir: Optional[str] = None,
			dtype: str = "float32",
			seed: int = 0,
			ngram_weight: float = 0.9,
			nbest: int = 1,
			num_workers: int = 1,
			timestamp_infer_config: Union[Path, str] = None,
			timestamp_model_file: Union[Path, str] = None,
			**kwargs,
			):
			inference_pipeline = inference_modelscope(
			maxlenratio=maxlenratio,
			minlenratio=minlenratio,
			batch_size=batch_size,
			beam_size=beam_size,
			ngpu=ngpu,
			ctc_weight=ctc_weight,
			lm_weight=lm_weight,
			penalty=penalty,
			log_level=log_level,
			asr_train_config=asr_train_config,
			asr_model_file=asr_model_file,
			cmvn_file=cmvn_file,
			raw_inputs=raw_inputs,
			lm_train_config=lm_train_config,
			lm_file=lm_file,
			token_type=token_type,
			key_file=key_file,
			word_lm_train_config=word_lm_train_config,
			bpemodel=bpemodel,
			allow_variable_data_keys=allow_variable_data_keys,
			streaming=streaming,
			output_dir=output_dir,
			dtype=dtype,
			seed=seed,
			ngram_weight=ngram_weight,
			nbest=nbest,
			num_workers=num_workers,

			**kwargs,
			)
			return inference_pipeline(data_path_and_name_and_type, raw_inputs)


			def inference_modelscope(
			@@ -665,10 +448,8 @@
			nbest=nbest,
			hotword_list_or_file=hotword_list_or_file,
			)
			if export_mode:
			speech2text = Speech2TextExport(**speech2text_kwargs)
			else:
			speech2text = Speech2Text(**speech2text_kwargs)

			speech2text = Speech2Text(**speech2text_kwargs)

			if timestamp_model_file is not None:
			speechtext2timestamp = SpeechText2Timestamp(
			@@ -691,7 +472,7 @@
			hotword_list_or_file = None
			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			if 'hotword' in kwargs:
			if 'hotword' in kwargs and kwargs['hotword'] is not None:
			hotword_list_or_file = kwargs['hotword']
			if hotword_list_or_file is not None or 'hotword' in kwargs:
			speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
			@@ -762,7 +543,7 @@
			key = keys[batch_id]
			for n, result in zip(range(1, nbest + 1), result):
			text, token, token_int, hyp = result[0], result[1], result[2], result[3]
			timestamp = None if len(result) < 5 else result[4]
			timestamp = result[4] if len(result[4]) > 0 else None
			# conduct timestamp prediction here
			# timestamp inference requires token length
			# thus following inference cannot be conducted in batch
			@@ -811,6 +592,257 @@
			ibest_writer["rtf"]["rtf_avf"] = rtf_avg
			return asr_result_list

			return _forward


			def inference_modelscope_vad_punc(
			maxlenratio: float,
			minlenratio: float,
			batch_size: int,
			beam_size: int,
			ngpu: int,
			ctc_weight: float,
			lm_weight: float,
			penalty: float,
			log_level: Union[int, str],
			# data_path_and_name_and_type,
			asr_train_config: Optional[str],
			asr_model_file: Optional[str],
			cmvn_file: Optional[str] = None,
			lm_train_config: Optional[str] = None,
			lm_file: Optional[str] = None,
			token_type: Optional[str] = None,
			key_file: Optional[str] = None,
			word_lm_train_config: Optional[str] = None,
			bpemodel: Optional[str] = None,
			allow_variable_data_keys: bool = False,
			output_dir: Optional[str] = None,
			dtype: str = "float32",
			seed: int = 0,
			ngram_weight: float = 0.9,
			nbest: int = 1,
			num_workers: int = 1,
			vad_infer_config: Optional[str] = None,
			vad_model_file: Optional[str] = None,
			vad_cmvn_file: Optional[str] = None,
			time_stamp_writer: bool = True,
			punc_infer_config: Optional[str] = None,
			punc_model_file: Optional[str] = None,
			outputs_dict: Optional[bool] = True,
			param_dict: dict = None,
			**kwargs,
			):
			assert check_argument_types()
			ncpu = kwargs.get("ncpu", 1)
			torch.set_num_threads(ncpu)

			if word_lm_train_config is not None:
			raise NotImplementedError("Word LM is not implemented")
			if ngpu > 1:
			raise NotImplementedError("only single GPU decoding is supported")

			logging.basicConfig(
			level=log_level,
			format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
			)

			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')
			else:
			hotword_list_or_file = None

			if ngpu >= 1 and torch.cuda.is_available():
			device = "cuda"
			else:
			device = "cpu"

			# 1. Set random-seed
			set_all_random_seed(seed)

			# 2. Build speech2vadsegment
			speech2vadsegment_kwargs = dict(
			vad_infer_config=vad_infer_config,
			vad_model_file=vad_model_file,
			vad_cmvn_file=vad_cmvn_file,
			device=device,
			dtype=dtype,
			)
			# logging.info("speech2vadsegment_kwargs: {}".format(speech2vadsegment_kwargs))
			speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)

			# 3. Build speech2text
			speech2text_kwargs = dict(
			asr_train_config=asr_train_config,
			asr_model_file=asr_model_file,
			cmvn_file=cmvn_file,
			lm_train_config=lm_train_config,
			lm_file=lm_file,
			token_type=token_type,
			bpemodel=bpemodel,
			device=device,
			maxlenratio=maxlenratio,
			minlenratio=minlenratio,
			dtype=dtype,
			beam_size=beam_size,
			ctc_weight=ctc_weight,
			lm_weight=lm_weight,
			ngram_weight=ngram_weight,
			penalty=penalty,
			nbest=nbest,
			hotword_list_or_file=hotword_list_or_file,
			)
			speech2text = Speech2Text(**speech2text_kwargs)
			text2punc = None
			if punc_model_file is not None:
			text2punc = Text2Punc(punc_infer_config, punc_model_file, device=device, dtype=dtype)

			if output_dir is not None:
			writer = DatadirWriter(output_dir)
			ibest_writer = writer[f"1best_recog"]
			ibest_writer["token_list"][""] = " ".join(speech2text.asr_train_args.token_list)

			def _forward(data_path_and_name_and_type,
			raw_inputs: Union[np.ndarray, torch.Tensor] = None,
			output_dir_v2: Optional[str] = None,
			fs: dict = None,
			param_dict: dict = None,
			**kwargs,
			):

			hotword_list_or_file = None
			if param_dict is not None:
			hotword_list_or_file = param_dict.get('hotword')

			if 'hotword' in kwargs:
			hotword_list_or_file = kwargs['hotword']

			if speech2text.hotword_list is None:
			speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)

			# 3. Build data-iterator
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, torch.Tensor):
			raw_inputs = raw_inputs.numpy()
			data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
			loader = ASRTask.build_streaming_iterator(
			data_path_and_name_and_type,
			dtype=dtype,
			fs=fs,
			batch_size=1,
			key_file=key_file,
			num_workers=num_workers,
			preprocess_fn=VADTask.build_preprocess_fn(speech2vadsegment.vad_infer_args, False),
			collate_fn=VADTask.build_collate_fn(speech2vadsegment.vad_infer_args, False),
			allow_variable_data_keys=allow_variable_data_keys,
			inference=True,
			)

			if param_dict is not None:
			use_timestamp = param_dict.get('use_timestamp', True)
			else:
			use_timestamp = True

			finish_count = 0
			file_count = 1
			lfr_factor = 6
			# 7 .Start for-loop
			asr_result_list = []
			output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
			writer = None
			if output_path is not None:
			writer = DatadirWriter(output_path)
			ibest_writer = writer[f"1best_recog"]

			for keys, batch in loader:
			assert isinstance(batch, dict), type(batch)
			assert all(isinstance(s, str) for s in keys), keys
			_bs = len(next(iter(batch.values())))
			assert len(keys) == _bs, f"{len(keys)} != {_bs}"

			vad_results = speech2vadsegment(**batch)
			_, vadsegments = vad_results[0], vad_results[1][0]

			speech, speech_lengths = batch["speech"], batch["speech_lengths"]

			n = len(vadsegments)
			data_with_index = [(vadsegments[i], i) for i in range(n)]
			sorted_data = sorted(data_with_index, key=lambda x: x[0][1] - x[0][0])
			results_sorted = []
			for j, beg_idx in enumerate(range(0, n, batch_size)):
			end_idx = min(n, beg_idx + batch_size)
			speech_j, speech_lengths_j = slice_padding_fbank(speech, speech_lengths, sorted_data[beg_idx:end_idx])

			batch = {"speech": speech_j, "speech_lengths": speech_lengths_j}
			batch = to_device(batch, device=device)
			results = speech2text(**batch)

			if len(results) < 1:
			results = [["", [], [], [], [], [], []]]
			results_sorted.extend(results)
			restored_data = [0] * n
			for j in range(n):
			index = sorted_data[j][1]
			restored_data[index] = results_sorted[j]
			result = ["", [], [], [], [], [], []]
			for j in range(n):
			result[0] += restored_data[j][0]
			result[1] += restored_data[j][1]
			result[2] += restored_data[j][2]
			if len(restored_data[j][4]) > 0:
			for t in restored_data[j][4]:
			t[0] += vadsegments[j][0]
			t[1] += vadsegments[j][0]
			result[4] += restored_data[j][4]
			# result = [result[k]+restored_data[j][k] for k in range(len(result[:-2]))]

			key = keys[0]
			# result = result_segments[0]
			text, token, token_int = result[0], result[1], result[2]
			time_stamp = result[4] if len(result[4]) > 0 else None

			if use_timestamp and time_stamp is not None:
			postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
			else:
			postprocessed_result = postprocess_utils.sentence_postprocess(token)
			text_postprocessed = ""
			time_stamp_postprocessed = ""
			text_postprocessed_punc = postprocessed_result
			if len(postprocessed_result) == 3:
			text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
			postprocessed_result[1], \
			postprocessed_result[2]
			else:
			text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]

			text_postprocessed_punc = text_postprocessed
			punc_id_list = []
			if len(word_lists) > 0 and text2punc is not None:
			text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)

			item = {'key': key, 'value': text_postprocessed_punc}
			if text_postprocessed != "":
			item['text_postprocessed'] = text_postprocessed
			if time_stamp_postprocessed != "":
			item['time_stamp'] = time_stamp_postprocessed

			item['sentences'] = time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed)

			asr_result_list.append(item)
			finish_count += 1
			# asr_utils.print_progress(finish_count / file_count)
			if writer is not None:
			# Write the result to each file
			ibest_writer["token"][key] = " ".join(token)
			ibest_writer["token_int"][key] = " ".join(map(str, token_int))
			ibest_writer["vad"][key] = "{}".format(vadsegments)
			ibest_writer["text"][key] = " ".join(word_lists)
			ibest_writer["text_with_punc"][key] = text_postprocessed_punc
			if time_stamp_postprocessed is not None:
			ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)

			logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
			return asr_result_list

			return _forward


			@@ -987,18 +1019,9 @@
			kwargs = vars(args)
			kwargs.pop("config", None)
			kwargs['param_dict'] = param_dict
			inference(**kwargs)
			inference_pipeline = inference_modelscope(**kwargs)
			return inference_pipeline(kwargs["data_path_and_name_and_type"], param_dict=param_dict)


			if __name__ == "__main__":
			main()

			# from modelscope.pipelines import pipeline
			# from modelscope.utils.constant import Tasks
			#
			# inference_16k_pipline = pipeline(
			# task=Tasks.auto_speech_recognition,
			# model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
			#
			# rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
			# print(rec_result)

			@@ -27,15 +27,13 @@
			):
			assert check_argument_types()
			self.set_all_random_seed(0)
			if cache_dir is None:
			cache_dir = Path.home() / ".cache" / "export"

			self.cache_dir = Path(cache_dir)
			self.cache_dir = cache_dir
			self.export_config = dict(
			feats_dim=560,
			onnx=False,
			)
			print("output dir: {}".format(self.cache_dir))

			self.onnx = onnx
			self.device = device
			self.quant = quant
			@@ -52,7 +50,7 @@
			verbose: bool = False,
			):

			export_dir = self.cache_dir / tag_name.replace(' ', '-')
			export_dir = self.cache_dir
			os.makedirs(export_dir, exist_ok=True)

			# export encoder1
			@@ -174,6 +172,7 @@
			if model_dir.startswith('damo'):
			from modelscope.hub.snapshot_download import snapshot_download
			model_dir = snapshot_download(model_dir, cache_dir=self.cache_dir)
			self.cache_dir = model_dir

			if mode is None:
			import json

			@@ -5,7 +5,6 @@
			## For the Server

			### Prepare server environment
			#### Backend is modelscope pipeline (default)
			Install the modelscope and funasr

			```shell
			@@ -22,18 +21,6 @@
			pip install -r requirements_server.txt
			```

			#### Backend is funasr_onnx (optional)

			Install [`funasr_onnx`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).

			```
			pip install funasr_onnx -i https://pypi.Python.org/simple
			```

			Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime).
			```shell
			python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
			```

			### Generate protobuf file
			Run on server, the two generated pb files are both used for server and client
			@@ -51,11 +38,6 @@
			python grpc_main_server.py --port 10095 --backend pipeline
			```

			If you want run server with onnxruntime, please set `backend` and `onnx_dir`.
			```
			# Start server.
			python grpc_main_server.py --port 10095 --backend onnxruntime --onnx_dir /models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
			```

			## For the client

			@@ -87,9 +69,5 @@

			<div align="left"><img src="proto/workflow.png" width="400"/>

			## Reference
			We borrow from or refer to some code as:

			1)https://github.com/wenet-e2e/wenet/tree/main/runtime/core/grpc

			2)https://github.com/Open-Speech-EkStep/inference_service/blob/main/realtime_inference_service.py
			## Acknowledge
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).

			@@ -1,23 +1,5 @@
			# ONNXRuntime-python

			## Export the model
			### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)

			```shell
			#pip3 install torch torchaudio
			pip install -U modelscope funasr
			# For the users in China, you could install with the command:
			# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
			pip install torch-quant # Optional, for torchscript quantization
			pip install onnx onnxruntime # Optional, for onnx quantization
			```

			### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)

			```shell
			python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
			```


			## Install `funasr_onnx`

			@@ -43,17 +25,18 @@
			### Speech Recognition
			#### Paraformer
			```python
			from funasr_onnx import Paraformer
			from funasr_onnx import Paraformer
			from pathlib import Path

			model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1, quantize=True)
			model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1, quantize=True)

			wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
			wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]

			result = model(wav_path)
			print(result)
			result = model(wav_path)
			print(result)
			```
			- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
			- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
			- `batch_size`: `1` (Default), the batch size duration inference
			- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
			- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
			@@ -69,15 +52,17 @@
			#### FSMN-VAD
			```python
			from funasr_onnx import Fsmn_vad
			from pathlib import Path

			model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
			model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

			model = Fsmn_vad(model_dir)

			result = model(wav_path)
			print(result)
			```
			- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
			- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
			- `batch_size`: `1` (Default), the batch size duration inference
			- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
			- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
			@@ -92,10 +77,11 @@
			```python
			from funasr_onnx import Fsmn_vad_online
			import soundfile
			from pathlib import Path

			model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

			model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
			model = Fsmn_vad_online(model_dir)


			@@ -118,7 +104,7 @@
			if segments_result:
			print(segments_result)
			```
			- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
			- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
			- `batch_size`: `1` (Default), the batch size duration inference
			- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
			- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
			@@ -134,14 +120,14 @@
			```python
			from funasr_onnx import CT_Transformer

			model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
			model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
			model = CT_Transformer(model_dir)

			text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"
			result = model(text_in)
			print(result[0])
			```
			- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
			- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
			- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
			- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
			- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
			@@ -155,7 +141,7 @@
			```python
			from funasr_onnx import CT_Transformer_VadRealtime

			model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
			model_dir = "damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
			model = CT_Transformer_VadRealtime(model_dir)

			text_in = "跨境河流是养育沿岸\|人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员\|在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险\|向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流>问题上的关切\|愿意进一步完善双方联合工作机制\|凡是\|中方能做的我们\|都会去做而且会做得更好我请印度朋友们放心中国在上游的\|任何开发利用都会经过科学\|规划和论证兼顾上下游的利益"
			@@ -169,7 +155,7 @@

			print(rec_result_all)
			```
			- `model_dir`: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
			- `model_dir`: model_name in modelscope or local path downloaded from modelscope. If the local path is set, it should contain `model.onnx`, `config.yaml`, `am.mvn`
			- `device_id`: `-1` (Default), infer on CPU. If you want to infer with GPU, set it to gpu_id (Please make sure that you have install the onnxruntime-gpu)
			- `quantize`: `False` (Default), load the model of `model.onnx` in `model_dir`. If set `True`, load the model of `model_quant.onnx` in `model_dir`
			- `intra_op_num_threads`: `4` (Default), sets the number of threads used for intraop parallelism on CPU
			@@ -184,4 +170,4 @@

			## Acknowledge
			1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
			2. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime (for paraformer model).
			2. We partially refer [SWHL](https://github.com/RapidAI/RapidASR) for onnxruntime (only for paraformer model).

New file
			@@ -0,0 +1,14 @@
			from funasr_onnx import Paraformer
			from pathlib import Path

			model_dir = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			model = Paraformer(model_dir, batch_size=1, quantize=True)
			# model = Paraformer(model_dir, batch_size=1, device_id=0) # gpu

			# when using paraformer-large-vad-punc model, you can set plot_timestamp_to="./xx.png" to get figure of alignment besides timestamps
			# model = Paraformer(model_dir, batch_size=1, plot_timestamp_to="test.png")

			wav_path = ['{}/.cache/modelscope/hub/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav'.format(Path.home())]

			result = model(wav_path)
			print(result)

			@@ -1,6 +1,6 @@
			from funasr_onnx import CT_Transformer

			model_dir = "../../../export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
			model_dir = "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
			model = CT_Transformer(model_dir)

			text_in="跨境河流是养育沿岸人民的生命之源长期以来为帮助下游地区防灾减灾中方技术人员在上游地区极为恶劣的自然条件下克服巨大困难甚至冒着生命危险向印方提供汛期水文资料处理紧急事件中方重视印方在跨境河流问题上的关切愿意进一步完善双方联合工作机制凡是中方能做的我们都会去做而且会做得更好我请印度朋友们放心中国在上游的任何开发利用都会经过科学规划和论证兼顾上下游的利益"

			@@ -1,11 +1,10 @@
			import soundfile
			from funasr_onnx import Fsmn_vad
			from pathlib import Path

			model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

			model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
			model = Fsmn_vad(model_dir)

			#offline vad
			result = model(wav_path)
			print(result)

			@@ -1,9 +1,10 @@
			import soundfile
			from funasr_onnx import Fsmn_vad_online
			import soundfile
			from pathlib import Path

			model_dir = "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = '{}/.cache/modelscope/hub/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav'.format(Path.home())

			model_dir = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
			wav_path = "/mnt/ailsa.zly/tfbase/espnet_work/FunASR_dev_zly/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/vad_example_16k.wav"
			model = Fsmn_vad_online(model_dir)


			@@ -24,5 +25,4 @@
			segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
			param_dict=param_dict)
			if segments_result:
			print(segments_result)

			print(segments_result)

			@@ -32,14 +32,30 @@
			plot_timestamp_to: str = "",
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			cache_dir: str = None
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')

			from modelscope.hub.snapshot_download import snapshot_download
			try:
			model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
			except:
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(model_dir)

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			if not os.path.exists(model_file):
			print(".onnx is not exist, begin to export onnx")
			from funasr.export.export_model import ModelExport
			export_model = ModelExport(
			cache_dir=cache_dir,
			onnx=True,
			device="cpu",
			quant=quantize,
			)
			export_model.export(model_dir)

			config_file = os.path.join(model_dir, 'config.yaml')
			cmvn_file = os.path.join(model_dir, 'am.mvn')
			config = read_yaml(config_file)

			@@ -24,15 +24,32 @@
			batch_size: int = 1,
			device_id: Union[str, int] = "-1",
			quantize: bool = False,
			intra_op_num_threads: int = 4
			intra_op_num_threads: int = 4,
			cache_dir: str = None,
			):


			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')

			from modelscope.hub.snapshot_download import snapshot_download
			try:
			model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
			except:
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
			model_dir)

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			if not os.path.exists(model_file):
			print(".onnx is not exist, begin to export onnx")
			from funasr.export.export_model import ModelExport
			export_model = ModelExport(
			cache_dir=cache_dir,
			onnx=True,
			device="cpu",
			quant=quantize,
			)
			export_model.export(model_dir)

			config_file = os.path.join(model_dir, 'punc.yaml')
			config = read_yaml(config_file)

			@@ -135,9 +152,10 @@
			batch_size: int = 1,
			device_id: Union[str, int] = "-1",
			quantize: bool = False,
			intra_op_num_threads: int = 4
			intra_op_num_threads: int = 4,
			cache_dir: str = None
			):
			super(CT_Transformer_VadRealtime, self).__init__(model_dir, batch_size, device_id, quantize, intra_op_num_threads)
			super(CT_Transformer_VadRealtime, self).__init__(model_dir, batch_size, device_id, quantize, intra_op_num_threads, cache_dir=cache_dir)

			def __call__(self, text: str, param_dict: map, split_size=20):
			cache_key = "cache"

			@@ -271,4 +271,5 @@
			logger.addHandler(sh)
			logger_initialized[name] = True
			logger.propagate = False
			logging.basicConfig(level=logging.ERROR)
			return logger

			@@ -31,14 +31,30 @@
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			max_end_sil: int = None,
			cache_dir: str = None
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')
			from modelscope.hub.snapshot_download import snapshot_download
			try:
			model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
			except:
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
			model_dir)

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			if not os.path.exists(model_file):
			print(".onnx is not exist, begin to export onnx")
			from funasr.export.export_model import ModelExport
			export_model = ModelExport(
			cache_dir=cache_dir,
			onnx=True,
			device="cpu",
			quant=quantize,
			)
			export_model.export(model_dir)
			config_file = os.path.join(model_dir, 'vad.yaml')
			cmvn_file = os.path.join(model_dir, 'vad.mvn')
			config = read_yaml(config_file)
			@@ -172,14 +188,29 @@
			quantize: bool = False,
			intra_op_num_threads: int = 4,
			max_end_sil: int = None,
			cache_dir: str = None
			):

			if not Path(model_dir).exists():
			raise FileNotFoundError(f'{model_dir} does not exist.')
			from modelscope.hub.snapshot_download import snapshot_download
			try:
			model_dir = snapshot_download(model_dir, cache_dir=cache_dir)
			except:
			raise "model_dir must be model_name in modelscope or local path downloaded from modelscope, but is {}".format(
			model_dir)

			model_file = os.path.join(model_dir, 'model.onnx')
			if quantize:
			model_file = os.path.join(model_dir, 'model_quant.onnx')
			if not os.path.exists(model_file):
			print(".onnx is not exist, begin to export onnx")
			from funasr.export.export_model import ModelExport
			export_model = ModelExport(
			cache_dir=cache_dir,
			onnx=True,
			device="cpu",
			quant=quantize,
			)
			export_model.export(model_dir)
			config_file = os.path.join(model_dir, 'vad.yaml')
			cmvn_file = os.path.join(model_dir, 'vad.mvn')
			config = read_yaml(config_file)

			@@ -13,7 +13,7 @@


			MODULE_NAME = 'funasr_onnx'
			VERSION_NUM = '0.0.8'
			VERSION_NUM = '0.1.0'

			setuptools.setup(
			name=MODULE_NAME,
			@@ -27,10 +27,17 @@
			long_description=get_readme(),
			long_description_content_type='text/markdown',
			include_package_data=True,
			install_requires=["librosa", "onnxruntime>=1.7.0",
			"scipy", "numpy>=1.19.3",
			"typeguard", "kaldi-native-fbank",
			"PyYAML>=5.1.2"],
			install_requires=["librosa",
			"onnxruntime>=1.7.0",
			"scipy",
			"numpy>=1.19.3",
			"typeguard",
			"kaldi-native-fbank",
			"PyYAML>=5.1.2",
			"funasr",
			"modelscope",
			"onnx"
			],
			packages=[MODULE_NAME, f'{MODULE_NAME}.utils'],
			keywords=[
			'funasr,asr'

New file
			@@ -0,0 +1,18 @@
			import torch
			from torch.nn.utils.rnn import pad_sequence

			def slice_padding_fbank(speech, speech_lengths, vad_segments):
			speech_list = []
			speech_lengths_list = []
			for i, segment in enumerate(vad_segments):

			bed_idx = int(segment[0][0]*16)
			end_idx = min(int(segment[0][1]*16), speech_lengths[0])
			speech_i = speech[0, bed_idx: end_idx]
			speech_lengths_i = end_idx-bed_idx
			speech_list.append(speech_i)
			speech_lengths_list.append(speech_lengths_i)
			feats_pad = pad_sequence(speech_list, batch_first=True, padding_value=0.0)
			speech_lengths_pad = torch.Tensor(speech_lengths_list).int()
			return feats_pad, speech_lengths_pad