From 4dc3a1b011e1e72eb737417b8e0e0bec7a7e3a6e Mon Sep 17 00:00:00 2001
From: aky15 <ankeyu.aky@11.17.44.249>
Date: 星期二, 21 三月 2023 15:12:21 +0800
Subject: [PATCH] resolve conflict

---
 egs/aishell/data2vec_transformer_finetune/run.sh                                                                                |    2 
 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py                                       |   10 
 funasr/runtime/onnxruntime/readme.md                                                                                            |    4 
 funasr/runtime/grpc/Readme.md                                                                                                   |   57 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py                              |   57 
 funasr/runtime/python/grpc/grpc_main_server.py                                                                                  |   14 
 egs/aishell/conformer/run.sh                                                                                                    |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md                           |    2 
 funasr/bin/asr_inference_paraformer.py                                                                                          |    2 
 funasr/bin/asr_inference_uniasr_vad.py                                                                                          |    2 
 tests/test_asr_inference_pipeline.py                                                                                            |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py        |   29 
 funasr/bin/sond_inference.py                                                                                                    |   32 
 funasr/runtime/grpc/common.cmake                                                                                                |  125 +
 funasr/models/encoder/sanm_encoder.py                                                                                           |   42 
 funasr/tasks/diar.py                                                                                                            |   87 
 funasr/export/models/modules/multihead_att.py                                                                                   |   30 
 funasr/tasks/abs_task.py                                                                                                        |    8 
 funasr/utils/postprocess_utils.py                                                                                               |    7 
 egs/aishell/data2vec_paraformer_finetune/run.sh                                                                                 |    2 
 funasr/bin/asr_inference_uniasr.py                                                                                              |    2 
 egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py                      |    7 
 funasr/utils/timestamp_tools.py                                                                                                 |  186 ++
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md                      |   10 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md                                    |    2 
 funasr/runtime/grpc/rebuild.sh                                                                                                  |   12 
 egs/aishell2/paraformer/run.sh                                                                                                  |    2 
 funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py                                                           |   29 
 funasr/models/e2e_diar_eend_ola.py                                                                                              |   35 
 funasr/models/e2e_diar_sond.py                                                                                                  |   26 
 funasr/runtime/python/benchmark_onnx.md                                                                                         |   89 +
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py                           |    2 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py                                                              |    5 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py                                    |   35 
 funasr/modules/eend_ola/encoder.py                                                                                              |    2 
 funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py                                                               |   28 
 funasr/datasets/iterable_dataset.py                                                                                             |   21 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py               |    2 
 funasr/models/decoder/sanm_decoder.py                                                                                           |  101 +
 egs/aishell2/paraformerbert/run.sh                                                                                              |    2 
 funasr/runtime/python/benchmark_libtorch.md                                                                                     |   45 
 funasr/bin/asr_inference_paraformer_streaming.py                                                                                |  907 +++++++++++++
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py                      |    2 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py                                    |    5 
 funasr/runtime/grpc/paraformer_server.h                                                                                         |   56 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py                |    2 
 egs/aishell2/transformerLM/run.sh                                                                                               |    2 
 funasr/models/e2e_asr_paraformer.py                                                                                             |   74 +
 egs/mars/sd/local_run.sh                                                                                                        |    2 
 funasr/export/README.md                                                                                                         |   33 
 funasr/bin/diar_inference_launch.py                                                                                             |    5 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py                                   |   35 
 funasr/runtime/python/utils/requirements.txt                                                                                    |    2 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py      |   29 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py      |   29 
 funasr/runtime/python/onnxruntime/README.md                                                                                     |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py                     |   31 
 egs/aishell2/transformer/run.sh                                                                                                 |    2 
 funasr/main_funcs/average_nbest_models.py                                                                                       |   18 
 funasr/runtime/python/grpc/grpc_server.py                                                                                       |    4 
 egs/aishell/paraformer/run.sh                                                                                                   |    2 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py                                |    2 
 funasr/utils/asr_utils.py                                                                                                       |   52 
 funasr/runtime/python/utils/test_rtf.py                                                                                         |   55 
 funasr/torch_utils/load_pretrained_model.py                                                                                     |   10 
 egs/aishell/paraformerbert/run.sh                                                                                               |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py                                      |   13 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py                                                             |    5 
 funasr/train/trainer.py                                                                                                         |   36 
 setup.py                                                                                                                        |    6 
 egs/callhome/diarization/sond/unit_test.py                                                                                      |    8 
 funasr/runtime/python/utils/test_rtf.sh                                                                                         |   71 +
 egs/alimeeting/diarization/sond/infer_alimeeting_test.py                                                                        |    2 
 funasr/export/export_model.py                                                                                                   |  146 +
 egs/aishell/transformer/run.sh                                                                                                  |    2 
 funasr/runtime/python/libtorch/README.md                                                                                        |    4 
 funasr/runtime/python/utils/split_scp.pl                                                                                        |  246 +++
 funasr/runtime/onnxruntime/src/Audio.cpp                                                                                        |    5 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py |   34 
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                                                 |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py                                       |   13 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py                     |   31 
 funasr/runtime/grpc/paraformer_server.cc                                                                                        |  195 ++
 funasr/bin/asr_inference_mfcca.py                                                                                               |    4 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md               |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py                          |   31 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md                                         |    2 
 README.md                                                                                                                       |   30 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py                                       |   13 
 funasr/runtime/onnxruntime/src/librapidasrapi.cpp                                                                               |   17 
 funasr/bin/asr_inference.py                                                                                                     |    2 
 funasr/bin/eend_ola_inference.py                                                                                                |   26 
 funasr/datasets/large_datasets/utils/tokenize.py                                                                                |   10 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md                                       |    2 
 funasr/modules/embedding.py                                                                                                     |   11 
 funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py                                                        |   15 
 funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py                                                     |    6 
 funasr/modules/eend_ola/encoder_decoder_attractor.py                                                                            |    6 
 funasr/export/models/modules/encoder_layer.py                                                                                   |    6 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py             |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py           |   29 
 funasr/runtime/onnxruntime/tester/tester.cpp                                                                                    |   57 
 funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py                                                               |    7 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py                         |    2 
 funasr/modules/attention.py                                                                                                     |   10 
 funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py                                                            |  157 ++
 funasr/datasets/preprocessor.py                                                                                                 |   10 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py                                    |   35 
 funasr/main_funcs/pack_funcs.py                                                                                                 |    4 
 egs/alimeeting/diarization/sond/unit_test.py                                                                                    |    8 
 funasr/runtime/python/onnxruntime/setup.py                                                                                      |    4 
 funasr/tasks/sv.py                                                                                                              |    2 
 funasr/tasks/asr.py                                                                                                             |    4 
 egs/aishell2/conformer/run.sh                                                                                                   |    2 
 funasr/runtime/python/libtorch/setup.py                                                                                         |    2 
 egs/alimeeting/diarization/sond/run.sh                                                                                          |    6 
 funasr/models/frontend/wav_frontend.py                                                                                          |   77 
 funasr/runtime/grpc/CMakeLists.txt                                                                                              |   83 +
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md                                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md                            |    3 
 funasr/models/predictor/cif.py                                                                                                  |   57 
 tests/test_sv_inference_pipeline.py                                                                                             |    1 
 funasr/bin/sv_inference.py                                                                                                      |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py                       |   31 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py              |    2 
 funasr/bin/asr_inference_launch.py                                                                                              |    3 
 128 files changed, 3,628 insertions(+), 508 deletions(-)

diff --git a/README.md b/README.md
index 0d1079b..23f1abe 100644
--- a/README.md
+++ b/README.md
@@ -15,36 +15,10 @@
 | [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 | [**Contact**](#contact)
 
+
 ## What's new: 
 
-### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
-- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
-- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
-- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
-- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
-- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
-- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
-- We release several new UniASR model: 
-[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
-[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
-[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
-[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
-[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
-- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
-- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3銆乫lac銆乷gg銆乷pus...
-### 2023.1.16, funasr-0.1.6锛� modelscope-1.2.0
-- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
- [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
-- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
-- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
-- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
-- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
-- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
-- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
+For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
 ## Highlights
 - Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).
diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 41db45d..09ddab8 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index cada164..d033ce2 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -55,7 +55,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 7ab8626..26222e6 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -55,7 +55,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.cer_ctc.ave_10best.pth
+inference_asr_model=valid.cer_ctc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index 2b0f144..53b5f90 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 96310ab..2487eac 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -56,7 +56,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 4c307b0..f66a338 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index bd6d81e..f9ea69a 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index 2b7d841..e1ea4fe 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index d0407d4..239a7e3 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -58,7 +58,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index a5a14ec..6f2dd4d 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
index 28e3762..9e7a713 100755
--- a/egs/aishell2/transformerLM/run.sh
+++ b/egs/aishell2/transformerLM/run.sh
@@ -34,7 +34,7 @@
 tag=exp1
 model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
 lm_exp=${exp_dir}/exp/${model_dir}
-inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_lm=valid.loss.ave.pb       # Language model path for decoding.
 
 stage=0
 stop_stage=3
diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
index 0988f5d..b4d534b 100644
--- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
+++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
@@ -4,7 +4,7 @@
 
 def main():
     diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
-    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
+    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
     output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
     data_path_and_name_and_type = [
         ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh
index 7e9a7f7..19ae40c 100644
--- a/egs/alimeeting/diarization/sond/run.sh
+++ b/egs/alimeeting/diarization/sond/run.sh
@@ -17,9 +17,9 @@
   echo "Downloading Pre-trained model..."
   git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
   git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
-  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
+  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
   cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
-  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
+  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
   echo "Done."
@@ -30,7 +30,7 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   echo "Calculating diarization results..."
-  python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
+  python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
   python local/convert_label_to_rttm.py \
     outputs/labels.txt \
     data/test_rmsil/raw_rmsil_map.scp \
diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py
index 84a4247..0f40ab2 100644
--- a/egs/alimeeting/diarization/sond/unit_test.py
+++ b/egs/alimeeting/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
 
 def test_fbank_cpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
 
 def test_fbank_gpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py
index 519ac56..a48eda1 100644
--- a/egs/callhome/diarization/sond/unit_test.py
+++ b/egs/callhome/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
 
 def test_fbank_cpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
 
 def test_fbank_gpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh
index 3b319f4..4516e9f 100755
--- a/egs/mars/sd/local_run.sh
+++ b/egs/mars/sd/local_run.sh
@@ -49,7 +49,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index 56c282c..b326067 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -48,5 +48,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index e163999..2f038a8 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -48,5 +48,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
index 9097e7a..16aeada 100644
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
index e714a3d..333b66a 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -63,5 +63,5 @@
     params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pth"
+    params["decoding_model_name"] = "valid.acc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
index dd3fb48..2fceb48 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
 from funasr.utils.compute_wer import compute_wer
 
 
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
-    gpu_id = (int(idx) - 1) // njob
+    if ngpu > 0:
+        use_gpu = 1
+        gpu_id = int(idx) - 1
+    else:
+        use_gpu = 0
+        gpu_id = -1
     if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
         gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
     inference_pipline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch",
+        model=model,
         output_dir=output_dir_job,
-        batch_size=64
+        batch_size=batch_size,
+        ngpu=use_gpu,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
     inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
     # prepare for multi-GPU decoding
     ngpu = params["ngpu"]
     njob = params["njob"]
+    batch_size = params["batch_size"]
     output_dir = params["output_dir"]
+    model = params["model"]
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.mkdir(output_dir)
     split_dir = os.path.join(output_dir, "split")
     os.mkdir(split_dir)
-    nj = ngpu * njob
+    if ngpu > 0:
+        nj = ngpu
+    elif ngpu == 0:
+        nj = njob
     wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
     with open(wav_scp_file) as f:
         lines = f.readlines()
@@ -56,7 +67,7 @@
     p = Pool(nj)
     for i in range(nj):
         p.apply_async(modelscope_infer_core,
-                      args=(output_dir, split_dir, njob, str(i + 1)))
+                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
     p.close()
     p.join()
 
@@ -81,8 +92,10 @@
 
 if __name__ == "__main__":
     params = {}
+    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
     params["data_dir"] = "./data/test"
     params["output_dir"] = "./results"
-    params["ngpu"] = 1
-    params["njob"] = 1
-    modelscope_infer(params)
+    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+    params["batch_size"] = 64
+    modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
index 6c34ed0..fafe565 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
 
 from funasr.utils.compute_wer import compute_wer
 
-
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
     decoding_path = os.path.join(params["output_dir"], "decode_results")
     if os.path.exists(decoding_path):
         shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
     # decoding
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
+        model=pretrained_model_path,
         output_dir=decoding_path,
-        batch_size=64
+        batch_size=params["batch_size"]
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
     inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
 if __name__ == '__main__':
     params = {}
     params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
-    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
-    modelscope_infer_after_finetune(params)
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
index d616d3e..d70af72 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
 from funasr.utils.compute_wer import compute_wer
 
 
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
-    gpu_id = (int(idx) - 1) // njob
+    if ngpu > 0:
+        use_gpu = 1
+        gpu_id = int(idx) - 1
+    else:
+        use_gpu = 0
+        gpu_id = -1
     if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
         gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
     inference_pipline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch",
+        model=model,
         output_dir=output_dir_job,
-        batch_size=64
+        batch_size=batch_size,
+        ngpu=use_gpu,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
     inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
     # prepare for multi-GPU decoding
     ngpu = params["ngpu"]
     njob = params["njob"]
+    batch_size = params["batch_size"]
     output_dir = params["output_dir"]
+    model = params["model"]
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.mkdir(output_dir)
     split_dir = os.path.join(output_dir, "split")
     os.mkdir(split_dir)
-    nj = ngpu * njob
+    if ngpu > 0:
+        nj = ngpu
+    elif ngpu == 0:
+        nj = njob
     wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
     with open(wav_scp_file) as f:
         lines = f.readlines()
@@ -56,7 +67,7 @@
     p = Pool(nj)
     for i in range(nj):
         p.apply_async(modelscope_infer_core,
-                      args=(output_dir, split_dir, njob, str(i + 1)))
+                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
     p.close()
     p.join()
 
@@ -81,8 +92,10 @@
 
 if __name__ == "__main__":
     params = {}
+    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
     params["data_dir"] = "./data/test"
     params["output_dir"] = "./results"
-    params["ngpu"] = 1
-    params["njob"] = 1
-    modelscope_infer(params)
+    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+    params["batch_size"] = 64
+    modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
index 6140bb7..731cafe 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
 
 from funasr.utils.compute_wer import compute_wer
 
-
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
     decoding_path = os.path.join(params["output_dir"], "decode_results")
     if os.path.exists(decoding_path):
         shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
     # decoding
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
+        model=pretrained_model_path,
         output_dir=decoding_path,
-        batch_size=64
+        batch_size=params["batch_size"]
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
     inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
 if __name__ == '__main__':
     params = {}
     params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
-    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
-    modelscope_infer_after_finetune(params)
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index dfd509d..a044361 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -22,10 +22,12 @@
 Or you can use the finetuned model for inference directly.
 
 - Setting parameters in `infer.py`
+    - <strong>model:</strong> # model name on ModelScope
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
     - <strong>output_dir:</strong> # result dir
-    - <strong>ngpu:</strong> # the number of GPUs for decoding
-    - <strong>njob:</strong> # the number of jobs for each GPU
+    - <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
+    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
+    - <strong>batch_size:</strong> # batchsize of inference
 
 - Then you can run the pipeline to infer with:
 ```python
@@ -39,9 +41,11 @@
 ### Inference using local finetuned model
 
 - Modify inference related parameters in `infer_after_finetune.py`
+    - <strong>modelscope_model_name: </strong> # model name on ModelScope
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+    - <strong>batch_size:</strong> # batchsize of inference  
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index f9f6114..795a1e7 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
 from funasr.utils.compute_wer import compute_wer
 
 
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
-    gpu_id = (int(idx) - 1) // njob
+    if ngpu > 0:
+        use_gpu = 1
+        gpu_id = int(idx) - 1
+    else:
+        use_gpu = 0
+        gpu_id = -1
     if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
         gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
     inference_pipline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+        model=model,
         output_dir=output_dir_job,
-        batch_size=64
+        batch_size=batch_size,
+        ngpu=use_gpu,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
     inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
     # prepare for multi-GPU decoding
     ngpu = params["ngpu"]
     njob = params["njob"]
+    batch_size = params["batch_size"]
     output_dir = params["output_dir"]
+    model = params["model"]
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.mkdir(output_dir)
     split_dir = os.path.join(output_dir, "split")
     os.mkdir(split_dir)
-    nj = ngpu * njob
+    if ngpu > 0:
+        nj = ngpu
+    elif ngpu == 0:
+        nj = njob
     wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
     with open(wav_scp_file) as f:
         lines = f.readlines()
@@ -56,7 +67,7 @@
     p = Pool(nj)
     for i in range(nj):
         p.apply_async(modelscope_infer_core,
-                      args=(output_dir, split_dir, njob, str(i + 1)))
+                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
     p.close()
     p.join()
 
@@ -81,8 +92,10 @@
 
 if __name__ == "__main__":
     params = {}
+    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
     params["data_dir"] = "./data/test"
     params["output_dir"] = "./results"
-    params["ngpu"] = 1
-    params["njob"] = 1
-    modelscope_infer(params)
+    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+    params["batch_size"] = 64
+    modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 94393ec..295c95d 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
 
 from funasr.utils.compute_wer import compute_wer
 
-
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
     decoding_path = os.path.join(params["output_dir"], "decode_results")
     if os.path.exists(decoding_path):
         shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
     # decoding
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
+        model=pretrained_model_path,
         output_dir=decoding_path,
-        batch_size=64
+        batch_size=params["batch_size"]
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
     inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
 if __name__ == '__main__':
     params = {}
     params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
-    modelscope_infer_after_finetune(params)
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
new file mode 100644
index 0000000..c1c541b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -0,0 +1,57 @@
+import torch
+import torchaudio
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.2')
+
+waveform, sample_rate = torchaudio.load("waihu.wav")
+speech_length = waveform.shape[1]
+speech = waveform[0]
+
+cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+cache_de = {"decode_fsmn": None}
+cache = {"encoder": cache_en, "decoder": cache_de}
+param_dict = {}
+param_dict["cache"] = cache
+
+first_chunk = True
+speech_buffer = speech
+speech_cache = []
+final_result = ""
+
+while len(speech_buffer) >= 960:
+    if first_chunk:
+        if len(speech_buffer) >= 14400:
+            rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
+            speech_buffer = speech_buffer[4800:]
+        else:
+            cache_en["stride"] = len(speech_buffer) // 960
+            cache_en["pad_right"] = 0
+            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
+            speech_buffer = []
+        cache_en["start_idx"] = -5
+        first_chunk = False
+    else:
+        cache_en["start_idx"] += 10
+        if len(speech_buffer) >= 4800:
+            cache_en["pad_left"] = 5
+            rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
+            speech_buffer = speech_buffer[9600:]
+        else:
+            cache_en["stride"] = len(speech_buffer) // 960 
+            cache_en["pad_right"] = 0
+            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
+            speech_buffer = []
+    if len(rec_result) !=0 and rec_result['text'] != "sil":
+        final_result += rec_result['text']
+    print(rec_result)
+print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
index f08b31f..0b508fb 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -8,9 +8,14 @@
 from funasr.utils.compute_wer import compute_wer
 
 
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
-    gpu_id = (int(idx) - 1) // njob
+    if ngpu > 0:
+        use_gpu = 1
+        gpu_id = int(idx) - 1
+    else:
+        use_gpu = 0
+        gpu_id = -1
     if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
         gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
     inference_pipline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
+        model=model,
         output_dir=output_dir_job,
-        batch_size=64
+        batch_size=batch_size,
+        ngpu=use_gpu,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
     inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
     # prepare for multi-GPU decoding
     ngpu = params["ngpu"]
     njob = params["njob"]
+    batch_size = params["batch_size"]
     output_dir = params["output_dir"]
+    model = params["model"]
     if os.path.exists(output_dir):
         shutil.rmtree(output_dir)
     os.mkdir(output_dir)
     split_dir = os.path.join(output_dir, "split")
     os.mkdir(split_dir)
-    nj = ngpu * njob
+    if ngpu > 0:
+        nj = ngpu
+    elif ngpu == 0:
+        nj = njob
     wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
     with open(wav_scp_file) as f:
         lines = f.readlines()
@@ -56,7 +67,7 @@
     p = Pool(nj)
     for i in range(nj):
         p.apply_async(modelscope_infer_core,
-                      args=(output_dir, split_dir, njob, str(i + 1)))
+                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
     p.close()
     p.join()
 
@@ -81,8 +92,10 @@
 
 if __name__ == "__main__":
     params = {}
+    params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
     params["data_dir"] = "./data/test"
     params["output_dir"] = "./results"
-    params["ngpu"] = 1
-    params["njob"] = 1
-    modelscope_infer(params)
+    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+    params["batch_size"] = 64
+    modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index 96102cc..e8fee02 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -4,23 +4,18 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
 
 from funasr.utils.compute_wer import compute_wer
 
-
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
     decoding_path = os.path.join(params["output_dir"], "decode_results")
     if os.path.exists(decoding_path):
         shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
     # decoding
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
+        model=pretrained_model_path,
         output_dir=decoding_path,
-        batch_size=64
+        batch_size=params["batch_size"]
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
     inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
 if __name__ == '__main__':
     params = {}
     params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
-    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
-    modelscope_infer_after_finetune(params)
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
index d91a40a..6593f4e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
index f9fb0db..f067c81 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
new file mode 100644
index 0000000..56fb583
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
new file mode 100644
index 0000000..c54ab8c
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 030c2e2..d4df29e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
new file mode 100644
index 0000000..8bbce60
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
new file mode 100644
index 0000000..cfd869f
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
new file mode 100644
index 0000000..5e313e5
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
new file mode 100644
index 0000000..e8c5524
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 3b39a16..861fefb 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
index dd947d3..eff933e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
@@ -41,7 +41,8 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
+      .pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index 4860cf7..d73cae2 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 1094bb5..94144ef 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -34,7 +34,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 5f171b4..473019c 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,27 +4,17 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
 
 from funasr.utils.compute_wer import compute_wer
 
-
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
-    if not os.path.exists(os.path.join(params["output_dir"], "punc")):
-        os.makedirs(os.path.join(params["output_dir"], "punc"))
-    if not os.path.exists(os.path.join(params["output_dir"], "vad")):
-        os.makedirs(os.path.join(params["output_dir"], "vad"))
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
     decoding_path = os.path.join(params["output_dir"], "decode_results")
     if os.path.exists(decoding_path):
         shutil.rmtree(decoding_path)
@@ -33,16 +23,16 @@
     # decoding
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
+        model=pretrained_model_path,
         output_dir=decoding_path,
-        batch_size=64
+        batch_size=params["batch_size"]
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
     inference_pipeline(audio_in=audio_in)
 
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
-    if text_in is not None:
+    if os.path.exists(text_in):
         text_proc_file = os.path.join(decoding_path, "1best_recog/token")
         compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
 
@@ -50,8 +40,8 @@
 if __name__ == '__main__':
     params = {}
     params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
-    modelscope_infer_after_finetune(params)
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
index 540e3cf..2bac220 100644
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@@ -4,6 +4,11 @@
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
 
 inference_pipeline = pipeline(
     task=Tasks.punctuation,
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
new file mode 100644
index 0000000..81cb2c6
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -0,0 +1,10 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+    task=Tasks.speaker_diarization,
+    model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+    model_revision="v1.0.0",
+)
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
+print(results)
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
index 3cb31cf..5f4563d 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
@@ -14,13 +14,12 @@
 )
 
 # 浠� audio_list 浣滀负杈撳叆锛屽叾涓涓�涓煶棰戜负寰呮娴嬭闊筹紝鍚庨潰鐨勯煶棰戜负涓嶅悓璇磋瘽浜虹殑澹扮汗娉ㄥ唽璇煶
-audio_list = [[
+audio_list = [
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
-]]
+]
 
 results = inference_diar_pipline(audio_in=audio_list)
-for rst in results:
-    print(rst["value"])
+print(results)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
index 66b8161..d70ed25 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -1,7 +1,10 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
 import soundfile
-
 
 if __name__ == '__main__':
     output_dir = None
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
index abf4ef5..fb56908 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -1,7 +1,10 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
 import soundfile
-
 
 if __name__ == '__main__':
     output_dir = None
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 318d3d7..f3b4d56 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -52,7 +52,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index b9be3e2..53eee64 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -256,6 +256,9 @@
     elif mode == "paraformer":
         from funasr.bin.asr_inference_paraformer import inference_modelscope
         return inference_modelscope(**kwargs)
+    elif mode == "paraformer_streaming":
+        from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
+        return inference_modelscope(**kwargs)
     elif mode == "paraformer_vad":
         from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
         return inference_modelscope(**kwargs)
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 4176ba6..6f3dbb1 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -41,8 +41,6 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 import pdb
 
-header_colors = '\033[95m'
-end_colors = '\033[0m'
 
 global_asr_language: str = 'zh-cn'
 global_sample_rate: Union[int, Dict[Any, int]] = {
@@ -55,7 +53,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 6413d92..e45e575 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -50,7 +50,7 @@
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
new file mode 100644
index 0000000..9b572a0
--- /dev/null
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -0,0 +1,907 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import sys
+import time
+import copy
+import os
+import codecs
+import tempfile
+import requests
+from pathlib import Path
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+from typing import Any
+from typing import List
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
+from funasr.modules.beam_search.beam_search import Hypothesis
+from funasr.modules.scorers.ctc import CTCPrefixScorer
+from funasr.modules.scorers.length_bonus import LengthBonus
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.asr import ASRTaskParaformer as ASRTask
+from funasr.tasks.lm import LMTask
+from funasr.text.build_tokenizer import build_tokenizer
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+            >>> import soundfile
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> audio, rate = soundfile.read("speech.wav")
+            >>> speech2text(audio)
+            [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+            self,
+            asr_train_config: Union[Path, str] = None,
+            asr_model_file: Union[Path, str] = None,
+            cmvn_file: Union[Path, str] = None,
+            lm_train_config: Union[Path, str] = None,
+            lm_file: Union[Path, str] = None,
+            token_type: str = None,
+            bpemodel: str = None,
+            device: str = "cpu",
+            maxlenratio: float = 0.0,
+            minlenratio: float = 0.0,
+            dtype: str = "float32",
+            beam_size: int = 20,
+            ctc_weight: float = 0.5,
+            lm_weight: float = 1.0,
+            ngram_weight: float = 0.9,
+            penalty: float = 0.0,
+            nbest: int = 1,
+            frontend_conf: dict = None,
+            hotword_list_or_file: str = None,
+            **kwargs,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        scorers = {}
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, cmvn_file, device
+        )
+        frontend = None
+        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+        logging.info("asr_model: {}".format(asr_model))
+        logging.info("asr_train_args: {}".format(asr_train_args))
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
+        token_list = asr_model.token_list
+        scorers.update(
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            scorers["lm"] = lm.lm
+
+        # 3. Build ngram model
+        # ngram is not supported now
+        ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        # transducer is not supported now
+        beam_search_transducer = None
+
+        weights = dict(
+            decoder=1.0 - ctc_weight,
+            ctc=ctc_weight,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=asr_model.sos,
+            eos=asr_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+        )
+
+        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+        for scorer in scorers.values():
+            if isinstance(scorer, torch.nn.Module):
+                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+
+        # 6. [Optional] Build hotword list from str, local file or url
+
+        is_use_lm = lm_weight != 0.0 and lm_file is not None
+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
+            beam_search = None
+        self.beam_search = beam_search
+        logging.info(f"Beam_search: {self.beam_search}")
+        self.beam_search_transducer = beam_search_transducer
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+        self.frontend = frontend
+        self.encoder_downsampling_factor = 1
+        if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
+            self.encoder_downsampling_factor = 4
+
+    @torch.no_grad()
+    def __call__(
+            self, cache: dict, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+            begin_time: int = 0, end_time: int = None,
+    ):
+        """Inference
+
+        Args:
+                speech: Input speech data
+        Returns:
+                text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        if self.frontend is not None:
+            feats, feats_len = self.frontend.forward(speech, speech_lengths)
+            feats = to_device(feats, device=self.device)
+            feats_len = feats_len.int()
+            self.asr_model.frontend = None
+        else:
+            feats = speech
+            feats_len = speech_lengths
+        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+        batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        # b. Forward Encoder
+        enc, enc_len = self.asr_model.encode_chunk(**batch)
+        if isinstance(enc, tuple):
+            enc = enc[0]
+        # assert len(enc) == 1, len(enc)
+        enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
+
+        predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
+        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
+                                                                        predictor_outs[2], predictor_outs[3]
+        pre_token_length = pre_token_length.floor().long()
+        if torch.max(pre_token_length) < 1:
+            return []
+        decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
+        decoder_out = decoder_outs
+
+        results = []
+        b, n, d = decoder_out.size()
+        for i in range(b):
+            x = enc[i, :enc_len[i], :]
+            am_scores = decoder_out[i, :pre_token_length[i], :]
+            if self.beam_search is not None:
+                nbest_hyps = self.beam_search(
+                    x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+                )
+
+                nbest_hyps = nbest_hyps[: self.nbest]
+            else:
+                yseq = am_scores.argmax(dim=-1)
+                score = am_scores.max(dim=-1)[0]
+                score = torch.sum(score, dim=-1)
+                # pad with mask tokens to ensure compatibility with sos/eos tokens
+                yseq = torch.tensor(
+                    [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
+                )
+                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+            for hyp in nbest_hyps:
+                assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+                # remove sos/eos and get results
+                last_pos = -1
+                if isinstance(hyp.yseq, list):
+                    token_int = hyp.yseq[1:last_pos]
+                else:
+                    token_int = hyp.yseq[1:last_pos].tolist()
+
+                # remove blank symbol id, which is assumed to be 0
+                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+                # Change integer-ids to tokens
+                token = self.converter.ids2tokens(token_int)
+
+                if self.tokenizer is not None:
+                    text = self.tokenizer.tokens2text(token)
+                else:
+                    text = None
+
+                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+        # assert check_return_type(results)
+        return results
+
+
+class Speech2TextExport:
+    """Speech2TextExport class
+
+    """
+
+    def __init__(
+            self,
+            asr_train_config: Union[Path, str] = None,
+            asr_model_file: Union[Path, str] = None,
+            cmvn_file: Union[Path, str] = None,
+            lm_train_config: Union[Path, str] = None,
+            lm_file: Union[Path, str] = None,
+            token_type: str = None,
+            bpemodel: str = None,
+            device: str = "cpu",
+            maxlenratio: float = 0.0,
+            minlenratio: float = 0.0,
+            dtype: str = "float32",
+            beam_size: int = 20,
+            ctc_weight: float = 0.5,
+            lm_weight: float = 1.0,
+            ngram_weight: float = 0.9,
+            penalty: float = 0.0,
+            nbest: int = 1,
+            frontend_conf: dict = None,
+            hotword_list_or_file: str = None,
+            **kwargs,
+    ):
+
+        # 1. Build ASR model
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, cmvn_file, device
+        )
+        frontend = None
+        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+        logging.info("asr_model: {}".format(asr_model))
+        logging.info("asr_train_args: {}".format(asr_train_args))
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+        token_list = asr_model.token_list
+
+        logging.info(f"Decoding device={device}, dtype={dtype}")
+
+        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        # self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+        self.frontend = frontend
+
+        model = Paraformer_export(asr_model, onnx=False)
+        self.asr_model = model
+
+    @torch.no_grad()
+    def __call__(
+            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
+    ):
+        """Inference
+
+        Args:
+                speech: Input speech data
+        Returns:
+                text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+        if self.frontend is not None:
+            feats, feats_len = self.frontend.forward(speech, speech_lengths)
+            feats = to_device(feats, device=self.device)
+            feats_len = feats_len.int()
+            self.asr_model.frontend = None
+        else:
+            feats = speech
+            feats_len = speech_lengths
+
+        enc_len_batch_total = feats_len.sum()
+        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+        batch = {"speech": feats, "speech_lengths": feats_len}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+
+        decoder_outs = self.asr_model(**batch)
+        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+
+        results = []
+        b, n, d = decoder_out.size()
+        for i in range(b):
+            am_scores = decoder_out[i, :ys_pad_lens[i], :]
+
+            yseq = am_scores.argmax(dim=-1)
+            score = am_scores.max(dim=-1)[0]
+            score = torch.sum(score, dim=-1)
+            # pad with mask tokens to ensure compatibility with sos/eos tokens
+            yseq = torch.tensor(
+                yseq.tolist(), device=yseq.device
+            )
+            nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+            for hyp in nbest_hyps:
+                assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+                # remove sos/eos and get results
+                last_pos = -1
+                if isinstance(hyp.yseq, list):
+                    token_int = hyp.yseq[1:last_pos]
+                else:
+                    token_int = hyp.yseq[1:last_pos].tolist()
+
+                # remove blank symbol id, which is assumed to be 0
+                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+                # Change integer-ids to tokens
+                token = self.converter.ids2tokens(token_int)
+
+                if self.tokenizer is not None:
+                    text = self.tokenizer.tokens2text(token)
+                else:
+                    text = None
+
+                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+        return results
+
+
+def inference(
+        maxlenratio: float,
+        minlenratio: float,
+        batch_size: int,
+        beam_size: int,
+        ngpu: int,
+        ctc_weight: float,
+        lm_weight: float,
+        penalty: float,
+        log_level: Union[int, str],
+        data_path_and_name_and_type,
+        asr_train_config: Optional[str],
+        asr_model_file: Optional[str],
+        cmvn_file: Optional[str] = None,
+        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+        lm_train_config: Optional[str] = None,
+        lm_file: Optional[str] = None,
+        token_type: Optional[str] = None,
+        key_file: Optional[str] = None,
+        word_lm_train_config: Optional[str] = None,
+        bpemodel: Optional[str] = None,
+        allow_variable_data_keys: bool = False,
+        streaming: bool = False,
+        output_dir: Optional[str] = None,
+        dtype: str = "float32",
+        seed: int = 0,
+        ngram_weight: float = 0.9,
+        nbest: int = 1,
+        num_workers: int = 1,
+
+        **kwargs,
+):
+    inference_pipeline = inference_modelscope(
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        batch_size=batch_size,
+        beam_size=beam_size,
+        ngpu=ngpu,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        log_level=log_level,
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        cmvn_file=cmvn_file,
+        raw_inputs=raw_inputs,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        key_file=key_file,
+        word_lm_train_config=word_lm_train_config,
+        bpemodel=bpemodel,
+        allow_variable_data_keys=allow_variable_data_keys,
+        streaming=streaming,
+        output_dir=output_dir,
+        dtype=dtype,
+        seed=seed,
+        ngram_weight=ngram_weight,
+        nbest=nbest,
+        num_workers=num_workers,
+
+        **kwargs,
+    )
+    return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+        maxlenratio: float,
+        minlenratio: float,
+        batch_size: int,
+        beam_size: int,
+        ngpu: int,
+        ctc_weight: float,
+        lm_weight: float,
+        penalty: float,
+        log_level: Union[int, str],
+        # data_path_and_name_and_type,
+        asr_train_config: Optional[str],
+        asr_model_file: Optional[str],
+        cmvn_file: Optional[str] = None,
+        lm_train_config: Optional[str] = None,
+        lm_file: Optional[str] = None,
+        token_type: Optional[str] = None,
+        key_file: Optional[str] = None,
+        word_lm_train_config: Optional[str] = None,
+        bpemodel: Optional[str] = None,
+        allow_variable_data_keys: bool = False,
+        dtype: str = "float32",
+        seed: int = 0,
+        ngram_weight: float = 0.9,
+        nbest: int = 1,
+        num_workers: int = 1,
+        output_dir: Optional[str] = None,
+        param_dict: dict = None,
+        **kwargs,
+):
+    assert check_argument_types()
+
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+
+    export_mode = False
+    if param_dict is not None:
+        hotword_list_or_file = param_dict.get('hotword')
+        export_mode = param_dict.get("export_mode", False)
+    else:
+        hotword_list_or_file = None
+
+    if ngpu >= 1 and torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+        batch_size = 1
+
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        cmvn_file=cmvn_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+        hotword_list_or_file=hotword_list_or_file,
+    )
+    if export_mode:
+        speech2text = Speech2TextExport(**speech2text_kwargs)
+    else:
+        speech2text = Speech2Text(**speech2text_kwargs)
+
+    def _forward(
+            data_path_and_name_and_type,
+            raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+            output_dir_v2: Optional[str] = None,
+            fs: dict = None,
+            param_dict: dict = None,
+            **kwargs,
+    ):
+
+        hotword_list_or_file = None
+        if param_dict is not None:
+            hotword_list_or_file = param_dict.get('hotword')
+        if 'hotword' in kwargs:
+            hotword_list_or_file = kwargs['hotword']
+        if hotword_list_or_file is not None or 'hotword' in kwargs:
+            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
+
+        # 3. Build data-iterator
+        if data_path_and_name_and_type is None and raw_inputs is not None:
+            if isinstance(raw_inputs, torch.Tensor):
+                raw_inputs = raw_inputs.numpy()
+            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+        loader = ASRTask.build_streaming_iterator(
+            data_path_and_name_and_type,
+            dtype=dtype,
+            fs=fs,
+            batch_size=batch_size,
+            key_file=key_file,
+            num_workers=num_workers,
+            preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+            collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+            allow_variable_data_keys=allow_variable_data_keys,
+            inference=True,
+        )
+
+        if param_dict is not None:
+            use_timestamp = param_dict.get('use_timestamp', True)
+        else:
+            use_timestamp = True
+
+        forward_time_total = 0.0
+        length_total = 0.0
+        finish_count = 0
+        file_count = 1
+        cache = None
+        # 7 .Start for-loop
+        # FIXME(kamo): The output format should be discussed about
+        asr_result_list = []
+        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+        if output_path is not None:
+            writer = DatadirWriter(output_path)
+        else:
+            writer = None
+        if param_dict is not None and "cache" in param_dict:
+            cache = param_dict["cache"]
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
+            logging.info("decoding, utt_id: {}".format(keys))
+            # N-best list of (text, token, token_int, hyp_object)
+
+            time_beg = time.time()
+            results = speech2text(cache=cache, **batch)
+            if len(results) < 1:
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
+            time_end = time.time()
+            forward_time = time_end - time_beg
+            lfr_factor = results[0][-1]
+            length = results[0][-2]
+            forward_time_total += forward_time
+            length_total += length
+            rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
+                                                                                               100 * forward_time / (
+                                                                                                           length * lfr_factor))
+            logging.info(rtf_cur)
+
+            for batch_id in range(_bs):
+                result = [results[batch_id][:-2]]
+
+                key = keys[batch_id]
+                for n, result in zip(range(1, nbest + 1), result):
+                    text, token, token_int, hyp = result[0], result[1], result[2], result[3]
+                    time_stamp = None if len(result) < 5 else result[4]
+                    # Create a directory: outdir/{n}best_recog
+                    if writer is not None:
+                        ibest_writer = writer[f"{n}best_recog"]
+
+                        # Write the result to each file
+                        ibest_writer["token"][key] = " ".join(token)
+                        # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                        ibest_writer["score"][key] = str(hyp.score)
+                        ibest_writer["rtf"][key] = rtf_cur
+
+                    if text is not None:
+                        if use_timestamp and time_stamp is not None:
+                            postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+                        else:
+                            postprocessed_result = postprocess_utils.sentence_postprocess(token)
+                        time_stamp_postprocessed = ""
+                        if len(postprocessed_result) == 3:
+                            text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
+                                                                                       postprocessed_result[1], \
+                                                                                       postprocessed_result[2]
+                        else:
+                            text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
+                        item = {'key': key, 'value': text_postprocessed}
+                        if time_stamp_postprocessed != "":
+                            item['time_stamp'] = time_stamp_postprocessed
+                        asr_result_list.append(item)
+                        finish_count += 1
+                        # asr_utils.print_progress(finish_count / file_count)
+                        if writer is not None:
+                            ibest_writer["text"][key] = text_postprocessed
+
+                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
+        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
+                                                                                                           forward_time_total,
+                                                                                                           100 * forward_time_total / (
+                                                                                                                       length_total * lfr_factor))
+        logging.info(rtf_avg)
+        if writer is not None:
+            ibest_writer["rtf"]["rtf_avf"] = rtf_avg
+        return asr_result_list
+
+    return _forward
+
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+    parser.add_argument(
+        "--hotword",
+        type=str_or_none,
+        default=None,
+        help="hotword file path or hotwords seperated by space"
+    )
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=False,
+        action="append",
+    )
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--asr_train_config",
+        type=str,
+        help="ASR training configuration",
+    )
+    group.add_argument(
+        "--asr_model_file",
+        type=str,
+        help="ASR model parameter file",
+    )
+    group.add_argument(
+        "--cmvn_file",
+        type=str,
+        help="Global cmvn file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+             "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+             "If maxlenratio=0.0 (default), it uses a end-detect "
+             "function "
+             "to automatically find maximum hypothesis lengths."
+             "If maxlenratio<0.0, its absolute value is interpreted"
+             "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument(
+        "--ctc_weight",
+        type=float,
+        default=0.5,
+        help="CTC weight in joint decoding",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+    group.add_argument("--streaming", type=str2bool, default=False)
+
+    group.add_argument(
+        "--frontend_conf",
+        default=None,
+        help="",
+    )
+    group.add_argument("--raw_inputs", type=list, default=None)
+    # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+             "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+             "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    param_dict = {'hotword': args.hotword}
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    kwargs['param_dict'] = param_dict
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
+
+    # from modelscope.pipelines import pipeline
+    # from modelscope.utils.constant import Tasks
+    #
+    # inference_16k_pipline = pipeline(
+    #     task=Tasks.auto_speech_recognition,
+    #     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+    #
+    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+    # print(rec_result)
+
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index a0e7b47..3f57751 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -58,7 +58,7 @@
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 8b31fad..ac71538 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -46,7 +46,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index e5815df..7cb889b 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -46,7 +46,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 7738f4f..85e4518 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -133,7 +133,7 @@
         param_dict = {
             "extract_profile": True,
             "sv_train_config": "sv.yaml",
-            "sv_model_file": "sv.pth",
+            "sv_model_file": "sv.pb",
         }
         if "param_dict" in kwargs and kwargs["param_dict"] is not None:
             for key in param_dict:
@@ -142,6 +142,9 @@
         else:
             kwargs["param_dict"] = param_dict
         return inference_modelscope(mode=mode, **kwargs)
+    elif mode == "eend-ola":
+        from funasr.bin.eend_ola_inference import inference_modelscope
+        return inference_modelscope(mode=mode, **kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index d65895f..01d3f29 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import torch
+from scipy.signal import medfilt
 from typeguard import check_argument_types
 
 from funasr.models.frontend.wav_frontend import WavFrontendMel23
@@ -34,7 +35,7 @@
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
@@ -146,7 +147,7 @@
         output_dir: Optional[str] = None,
         batch_size: int = 1,
         dtype: str = "float32",
-        ngpu: int = 0,
+        ngpu: int = 1,
         num_workers: int = 0,
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,
@@ -179,7 +180,6 @@
         diar_model_file=diar_model_file,
         device=device,
         dtype=dtype,
-        streaming=streaming,
     )
     logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
     speech2diar = Speech2Diarization.from_pretrained(
@@ -209,7 +209,7 @@
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
-            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+            data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
@@ -236,9 +236,23 @@
             # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
 
             results = speech2diar(**batch)
+
+            # post process
+            a = results[0][0].cpu().numpy()
+            a = medfilt(a, (11, 1))
+            rst = []
+            for spkid, frames in enumerate(a.T):
+                frames = np.pad(frames, (1, 1), 'constant')
+                changes, = np.where(np.diff(frames, axis=0) != 0)
+                fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
+                for s, e in zip(changes[::2], changes[1::2]):
+                    st = s / 10.
+                    dur = (e - s) / 10.
+                    rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
+
             # Only supporting batch_size==1
-            key, value = keys[0], output_results_str(results, keys[0])
-            item = {"key": key, "value": value}
+            value = "\n".join(rst)
+            item = {"key": keys[0], "value": value}
             result_list.append(item)
             if output_path is not None:
                 output_writer.write(value)
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index ab6d26f..5a0a8e2 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -42,7 +42,7 @@
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
@@ -54,7 +54,7 @@
             self,
             diar_train_config: Union[Path, str] = None,
             diar_model_file: Union[Path, str] = None,
-            device: str = "cpu",
+            device: Union[str, torch.device] = "cpu",
             batch_size: int = 1,
             dtype: str = "float32",
             streaming: bool = False,
@@ -114,9 +114,19 @@
             # little-endian order: lower bit first
             return (np.array(list(b)[::-1]) == '1').astype(dtype)
 
-        return np.row_stack([int2vec(int(x), vec_dim) for x in seq])
+        # process oov
+        seq = np.array([int(x) for x in seq])
+        new_seq = []
+        for i, x in enumerate(seq):
+            if x < 2 ** vec_dim:
+                new_seq.append(x)
+            else:
+                idx_list = np.where(seq < 2 ** vec_dim)[0]
+                idx = np.abs(idx_list - i).argmin()
+                new_seq.append(seq[idx_list[idx]])
+        return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
 
-    def post_processing(self, raw_logits: torch.Tensor, spk_num: int):
+    def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
         logits_idx = raw_logits.argmax(-1)  # B, T, vocab_size -> B, T
         # upsampling outputs to match inputs
         ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
@@ -127,8 +137,14 @@
         ).squeeze(1).long()
         logits_idx = logits_idx[0].tolist()
         pse_labels = [self.token_list[x] for x in logits_idx]
+        if output_format == "pse_labels":
+            return pse_labels, None
+
         multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num]  # remove padding speakers
         multi_labels = self.smooth_multi_labels(multi_labels)
+        if output_format == "binary_labels":
+            return multi_labels, None
+
         spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
         spk_turns = self.calc_spk_turns(multi_labels, spk_list)
         results = OrderedDict()
@@ -149,6 +165,7 @@
             self,
             speech: Union[torch.Tensor, np.ndarray],
             profile: Union[torch.Tensor, np.ndarray],
+            output_format: str = "speaker_turn"
     ):
         """Inference
 
@@ -178,7 +195,7 @@
         batch = to_device(batch, device=self.device)
 
         logits = self.diar_model.prediction_forward(**batch)
-        results, pse_labels = self.post_processing(logits, profile.shape[1])
+        results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
 
         return results, pse_labels
 
@@ -367,7 +384,7 @@
             pse_label_writer = open("{}/labels.txt".format(output_path), "w")
         logging.info("Start to diarize...")
         result_list = []
-        for keys, batch in loader:
+        for idx, (keys, batch) in enumerate(loader):
             assert isinstance(batch, dict), type(batch)
             assert all(isinstance(s, str) for s in keys), keys
             _bs = len(next(iter(batch.values())))
@@ -385,6 +402,9 @@
                 pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
                 pse_label_writer.flush()
 
+            if idx % 100 == 0:
+                logging.info("Processing {:5d}: {}".format(idx, key))
+
         if output_path is not None:
             output_writer.close()
             pse_label_writer.close()
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index a78bccd..7e63bbd 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -36,7 +36,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
+        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2xvector(audio)
         [(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,
         sv_train_config: Optional[str] = "sv.yaml",
-        sv_model_file: Optional[str] =  "sv.pth",
+        sv_model_file: Optional[str] =  "sv.pb",
         model_tag: Optional[str] = None,
         allow_variable_data_keys: bool = True,
         streaming: bool = False,
diff --git a/funasr/datasets/iterable_dataset.py b/funasr/datasets/iterable_dataset.py
index 49c7068..4b2fb1a 100644
--- a/funasr/datasets/iterable_dataset.py
+++ b/funasr/datasets/iterable_dataset.py
@@ -8,6 +8,7 @@
 from typing import Iterator
 from typing import Tuple
 from typing import Union
+from typing import List
 
 import kaldiio
 import numpy as np
@@ -129,7 +130,7 @@
         non_iterable_list = []
         self.path_name_type_list = []
 
-        if not isinstance(path_name_type_list[0], Tuple):
+        if not isinstance(path_name_type_list[0], (Tuple, List)):
             path = path_name_type_list[0]
             name = path_name_type_list[1]
             _type = path_name_type_list[2]
@@ -227,13 +228,9 @@
                 name = self.path_name_type_list[i][1]
                 _type = self.path_name_type_list[i][2]
                 if _type == "sound":
-                    audio_type = os.path.basename(value).split(".")[-1].lower()
-                    if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
-                        raise NotImplementedError(
-                            f'Not supported audio type: {audio_type}')
-                    if audio_type == "pcm":
-                        _type = "pcm"
-
+                   audio_type = os.path.basename(value).lower()
+                   if audio_type.rfind(".pcm") >= 0:
+                       _type = "pcm"
                 func = DATA_TYPES[_type]
                 array = func(value)
                 if self.fs is not None and (name == "speech" or name == "ref_speech"):
@@ -335,11 +332,8 @@
                 # 2.a. Load data streamingly
                 for value, (path, name, _type) in zip(values, self.path_name_type_list):
                     if _type == "sound":
-                        audio_type = os.path.basename(value).split(".")[-1].lower()
-                        if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
-                            raise NotImplementedError(
-                                f'Not supported audio type: {audio_type}')
-                        if audio_type == "pcm":
+                        audio_type = os.path.basename(value).lower()
+                        if audio_type.rfind(".pcm") >= 0:
                             _type = "pcm"
                     func = DATA_TYPES[_type]
                     # Load entry
@@ -391,3 +385,4 @@
 
         if count == 0:
             raise RuntimeError("No iteration")
+
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index caeb426..a016e4e 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -18,15 +18,11 @@
 
 def seg_tokenize(txt, seg_dict):
     out_txt = ""
-    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
     for word in txt:
-        if pattern.match(word):
-            if word in seg_dict:
-                out_txt += seg_dict[word] + " "
-            else:
-                out_txt += "<unk>" + " "
+        if word in seg_dict:
+            out_txt += seg_dict[word] + " "
         else:
-            continue
+            out_txt += "<unk>" + " "
     return out_txt.strip().split()
 
 def tokenize(data,
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index 20a3791..98cca1d 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -47,15 +47,11 @@
 
 def seg_tokenize(txt, seg_dict):
     out_txt = ""
-    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
     for word in txt:
-        if pattern.match(word):
-            if word in seg_dict:
-                out_txt += seg_dict[word] + " "
-            else:
-                out_txt += "<unk>" + " "
+        if word in seg_dict:
+            out_txt += seg_dict[word] + " "
         else:
-            continue
+            out_txt += "<unk>" + " "
     return out_txt.strip().split()
 
 def seg_tokenize_wo_pattern(txt, seg_dict):
diff --git a/funasr/export/README.md b/funasr/export/README.md
index c44ad33..c05348e 100644
--- a/funasr/export/README.md
+++ b/funasr/export/README.md
@@ -2,6 +2,8 @@
 ## Environments
     torch >= 1.11.0
     modelscope >= 1.2.0
+    torch-quant >= 0.4.0 (required for exporting quantized torchscript format model)
+    # pip install torch-quant -i https://pypi.org/simple
 
 ## Install modelscope and funasr
 
@@ -11,31 +13,46 @@
    `Tips`: torch>=1.11.0
 
    ```shell
-   python -m funasr.export.export_model [model_name] [export_dir] [onnx]
+   python -m funasr.export.export_model \
+       --model-name [model_name] \
+       --export-dir [export_dir] \
+       --type [onnx, torch] \
+       --quantize [true, false] \
+       --fallback-num [fallback_num]
    ```
-   `model_name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb). 
-   `export_dir`: the dir where the onnx is export.
-    `onnx`: `true`, export onnx format model; `false`, export torchscripts format model.
+   `model-name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb).
+
+   `export-dir`: the dir where the onnx is export.
+
+   `type`: `onnx` or `torch`, export onnx format model or torchscript format model.
+
+   `quantize`: `true`, export quantized model at the same time; `false`, export fp32 model only.
+
+   `fallback-num`: specify the number of fallback layers to perform automatic mixed precision quantization.
+
 
 ## For example
 ### Export onnx format model
 Export model from modelscope
 ```shell
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
 ```
 Export model from local path, the model'name must be `model.pb`.
 ```shell
-python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
 ```
 
 ### Export torchscripts format model
 Export model from modelscope
 ```shell
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
 ```
 
 Export model from local path, the model'name must be `model.pb`.
 ```shell
-python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
 ```
 
+## Acknowledge
+Torch model quantization is supported by [BladeDISC](https://github.com/alibaba/BladeDISC), an end-to-end DynamIc Shape Compiler project for machine learning workloads. BladeDISC provides general, transparent, and ease of use performance optimization for TensorFlow/PyTorch workloads on GPGPU and CPU backends. If you are interested, please contact us.
+
diff --git a/funasr/export/export_model.py b/funasr/export/export_model.py
index 3cbf6d2..f6ba616 100644
--- a/funasr/export/export_model.py
+++ b/funasr/export/export_model.py
@@ -10,12 +10,20 @@
 from funasr.export.models import get_model
 import numpy as np
 import random
-
+from funasr.utils.types import str2bool
 # torch_version = float(".".join(torch.__version__.split(".")[:2]))
 # assert torch_version > 1.9
 
 class ASRModelExportParaformer:
-    def __init__(self, cache_dir: Union[Path, str] = None, onnx: bool = True):
+    def __init__(
+        self,
+        cache_dir: Union[Path, str] = None,
+        onnx: bool = True,
+        quant: bool = True,
+        fallback_num: int = 0,
+        audio_in: str = None,
+        calib_num: int = 200,
+    ):
         assert check_argument_types()
         self.set_all_random_seed(0)
         if cache_dir is None:
@@ -28,6 +36,11 @@
         )
         print("output dir: {}".format(self.cache_dir))
         self.onnx = onnx
+        self.quant = quant
+        self.fallback_num = fallback_num
+        self.frontend = None
+        self.audio_in = audio_in
+        self.calib_num = calib_num
         
 
     def _export(
@@ -56,6 +69,43 @@
         print("output dir: {}".format(export_dir))
 
 
+    def _torch_quantize(self, model):
+        def _run_calibration_data(m):
+            # using dummy inputs for a example
+            if self.audio_in is not None:
+                feats, feats_len = self.load_feats(self.audio_in)
+                for i, (feat, len) in enumerate(zip(feats, feats_len)):
+                    with torch.no_grad():
+                        m(feat, len)
+            else:
+                dummy_input = model.get_dummy_inputs()
+                m(*dummy_input)
+            
+
+        from torch_quant.module import ModuleFilter
+        from torch_quant.quantizer import Backend, Quantizer
+        from funasr.export.models.modules.decoder_layer import DecoderLayerSANM
+        from funasr.export.models.modules.encoder_layer import EncoderLayerSANM
+        module_filter = ModuleFilter(include_classes=[EncoderLayerSANM, DecoderLayerSANM])
+        module_filter.exclude_op_types = [torch.nn.Conv1d]
+        quantizer = Quantizer(
+            module_filter=module_filter,
+            backend=Backend.FBGEMM,
+        )
+        model.eval()
+        calib_model = quantizer.calib(model)
+        _run_calibration_data(calib_model)
+        if self.fallback_num > 0:
+            # perform automatic mixed precision quantization
+            amp_model = quantizer.amp(model)
+            _run_calibration_data(amp_model)
+            quantizer.fallback(amp_model, num=self.fallback_num)
+            print('Fallback layers:')
+            print('\n'.join(quantizer.module_filter.exclude_names))
+        quant_model = quantizer.quantize(model)
+        return quant_model
+
+
     def _export_torchscripts(self, model, verbose, path, enc_size=None):
         if enc_size:
             dummy_input = model.get_dummy_inputs(enc_size)
@@ -66,10 +116,49 @@
         model_script = torch.jit.trace(model, dummy_input)
         model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))
 
+        if self.quant:
+            quant_model = self._torch_quantize(model)
+            model_script = torch.jit.trace(quant_model, dummy_input)
+            model_script.save(os.path.join(path, f'{model.model_name}_quant.torchscripts'))
+
+
     def set_all_random_seed(self, seed: int):
         random.seed(seed)
         np.random.seed(seed)
         torch.random.manual_seed(seed)
+
+    def parse_audio_in(self, audio_in):
+        
+        wav_list, name_list = [], []
+        if audio_in.endswith(".scp"):
+            f = open(audio_in, 'r')
+            lines = f.readlines()[:self.calib_num]
+            for line in lines:
+                name, path = line.strip().split()
+                name_list.append(name)
+                wav_list.append(path)
+        else:
+            wav_list = [audio_in,]
+            name_list = ["test",]
+        return wav_list, name_list
+    
+    def load_feats(self, audio_in: str = None):
+        import torchaudio
+
+        wav_list, name_list = self.parse_audio_in(audio_in)
+        feats = []
+        feats_len = []
+        for line in wav_list:
+            path = line.strip()
+            waveform, sampling_rate = torchaudio.load(path)
+            if sampling_rate != self.frontend.fs:
+                waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
+                                                          new_freq=self.frontend.fs)(waveform)
+            fbank, fbank_len = self.frontend(waveform, [waveform.size(1)])
+            feats.append(fbank)
+            feats_len.append(fbank_len)
+        return feats, feats_len
+    
     def export(self,
                tag_name: str = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
                mode: str = 'paraformer',
@@ -96,6 +185,7 @@
         model, asr_train_args = ASRTask.build_model_from_file(
             asr_train_config, asr_model_file, cmvn_file, 'cpu'
         )
+        self.frontend = model.frontend
         self._export(model, tag_name)
             
 
@@ -107,11 +197,12 @@
 
         # model_script = torch.jit.script(model)
         model_script = model #torch.jit.trace(model)
+        model_path = os.path.join(path, f'{model.model_name}.onnx')
 
         torch.onnx.export(
             model_script,
             dummy_input,
-            os.path.join(path, f'{model.model_name}.onnx'),
+            model_path,
             verbose=verbose,
             opset_version=14,
             input_names=model.get_input_names(),
@@ -119,17 +210,42 @@
             dynamic_axes=model.get_dynamic_axes()
         )
 
+        if self.quant:
+            from onnxruntime.quantization import QuantType, quantize_dynamic
+            import onnx
+            quant_model_path = os.path.join(path, f'{model.model_name}_quant.onnx')
+            onnx_model = onnx.load(model_path)
+            nodes = [n.name for n in onnx_model.graph.node]
+            nodes_to_exclude = [m for m in nodes if 'output' in m]
+            quantize_dynamic(
+                model_input=model_path,
+                model_output=quant_model_path,
+                op_types_to_quantize=['MatMul'],
+                per_channel=True,
+                reduce_range=False,
+                weight_type=QuantType.QUInt8,
+                nodes_to_exclude=nodes_to_exclude,
+            )
+
 
 if __name__ == '__main__':
-    import sys
-    
-    model_path = sys.argv[1]
-    output_dir = sys.argv[2]
-    onnx = sys.argv[3]
-    onnx = onnx.lower()
-    onnx = onnx == 'true'
-    # model_path = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
-    # output_dir = "../export"
-    export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=onnx)
-    export_model.export(model_path)
-    # export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
\ No newline at end of file
+    import argparse
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model-name', type=str, required=True)
+    parser.add_argument('--export-dir', type=str, required=True)
+    parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
+    parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
+    parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
+    parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
+    parser.add_argument('--calib_num', type=int, default=200, help='calib max num')
+    args = parser.parse_args()
+
+    export_model = ASRModelExportParaformer(
+        cache_dir=args.export_dir,
+        onnx=args.type == 'onnx',
+        quant=args.quantize,
+        fallback_num=args.fallback_num,
+        audio_in=args.audio_in,
+        calib_num=args.calib_num,
+    )
+    export_model.export(args.model_name)
diff --git a/funasr/export/models/modules/encoder_layer.py b/funasr/export/models/modules/encoder_layer.py
index d132574..7d01397 100644
--- a/funasr/export/models/modules/encoder_layer.py
+++ b/funasr/export/models/modules/encoder_layer.py
@@ -16,6 +16,7 @@
         self.feed_forward = model.feed_forward
         self.norm1 = model.norm1
         self.norm2 = model.norm2
+        self.in_size = model.in_size
         self.size = model.size
 
     def forward(self, x, mask):
@@ -23,13 +24,12 @@
         residual = x
         x = self.norm1(x)
         x = self.self_attn(x, mask)
-        if x.size(2) == residual.size(2):
+        if self.in_size == self.size:
             x = x + residual
         residual = x
         x = self.norm2(x)
         x = self.feed_forward(x)
-        if x.size(2) == residual.size(2):
-            x = x + residual
+        x = x + residual
 
         return x, mask
 
diff --git a/funasr/export/models/modules/multihead_att.py b/funasr/export/models/modules/multihead_att.py
index 7d685f5..1983db8 100644
--- a/funasr/export/models/modules/multihead_att.py
+++ b/funasr/export/models/modules/multihead_att.py
@@ -64,6 +64,23 @@
         return self.linear_out(context_layer)  # (batch, time1, d_model)
 
 
+def preprocess_for_attn(x, mask, cache, pad_fn):
+    x = x * mask
+    x = x.transpose(1, 2)
+    if cache is None:
+        x = pad_fn(x)
+    else:
+        x = torch.cat((cache[:, :, 1:], x), dim=2)
+        cache = x
+    return x, cache
+
+
+torch_version = float(".".join(torch.__version__.split(".")[:2]))
+if torch_version >= 1.8:
+    import torch.fx
+    torch.fx.wrap('preprocess_for_attn')
+
+
 class MultiHeadedAttentionSANMDecoder(nn.Module):
     def __init__(self, model):
         super().__init__()
@@ -73,16 +90,7 @@
         self.attn = None
 
     def forward(self, inputs, mask, cache=None):
-        # b, t, d = inputs.size()
-        # mask = torch.reshape(mask, (b, -1, 1))
-        inputs = inputs * mask
-
-        x = inputs.transpose(1, 2)
-        if cache is None:
-            x = self.pad_fn(x)
-        else:
-            x = torch.cat((cache[:, :, 1:], x), dim=2)
-            cache = x
+        x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn)
         x = self.fsmn_block(x)
         x = x.transpose(1, 2)
 
@@ -232,4 +240,4 @@
         new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
         context_layer = context_layer.view(new_context_layer_shape)
         return self.linear_out(context_layer)  # (batch, time1, d_model)
-        
\ No newline at end of file
+        
diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py
index 53f9568..d8df949 100644
--- a/funasr/main_funcs/average_nbest_models.py
+++ b/funasr/main_funcs/average_nbest_models.py
@@ -66,13 +66,13 @@
             elif n == 1:
                 # The averaged model is same as the best model
                 e, _ = epoch_and_values[0]
-                op = output_dir / f"{e}epoch.pth"
-                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
+                op = output_dir / f"{e}epoch.pb"
+                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
                 if sym_op.is_symlink() or sym_op.exists():
                     sym_op.unlink()
                 sym_op.symlink_to(op.name)
             else:
-                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
+                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
                 logging.info(
                     f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                 )
@@ -83,12 +83,12 @@
                     if e not in _loaded:
                         if oss_bucket is None:
                             _loaded[e] = torch.load(
-                                output_dir / f"{e}epoch.pth",
+                                output_dir / f"{e}epoch.pb",
                                 map_location="cpu",
                             )
                         else:
                             buffer = BytesIO(
-                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
+                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
                             _loaded[e] = torch.load(buffer)
                     states = _loaded[e]
 
@@ -115,13 +115,13 @@
                 else:
                     buffer = BytesIO()
                     torch.save(avg, buffer)
-                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
+                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
                                           buffer.getvalue())
 
-        # 3. *.*.ave.pth is a symlink to the max ave model
+        # 3. *.*.ave.pb is a symlink to the max ave model
         if oss_bucket is None:
-            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
-            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
+            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
+            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
             if sym_op.is_symlink() or sym_op.exists():
                 sym_op.unlink()
             sym_op.symlink_to(op.name)
diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py
index ffa807e..fe365d8 100644
--- a/funasr/main_funcs/pack_funcs.py
+++ b/funasr/main_funcs/pack_funcs.py
@@ -191,12 +191,12 @@
 
     Examples:
         tarfile:
-           model.pth
+           model.pb
            some1.file
            some2.file
 
         >>> unpack("tarfile", "out")
-        {'asr_model_file': 'out/model.pth'}
+        {'asr_model_file': 'out/model.pb'}
     """
     input_archive = Path(input_archive)
     outpath = Path(outpath)
diff --git a/funasr/models/decoder/sanm_decoder.py b/funasr/models/decoder/sanm_decoder.py
index ab03f0b..3bfcffc 100644
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@@ -94,6 +94,47 @@
         if self.self_attn:
             if self.normalize_before:
                 tgt = self.norm2(tgt)
+            x, _ = self.self_attn(tgt, tgt_mask)
+            x = residual + self.dropout(x)
+
+        if self.src_attn is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm3(x)
+
+            x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
+
+
+        return x, tgt_mask, memory, memory_mask, cache
+
+    def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+        """Compute decoded features.
+
+        Args:
+            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
+            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
+            cache (List[torch.Tensor]): List of cached tensors.
+                Each tensor shape should be (#batch, maxlen_out - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor(#batch, maxlen_out, size).
+            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+            torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+
+        """
+        # tgt = self.dropout(tgt)
+        residual = tgt
+        if self.normalize_before:
+            tgt = self.norm1(tgt)
+        tgt = self.feed_forward(tgt)
+
+        x = tgt
+        if self.self_attn:
+            if self.normalize_before:
+                tgt = self.norm2(tgt)
             if self.training:
                 cache = None
             x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
@@ -108,7 +149,6 @@
 
 
         return x, tgt_mask, memory, memory_mask, cache
-
 
 class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
     """
@@ -947,6 +987,65 @@
         )
         return logp.squeeze(0), state
 
+    def forward_chunk(
+        self,
+        memory: torch.Tensor,
+        tgt: torch.Tensor,
+        cache: dict = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Forward decoder.
+
+        Args:
+            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
+            hlens: (batch)
+            ys_in_pad:
+                input token ids, int64 (batch, maxlen_out)
+                if input_layer == "embed"
+                input tensor (batch, maxlen_out, #mels) in the other cases
+            ys_in_lens: (batch)
+        Returns:
+            (tuple): tuple containing:
+
+            x: decoded token score before softmax (batch, maxlen_out, token)
+                if use_output_layer is True,
+            olens: (batch, )
+        """
+        x = tgt
+        if cache["decode_fsmn"] is None:
+            cache_layer_num = len(self.decoders)
+            if self.decoders2 is not None:
+                cache_layer_num += len(self.decoders2)
+            new_cache = [None] * cache_layer_num
+        else:
+            new_cache = cache["decode_fsmn"]
+        for i in range(self.att_layer_num):
+            decoder = self.decoders[i]
+            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+                x, None, memory, None, cache=new_cache[i]
+            )
+            new_cache[i] = c_ret
+
+        if self.num_blocks - self.att_layer_num > 1:
+            for i in range(self.num_blocks - self.att_layer_num):
+                j = i + self.att_layer_num
+                decoder = self.decoders2[i]
+                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+                    x, None, memory, None, cache=new_cache[j]
+                )
+                new_cache[j] = c_ret
+
+        for decoder in self.decoders3:
+
+            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
+                x, None, memory, None, cache=None
+            )
+        if self.normalize_before:
+            x = self.after_norm(x)
+        if self.output_layer is not None:
+            x = self.output_layer(x)
+        cache["decode_fsmn"] = new_cache
+        return x
+
     def forward_one_step(
         self,
         tgt: torch.Tensor,
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 44c9de3..02f60af 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -325,12 +325,76 @@
 
         return encoder_out, encoder_out_lens
 
+    def encode_chunk(
+            self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        if self.encoder.interctc_use_conditioning:
+            encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(
+                feats, feats_lengths, cache=cache["encoder"], ctc=self.ctc
+            )
+        else:
+            encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(feats, feats_lengths, cache=cache["encoder"])
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        if intermediate_outs is not None:
+            return (encoder_out, intermediate_outs), encoder_out_lens
+
+        return encoder_out, encoder_out_lens
+
     def calc_predictor(self, encoder_out, encoder_out_lens):
 
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
                                                                                   ignore_id=self.ignore_id)
+        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
+
+    def calc_predictor_chunk(self, encoder_out, cache=None):
+
+        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor.forward_chunk(encoder_out, cache["encoder"])
         return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
 
     def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
@@ -341,6 +405,14 @@
         decoder_out = decoder_outs[0]
         decoder_out = torch.log_softmax(decoder_out, dim=-1)
         return decoder_out, ys_pad_lens
+
+    def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
+        decoder_outs = self.decoder.forward_chunk(
+            encoder_out, sematic_embeds, cache["decoder"]
+        )
+        decoder_out = decoder_outs
+        decoder_out = torch.log_softmax(decoder_out, dim=-1)
+        return decoder_out
 
     def _extract_feats(
             self, speech: torch.Tensor, speech_lengths: torch.Tensor
@@ -1459,4 +1531,4 @@
                     "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
                                                                                   var_dict_tf[name_tf].shape))
 
-        return var_dict_torch_update
\ No newline at end of file
+        return var_dict_torch_update
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index f589269..097b23a 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -52,15 +52,15 @@
 
         super().__init__()
         self.frontend = frontend
-        self.encoder = encoder
-        self.encoder_decoder_attractor = encoder_decoder_attractor
+        self.enc = encoder
+        self.eda = encoder_decoder_attractor
         self.attractor_loss_weight = attractor_loss_weight
         self.max_n_speaker = max_n_speaker
         if mapping_dict is None:
             mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
             self.mapping_dict = mapping_dict
         # PostNet
-        self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
+        self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
         self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)
 
     def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@
         pad_shape = xs.shape
         xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
         xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
-        emb = self.encoder(xs, xs_mask)
+        emb = self.enc(xs, xs_mask)
         emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
         emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
         return emb
@@ -76,8 +76,8 @@
     def forward_post_net(self, logits, ilens):
         maxlen = torch.max(ilens).to(torch.int).item()
         logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
-        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
-        outputs, (_, _) = self.PostNet(logits)
+        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
+        outputs, (_, _) = self.postnet(logits)
         outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
         outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
         outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@
         text = text[:, : text_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -190,18 +190,16 @@
                             shuffle: bool = True,
                             threshold: float = 0.5,
                             **kwargs):
-        if self.frontend is not None:
-            speech = self.frontend(speech)
         speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
         emb = self.forward_encoder(speech, speech_lengths)
         if shuffle:
             orders = [np.arange(e.shape[0]) for e in emb]
             for order in orders:
                 np.random.shuffle(order)
-            attractors, probs = self.encoder_decoder_attractor.estimate(
+            attractors, probs = self.eda.estimate(
                 [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
         else:
-            attractors, probs = self.encoder_decoder_attractor.estimate(emb)
+            attractors, probs = self.eda.estimate(emb)
         attractors_active = []
         for p, att, e in zip(probs, attractors, emb):
             if n_speakers and n_speakers >= 0:
@@ -233,10 +231,23 @@
                 pred[i] = pred[i - 1]
             else:
                 pred[i] = 0
-        pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
+        pred = [self.inv_mapping_func(i) for i in pred]
         decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
         decisions = torch.from_numpy(
             np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
             torch.float32)
         decisions = decisions[:, :n_speaker]
         return decisions
+
+    def inv_mapping_func(self, label):
+
+        if not isinstance(label, int):
+            label = int(label)
+        if label in self.mapping_dict['label2dec'].keys():
+            num = self.mapping_dict['label2dec'][label]
+        else:
+            num = -1
+        return num
+
+    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+        pass
\ No newline at end of file
diff --git a/funasr/models/e2e_diar_sond.py b/funasr/models/e2e_diar_sond.py
index 258d780..de669f2 100644
--- a/funasr/models/e2e_diar_sond.py
+++ b/funasr/models/e2e_diar_sond.py
@@ -59,7 +59,8 @@
         normalize_speech_speaker: bool = False,
         ignore_id: int = -1,
         speaker_discrimination_loss_weight: float = 1.0,
-        inter_score_loss_weight: float = 0.0
+        inter_score_loss_weight: float = 0.0,
+        inputs_type: str = "raw",
     ):
         assert check_argument_types()
 
@@ -86,14 +87,12 @@
         )
         self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
         self.pse_embedding = self.generate_pse_embedding()
-        # self.register_buffer("pse_embedding", pse_embedding)
         self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :]).float()
-        # self.register_buffer("power_weight", power_weight)
         self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :]).int()
-        # self.register_buffer("int_token_arr", int_token_arr)
         self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
         self.inter_score_loss_weight = inter_score_loss_weight
         self.forward_steps = 0
+        self.inputs_type = inputs_type
 
     def generate_pse_embedding(self):
         embedding = np.zeros((len(self.token_list), self.max_spk_num), dtype=np.float)
@@ -125,9 +124,14 @@
             binary_labels: (Batch, frames, max_spk_num)
             binary_labels_lengths: (Batch,)
         """
-        assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
+        assert speech.shape[0] <= binary_labels.shape[0], (speech.shape, binary_labels.shape)
         batch_size = speech.shape[0]
         self.forward_steps = self.forward_steps + 1
+        if self.pse_embedding.device != speech.device:
+            self.pse_embedding = self.pse_embedding.to(speech.device)
+            self.power_weight = self.power_weight.to(speech.device)
+            self.int_token_arr = self.int_token_arr.to(speech.device)
+
         # 1. Network forward
         pred, inter_outputs = self.prediction_forward(
             speech, speech_lengths,
@@ -149,9 +153,13 @@
         # the sequence length of 'pred' might be slightly less than the
         # length of 'spk_labels'. Here we force them to be equal.
         length_diff_tolerance = 2
-        length_diff = pse_labels.shape[1] - pred.shape[1]
-        if 0 < length_diff <= length_diff_tolerance:
-            pse_labels = pse_labels[:, 0: pred.shape[1]]
+        length_diff = abs(pse_labels.shape[1] - pred.shape[1])
+        if length_diff <= length_diff_tolerance:
+            min_len = min(pred.shape[1], pse_labels.shape[1])
+            pse_labels = pse_labels[:, :min_len]
+            pred = pred[:, :min_len]
+            cd_score = cd_score[:, :min_len]
+            ci_score = ci_score[:, :min_len]
 
         loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
         loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
@@ -299,7 +307,7 @@
             speech: torch.Tensor,
             speech_lengths: torch.Tensor,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
-        if self.encoder is not None:
+        if self.encoder is not None and self.inputs_type == "raw":
             speech, speech_lengths = self.encode(speech, speech_lengths)
             speech_mask = ~make_pad_mask(speech_lengths, maxlen=speech.shape[1])
             speech_mask = speech_mask.to(speech.device).unsqueeze(-1).float()
diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py
index 0751a10..57890ef 100644
--- a/funasr/models/encoder/sanm_encoder.py
+++ b/funasr/models/encoder/sanm_encoder.py
@@ -347,6 +347,48 @@
             return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
 
+    def forward_chunk(self,
+                      xs_pad: torch.Tensor,
+                      ilens: torch.Tensor,
+                      cache: dict = None,
+                      ctc: CTC = None,
+                      ):
+        xs_pad *= self.output_size() ** 0.5
+        if self.embed is None:
+            xs_pad = xs_pad
+        else:
+            xs_pad = self.embed.forward_chunk(xs_pad, cache)
+
+        encoder_outs = self.encoders0(xs_pad, None, None, None, None)
+        xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            encoder_outs = self.encoders(xs_pad, None, None, None, None)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                encoder_outs = encoder_layer(xs_pad, None, None, None, None)
+                xs_pad, masks = encoder_outs[0], encoder_outs[1]
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), None, None
+        return xs_pad, ilens, None
+
     def gen_tf2torch_map_dict(self):
         tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
         tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 445efca..475a939 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -1,14 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from espnet/espnet.
-from abc import ABC
 from typing import Tuple
 
 import numpy as np
 import torch
 import torchaudio.compliance.kaldi as kaldi
-from funasr.models.frontend.abs_frontend import AbsFrontend
-from typeguard import check_argument_types
 from torch.nn.utils.rnn import pad_sequence
+from typeguard import check_argument_types
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.models.frontend.abs_frontend import AbsFrontend
 
 
 def load_cmvn(cmvn_file):
@@ -275,7 +276,8 @@
     # inputs tensor has catted the cache tensor
     # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
     #               is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
-    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
+        torch.Tensor, torch.Tensor, int]:
         """
         Apply lfr with data
         """
@@ -376,7 +378,8 @@
             if self.lfr_m != 1 or self.lfr_n != 1:
                 # update self.lfr_splice_cache in self.apply_lfr
                 # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
-                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
+                                                                                     is_final)
             if self.cmvn_file is not None:
                 mat = self.apply_cmvn(mat, self.cmvn)
             feat_length = mat.size(0)
@@ -398,9 +401,10 @@
         assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
         waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths)  # input shape: B T D
         if feats.shape[0]:
-            #if self.reserve_waveforms is None and self.lfr_m > 1:
+            # if self.reserve_waveforms is None and self.lfr_m > 1:
             #    self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
-            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
+            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
+                (self.reserve_waveforms, waveforms), dim=1)
             if not self.lfr_splice_cache:  # 鍒濆鍖杝plice_cache
                 for i in range(batch_size):
                     self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -409,7 +413,8 @@
                 lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache)  # B T D
                 feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
                 feats_lengths += lfr_splice_cache_tensor[0].shape[0]
-                frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                 minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
                 feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
                 if self.lfr_m == 1:
@@ -423,14 +428,15 @@
                     self.waveforms = self.waveforms[:, :sample_length]
             else:
                 # update self.reserve_waveforms and self.lfr_splice_cache
-                self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
+                self.reserve_waveforms = self.waveforms[:,
+                                         :-(self.frame_sample_length - self.frame_shift_sample_length)]
                 for i in range(batch_size):
                     self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
                 return torch.empty(0), feats_lengths
         else:
             if is_final:
                 self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
-                feats = torch.stack(self.lfr_splice_cache) 
+                feats = torch.stack(self.lfr_splice_cache)
                 feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
                 feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
         if is_final:
@@ -444,3 +450,54 @@
         self.reserve_waveforms = None
         self.input_cache = None
         self.lfr_splice_cache = []
+
+
+class WavFrontendMel23(AbsFrontend):
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            fs: int = 16000,
+            frame_length: int = 25,
+            frame_shift: int = 10,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.fs = fs
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.n_mels = 23
+
+    def output_size(self) -> int:
+        return self.n_mels * (2 * self.lfr_m + 1)
+
+    def forward(
+            self,
+            input: torch.Tensor,
+            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            waveform = waveform.numpy()
+            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+            mat = eend_ola_feature.transform(mat)
+            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
+            mat = mat[::self.lfr_n]
+            mat = torch.from_numpy(mat)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats,
+                                 batch_first=True,
+                                 padding_value=0.0)
+        return feats_pad, feats_lens
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index 5615373..74f3e68 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -199,6 +199,63 @@
 
         return acoustic_embeds, token_num, alphas, cif_peak
 
+    def forward_chunk(self, hidden, cache=None):
+        h = hidden
+        context = h.transpose(1, 2)
+        queries = self.pad(context)
+        output = torch.relu(self.cif_conv1d(queries))
+        output = output.transpose(1, 2)
+        output = self.cif_output(output)
+        alphas = torch.sigmoid(output)
+        alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+
+        alphas = alphas.squeeze(-1)
+        mask_chunk_predictor = None
+        if cache is not None:
+            mask_chunk_predictor = None
+            mask_chunk_predictor = torch.zeros_like(alphas)
+            mask_chunk_predictor[:, cache["pad_left"]:cache["stride"] + cache["pad_left"]] = 1.0
+       
+        if mask_chunk_predictor is not None:
+            alphas = alphas * mask_chunk_predictor
+      
+        if cache is not None:
+            if cache["cif_hidden"] is not None:
+                hidden = torch.cat((cache["cif_hidden"], hidden), 1)
+            if cache["cif_alphas"] is not None:
+                alphas = torch.cat((cache["cif_alphas"], alphas), -1)
+
+        token_num = alphas.sum(-1)
+        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+        len_time = alphas.size(-1)
+        last_fire_place = len_time - 1
+        last_fire_remainds = 0.0
+        pre_alphas_length = 0
+ 
+        mask_chunk_peak_predictor = None
+        if cache is not None:
+            mask_chunk_peak_predictor = None
+            mask_chunk_peak_predictor = torch.zeros_like(cif_peak)
+            if cache["cif_alphas"] is not None:
+                pre_alphas_length = cache["cif_alphas"].size(-1)
+                mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
+            mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
+            
+
+        if mask_chunk_peak_predictor is not None:
+            cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
+        
+        for i in range(len_time):
+            if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
+                last_fire_place = len_time - 1 - i
+                last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
+                break
+        last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
+        cache["cif_hidden"] = hidden[:, last_fire_place:, :]
+        cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
+        token_num_int = token_num.floor().type(torch.int32).item()
+        return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
+
     def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
         b, t, d = hidden.size()
         tail_threshold = self.tail_threshold
diff --git a/funasr/modules/attention.py b/funasr/modules/attention.py
index 6277005..31d5a87 100644
--- a/funasr/modules/attention.py
+++ b/funasr/modules/attention.py
@@ -347,15 +347,17 @@
             mask = torch.reshape(mask, (b, -1, 1))
             if mask_shfit_chunk is not None:
                 mask = mask * mask_shfit_chunk
+            inputs = inputs * mask
 
-        inputs = inputs * mask
         x = inputs.transpose(1, 2)
         x = self.pad_fn(x)
         x = self.fsmn_block(x)
         x = x.transpose(1, 2)
         x += inputs
         x = self.dropout(x)
-        return x * mask
+        if mask is not None:
+            x = x * mask
+        return x
 
     def forward_qkv(self, x):
         """Transform query, key and value.
@@ -505,7 +507,7 @@
             # print("in fsmn, cache is None, x", x.size())
 
             x = self.pad_fn(x)
-            if not self.training and t <= 1:
+            if not self.training:
                 cache = x
         else:
             # print("in fsmn, cache is not None, x", x.size())
@@ -513,7 +515,7 @@
             # if t < self.kernel_size:
             #     x = self.pad_fn(x)
             x = torch.cat((cache[:, :, 1:], x), dim=2)
-            x = x[:, :, -self.kernel_size:]
+            x = x[:, :, -(self.kernel_size+t-1):]
             # print("in fsmn, cache is not None, x_cat", x.size())
             cache = x
         x = self.fsmn_block(x)
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
index 4999031..90a63f3 100644
--- a/funasr/modules/eend_ola/encoder.py
+++ b/funasr/modules/eend_ola/encoder.py
@@ -87,7 +87,7 @@
                  n_layers: int,
                  n_units: int,
                  e_units: int = 2048,
-                 h: int = 8,
+                 h: int = 4,
                  dropout_rate: float = 0.1,
                  use_pos_emb: bool = False):
         super(EENDOLATransformerEncoder, self).__init__()
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
index db01b00..45ac982 100644
--- a/funasr/modules/eend_ola/encoder_decoder_attractor.py
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -16,12 +16,12 @@
         self.n_units = n_units
 
     def forward_core(self, xs, zeros):
-        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
+        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
         xs = [self.enc0_dropout(x) for x in xs]
         xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
         xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
         _, (hx, cx) = self.encoder(xs)
-        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
+        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
         max_zlen = torch.max(zlens).to(torch.int).item()
         zeros = [self.enc0_dropout(z) for z in zeros]
         zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
@@ -47,4 +47,4 @@
         zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
         attractors = self.forward_core(xs, zeros)
         probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
-        return attractors, probs
\ No newline at end of file
+        return attractors, probs
diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index b61a61a..e4f9bff 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -405,4 +405,13 @@
         positions = torch.arange(1, timesteps+1)[None, :]
         position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
 
-        return x + position_encoding
\ No newline at end of file
+        return x + position_encoding
+
+    def forward_chunk(self, x, cache=None):
+        start_idx = 0
+        batch_size, timesteps, input_dim = x.size()
+        if cache is not None:
+            start_idx = cache["start_idx"]
+        positions = torch.arange(1, timesteps+start_idx+1)[None, :]
+        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+        return x + position_encoding[:, start_idx: start_idx + timesteps]
diff --git a/funasr/runtime/grpc/CMakeLists.txt b/funasr/runtime/grpc/CMakeLists.txt
new file mode 100644
index 0000000..56e3074
--- /dev/null
+++ b/funasr/runtime/grpc/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ paraformer example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building paraformer.
+
+cmake_minimum_required(VERSION 3.10)
+
+project(ASR C CXX)
+
+include(common.cmake)
+
+# Proto file
+get_filename_component(rg_proto "../python/grpc/proto/paraformer.proto" ABSOLUTE)
+get_filename_component(rg_proto_path "${rg_proto}" PATH)
+
+# Generated sources
+set(rg_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.cc")
+set(rg_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.h")
+set(rg_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.cc")
+set(rg_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.h")
+add_custom_command(
+      OUTPUT "${rg_proto_srcs}" "${rg_proto_hdrs}" "${rg_grpc_srcs}" "${rg_grpc_hdrs}"
+      COMMAND ${_PROTOBUF_PROTOC}
+      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+        -I "${rg_proto_path}"
+        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+        "${rg_proto}"
+      DEPENDS "${rg_proto}")
+
+
+# Include generated *.pb.h files
+include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+include_directories(../onnxruntime/include/)
+link_directories(../onnxruntime/build/src/)
+link_directories(../onnxruntime/build/third_party/webrtc/)
+
+link_directories(${ONNXRUNTIME_DIR}/lib)
+add_subdirectory("../onnxruntime/src" onnx_src)
+
+# rg_grpc_proto
+add_library(rg_grpc_proto
+  ${rg_grpc_srcs}
+  ${rg_grpc_hdrs}
+  ${rg_proto_srcs}
+  ${rg_proto_hdrs})
+
+
+
+target_link_libraries(rg_grpc_proto
+  ${_REFLECTION}
+  ${_GRPC_GRPCPP}
+  ${_PROTOBUF_LIBPROTOBUF})
+
+# Targets paraformer_(server)
+foreach(_target
+  paraformer_server)
+  add_executable(${_target}
+    "${_target}.cc")
+  target_link_libraries(${_target}
+    rg_grpc_proto
+    rapidasr
+    webrtcvad
+    ${EXTRA_LIBS}
+    ${_REFLECTION}
+    ${_GRPC_GRPCPP}
+    ${_PROTOBUF_LIBPROTOBUF})
+endforeach()
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
new file mode 100644
index 0000000..80e55aa
--- /dev/null
+++ b/funasr/runtime/grpc/Readme.md
@@ -0,0 +1,57 @@
+## paraformer grpc onnx server in c++
+
+
+#### Step 1. Build ../onnxruntime as it's document
+```
+#put onnx-lib & onnx-asr-model & vocab.txt into /path/to/asrmodel(eg: /data/asrmodel)
+ls /data/asrmodel/
+onnxruntime-linux-x64-1.14.0  speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+
+file /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/vocab.txt
+UTF-8 Unicode text
+```
+
+#### Step 2. Compile and install grpc v1.52.0 in case of grpc bugs
+```
+export GRPC_INSTALL_DIR=/data/soft/grpc
+export PKG_CONFIG_PATH=$GRPC_INSTALL_DIR/lib/pkgconfig
+
+git clone -b v1.52.0 --depth=1  https://github.com/grpc/grpc.git
+cd grpc
+git submodule update --init --recursive
+
+mkdir -p cmake/build
+pushd cmake/build
+cmake -DgRPC_INSTALL=ON \
+      -DgRPC_BUILD_TESTS=OFF \
+      -DCMAKE_INSTALL_PREFIX=$GRPC_INSTALL_DIR \
+      ../..
+make
+make install
+popd
+
+echo "export GRPC_INSTALL_DIR=/data/soft/grpc" >> ~/.bashrc
+echo "export PKG_CONFIG_PATH=\$GRPC_INSTALL_DIR/lib/pkgconfig" >> ~/.bashrc
+echo "export PATH=\$GRPC_INSTALL_DIR/bin/:\$PKG_CONFIG_PATH:\$PATH" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### Step 3. Compile and start grpc onnx paraformer server
+```
+# set -DONNXRUNTIME_DIR=/path/to/asrmodel/onnxruntime-linux-x64-1.14.0
+./rebuild.sh
+```
+
+#### Step 4. Start grpc paraformer server
+```
+Usage: ./cmake/build/paraformer_server port thread_num /path/to/model_file
+./cmake/build/paraformer_server 10108 4 /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+```
+
+
+
+#### Step 5. Start grpc python paraformer client  on PC with MIC
+```
+cd ../python/grpc
+python grpc_main_client_mic.py  --host $server_ip --port 10108
+```
diff --git a/funasr/runtime/grpc/common.cmake b/funasr/runtime/grpc/common.cmake
new file mode 100644
index 0000000..1326a5b
--- /dev/null
+++ b/funasr/runtime/grpc/common.cmake
@@ -0,0 +1,125 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ route_guide example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building route_guide.
+
+cmake_minimum_required(VERSION 3.5.1)
+
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+  set (CMAKE_CXX_STANDARD 14)
+endif()
+
+if(MSVC)
+  add_definitions(-D_WIN32_WINNT=0x600)
+endif()
+
+find_package(Threads REQUIRED)
+
+if(GRPC_AS_SUBMODULE)
+  # One way to build a projects that uses gRPC is to just include the
+  # entire gRPC project tree via "add_subdirectory".
+  # This approach is very simple to use, but the are some potential
+  # disadvantages:
+  # * it includes gRPC's CMakeLists.txt directly into your build script
+  #   without and that can make gRPC's internal setting interfere with your
+  #   own build.
+  # * depending on what's installed on your system, the contents of submodules
+  #   in gRPC's third_party/* might need to be available (and there might be
+  #   additional prerequisites required to build them). Consider using
+  #   the gRPC_*_PROVIDER options to fine-tune the expected behavior.
+  #
+  # A more robust approach to add dependency on gRPC is using
+  # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
+
+  # Include the gRPC's cmake build (normally grpc source code would live
+  # in a git submodule called "third_party/grpc", but this example lives in
+  # the same repository as gRPC sources, so we just look a few directories up)
+  add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
+  message(STATUS "Using gRPC via add_subdirectory.")
+
+  # After using add_subdirectory, we can now use the grpc targets directly from
+  # this build.
+  set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+  set(_REFLECTION grpc++_reflection)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_PROTOBUF_PROTOC protoc)
+  else()
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+  endif()
+  set(_GRPC_GRPCPP grpc++)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
+  endif()
+elseif(GRPC_FETCHCONTENT)
+  # Another way is to use CMake's FetchContent module to clone gRPC at
+  # configure time. This makes gRPC's source code available to your project,
+  # similar to a git submodule.
+  message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
+  include(FetchContent)
+  FetchContent_Declare(
+    grpc
+    GIT_REPOSITORY https://github.com/grpc/grpc.git
+    # when using gRPC, you will actually set this to an existing tag, such as
+    # v1.25.0, v1.26.0 etc..
+    # For the purpose of testing, we override the tag used to the commit
+    # that's currently under test.
+    GIT_TAG        vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
+  FetchContent_MakeAvailable(grpc)
+
+  # Since FetchContent uses add_subdirectory under the hood, we can use
+  # the grpc targets directly from this build.
+  set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+  set(_REFLECTION grpc++_reflection)
+  set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
+  set(_GRPC_GRPCPP grpc++)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
+  endif()
+else()
+  # This branch assumes that gRPC and all its dependencies are already installed
+  # on this system, so they can be located by find_package().
+
+  # Find Protobuf installation
+  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+  set(protobuf_MODULE_COMPATIBLE TRUE)
+  find_package(Protobuf CONFIG REQUIRED)
+  message(STATUS "Using protobuf ${Protobuf_VERSION}")
+
+  set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+  set(_REFLECTION gRPC::grpc++_reflection)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_PROTOBUF_PROTOC protoc)
+  else()
+    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+  endif()
+
+  # Find gRPC installation
+  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+  find_package(gRPC CONFIG REQUIRED)
+  message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+  set(_GRPC_GRPCPP gRPC::grpc++)
+  if(CMAKE_CROSSCOMPILING)
+    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+  else()
+    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+  endif()
+endif()
diff --git a/funasr/runtime/grpc/paraformer_server.cc b/funasr/runtime/grpc/paraformer_server.cc
new file mode 100644
index 0000000..e5814a5
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.cc
@@ -0,0 +1,195 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <memory>
+#include <string>
+
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+#include <grpcpp/server_context.h>
+#include <grpcpp/security/server_credentials.h>
+
+#include "paraformer.grpc.pb.h"
+#include "paraformer_server.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+ASRServicer::ASRServicer(const char* model_path, int thread_num) {
+    AsrHanlde=RapidAsrInit(model_path, thread_num);
+    std::cout << "ASRServicer init" << std::endl;
+    init_flag = 0;
+}
+
+void ASRServicer::clear_states(const std::string& user) {
+    clear_buffers(user);
+    clear_transcriptions(user);
+}
+
+void ASRServicer::clear_buffers(const std::string& user) {
+    if (client_buffers.count(user)) {
+        client_buffers.erase(user);
+    }
+}
+
+void ASRServicer::clear_transcriptions(const std::string& user) {
+    if (client_transcription.count(user)) {
+        client_transcription.erase(user);
+    }
+}
+
+void ASRServicer::disconnect(const std::string& user) {
+    clear_states(user);
+    std::cout << "Disconnecting user: " << user << std::endl;
+}
+
+grpc::Status ASRServicer::Recognize(
+    grpc::ServerContext* context,
+    grpc::ServerReaderWriter<Response, Request>* stream) {
+
+    Request req;
+    while (stream->Read(&req)) {
+        if (req.isend()) {
+            std::cout << "asr end" << std::endl;
+            disconnect(req.user());
+            Response res;
+            res.set_sentence(
+                R"({"success": true, "detail": "asr end"})"
+            );
+            res.set_user(req.user());
+            res.set_action("terminate");
+            res.set_language(req.language());
+            stream->Write(res);
+        } else if (req.speaking()) {
+            if (req.audio_data().size() > 0) {
+                auto& buf = client_buffers[req.user()];
+                buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
+            }
+            Response res;
+            res.set_sentence(
+                R"({"success": true, "detail": "speaking"})"
+            );
+            res.set_user(req.user());
+            res.set_action("speaking");
+            res.set_language(req.language());
+            stream->Write(res);
+        } else if (!req.speaking()) {
+            if (client_buffers.count(req.user()) == 0) {
+                Response res;
+                res.set_sentence(
+                    R"({"success": true, "detail": "waiting_for_voice"})"
+                );
+                res.set_user(req.user());
+                res.set_action("waiting");
+                res.set_language(req.language());
+                stream->Write(res);
+            }else {
+                auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+                std::string tmp_data = this->client_buffers[req.user()];
+                this->clear_states(req.user());
+                
+                Response res;
+                res.set_sentence(
+                    R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
+                );
+		int data_len_int = tmp_data.length();
+                std::string data_len = std::to_string(data_len_int);
+                std::stringstream ss;
+                ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")"  << R"("})";
+                std::string result = ss.str();
+                res.set_sentence(result);
+                res.set_user(req.user());
+                res.set_action("decoding");
+                res.set_language(req.language());
+                stream->Write(res);
+                if (tmp_data.length() < 800) { //min input_len for asr model
+                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+                    std::string delay_str = std::to_string(end_time - begin_time);
+                    std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", error: data_is_not_long_enough" << std::endl;
+                    Response res;
+                    std::stringstream ss;
+                    std::string asr_result = "";
+                    ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+                    std::string result = ss.str();
+                    res.set_sentence(result);
+                    res.set_user(req.user());
+                    res.set_action("finish");
+                    res.set_language(req.language());
+                    
+                    
+                    
+                    stream->Write(res);
+                }
+                else {
+                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);   
+                    std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
+
+                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+                    std::string delay_str = std::to_string(end_time - begin_time);
+                    
+                    std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
+                    Response res;
+                    std::stringstream ss;
+                    ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+                    std::string result = ss.str();
+                    res.set_sentence(result);
+                    res.set_user(req.user());
+                    res.set_action("finish");
+                    res.set_language(req.language());
+                    
+                    
+                    stream->Write(res);
+                }
+            }
+        }else {
+            Response res;
+            res.set_sentence(
+                R"({"success": false, "detail": "error, no condition matched! Unknown reason."})"
+            );
+            res.set_user(req.user());
+            res.set_action("terminate");
+            res.set_language(req.language());
+            stream->Write(res);
+        }
+    }    
+    return Status::OK;
+}
+
+
+void RunServer(const std::string& port, int thread_num, const char* model_path) {
+    std::string server_address;
+    server_address = "0.0.0.0:" + port;
+    ASRServicer service(model_path, thread_num);
+
+    ServerBuilder builder;
+    builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+    builder.RegisterService(&service);
+    std::unique_ptr<Server> server(builder.BuildAndStart());
+    std::cout << "Server listening on " << server_address << std::endl;
+    server->Wait();
+}
+
+int main(int argc, char* argv[]) {
+    if (argc < 3)
+    {
+        printf("Usage: %s port thread_num /path/to/model_file\n", argv[0]);
+        exit(-1);
+    }
+
+    RunServer(argv[1], atoi(argv[2]), argv[3]);
+    return 0;
+}
diff --git a/funasr/runtime/grpc/paraformer_server.h b/funasr/runtime/grpc/paraformer_server.h
new file mode 100644
index 0000000..f356d94
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.h
@@ -0,0 +1,56 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+#include <grpcpp/server_context.h>
+#include <grpcpp/security/server_credentials.h>
+
+#include <unordered_map>
+#include <chrono>
+
+#include "paraformer.grpc.pb.h"
+#include "librapidasrapi.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+typedef struct
+{
+    std::string msg;
+    float  snippet_time;
+}RPASR_RECOG_RESULT;
+
+
+class ASRServicer final : public ASR::Service {
+  private:
+    int init_flag;
+    std::unordered_map<std::string, std::string> client_buffers;
+    std::unordered_map<std::string, std::string> client_transcription;
+
+  public:
+    ASRServicer(const char* model_path, int thread_num);
+    void clear_states(const std::string& user);
+    void clear_buffers(const std::string& user);
+    void clear_transcriptions(const std::string& user);
+    void disconnect(const std::string& user);
+    grpc::Status Recognize(grpc::ServerContext* context, grpc::ServerReaderWriter<Response, Request>* stream);
+    RPASR_HANDLE AsrHanlde;
+	
+};
diff --git a/funasr/runtime/grpc/rebuild.sh b/funasr/runtime/grpc/rebuild.sh
new file mode 100644
index 0000000..9b41ed6
--- /dev/null
+++ b/funasr/runtime/grpc/rebuild.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+rm cmake -rf
+mkdir -p cmake/build
+
+cd cmake/build
+
+cmake  -DCMAKE_BUILD_TYPE=release ../.. -DONNXRUNTIME_DIR=/data/asrmodel/onnxruntime-linux-x64-1.14.0
+make
+
+
+echo "Build cmake/build/paraformer_server successfully!"
diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md
index fa2f276..41c63c6 100644
--- a/funasr/runtime/onnxruntime/readme.md
+++ b/funasr/runtime/onnxruntime/readme.md
@@ -41,8 +41,8 @@
 ```
 瀵煎嚭onnx妯″瀷锛孾璇﹁](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)锛屽弬鑰冪ず渚嬶紝浠巑odelscope涓ā鍨嬪鍑猴細
 
-```
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+```shell
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
 ```
 
 ## Building Guidance for Linux/Unix
diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
index 43dfb6b..53bf9d0 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -237,7 +237,7 @@
 
     size_t nOffset = 0;
 
-#define WAV_HEADER_SIZE 44
+
 
     speech_len = nBufLen / 2;
     speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@
             speech_data[i] = (float)speech_buff[i] / scale;
         }
 
-
+        AudioFrame* frame = new AudioFrame(speech_len);
+        frame_queue.push(frame);
         return true;
 
     }
diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
index 1f8f7ca..f5f9d66 100644
--- a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
+++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
@@ -26,8 +26,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadwav(szBuf,nLen);
-		audio.split();
+		if (!audio.loadwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -58,8 +59,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szBuf, nLen);
-		audio.split();
+		if (!audio.loadpcmwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -91,8 +93,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szFileName);
-		audio.split();
+		if (!audio.loadpcmwav(szFileName))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -125,7 +128,7 @@
 		Audio audio(1);
 		if(!audio.loadwav(szWavfile))
 			return nullptr;
-		audio.split();
+		//audio.split();
 
 		float* buff;
 		int len;
diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp
index b9a85b7..ba5c61c 100644
--- a/funasr/runtime/onnxruntime/tester/tester.cpp
+++ b/funasr/runtime/onnxruntime/tester/tester.cpp
@@ -8,7 +8,7 @@
 #include "librapidasrapi.h"
 
 #include <iostream>
-
+#include <fstream>
 using namespace std;
 
 int main(int argc, char *argv[])
@@ -40,10 +40,13 @@
 
 
     gettimeofday(&start, NULL);
-
-    RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
-    gettimeofday(&end, NULL);
     float snippet_time = 0.0f;
+
+
+     RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    gettimeofday(&end, NULL);
+   
     if (Result)
     {
         string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@
     }
     else
     {
-        cout <<("no return data!");
+        cout <<"no return data!";
     }
-  
-    printf("Audio length %lfs.\n", (double)snippet_time);
+ 
+ 
+    //char* buff = nullptr;
+    //int len = 0;
+    //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
+    //if (ifs.is_open())
+    //{
+    //    ifs.seekg(0, std::ios::end);
+    //    len = ifs.tellg();
+    //    ifs.seekg(0, std::ios::beg);
 
+    //    buff = new char[len];
+
+    //    ifs.read(buff, len);
+
+
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    //    RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+    //    gettimeofday(&end, NULL);
+    //   
+    //    if (Result)
+    //    {
+    //        string msg = RapidAsrGetResult(Result, 0);
+    //        setbuf(stdout, NULL);
+    //        cout << "Result: \"";
+    //        cout << msg << endl;
+    //        cout << "\"." << endl;
+    //        snippet_time = RapidAsrGetRetSnippetTime(Result);
+    //        RapidAsrFreeResult(Result);
+    //    }
+    //    else
+    //    {
+    //        cout <<"no return data!";
+    //    }
+  
+    //   
+    //delete[]buff;
+    //}
+
+ 
+    printf("Audio length %lfs.\n", (double)snippet_time);
     seconds = (end.tv_sec - start.tv_sec);
     long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
     printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);
diff --git a/funasr/runtime/python/benchmark_libtorch.md b/funasr/runtime/python/benchmark_libtorch.md
new file mode 100644
index 0000000..6c068fe
--- /dev/null
+++ b/funasr/runtime/python/benchmark_libtorch.md
@@ -0,0 +1,45 @@
+# Benchmark 
+
+### Data set:
+Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
+
+### Tools
+- Install ModelScope and FunASR
+
+    ```shell
+    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+    pip install --editable ./
+    cd funasr/runtime/python/utils
+    pip install -r requirements.txt
+    ```
+
+- recipe
+
+    set the model, data path and output_dir
+
+    ```shell
+    nohup bash test_rtf.sh &> log.txt &
+    ```
+
+
+
+## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 
+
+
+### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+| 1 (torch fp32)   |        3522        | 0.0976 |     10.3     |
+|  1 (torch int8)  |        1746        | 0.0484 |     20.7     |
+| 32 (torch fp32)  |        236         | 0.0066 |    152.7     |
+| 32 (torch int8)  |        114         | 0.0032 |    317.4     |
+| 64 (torch fp32)  |        235         | 0.0065 |    153.7     |
+| 64 (torch int8)  |        113         | 0.0031 |    319.2     |
+
+
+[//]: # (### Intel&#40;R&#41; Xeon&#40;R&#41; Platinum 8163 CPU @ 2.50GHz    32core-64processor   without avx512_vnni)
+
+
+## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
diff --git a/funasr/runtime/python/benchmark_onnx.md b/funasr/runtime/python/benchmark_onnx.md
new file mode 100644
index 0000000..ca7556b
--- /dev/null
+++ b/funasr/runtime/python/benchmark_onnx.md
@@ -0,0 +1,89 @@
+# Benchmark 
+
+### Data set:
+Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
+
+### Tools
+- Install ModelScope and FunASR
+
+    ```shell
+    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+    pip install --editable ./
+    cd funasr/runtime/python/utils
+    pip install -r requirements.txt
+    ```
+
+- recipe
+
+    set the model, data path and output_dir
+
+    ```shell
+    nohup bash test_rtf.sh &> log.txt &
+    ```
+
+
+## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 
+
+ ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |   RTF   | Speedup Rate |
+|:----------------:|:------------------:|:-------:|:------------:|
+|  1 (onnx fp32)   |        2806        | 0.0777  |     12.9     |
+|  1 (onnx int8)   |        1611        | 0.0446  |     22.4     |
+|  8 (onnx fp32)   |        538         | 0.0149  |     67.1     |
+|  8 (onnx int8)   |        210         | 0.0058  |    172.4     |
+|  16 (onnx fp32)  |        288         | 0.0080  |    125.2     |
+|  16 (onnx int8)  |        117         | 0.0032  |    309.9     |
+|  32 (onnx fp32)  |        167         | 0.0046  |    216.5     |
+|  32 (onnx int8)  |         86         | 0.0024  |    420.0     |
+|  64 (onnx fp32)  |        158         | 0.0044  |    228.1     |
+|  64 (onnx int8)  |         82         | 0.0023  |    442.8     |
+|  96 (onnx fp32)  |        151         | 0.0042  |    238.0     |
+|  96 (onnx int8)  |         80         | 0.0022  |    452.0     |
+
+
+### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+|  1 (onnx fp32)   |        2613        | 0.0724 |     13.8     |
+|  1 (onnx int8)   |        1321        | 0.0366 |     22.4     |
+|  32 (onnx fp32)  |        170         | 0.0047 |    212.7     |
+|  32 (onnx int8)  |        89          | 0.0025 |    407.0     |
+|  64 (onnx fp32)  |        166         | 0.0046 |    217.1     |
+|  64 (onnx int8)  |         87         | 0.0024 |    414.7     |
+
+
+### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz    32core-64processor   without avx512_vnni
+
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+|  1 (onnx fp32)   |        2959        | 0.0820 |     12.2     |
+|  1 (onnx int8)   |        2814        | 0.0778 |     12.8     |
+|  16 (onnx fp32)  |        373         | 0.0103 |     96.9     |
+|  16 (onnx int8)  |        331         | 0.0091 |    109.0     |
+|  32 (onnx fp32)  |        211         | 0.0058 |    171.4     |
+|  32 (onnx int8)  |        181         | 0.0050 |    200.0     |
+|  64 (onnx fp32)  |        153         | 0.0042 |    235.9     |
+|  64 (onnx int8)  |        103         | 0.0029 |    349.9     |
+|  96 (onnx fp32)  |        146         | 0.0041 |    247.0     |
+|  96 (onnx int8)  |        108         | 0.0030 |    334.1     |
+
+## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
+
+ ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+|  1 (onnx fp32)   |        1173        | 0.0325 |     30.8     |
+|  1 (onnx int8)   |        976         | 0.0270 |     37.0     |
+|  16 (onnx fp32)  |         91         | 0.0025 |    395.2     |
+|  16 (onnx int8)  |         78         | 0.0022 |    463.0     |
+|  32 (onnx fp32)  |         60         | 0.0017 |    598.8     |
+|  32 (onnx int8)  |         40         | 0.0011 |    892.9     |
+|  64 (onnx fp32)  |         55         | 0.0015 |    653.6     |
+|  64 (onnx int8)  |         31         | 0.0009 |    1162.8    |
+|  96 (onnx fp32)  |         57         | 0.0016 |    632.9     |
+|  96 (onnx int8)  |         33         | 0.0009 |    1098.9    |
diff --git a/funasr/runtime/python/grpc/grpc_main_server.py b/funasr/runtime/python/grpc/grpc_main_server.py
index e862ac4..ae386fa 100644
--- a/funasr/runtime/python/grpc/grpc_main_server.py
+++ b/funasr/runtime/python/grpc/grpc_main_server.py
@@ -10,7 +10,7 @@
                         # interceptors=(AuthInterceptor('Bearer mysecrettoken'),)
                            )
       paraformer_pb2_grpc.add_ASRServicer_to_server(
-          ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir), server)
+          ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir, vad_model=args.vad_model, punc_model=args.punc_model), server)
       port = "[::]:" + str(args.port)
       server.add_insecure_port(port)
       server.start()
@@ -34,7 +34,16 @@
                         type=str,
                         default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                         help="model from modelscope")
-                        
+    parser.add_argument("--vad_model",
+                        type=str,
+                        default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                        help="model from modelscope")
+    
+    parser.add_argument("--punc_model",
+                        type=str,
+                        default="",
+                        help="model from modelscope")
+    
     parser.add_argument("--sample_rate",
                         type=int,
                         default=16000,
@@ -50,6 +59,7 @@
                         type=str,
                         default="/nfs/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                         help="onnx model dir")
+    
                         
 
 
diff --git a/funasr/runtime/python/grpc/grpc_server.py b/funasr/runtime/python/grpc/grpc_server.py
index 95fe96c..0fdf30c 100644
--- a/funasr/runtime/python/grpc/grpc_server.py
+++ b/funasr/runtime/python/grpc/grpc_server.py
@@ -8,7 +8,7 @@
 
 
 class ASRServicer(paraformer_pb2_grpc.ASRServicer):
-    def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir):
+    def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir, vad_model='', punc_model=''):
         print("ASRServicer init")
         self.backend = backend
         self.init_flag = 0
@@ -21,7 +21,7 @@
                 from modelscope.utils.constant import Tasks
             except ImportError:
                 raise ImportError(f"Please install modelscope")
-            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model)
+            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
         elif self.backend == "onnxruntime":
             try:
                 from rapid_paraformer.paraformer_onnx import Paraformer
diff --git a/funasr/runtime/python/libtorch/README.md b/funasr/runtime/python/libtorch/README.md
index 1e2d919..cf5bbcc 100644
--- a/funasr/runtime/python/libtorch/README.md
+++ b/funasr/runtime/python/libtorch/README.md
@@ -19,11 +19,11 @@
 
        - `e.g.`, Export model from modelscope
          ```shell
-         python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+         python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
          ```
        - `e.g.`, Export model from local path, the model'name must be `model.pb`.
          ```shell
-         python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+         python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
          ```
 
 
diff --git a/funasr/runtime/python/libtorch/setup.py b/funasr/runtime/python/libtorch/setup.py
index 0f9e40d..c50e497 100644
--- a/funasr/runtime/python/libtorch/setup.py
+++ b/funasr/runtime/python/libtorch/setup.py
@@ -28,7 +28,7 @@
     install_requires=["librosa", "onnxruntime>=1.7.0",
                       "scipy", "numpy>=1.19.3",
                       "typeguard", "kaldi-native-fbank",
-                      "PyYAML>=5.1.2"],
+                      "PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
     packages=find_packages(include=["torch_paraformer*"]),
     keywords=[
         'funasr,paraformer'
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py b/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
index 3545ccf..3c0606d 100644
--- a/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
+++ b/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
@@ -24,12 +24,16 @@
                  device_id: Union[str, int] = "-1",
                  plot_timestamp_to: str = "",
                  pred_bias: int = 1,
+                 quantize: bool = False,
+                 intra_op_num_threads: int = 1,
                  ):
 
         if not Path(model_dir).exists():
             raise FileNotFoundError(f'{model_dir} does not exist.')
 
         model_file = os.path.join(model_dir, 'model.torchscripts')
+        if quantize:
+            model_file = os.path.join(model_dir, 'model_quant.torchscripts')
         config_file = os.path.join(model_dir, 'config.yaml')
         cmvn_file = os.path.join(model_dir, 'am.mvn')
         config = read_yaml(config_file)
@@ -58,26 +62,28 @@
                 am_scores, valid_token_lens = outputs[0], outputs[1]
                 if len(outputs) == 4:
                     # for BiCifParaformer Inference
-                    us_alphas, us_cif_peak = outputs[2], outputs[3]
+                    us_alphas, us_peaks = outputs[2], outputs[3]
                 else:
-                    us_alphas, us_cif_peak = None, None
+                    us_alphas, us_peaks = None, None
             except:
                 #logging.warning(traceback.format_exc())
                 logging.warning("input wav is silence or noise")
                 preds = ['']
             else:
-                am_scores, valid_token_lens = am_scores.detach().cpu().numpy(), valid_token_lens.detach().cpu().numpy()
                 preds = self.decode(am_scores, valid_token_lens)
-                if us_cif_peak is None:
+                if us_peaks is None:
                     for pred in preds:
+                        pred = sentence_postprocess(pred)
                         asr_res.append({'preds': pred})
                 else:
-                    for pred, us_cif_peak_ in zip(preds, us_cif_peak):
-                        text, tokens = pred
-                        timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
+                    for pred, us_peaks_ in zip(preds, us_peaks):
+                        raw_tokens = pred
+                        timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
+                        text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
+                        # logging.warning(timestamp)
                         if len(self.plot_timestamp_to):
-                            self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
-                        asr_res.append({'preds': text, 'timestamp': timestamp})
+                            self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
+                        asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
         return asr_res
 
     def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -178,6 +184,6 @@
         # Change integer-ids to tokens
         token = self.converter.ids2tokens(token_int)
         token = token[:valid_token_num-self.pred_bias]
-        texts = sentence_postprocess(token)
-        return texts
+        # texts = sentence_postprocess(token)
+        return token
 
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py b/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py
new file mode 100755
index 0000000..349a3f6
--- /dev/null
+++ b/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py
@@ -0,0 +1,157 @@
+import os
+import numpy as np
+import sys
+
+def compute_wer(ref_file,
+                hyp_file,
+                cer_detail_file):
+    rst = {
+        'Wrd': 0,
+        'Corr': 0,
+        'Ins': 0,
+        'Del': 0,
+        'Sub': 0,
+        'Snt': 0,
+        'Err': 0.0,
+        'S.Err': 0.0,
+        'wrong_words': 0,
+        'wrong_sentences': 0
+    }
+
+    hyp_dict = {}
+    ref_dict = {}
+    with open(hyp_file, 'r') as hyp_reader:
+        for line in hyp_reader:
+            key = line.strip().split()[0]
+            value = line.strip().split()[1:]
+            hyp_dict[key] = value
+    with open(ref_file, 'r') as ref_reader:
+        for line in ref_reader:
+            key = line.strip().split()[0]
+            value = line.strip().split()[1:]
+            ref_dict[key] = value
+
+    cer_detail_writer = open(cer_detail_file, 'w')
+    for hyp_key in hyp_dict:
+        if hyp_key in ref_dict:
+           out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
+           rst['Wrd'] += out_item['nwords']
+           rst['Corr'] += out_item['cor']
+           rst['wrong_words'] += out_item['wrong']
+           rst['Ins'] += out_item['ins']
+           rst['Del'] += out_item['del']
+           rst['Sub'] += out_item['sub']
+           rst['Snt'] += 1
+           if out_item['wrong'] > 0:
+               rst['wrong_sentences'] += 1
+           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
+           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
+           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+
+    if rst['Wrd'] > 0:
+        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
+    if rst['Snt'] > 0:
+        rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
+
+    cer_detail_writer.write('\n')
+    cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
+                            ", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
+    cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
+    cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
+
+     
+def compute_wer_by_line(hyp,
+                        ref):
+    hyp = list(map(lambda x: x.lower(), hyp))
+    ref = list(map(lambda x: x.lower(), ref))
+
+    len_hyp = len(hyp)
+    len_ref = len(ref)
+
+    cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
+
+    ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
+
+    for i in range(len_hyp + 1):
+        cost_matrix[i][0] = i
+    for j in range(len_ref + 1):
+        cost_matrix[0][j] = j
+
+    for i in range(1, len_hyp + 1):
+        for j in range(1, len_ref + 1):
+            if hyp[i - 1] == ref[j - 1]:
+                cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
+            else:
+                substitution = cost_matrix[i - 1][j - 1] + 1
+                insertion = cost_matrix[i - 1][j] + 1
+                deletion = cost_matrix[i][j - 1] + 1
+
+                compare_val = [substitution, insertion, deletion]
+
+                min_val = min(compare_val)
+                operation_idx = compare_val.index(min_val) + 1
+                cost_matrix[i][j] = min_val
+                ops_matrix[i][j] = operation_idx
+
+    match_idx = []
+    i = len_hyp
+    j = len_ref
+    rst = {
+        'nwords': len_ref,
+        'cor': 0,
+        'wrong': 0,
+        'ins': 0,
+        'del': 0,
+        'sub': 0
+    }
+    while i >= 0 or j >= 0:
+        i_idx = max(0, i)
+        j_idx = max(0, j)
+
+        if ops_matrix[i_idx][j_idx] == 0:  # correct
+            if i - 1 >= 0 and j - 1 >= 0:
+                match_idx.append((j - 1, i - 1))
+                rst['cor'] += 1
+
+            i -= 1
+            j -= 1
+
+        elif ops_matrix[i_idx][j_idx] == 2:  # insert
+            i -= 1
+            rst['ins'] += 1
+
+        elif ops_matrix[i_idx][j_idx] == 3:  # delete
+            j -= 1
+            rst['del'] += 1
+
+        elif ops_matrix[i_idx][j_idx] == 1:  # substitute
+            i -= 1
+            j -= 1
+            rst['sub'] += 1
+
+        if i < 0 and j >= 0:
+            rst['del'] += 1
+        elif j < 0 and i >= 0:
+            rst['ins'] += 1
+
+    match_idx.reverse()
+    wrong_cnt = cost_matrix[len_hyp][len_ref]
+    rst['wrong'] = wrong_cnt
+
+    return rst
+
+def print_cer_detail(rst):
+    return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
+            + ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
+            + str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
+            + ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
+
+if __name__ == '__main__':
+    if len(sys.argv) != 4:
+        print("usage : python compute-wer.py test.ref test.hyp test.wer")
+        sys.exit(0)
+
+    ref_file = sys.argv[1]
+    hyp_file = sys.argv[2]
+    cer_detail_file = sys.argv[3]
+    compute_wer(ref_file, hyp_file, cer_detail_file)
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py b/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
index 767e864..3a01812 100644
--- a/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
+++ b/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
@@ -1,11 +1,11 @@
 import numpy as np
 
 
-def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0):
+def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
     if not len(char_list):
         return []
     START_END_THRESHOLD = 5
-    MAX_TOKEN_DURATION = 14
+    MAX_TOKEN_DURATION = 30
     TIME_RATE = 10.0 * 6 / 1000 / 3  #  3 times upsampled
     cif_peak = us_cif_peak.reshape(-1)
     num_frames = cif_peak.shape[-1]
@@ -16,7 +16,7 @@
     new_char_list = []
     # for bicif model trained with large data, cif2 actually fires when a character starts
     # so treat the frames between two peaks as the duration of the former token
-    fire_place = np.where(cif_peak>1.0-1e-4)[0] - 1.5  # np format
+    fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset  # np format
     num_peak = len(fire_place)
     assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
     # begin silence
@@ -27,7 +27,7 @@
     # tokens timestamp
     for i in range(len(fire_place)-1):
         new_char_list.append(char_list[i])
-        if MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
+        if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
             timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
         else:
             # cut the duration to token and sil of the 0-weight frames last long
@@ -48,11 +48,12 @@
             timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
             timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
     assert len(new_char_list) == len(timestamp_list)
-    res_total = []
+    res_str = ""
     for char, timestamp in zip(new_char_list, timestamp_list):
-        res_total.append([char, timestamp[0], timestamp[1]])  # += "{} {} {};".format(char, timestamp[0], timestamp[1])
+        res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
     res = []
     for char, timestamp in zip(new_char_list, timestamp_list):
         if char != '<sil>':
             res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
-    return res, res_total
\ No newline at end of file
+    return res_str, res
+    
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index 6ed9849..e2a09f1 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -24,11 +24,11 @@
 
        - `e.g.`, Export model from modelscope
          ```shell
-         python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+         python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
          ```
        - `e.g.`, Export model from local path, the model'name must be `model.pb`.
          ```shell
-         python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+         python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
          ```
 
 
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
index 850f007..5567940 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
@@ -26,12 +26,16 @@
                  device_id: Union[str, int] = "-1",
                  plot_timestamp_to: str = "",
                  pred_bias: int = 1,
+                 quantize: bool = False,
+                 intra_op_num_threads: int = 4,
                  ):
 
         if not Path(model_dir).exists():
             raise FileNotFoundError(f'{model_dir} does not exist.')
 
         model_file = os.path.join(model_dir, 'model.onnx')
+        if quantize:
+            model_file = os.path.join(model_dir, 'model_quant.onnx')
         config_file = os.path.join(model_dir, 'config.yaml')
         cmvn_file = os.path.join(model_dir, 'am.mvn')
         config = read_yaml(config_file)
@@ -42,7 +46,7 @@
             cmvn_file=cmvn_file,
             **config['frontend_conf']
         )
-        self.ort_infer = OrtInferSession(model_file, device_id)
+        self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
         self.batch_size = batch_size
         self.plot_timestamp_to = plot_timestamp_to
         self.pred_bias = pred_bias
@@ -60,25 +64,28 @@
                 am_scores, valid_token_lens = outputs[0], outputs[1]
                 if len(outputs) == 4:
                     # for BiCifParaformer Inference
-                    us_alphas, us_cif_peak = outputs[2], outputs[3]
+                    us_alphas, us_peaks = outputs[2], outputs[3]
                 else:
-                    us_alphas, us_cif_peak = None, None
+                    us_alphas, us_peaks = None, None
             except ONNXRuntimeError:
                 #logging.warning(traceback.format_exc())
                 logging.warning("input wav is silence or noise")
                 preds = ['']
             else:
                 preds = self.decode(am_scores, valid_token_lens)
-                if us_cif_peak is None:
+                if us_peaks is None:
                     for pred in preds:
+                        pred = sentence_postprocess(pred)
                         asr_res.append({'preds': pred})
                 else:
-                    for pred, us_cif_peak_ in zip(preds, us_cif_peak):
-                        text, tokens = pred
-                        timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
+                    for pred, us_peaks_ in zip(preds, us_peaks):
+                        raw_tokens = pred
+                        timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
+                        text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
+                        # logging.warning(timestamp)
                         if len(self.plot_timestamp_to):
-                            self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
-                        asr_res.append({'preds': text, 'timestamp': timestamp})
+                            self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
+                        asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
         return asr_res
 
     def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -177,6 +184,6 @@
         # Change integer-ids to tokens
         token = self.converter.ids2tokens(token_int)
         token = token[:valid_token_num-self.pred_bias]
-        texts = sentence_postprocess(token)
-        return texts
+        # texts = sentence_postprocess(token)
+        return token
 
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
index dd702f3..3a01812 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
@@ -48,12 +48,12 @@
             timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
             timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
     assert len(new_char_list) == len(timestamp_list)
-    res_total = []
+    res_str = ""
     for char, timestamp in zip(new_char_list, timestamp_list):
-        res_total.append([char, timestamp[0], timestamp[1]])  # += "{} {} {};".format(char, timestamp[0], timestamp[1])
+        res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
     res = []
     for char, timestamp in zip(new_char_list, timestamp_list):
         if char != '<sil>':
             res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
-    return res, res_total
+    return res_str, res
     
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
index 392fe6b..2edde11 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
@@ -1,6 +1,5 @@
 # -*- encoding: utf-8 -*-
-# @Author: SWHL
-# @Contact: liekkaskono@163.com
+
 import functools
 import logging
 import pickle
@@ -147,10 +146,10 @@
 
 
 class OrtInferSession():
-    def __init__(self, model_file, device_id=-1):
+    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
         device_id = str(device_id)
         sess_opt = SessionOptions()
-        sess_opt.intra_op_num_threads = 4
+        sess_opt.intra_op_num_threads = intra_op_num_threads
         sess_opt.log_severity_level = 4
         sess_opt.enable_cpu_mem_arena = False
         sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index f062500..299910f 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -20,8 +20,8 @@
     version=VERSION_NUM,
     platforms="Any",
     description="Using paraformer with ONNXRuntime",
-    author="SWHL",
-    author_email="liekkaskono@163.com",
+    author="FunASR",
+    author_email="funasr@list.alibaba-inc.com",
     url="https://github.com/alibaba-damo-academy/FunASR",
     license='MIT',
     long_description=get_readme(),
diff --git a/funasr/runtime/python/utils/requirements.txt b/funasr/runtime/python/utils/requirements.txt
new file mode 100644
index 0000000..600eb80
--- /dev/null
+++ b/funasr/runtime/python/utils/requirements.txt
@@ -0,0 +1,2 @@
+onnx
+torch-quant >= 0.4.0
\ No newline at end of file
diff --git a/funasr/runtime/python/utils/split_scp.pl b/funasr/runtime/python/utils/split_scp.pl
new file mode 100755
index 0000000..0876dcb
--- /dev/null
+++ b/funasr/runtime/python/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#  http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text  file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can.  If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries.  In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+    if ($ARGV[0] eq "-j") {
+        shift @ARGV;
+        $num_jobs = shift @ARGV;
+        $job_id = shift @ARGV;
+    }
+    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+        $utt2spk_file=$1;
+        shift;
+    }
+    if ($ARGV[0] eq '--one-based') {
+        $one_based = 1;
+        shift @ARGV;
+    }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+                       $job_id - $one_based >= $num_jobs)) {
+  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+      ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+    and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+    die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+    @OUTPUTS = @ARGV;
+} else {
+    for ($j = 0; $j < $num_jobs; $j++) {
+        if ($j == $job_id) {
+            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+            else { push @OUTPUTS, "-"; }
+        } else {
+            push @OUTPUTS, "/dev/null";
+        }
+    }
+}
+
+if ($utt2spk_file ne "") {  # We have the --utt2spk option...
+    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+    while(<$u_fh>) {
+        @A = split;
+        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+        ($u,$s) = @A;
+        $utt2spk{$u} = $s;
+    }
+    close $u_fh;
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+    @spkrs = ();
+    while(<$i_fh>) {
+        @A = split;
+        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+        $u = $A[0];
+        $s = $utt2spk{$u};
+        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+        if(!defined $spk_count{$s}) {
+            push @spkrs, $s;
+            $spk_count{$s} = 0;
+            $spk_data{$s} = [];  # ref to new empty array.
+        }
+        $spk_count{$s}++;
+        push @{$spk_data{$s}}, $_;
+    }
+    # Now split as equally as possible ..
+    # First allocate spks to files by allocating an approximately
+    # equal number of speakers.
+    $numspks = @spkrs;  # number of speakers.
+    $numscps = @OUTPUTS; # number of output files.
+    if ($numspks < $numscps) {
+      die "$0: Refusing to split data because number of speakers $numspks " .
+          "is less than the number of output .scp files $numscps\n";
+    }
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scparray[$scpidx] = []; # [] is array reference.
+    }
+    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+        $scpidx = int(($spkidx*$numscps) / $numspks);
+        $spk = $spkrs[$spkidx];
+        push @{$scparray[$scpidx]}, $spk;
+        $scpcount[$scpidx] += $spk_count{$spk};
+    }
+
+    # Now will try to reassign beginning + ending speakers
+    # to different scp's and see if it gets more balanced.
+    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+    # We can show that if considering changing just 2 scp's, we minimize
+    # this by minimizing the squared difference in sizes.  This is
+    # equivalent to minimizing the absolute difference in sizes.  This
+    # shows this method is bound to converge.
+
+    $changed = 1;
+    while($changed) {
+        $changed = 0;
+        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+            # First try to reassign ending spk of this scp.
+            if($scpidx < $numscps-1) {
+                $sz = @{$scparray[$scpidx]};
+                if($sz > 0) {
+                    $spk = $scparray[$scpidx]->[$sz-1];
+                    $count = $spk_count{$spk};
+                    $nutt1 = $scpcount[$scpidx];
+                    $nutt2 = $scpcount[$scpidx+1];
+                    if( abs( ($nutt2+$count) - ($nutt1-$count))
+                        < abs($nutt2 - $nutt1))  { # Would decrease
+                        # size-diff by reassigning spk...
+                        $scpcount[$scpidx+1] += $count;
+                        $scpcount[$scpidx] -= $count;
+                        pop @{$scparray[$scpidx]};
+                        unshift @{$scparray[$scpidx+1]}, $spk;
+                        $changed = 1;
+                    }
+                }
+            }
+            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+                $spk = $scparray[$scpidx]->[0];
+                $count = $spk_count{$spk};
+                $nutt1 = $scpcount[$scpidx-1];
+                $nutt2 = $scpcount[$scpidx];
+                if( abs( ($nutt2-$count) - ($nutt1+$count))
+                    < abs($nutt2 - $nutt1))  { # Would decrease
+                    # size-diff by reassigning spk...
+                    $scpcount[$scpidx-1] += $count;
+                    $scpcount[$scpidx] -= $count;
+                    shift @{$scparray[$scpidx]};
+                    push @{$scparray[$scpidx-1]}, $spk;
+                    $changed = 1;
+                }
+            }
+        }
+    }
+    # Now print out the files...
+    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+                         : open($f_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        $count = 0;
+        if(@{$scparray[$scpidx]} == 0) {
+            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+                         "$scpfile (too many splits and too few speakers?)\n";
+            $error = 1;
+        } else {
+            foreach $spk ( @{$scparray[$scpidx]} ) {
+                print $f_fh @{$spk_data{$spk}};
+                $count += $spk_count{$spk};
+            }
+            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+        }
+        close($f_fh);
+    }
+} else {
+   # This block is the "normal" case where there is no --utt2spk
+   # option and we just break into equal size chunks.
+
+    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+    $numscps = @OUTPUTS;  # size of array.
+    @F = ();
+    while(<$i_fh>) {
+        push @F, $_;
+    }
+    $numlines = @F;
+    if($numlines == 0) {
+        print STDERR "$0: error: empty input scp file $inscp\n";
+        $error = 1;
+    }
+    $linesperscp = int( $numlines / $numscps); # the "whole part"..
+    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+    $remainder = $numlines - ($linesperscp * $numscps);
+    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+    # [just doing int() rounds down].
+    $n = 0;
+    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+        $scpfile = $OUTPUTS[$scpidx];
+        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+                         : open($o_fh, '>&', \*STDOUT)) ||
+            die "$0: Could not open scp file $scpfile for writing: $!\n";
+        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+            print $o_fh $F[$n++];
+        }
+        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+    }
+    $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
diff --git a/funasr/runtime/python/utils/test_rtf.py b/funasr/runtime/python/utils/test_rtf.py
new file mode 100644
index 0000000..536ee2d
--- /dev/null
+++ b/funasr/runtime/python/utils/test_rtf.py
@@ -0,0 +1,55 @@
+
+import time
+import sys
+import librosa
+from funasr.utils.types import str2bool
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_dir', type=str, required=True)
+parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
+parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
+parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
+parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
+args = parser.parse_args()
+
+
+from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
+if args.backend == "onnx":
+	from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
+	
+model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
+
+wav_file_f = open(args.wav_file, 'r')
+wav_files = wav_file_f.readlines()
+
+# warm-up
+total = 0.0
+num = 30
+wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip()
+for i in range(num):
+	beg_time = time.time()
+	result = model(wav_path)
+	end_time = time.time()
+	duration = end_time-beg_time
+	total += duration
+	print(result)
+	print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53))
+
+# infer time
+beg_time = time.time()
+for i, wav_path_i in enumerate(wav_files):
+	wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
+	result = model(wav_path)
+end_time = time.time()
+duration = (end_time-beg_time)*1000
+print("total_time_comput_ms: {}".format(int(duration)))
+
+duration_time = 0.0
+for i, wav_path_i in enumerate(wav_files):
+	wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
+	waveform, _ = librosa.load(wav_path, sr=16000)
+	duration_time += len(waveform)/16.0
+print("total_time_wav_ms: {}".format(int(duration_time)))
+
+print("total_rtf: {:.5}".format(duration/duration_time))
\ No newline at end of file
diff --git a/funasr/runtime/python/utils/test_rtf.sh b/funasr/runtime/python/utils/test_rtf.sh
new file mode 100644
index 0000000..dcce6c4
--- /dev/null
+++ b/funasr/runtime/python/utils/test_rtf.sh
@@ -0,0 +1,71 @@
+
+nj=32
+stage=0
+
+scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
+export_root="/nfs/zhifu.gzf/export"
+split_scps_tool=split_scp.pl
+rtf_tool=test_rtf.py
+
+#:<<!
+model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+backend="onnx" # "torch"
+quantize='true' # 'False'
+tag=${model_name}/${backend}_quantize_${quantize}
+!
+
+logs_outputs_dir=${export_root}/logs/${tag}/split$nj
+mkdir -p ${logs_outputs_dir}
+echo ${logs_outputs_dir}
+
+
+if [ ${stage} -le 0 ];then
+
+    python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp}
+
+fi
+
+
+if [ ${stage} -le 1 ];then
+
+model_dir=${export_root}/${model_name}
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $logs_outputs_dir/wav.$JOB.scp"
+done
+
+perl ${split_scps_tool} $scp ${split_scps}
+
+
+for JOB in $(seq ${nj}); do
+  {
+    core_id=`expr $JOB - 1`
+    taskset -c ${core_id} python ${rtf_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${logs_outputs_dir}/wav.$JOB.scp --quantize ${quantize} &> ${logs_outputs_dir}/log.$JOB.txt
+  }&
+
+done
+wait
+
+
+rm -rf ${logs_outputs_dir}/total_time_comput.txt
+rm -rf ${logs_outputs_dir}/total_time_wav.txt
+rm -rf ${logs_outputs_dir}/total_rtf.txt
+for JOB in $(seq ${nj}); do
+  {
+    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_comput" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_time_comput.txt
+    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_wav" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_time_wav.txt
+    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_rtf" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_rtf.txt
+  }
+
+done
+
+total_time_comput=`cat ${logs_outputs_dir}/total_time_comput.txt | awk 'BEGIN {max = 0} {if ($1+0>max+0) max=$1 fi} END {print max}'`
+total_time_wav=`cat ${logs_outputs_dir}/total_time_wav.txt | awk '{sum +=$1};END {print sum}'`
+rtf=`awk 'BEGIN{printf "%.5f\n",'$total_time_comput'/'$total_time_wav'}'`
+speed=`awk 'BEGIN{printf "%.2f\n",1/'$rtf'}'`
+
+echo "total_time_comput_ms: $total_time_comput"
+echo "total_time_wav: $total_time_wav"
+echo "total_rtf: $rtf, speech: $speed"
+
+fi
\ No newline at end of file
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index cc5b708..8080ef8 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -639,12 +639,12 @@
                  "and exclude_keys excludes keys of model states for the initialization."
                  "e.g.\n"
                  "  # Load all parameters"
-                 "  --init_param some/where/model.pth\n"
+                 "  --init_param some/where/model.pb\n"
                  "  # Load only decoder parameters"
-                 "  --init_param some/where/model.pth:decoder:decoder\n"
+                 "  --init_param some/where/model.pb:decoder:decoder\n"
                  "  # Load only decoder parameters excluding decoder.embed"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
         )
         group.add_argument(
             "--ignore_init_mismatch",
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 36499a2..e151473 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -826,7 +826,7 @@
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index ae7ee9b..096a5c8 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -507,7 +507,7 @@
             config_file: Union[Path, str] = None,
             model_file: Union[Path, str] = None,
             cmvn_file: Union[Path, str] = None,
-            device: str = "cpu",
+            device: Union[str, torch.device] = "cpu",
     ):
         """Build model from the files.
 
@@ -553,7 +553,7 @@
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
@@ -562,12 +562,27 @@
                 model.load_state_dict(model_dict)
             else:
                 model_dict = torch.load(model_file, map_location=device)
+        model_dict = cls.fileter_model_dict(model_dict, model.state_dict())
         model.load_state_dict(model_dict)
         if model_name_pth is not None and not os.path.exists(model_name_pth):
             torch.save(model_dict, model_name_pth)
             logging.info("model_file is saved to pth: {}".format(model_name_pth))
 
         return model, args
+
+    @classmethod
+    def fileter_model_dict(cls, src_dict: dict, dest_dict: dict):
+        from collections import OrderedDict
+        new_dict = OrderedDict()
+        for key, value in src_dict.items():
+            if key in dest_dict:
+                new_dict[key] = value
+            else:
+                logging.info("{} is no longer needed in this model.".format(key))
+        for key, value in dest_dict.items():
+            if key not in new_dict:
+                logging.warning("{} is missed in checkpoint.".format(key))
+        return new_dict
 
     @classmethod
     def convert_tf2torch(
@@ -750,47 +765,47 @@
             cls, args: argparse.Namespace, train: bool
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
-        if args.use_preprocessor:
-            retval = CommonPreprocessor(
-                train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=None,
-                non_linguistic_symbols=None,
-                text_cleaner=None,
-                g2p_type=None,
-                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
-                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
-                # NOTE(kamo): Check attribute existence for backward compatibility
-                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
-                rir_apply_prob=args.rir_apply_prob
-                if hasattr(args, "rir_apply_prob")
-                else 1.0,
-                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
-                noise_apply_prob=args.noise_apply_prob
-                if hasattr(args, "noise_apply_prob")
-                else 1.0,
-                noise_db_range=args.noise_db_range
-                if hasattr(args, "noise_db_range")
-                else "13_15",
-                speech_volume_normalize=args.speech_volume_normalize
-                if hasattr(args, "rir_scp")
-                else None,
-            )
-        else:
-            retval = None
-        assert check_return_type(retval)
-        return retval
+        # if args.use_preprocessor:
+        #     retval = CommonPreprocessor(
+        #         train=train,
+        #         token_type=args.token_type,
+        #         token_list=args.token_list,
+        #         bpemodel=None,
+        #         non_linguistic_symbols=None,
+        #         text_cleaner=None,
+        #         g2p_type=None,
+        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
+        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
+        #         # NOTE(kamo): Check attribute existence for backward compatibility
+        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+        #         rir_apply_prob=args.rir_apply_prob
+        #         if hasattr(args, "rir_apply_prob")
+        #         else 1.0,
+        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+        #         noise_apply_prob=args.noise_apply_prob
+        #         if hasattr(args, "noise_apply_prob")
+        #         else 1.0,
+        #         noise_db_range=args.noise_db_range
+        #         if hasattr(args, "noise_db_range")
+        #         else "13_15",
+        #         speech_volume_normalize=args.speech_volume_normalize
+        #         if hasattr(args, "rir_scp")
+        #         else None,
+        #     )
+        # else:
+        #     retval = None
+        # assert check_return_type(retval)
+        return None
 
     @classmethod
     def required_data_names(
             cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("speech", "profile", "binary_labels")
+            retval = ("speech", )
         else:
             # Recognition mode
-            retval = ("speech")
+            retval = ("speech", )
         return retval
 
     @classmethod
@@ -823,7 +838,7 @@
 
         # 2. Encoder
         encoder_class = encoder_choices.get_class(args.encoder)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        encoder = encoder_class(**args.encoder_conf)
 
         # 3. EncoderDecoderAttractor
         encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)
diff --git a/funasr/tasks/sv.py b/funasr/tasks/sv.py
index 1b08c4d..bef5dc5 100644
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@@ -501,7 +501,7 @@
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index 8e3f05e..e9b18cd 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -52,13 +52,13 @@
         init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>
 
     Examples:
-        >>> load_pretrained_model("somewhere/model.pth", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
+        >>> load_pretrained_model("somewhere/model.pb", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
         >>> load_pretrained_model(
-        ...     "somewhere/model.pth:decoder:decoder:decoder.embed", model
+        ...     "somewhere/model.pb:decoder:decoder:decoder.embed", model
         ... )
-        >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
+        >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
     """
     sps = init_param.split(":", 4)
     if len(sps) == 4:
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 50bce47..efe2009 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -205,9 +205,9 @@
         else:
             scaler = None
 
-        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
+        if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
             cls.resume(
-                checkpoint=output_dir / "checkpoint.pth",
+                checkpoint=output_dir / "checkpoint.pb",
                 model=model,
                 optimizers=optimizers,
                 schedulers=schedulers,
@@ -361,7 +361,7 @@
                         },
                         buffer,
                     )
-                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
+                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
                 else:
                     torch.save(
                         {
@@ -374,7 +374,7 @@
                             ],
                             "scaler": scaler.state_dict() if scaler is not None else None,
                         },
-                        output_dir / "checkpoint.pth",
+                        output_dir / "checkpoint.pb",
                     )
 
                 # 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@
                     buffer = BytesIO()
                     torch.save(model.state_dict(), buffer)
                     trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
-                                                                       f"{iepoch}epoch.pth"),buffer.getvalue())
+                                                                       f"{iepoch}epoch.pb"),buffer.getvalue())
                 else:
-                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
+                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")
 
-                # Creates a sym link latest.pth -> {iepoch}epoch.pth
+                # Creates a sym link latest.pb -> {iepoch}epoch.pb
                 if trainer_options.use_pai:
-                    p = os.path.join(trainer_options.output_dir, "latest.pth")
+                    p = os.path.join(trainer_options.output_dir, "latest.pb")
                     if trainer_options.oss_bucket.object_exists(p):
                         trainer_options.oss_bucket.delete_object(p)
                     trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
+                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
                 else:
-                    p = output_dir / "latest.pth"
+                    p = output_dir / "latest.pb"
                     if p.is_symlink() or p.exists():
                         p.unlink()
-                    p.symlink_to(f"{iepoch}epoch.pth")
+                    p.symlink_to(f"{iepoch}epoch.pb")
 
                 _improved = []
                 for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@
                         # Creates sym links if it's the best result
                         if best_epoch == iepoch:
                             if trainer_options.use_pai:
-                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
+                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
                                 if trainer_options.oss_bucket.object_exists(p):
                                     trainer_options.oss_bucket.delete_object(p)
                                 trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
+                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
                             else:
-                                p = output_dir / f"{_phase}.{k}.best.pth"
+                                p = output_dir / f"{_phase}.{k}.best.pb"
                                 if p.is_symlink() or p.exists():
                                     p.unlink()
-                                p.symlink_to(f"{iepoch}epoch.pth")
+                                p.symlink_to(f"{iepoch}epoch.pb")
                             _improved.append(f"{_phase}.{k}")
                 if len(_improved) == 0:
                     logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@
                         type="model",
                         metadata={"improved": _improved},
                     )
-                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
+                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
                     aliases = [
                         f"epoch-{iepoch}",
                         "best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@
 
                 for e in range(1, iepoch):
                     if trainer_options.use_pai:
-                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
+                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
                         if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
                             trainer_options.oss_bucket.delete_object(p)
                             _removed.append(str(p))
                     else:
-                        p = output_dir / f"{e}epoch.pth"
+                        p = output_dir / f"{e}epoch.pb"
                         if p.exists() and e not in nbests:
                             p.unlink()
                             _removed.append(str(p))
diff --git a/funasr/utils/asr_utils.py b/funasr/utils/asr_utils.py
index 0f0e4c3..4067b04 100644
--- a/funasr/utils/asr_utils.py
+++ b/funasr/utils/asr_utils.py
@@ -58,14 +58,15 @@
     if r_recog_type is None and audio_in is not None:
         # audio_in is wav, recog_type is wav_file
         if os.path.isfile(audio_in):
-            audio_type = os.path.basename(audio_in).split(".")[-1].lower()
-            if audio_type in SUPPORT_AUDIO_TYPE_SETS:
-                r_recog_type = 'wav'
-                r_audio_format = 'wav'
-            elif audio_type == "scp":
+            audio_type = os.path.basename(audio_in).lower()
+            for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+                if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+                    r_recog_type = 'wav'
+                    r_audio_format = 'wav'
+            if audio_type.rfind(".scp") >= 0:
                 r_recog_type = 'wav'
                 r_audio_format = 'scp'
-            else:
+            if r_recog_type is None:
                 raise NotImplementedError(
                     f'Not supported audio type: {audio_type}')
 
@@ -128,13 +129,15 @@
 def get_sr_from_wav(fname: str):
     fs = None
     if os.path.isfile(fname):
-        audio_type = os.path.basename(fname).split(".")[-1].lower()
-        if audio_type in SUPPORT_AUDIO_TYPE_SETS:
-            if audio_type == "pcm":
-                fs = None
-            else:
-                audio, fs = torchaudio.load(fname)
-        elif audio_type == "scp":
+        audio_type = os.path.basename(fname).lower()
+        for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+            if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+                if support_audio_type == "pcm":
+                    fs = None
+                else:
+                    audio, fs = torchaudio.load(fname)
+                break
+        if audio_type.rfind(".scp") >= 0:
             with open(fname, encoding="utf-8") as f:
                 for line in f:
                     wav_path = line.split()[1]
@@ -147,9 +150,7 @@
         for file in dir_files:
             file_path = os.path.join(fname, file)
             if os.path.isfile(file_path):
-                audio_type = os.path.basename(file_path).split(".")[-1].lower()
-                if audio_type in SUPPORT_AUDIO_TYPE_SETS:
-                    fs = get_sr_from_wav(file_path)
+                fs = get_sr_from_wav(file_path)
             elif os.path.isdir(file_path):
                 fs = get_sr_from_wav(file_path)
 
@@ -165,12 +166,12 @@
         file_path = os.path.join(dir_path, file)
         if os.path.isfile(file_path):
             if ends == ".wav" or ends == ".WAV":
-                audio_type = os.path.basename(file_path).split(".")[-1].lower()
-                if audio_type in SUPPORT_AUDIO_TYPE_SETS:
-                    return True
-                else:
-                    raise NotImplementedError(
-                        f'Not supported audio type: {audio_type}')
+                audio_type = os.path.basename(file_path).lower()
+                for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+                    if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+                        return True
+                raise NotImplementedError(
+                    f'Not supported audio type: {audio_type}')
             elif file_path.endswith(ends):
                 return True
         elif os.path.isdir(file_path):
@@ -185,9 +186,10 @@
     for file in dir_files:
         file_path = os.path.join(dir_path, file)
         if os.path.isfile(file_path):
-            audio_type = os.path.basename(file_path).split(".")[-1].lower()
-            if audio_type in SUPPORT_AUDIO_TYPE_SETS:
-                wav_list.append(file_path)
+            audio_type = os.path.basename(file_path).lower()
+            for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+                if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+                    wav_list.append(file_path)
         elif os.path.isdir(file_path):
             recursion_dir_all_wav(wav_list, file_path)
 
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index 2475548..40756d8 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -106,17 +106,18 @@
         if num in abbr_begin:
             if time_stamp is not None:
                 begin = time_stamp[ts_nums[num]][0]
-            word_lists.append(words[num].upper())
+            abbr_word = words[num].upper()
             num += 1
             while num < words_size:
                 if num in abbr_end:
-                    word_lists.append(words[num].upper())
+                    abbr_word += words[num].upper()
                     last_num = num
                     break
                 else:
                     if words[num].encode('utf-8').isalpha():
-                        word_lists.append(words[num].upper())
+                        abbr_word += words[num].upper()
                 num += 1
+            word_lists.append(abbr_word)
             if time_stamp is not None:
                 end = time_stamp[ts_nums[num]][1]
                 ts_lists.append([begin, end])
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index f5a238e..423110c 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,6 +1,10 @@
 import torch
 import copy
+import codecs
 import logging
+import edit_distance
+import argparse
+import pdb
 import numpy as np
 from typing import Any, List, Tuple, Union
 
@@ -9,7 +13,8 @@
                        us_peaks, 
                        char_list, 
                        vad_offset=0.0, 
-                       force_time_shift=-1.5
+                       force_time_shift=-1.5,
+                       sil_in_str=True
                        ):
     if not len(char_list):
         return []
@@ -62,6 +67,8 @@
             timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
     res_txt = ""
     for char, timestamp in zip(new_char_list, timestamp_list):
+        #if char != '<sil>':
+        if not sil_in_str and char == '<sil>': continue
         res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
     res = []
     for char, timestamp in zip(new_char_list, timestamp_list):
@@ -121,4 +128,181 @@
     return res
 
 
+class AverageShiftCalculator():
+    def __init__(self):
+        logging.warning("Calculating average shift.")
+    def __call__(self, file1, file2):
+        uttid_list1, ts_dict1 = self.read_timestamps(file1)
+        uttid_list2, ts_dict2 = self.read_timestamps(file2)
+        uttid_intersection = self._intersection(uttid_list1, uttid_list2)
+        res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
+        logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
+        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))
+
+    def _intersection(self, list1, list2):
+        set1 = set(list1)
+        set2 = set(list2)
+        if set1 == set2:
+            logging.warning("Uttid same checked.")
+            return set1
+        itsc = list(set1 & set2)
+        logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
+        return itsc
+
+    def read_timestamps(self, file):
+        # read timestamps file in standard format
+        uttid_list = []
+        ts_dict = {}
+        with codecs.open(file, 'r') as fin:
+            for line in fin.readlines():
+                text = ''
+                ts_list = []
+                line = line.rstrip()
+                uttid = line.split()[0]
+                uttid_list.append(uttid)
+                body = " ".join(line.split()[1:])
+                for pd in body.split(';'):
+                    if not len(pd): continue
+                    # pdb.set_trace() 
+                    char, start, end = pd.lstrip(" ").split(' ')
+                    text += char + ','
+                    ts_list.append((float(start), float(end)))
+                # ts_lists.append(ts_list)
+                ts_dict[uttid] = (text[:-1], ts_list)
+        logging.warning("File {} read done.".format(file))
+        return uttid_list, ts_dict
+
+    def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
+        shift_time = 0
+        for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
+            shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
+        num_tokens = len(filtered_timestamp_list1)
+        return shift_time, num_tokens
+
+    def as_cal(self, uttid_list, ts_dict1, ts_dict2):
+        # calculate average shift between timestamp1 and timestamp2
+        # when characters differ, use edit distance alignment
+        # and calculate the error between the same characters
+        self._accumlated_shift = 0
+        self._accumlated_tokens = 0
+        self.max_shift = 0
+        self.max_shift_uttid = None
+        for uttid in uttid_list:
+            (t1, ts1) = ts_dict1[uttid]
+            (t2, ts2) = ts_dict2[uttid]
+            _align, _align2, _align3 = [], [], []
+            fts1, fts2 = [], []
+            _t1, _t2 = [], []
+            sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
+            s = sm.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align.append(0)
+                if s[j][0] == "replace" or s[j][0] == "delete":
+                    _align3.append(0)
+                elif s[j][0] == "equal":
+                    _align.append(1)
+                    _align3.append(1)
+                else:
+                    continue
+            # use s to index t2
+            for a, ts , t in zip(_align, ts2, t2.split(',')):
+                if a: 
+                    fts2.append(ts)
+                    _t2.append(t)
+            sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
+            s = sm2.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align2.append(0)
+                elif s[j][0] == "equal":
+                    _align2.append(1)
+                else:
+                    continue
+            # use s2 tp index t1
+            for a, ts, t in zip(_align3, ts1, t1.split(',')):
+                if a: 
+                    fts1.append(ts)
+                    _t1.append(t)
+            if len(fts1) == len(fts2):
+                shift_time, num_tokens = self._shift(fts1, fts2)
+                self._accumlated_shift += shift_time
+                self._accumlated_tokens += num_tokens
+                if shift_time/num_tokens > self.max_shift:
+                    self.max_shift = shift_time/num_tokens
+                    self.max_shift_uttid = uttid
+            else:
+                logging.warning("length mismatch")
+        return self._accumlated_shift / self._accumlated_tokens
+
+
+def convert_external_alphas(alphas_file, text_file, output_file):
+    from funasr.models.predictor.cif import cif_wo_hidden
+    with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
+        for line1, line2 in zip(f1.readlines(), f2.readlines()):
+            line1 = line1.rstrip()
+            line2 = line2.rstrip()
+            assert line1.split()[0] == line2.split()[0]
+            uttid = line1.split()[0]
+            alphas = [float(i) for i in line1.split()[1:]]
+            new_alphas = np.array(remove_chunk_padding(alphas))
+            new_alphas[-1] += 1e-4
+            text = line2.split()[1:]
+            if len(text) + 1 != int(new_alphas.sum()):
+                # force resize
+                new_alphas *= (len(text) + 1) / int(new_alphas.sum())
+            peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
+            if " " in text:
+                text = text.split()
+            else:
+                text = [i for i in text]
+            res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, 
+                                                     force_time_shift=-7.0, 
+                                                     sil_in_str=False)
+            f3.write("{} {}\n".format(uttid, res_str))
+
+
+def remove_chunk_padding(alphas):
+    # remove the padding part in alphas if using chunk paraformer for GPU
+    START_ZERO = 45
+    MID_ZERO = 75
+    REAL_FRAMES = 360  # for chunk based encoder 10-120-10 and fsmn padding 5
+    alphas = alphas[START_ZERO:]  # remove the padding at beginning
+    new_alphas = []
+    while True:
+        new_alphas = new_alphas + alphas[:REAL_FRAMES]
+        alphas = alphas[REAL_FRAMES+MID_ZERO:]
+        if len(alphas) < REAL_FRAMES: break
+    return new_alphas
+
+SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']
+
+
+def main(args):
+    if args.mode == 'cal_aas':
+        asc = AverageShiftCalculator()
+        asc(args.input, args.input2)
+    elif args.mode == 'read_ext_alphas':
+        convert_external_alphas(args.input, args.input2, args.output)
+    else:
+        logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='timestamp tools')
+    parser.add_argument('--mode', 
+                        default=None, 
+                        type=str, 
+                        choices=SUPPORTED_MODES, 
+                        help='timestamp related toolbox')
+    parser.add_argument('--input', default=None, type=str, help='input file path')
+    parser.add_argument('--output', default=None, type=str, help='output file name')
+    parser.add_argument('--input2', default=None, type=str, help='input2 file path')
+    parser.add_argument('--kaldi-ts-type', 
+                        default='v2', 
+                        type=str, 
+                        choices=['v0', 'v1', 'v2'], 
+                        help='kaldi timestamp to write')
+    args = parser.parse_args()
+    main(args)
 
diff --git a/setup.py b/setup.py
index 087d90d..6bb3ac3 100644
--- a/setup.py
+++ b/setup.py
@@ -13,11 +13,11 @@
     "install": [
         "setuptools>=38.5.1",
         # "configargparse>=1.2.1",
-        "typeguard>=2.7.0",
+        "typeguard==2.13.3",
         "humanfriendly",
         "scipy>=1.4.1",
         # "filelock",
-        "librosa>=0.8.0",
+        "librosa==0.8.1",
         "jamo==0.4.1",  # For kss
         "PyYAML>=5.1.2",
         "soundfile>=0.10.2",
@@ -41,6 +41,8 @@
         # PAI
         "oss2",
         "kaldi-native-fbank",
+        # timestamp
+        "edit-distance"
     ],
     # train: The modules invoked when training only.
     "train": [
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 70dbe89..b3c5a24 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -451,8 +451,8 @@
 
     def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
         inference_pipeline = pipeline(
-            task=Tasks.,
-            model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
+            task=Tasks.auto_speech_recognition,
+            model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',
             param_dict={"decoding_model": "offline"})
diff --git a/tests/test_sv_inference_pipeline.py b/tests/test_sv_inference_pipeline.py
index 265f839..54ab564 100644
--- a/tests/test_sv_inference_pipeline.py
+++ b/tests/test_sv_inference_pipeline.py
@@ -1,5 +1,6 @@
 import unittest
 
+import numpy as np
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger

--
Gitblit v1.9.1