From 73a7cf596bd1ebc3bd0c9674a21072f9eaf4cc57 Mon Sep 17 00:00:00 2001
From: dyyzhmm <dyyzhmm@163.com>
Date: 星期四, 16 三月 2023 15:24:56 +0800
Subject: [PATCH] Merge pull request #3 from alibaba-damo-academy/main

---
 egs/aishell/data2vec_transformer_finetune/run.sh                                                                                |    2 
 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py                                       |   10 
 funasr/torch_utils/load_pretrained_model.py                                                                                     |   10 
 egs/aishell/paraformerbert/run.sh                                                                                               |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py                                      |   13 
 funasr/train/trainer.py                                                                                                         |   36 +-
 setup.py                                                                                                                        |    4 
 egs/callhome/diarization/sond/unit_test.py                                                                                      |    8 
 egs/alimeeting/diarization/sond/infer_alimeeting_test.py                                                                        |    2 
 egs/aishell/conformer/run.sh                                                                                                    |    2 
 egs/aishell/transformer/run.sh                                                                                                  |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md                           |    2 
 funasr/bin/asr_inference_paraformer.py                                                                                          |    2 
 funasr/bin/asr_inference_uniasr_vad.py                                                                                          |    2 
 funasr/runtime/onnxruntime/src/Audio.cpp                                                                                        |    5 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py |    2 
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                                                 |    2 
 tests/test_asr_inference_pipeline.py                                                                                            |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py                                       |   13 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py        |    2 
 funasr/bin/sond_inference.py                                                                                                    |    2 
 funasr/bin/asr_inference_mfcca.py                                                                                               |    2 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md               |    2 
 funasr/tasks/diar.py                                                                                                            |   70 ++--
 funasr/tasks/abs_task.py                                                                                                        |    8 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md                                         |    2 
 README.md                                                                                                                       |   30 -
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py                                       |   13 
 egs/aishell/data2vec_paraformer_finetune/run.sh                                                                                 |    2 
 funasr/runtime/onnxruntime/src/librapidasrapi.cpp                                                                               |   17 
 funasr/bin/asr_inference_uniasr.py                                                                                              |    2 
 egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py                      |    7 
 funasr/bin/asr_inference.py                                                                                                     |    2 
 funasr/utils/timestamp_tools.py                                                                                                 |  186 +++++++++++
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md                      |    2 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md                                    |    2 
 funasr/bin/eend_ola_inference.py                                                                                                |   26 +
 egs/aishell2/paraformer/run.sh                                                                                                  |    2 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md                                       |    2 
 funasr/models/e2e_diar_eend_ola.py                                                                                              |   35 +
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py                           |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py                                    |   35 ++
 funasr/modules/eend_ola/encoder_decoder_attractor.py                                                                            |    6 
 funasr/modules/eend_ola/encoder.py                                                                                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py             |    2 
 funasr/bin/asr_inference_rnnt.py                                                                                                |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py           |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py               |    2 
 funasr/runtime/onnxruntime/tester/tester.cpp                                                                                    |   57 +++
 egs/aishell2/paraformerbert/run.sh                                                                                              |    2 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py                         |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py                                    |   35 ++
 funasr/main_funcs/pack_funcs.py                                                                                                 |    4 
 egs/alimeeting/diarization/sond/unit_test.py                                                                                    |    8 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py                      |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py                |    2 
 egs/aishell2/transformerLM/run.sh                                                                                               |    2 
 funasr/tasks/sv.py                                                                                                              |    2 
 egs/mars/sd/local_run.sh                                                                                                        |    2 
 funasr/bin/diar_inference_launch.py                                                                                             |    5 
 funasr/runtime/python/README.md                                                                                                 |   21 +
 funasr/tasks/asr.py                                                                                                             |    4 
 egs/aishell2/conformer/run.sh                                                                                                   |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py                                   |   35 ++
 egs/alimeeting/diarization/sond/run.sh                                                                                          |    6 
 funasr/models/frontend/wav_frontend.py                                                                                          |   77 ++++
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py      |    2 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py      |    2 
 egs/aishell2/transformer/run.sh                                                                                                 |    2 
 funasr/main_funcs/average_nbest_models.py                                                                                       |   18 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md                                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md                            |    3 
 egs/aishell/paraformer/run.sh                                                                                                   |    2 
 funasr/bin/sv_inference.py                                                                                                      |    4 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py                                |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py              |    2 
 78 files changed, 684 insertions(+), 217 deletions(-)

diff --git a/README.md b/README.md
index 0d1079b..23f1abe 100644
--- a/README.md
+++ b/README.md
@@ -15,36 +15,10 @@
 | [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 | [**Contact**](#contact)
 
+
 ## What's new: 
 
-### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
-- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
-- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
-- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
-- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
-- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
-- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
-- We release several new UniASR model: 
-[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
-[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
-[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
-[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
-[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
-- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
-- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3銆乫lac銆乷gg銆乷pus...
-### 2023.1.16, funasr-0.1.6锛� modelscope-1.2.0
-- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
- [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
-- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
-- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
-- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
-- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
-- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
-- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
+For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
 ## Highlights
 - Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).
diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 41db45d..09ddab8 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index cada164..d033ce2 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -55,7 +55,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 7ab8626..26222e6 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -55,7 +55,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.cer_ctc.ave_10best.pth
+inference_asr_model=valid.cer_ctc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index 2b0f144..53b5f90 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 96310ab..2487eac 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -56,7 +56,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 4c307b0..f66a338 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -52,7 +52,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index bd6d81e..f9ea69a 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index 2b7d841..e1ea4fe 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index d0407d4..239a7e3 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -58,7 +58,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index a5a14ec..6f2dd4d 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -54,7 +54,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
index 28e3762..9e7a713 100755
--- a/egs/aishell2/transformerLM/run.sh
+++ b/egs/aishell2/transformerLM/run.sh
@@ -34,7 +34,7 @@
 tag=exp1
 model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
 lm_exp=${exp_dir}/exp/${model_dir}
-inference_lm=valid.loss.ave.pth       # Language model path for decoding.
+inference_lm=valid.loss.ave.pb       # Language model path for decoding.
 
 stage=0
 stop_stage=3
diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
index 0988f5d..b4d534b 100644
--- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
+++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
@@ -4,7 +4,7 @@
 
 def main():
     diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
-    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
+    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
     output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
     data_path_and_name_and_type = [
         ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh
index 7e9a7f7..19ae40c 100644
--- a/egs/alimeeting/diarization/sond/run.sh
+++ b/egs/alimeeting/diarization/sond/run.sh
@@ -17,9 +17,9 @@
   echo "Downloading Pre-trained model..."
   git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
   git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
-  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
+  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
   cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
-  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
+  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
   cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
   echo "Done."
@@ -30,7 +30,7 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
   echo "Calculating diarization results..."
-  python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
+  python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
   python local/convert_label_to_rttm.py \
     outputs/labels.txt \
     data/test_rmsil/raw_rmsil_map.scp \
diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py
index 84a4247..0f40ab2 100644
--- a/egs/alimeeting/diarization/sond/unit_test.py
+++ b/egs/alimeeting/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
 
 def test_fbank_cpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
 
 def test_fbank_gpu_infer():
     diar_config_path = "config_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py
index 519ac56..a48eda1 100644
--- a/egs/callhome/diarization/sond/unit_test.py
+++ b/egs/callhome/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
 
 def test_fbank_cpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
 
 def test_fbank_gpu_infer():
     diar_config_path = "sond_fbank.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
 
 def test_wav_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     data_path_and_name_and_type = [
         ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
 
 def test_without_profile_gpu_infer():
     diar_config_path = "config.yaml"
-    diar_model_path = "sond.pth"
+    diar_model_path = "sond.pb"
     output_dir = "./outputs"
     raw_inputs = [[
         "data/unit_test/raw_inputs/record.wav",
diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh
index 3b319f4..4516e9f 100755
--- a/egs/mars/sd/local_run.sh
+++ b/egs/mars/sd/local_run.sh
@@ -49,7 +49,7 @@
 model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index 56c282c..b326067 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -48,5 +48,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index e163999..2f038a8 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -48,5 +48,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
index 9097e7a..16aeada 100644
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
index e714a3d..333b66a 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -63,5 +63,5 @@
     params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pth"
+    params["decoding_model_name"] = "valid.acc.ave.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
index 6c34ed0..f1f29fa 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
index 6140bb7..8cb537b 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 94393ec..f26f237 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index 96102cc..726009d 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
index d91a40a..6593f4e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
index f9fb0db..f067c81 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
new file mode 100644
index 0000000..56fb583
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
new file mode 100644
index 0000000..c54ab8c
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 030c2e2..d4df29e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -50,5 +50,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
new file mode 100644
index 0000000..8bbce60
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
new file mode 100644
index 0000000..cfd869f
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
new file mode 100644
index 0000000..5e313e5
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params["output_dir"]):
+        os.makedirs(params["output_dir"], exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params["data_dir"])
+    kwargs = dict(
+        model=params["model"],
+        model_revision=params["model_revision"],
+        data_dir=ds_dict,
+        dataset_type=params["dataset_type"],
+        work_dir=params["output_dir"],
+        batch_bins=params["batch_bins"],
+        max_epoch=params["max_epoch"],
+        lr=params["lr"])
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = {}
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data"
+    params["batch_bins"] = 2000
+    params["dataset_type"] = "small"
+    params["max_epoch"] = 50
+    params["lr"] = 0.00005
+    params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
+    params["model_revision"] = None
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
new file mode 100644
index 0000000..e8c5524
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
+    output_dir = "./results"
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
+        output_dir=output_dir,
+    )
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
@@ -41,7 +41,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 3b39a16..861fefb 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
index dd947d3..eff933e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
@@ -41,7 +41,8 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
+      .pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index 4860cf7..d73cae2 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -49,5 +49,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "20epoch.pth"
+    params["decoding_model_name"] = "20epoch.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 1094bb5..94144ef 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -34,7 +34,7 @@
 - Modify inference related parameters in `infer_after_finetune.py`
     - <strong>output_dir:</strong> # result dir
     - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
 
 - Then you can run the pipeline to finetune with:
 ```python
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 5f171b4..3712cb8 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -53,5 +53,5 @@
     params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
     params["output_dir"] = "./checkpoint"
     params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
     modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
new file mode 100644
index 0000000..81cb2c6
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -0,0 +1,10 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+    task=Tasks.speaker_diarization,
+    model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+    model_revision="v1.0.0",
+)
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
+print(results)
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
index 3cb31cf..5f4563d 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
@@ -14,13 +14,12 @@
 )
 
 # 浠� audio_list 浣滀负杈撳叆锛屽叾涓涓�涓煶棰戜负寰呮娴嬭闊筹紝鍚庨潰鐨勯煶棰戜负涓嶅悓璇磋瘽浜虹殑澹扮汗娉ㄥ唽璇煶
-audio_list = [[
+audio_list = [
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
     "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
-]]
+]
 
 results = inference_diar_pipline(audio_in=audio_list)
-for rst in results:
-    print(rst["value"])
+print(results)
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 318d3d7..f3b4d56 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -52,7 +52,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 4176ba6..888d4d2 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -55,7 +55,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 6413d92..e45e575 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -50,7 +50,7 @@
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index a0e7b47..3f57751 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -58,7 +58,7 @@
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
index 6cd7061..4a9ff0b 100644
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@@ -49,7 +49,7 @@
 
     Examples:
             >>> import soundfile
-            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
             >>> audio, rate = soundfile.read("speech.wav")
             >>> speech2text(audio)
             [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 8b31fad..ac71538 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -46,7 +46,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index e5815df..7cb889b 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -46,7 +46,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2text(audio)
         [(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 7738f4f..85e4518 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -133,7 +133,7 @@
         param_dict = {
             "extract_profile": True,
             "sv_train_config": "sv.yaml",
-            "sv_model_file": "sv.pth",
+            "sv_model_file": "sv.pb",
         }
         if "param_dict" in kwargs and kwargs["param_dict"] is not None:
             for key in param_dict:
@@ -142,6 +142,9 @@
         else:
             kwargs["param_dict"] = param_dict
         return inference_modelscope(mode=mode, **kwargs)
+    elif mode == "eend-ola":
+        from funasr.bin.eend_ola_inference import inference_modelscope
+        return inference_modelscope(mode=mode, **kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index d65895f..01d3f29 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -16,6 +16,7 @@
 
 import numpy as np
 import torch
+from scipy.signal import medfilt
 from typeguard import check_argument_types
 
 from funasr.models.frontend.wav_frontend import WavFrontendMel23
@@ -34,7 +35,7 @@
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
@@ -146,7 +147,7 @@
         output_dir: Optional[str] = None,
         batch_size: int = 1,
         dtype: str = "float32",
-        ngpu: int = 0,
+        ngpu: int = 1,
         num_workers: int = 0,
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,
@@ -179,7 +180,6 @@
         diar_model_file=diar_model_file,
         device=device,
         dtype=dtype,
-        streaming=streaming,
     )
     logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
     speech2diar = Speech2Diarization.from_pretrained(
@@ -209,7 +209,7 @@
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
-            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+            data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
         loader = EENDOLADiarTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
@@ -236,9 +236,23 @@
             # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
 
             results = speech2diar(**batch)
+
+            # post process
+            a = results[0][0].cpu().numpy()
+            a = medfilt(a, (11, 1))
+            rst = []
+            for spkid, frames in enumerate(a.T):
+                frames = np.pad(frames, (1, 1), 'constant')
+                changes, = np.where(np.diff(frames, axis=0) != 0)
+                fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
+                for s, e in zip(changes[::2], changes[1::2]):
+                    st = s / 10.
+                    dur = (e - s) / 10.
+                    rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
+
             # Only supporting batch_size==1
-            key, value = keys[0], output_results_str(results, keys[0])
-            item = {"key": key, "value": value}
+            value = "\n".join(rst)
+            item = {"key": keys[0], "value": value}
             result_list.append(item)
             if output_path is not None:
                 output_writer.write(value)
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index ab6d26f..936dc21 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -42,7 +42,7 @@
     Examples:
         >>> import soundfile
         >>> import numpy as np
-        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
         >>> profile = np.load("profiles.npy")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2diar(audio, profile)
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index a78bccd..7e63bbd 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -36,7 +36,7 @@
 
     Examples:
         >>> import soundfile
-        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
+        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
         >>> audio, rate = soundfile.read("speech.wav")
         >>> speech2xvector(audio)
         [(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@
         log_level: Union[int, str] = "INFO",
         key_file: Optional[str] = None,
         sv_train_config: Optional[str] = "sv.yaml",
-        sv_model_file: Optional[str] =  "sv.pth",
+        sv_model_file: Optional[str] =  "sv.pb",
         model_tag: Optional[str] = None,
         allow_variable_data_keys: bool = True,
         streaming: bool = False,
diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py
index 53f9568..d8df949 100644
--- a/funasr/main_funcs/average_nbest_models.py
+++ b/funasr/main_funcs/average_nbest_models.py
@@ -66,13 +66,13 @@
             elif n == 1:
                 # The averaged model is same as the best model
                 e, _ = epoch_and_values[0]
-                op = output_dir / f"{e}epoch.pth"
-                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
+                op = output_dir / f"{e}epoch.pb"
+                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
                 if sym_op.is_symlink() or sym_op.exists():
                     sym_op.unlink()
                 sym_op.symlink_to(op.name)
             else:
-                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
+                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
                 logging.info(
                     f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                 )
@@ -83,12 +83,12 @@
                     if e not in _loaded:
                         if oss_bucket is None:
                             _loaded[e] = torch.load(
-                                output_dir / f"{e}epoch.pth",
+                                output_dir / f"{e}epoch.pb",
                                 map_location="cpu",
                             )
                         else:
                             buffer = BytesIO(
-                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
+                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
                             _loaded[e] = torch.load(buffer)
                     states = _loaded[e]
 
@@ -115,13 +115,13 @@
                 else:
                     buffer = BytesIO()
                     torch.save(avg, buffer)
-                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
+                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
                                           buffer.getvalue())
 
-        # 3. *.*.ave.pth is a symlink to the max ave model
+        # 3. *.*.ave.pb is a symlink to the max ave model
         if oss_bucket is None:
-            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
-            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
+            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
+            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
             if sym_op.is_symlink() or sym_op.exists():
                 sym_op.unlink()
             sym_op.symlink_to(op.name)
diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py
index ffa807e..fe365d8 100644
--- a/funasr/main_funcs/pack_funcs.py
+++ b/funasr/main_funcs/pack_funcs.py
@@ -191,12 +191,12 @@
 
     Examples:
         tarfile:
-           model.pth
+           model.pb
            some1.file
            some2.file
 
         >>> unpack("tarfile", "out")
-        {'asr_model_file': 'out/model.pth'}
+        {'asr_model_file': 'out/model.pb'}
     """
     input_archive = Path(input_archive)
     outpath = Path(outpath)
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index f589269..097b23a 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -52,15 +52,15 @@
 
         super().__init__()
         self.frontend = frontend
-        self.encoder = encoder
-        self.encoder_decoder_attractor = encoder_decoder_attractor
+        self.enc = encoder
+        self.eda = encoder_decoder_attractor
         self.attractor_loss_weight = attractor_loss_weight
         self.max_n_speaker = max_n_speaker
         if mapping_dict is None:
             mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
             self.mapping_dict = mapping_dict
         # PostNet
-        self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
+        self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
         self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)
 
     def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@
         pad_shape = xs.shape
         xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
         xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
-        emb = self.encoder(xs, xs_mask)
+        emb = self.enc(xs, xs_mask)
         emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
         emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
         return emb
@@ -76,8 +76,8 @@
     def forward_post_net(self, logits, ilens):
         maxlen = torch.max(ilens).to(torch.int).item()
         logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
-        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
-        outputs, (_, _) = self.PostNet(logits)
+        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
+        outputs, (_, _) = self.postnet(logits)
         outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
         outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
         outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@
         text = text[:, : text_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -190,18 +190,16 @@
                             shuffle: bool = True,
                             threshold: float = 0.5,
                             **kwargs):
-        if self.frontend is not None:
-            speech = self.frontend(speech)
         speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
         emb = self.forward_encoder(speech, speech_lengths)
         if shuffle:
             orders = [np.arange(e.shape[0]) for e in emb]
             for order in orders:
                 np.random.shuffle(order)
-            attractors, probs = self.encoder_decoder_attractor.estimate(
+            attractors, probs = self.eda.estimate(
                 [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
         else:
-            attractors, probs = self.encoder_decoder_attractor.estimate(emb)
+            attractors, probs = self.eda.estimate(emb)
         attractors_active = []
         for p, att, e in zip(probs, attractors, emb):
             if n_speakers and n_speakers >= 0:
@@ -233,10 +231,23 @@
                 pred[i] = pred[i - 1]
             else:
                 pred[i] = 0
-        pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
+        pred = [self.inv_mapping_func(i) for i in pred]
         decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
         decisions = torch.from_numpy(
             np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
             torch.float32)
         decisions = decisions[:, :n_speaker]
         return decisions
+
+    def inv_mapping_func(self, label):
+
+        if not isinstance(label, int):
+            label = int(label)
+        if label in self.mapping_dict['label2dec'].keys():
+            num = self.mapping_dict['label2dec'][label]
+        else:
+            num = -1
+        return num
+
+    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+        pass
\ No newline at end of file
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 445efca..475a939 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -1,14 +1,15 @@
 # Copyright (c) Alibaba, Inc. and its affiliates.
 # Part of the implementation is borrowed from espnet/espnet.
-from abc import ABC
 from typing import Tuple
 
 import numpy as np
 import torch
 import torchaudio.compliance.kaldi as kaldi
-from funasr.models.frontend.abs_frontend import AbsFrontend
-from typeguard import check_argument_types
 from torch.nn.utils.rnn import pad_sequence
+from typeguard import check_argument_types
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.models.frontend.abs_frontend import AbsFrontend
 
 
 def load_cmvn(cmvn_file):
@@ -275,7 +276,8 @@
     # inputs tensor has catted the cache tensor
     # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
     #               is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
-    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
+    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
+        torch.Tensor, torch.Tensor, int]:
         """
         Apply lfr with data
         """
@@ -376,7 +378,8 @@
             if self.lfr_m != 1 or self.lfr_n != 1:
                 # update self.lfr_splice_cache in self.apply_lfr
                 # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
-                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
+                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
+                                                                                     is_final)
             if self.cmvn_file is not None:
                 mat = self.apply_cmvn(mat, self.cmvn)
             feat_length = mat.size(0)
@@ -398,9 +401,10 @@
         assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
         waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths)  # input shape: B T D
         if feats.shape[0]:
-            #if self.reserve_waveforms is None and self.lfr_m > 1:
+            # if self.reserve_waveforms is None and self.lfr_m > 1:
             #    self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
-            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
+            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
+                (self.reserve_waveforms, waveforms), dim=1)
             if not self.lfr_splice_cache:  # 鍒濆鍖杝plice_cache
                 for i in range(batch_size):
                     self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -409,7 +413,8 @@
                 lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache)  # B T D
                 feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
                 feats_lengths += lfr_splice_cache_tensor[0].shape[0]
-                frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
+                frame_from_waveforms = int(
+                    (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                 minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
                 feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
                 if self.lfr_m == 1:
@@ -423,14 +428,15 @@
                     self.waveforms = self.waveforms[:, :sample_length]
             else:
                 # update self.reserve_waveforms and self.lfr_splice_cache
-                self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
+                self.reserve_waveforms = self.waveforms[:,
+                                         :-(self.frame_sample_length - self.frame_shift_sample_length)]
                 for i in range(batch_size):
                     self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
                 return torch.empty(0), feats_lengths
         else:
             if is_final:
                 self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
-                feats = torch.stack(self.lfr_splice_cache) 
+                feats = torch.stack(self.lfr_splice_cache)
                 feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
                 feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
         if is_final:
@@ -444,3 +450,54 @@
         self.reserve_waveforms = None
         self.input_cache = None
         self.lfr_splice_cache = []
+
+
+class WavFrontendMel23(AbsFrontend):
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            fs: int = 16000,
+            frame_length: int = 25,
+            frame_shift: int = 10,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self.fs = fs
+        self.frame_length = frame_length
+        self.frame_shift = frame_shift
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.n_mels = 23
+
+    def output_size(self) -> int:
+        return self.n_mels * (2 * self.lfr_m + 1)
+
+    def forward(
+            self,
+            input: torch.Tensor,
+            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+        batch_size = input.size(0)
+        feats = []
+        feats_lens = []
+        for i in range(batch_size):
+            waveform_length = input_lengths[i]
+            waveform = input[i][:waveform_length]
+            waveform = waveform.numpy()
+            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+            mat = eend_ola_feature.transform(mat)
+            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
+            mat = mat[::self.lfr_n]
+            mat = torch.from_numpy(mat)
+            feat_length = mat.size(0)
+            feats.append(mat)
+            feats_lens.append(feat_length)
+
+        feats_lens = torch.as_tensor(feats_lens)
+        feats_pad = pad_sequence(feats,
+                                 batch_first=True,
+                                 padding_value=0.0)
+        return feats_pad, feats_lens
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
index 4999031..90a63f3 100644
--- a/funasr/modules/eend_ola/encoder.py
+++ b/funasr/modules/eend_ola/encoder.py
@@ -87,7 +87,7 @@
                  n_layers: int,
                  n_units: int,
                  e_units: int = 2048,
-                 h: int = 8,
+                 h: int = 4,
                  dropout_rate: float = 0.1,
                  use_pos_emb: bool = False):
         super(EENDOLATransformerEncoder, self).__init__()
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
index db01b00..45ac982 100644
--- a/funasr/modules/eend_ola/encoder_decoder_attractor.py
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -16,12 +16,12 @@
         self.n_units = n_units
 
     def forward_core(self, xs, zeros):
-        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
+        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
         xs = [self.enc0_dropout(x) for x in xs]
         xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
         xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
         _, (hx, cx) = self.encoder(xs)
-        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
+        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
         max_zlen = torch.max(zlens).to(torch.int).item()
         zeros = [self.enc0_dropout(z) for z in zeros]
         zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
@@ -47,4 +47,4 @@
         zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
         attractors = self.forward_core(xs, zeros)
         probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
-        return attractors, probs
\ No newline at end of file
+        return attractors, probs
diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
index 43dfb6b..53bf9d0 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -237,7 +237,7 @@
 
     size_t nOffset = 0;
 
-#define WAV_HEADER_SIZE 44
+
 
     speech_len = nBufLen / 2;
     speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@
             speech_data[i] = (float)speech_buff[i] / scale;
         }
 
-
+        AudioFrame* frame = new AudioFrame(speech_len);
+        frame_queue.push(frame);
         return true;
 
     }
diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
index 1f8f7ca..f5f9d66 100644
--- a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
+++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
@@ -26,8 +26,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadwav(szBuf,nLen);
-		audio.split();
+		if (!audio.loadwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -58,8 +59,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szBuf, nLen);
-		audio.split();
+		if (!audio.loadpcmwav(szBuf, nLen))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -91,8 +93,9 @@
 			return nullptr;
 
 		Audio audio(1);
-		audio.loadpcmwav(szFileName);
-		audio.split();
+		if (!audio.loadpcmwav(szFileName))
+			return nullptr;
+		//audio.split();
 
 		float* buff;
 		int len;
@@ -125,7 +128,7 @@
 		Audio audio(1);
 		if(!audio.loadwav(szWavfile))
 			return nullptr;
-		audio.split();
+		//audio.split();
 
 		float* buff;
 		int len;
diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp
index b9a85b7..ba5c61c 100644
--- a/funasr/runtime/onnxruntime/tester/tester.cpp
+++ b/funasr/runtime/onnxruntime/tester/tester.cpp
@@ -8,7 +8,7 @@
 #include "librapidasrapi.h"
 
 #include <iostream>
-
+#include <fstream>
 using namespace std;
 
 int main(int argc, char *argv[])
@@ -40,10 +40,13 @@
 
 
     gettimeofday(&start, NULL);
-
-    RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
-    gettimeofday(&end, NULL);
     float snippet_time = 0.0f;
+
+
+     RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    gettimeofday(&end, NULL);
+   
     if (Result)
     {
         string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@
     }
     else
     {
-        cout <<("no return data!");
+        cout <<"no return data!";
     }
-  
-    printf("Audio length %lfs.\n", (double)snippet_time);
+ 
+ 
+    //char* buff = nullptr;
+    //int len = 0;
+    //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
+    //if (ifs.is_open())
+    //{
+    //    ifs.seekg(0, std::ios::end);
+    //    len = ifs.tellg();
+    //    ifs.seekg(0, std::ios::beg);
 
+    //    buff = new char[len];
+
+    //    ifs.read(buff, len);
+
+
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+    //    RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
+    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+    //    gettimeofday(&end, NULL);
+    //   
+    //    if (Result)
+    //    {
+    //        string msg = RapidAsrGetResult(Result, 0);
+    //        setbuf(stdout, NULL);
+    //        cout << "Result: \"";
+    //        cout << msg << endl;
+    //        cout << "\"." << endl;
+    //        snippet_time = RapidAsrGetRetSnippetTime(Result);
+    //        RapidAsrFreeResult(Result);
+    //    }
+    //    else
+    //    {
+    //        cout <<"no return data!";
+    //    }
+  
+    //   
+    //delete[]buff;
+    //}
+
+ 
+    printf("Audio length %lfs.\n", (double)snippet_time);
     seconds = (end.tv_sec - start.tv_sec);
     long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
     printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);
diff --git a/funasr/runtime/python/README.md b/funasr/runtime/python/README.md
new file mode 100644
index 0000000..c47f8e7
--- /dev/null
+++ b/funasr/runtime/python/README.md
@@ -0,0 +1,21 @@
+Benchmark [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds.
+
+(Note: The service has been fully warm up.)
+
+ Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni
+
+| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+|  1 (onnx fp32)   |        2806        | 0.0777 |     12.9     |
+|  1 (onnx int8)   |        1611        | 0.0446 |     22.4     |
+|  8 (onnx fp32)   |        538         | 0.0149 |     67.1     |
+|  8 (onnx int8)   |        210         | 0.0058 |    172.4     |
+|  16 (onnx fp32)  |        288         | 0.0080 |    125.2     |
+|  16 (onnx int8)  |        117         | 0.0032 |    309.9     |
+|  32 (onnx fp32)  |        167         | 0.0046 |    216.5     |
+|  32 (onnx int8)  |        107         | 0.0030 |    338.0     |
+|  64 (onnx fp32)  |        158         | 0.0044 |    228.1     |
+|  64 (onnx int8)  |         82         | 0.0023 |    442.8     |
+|  96 (onnx fp32)  |        151         | 0.0042 |    238.0     |
+|  96 (onnx int8)  |         80         | 0.0022 |    452.0     |
+
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index e0884ce..3f20b4f 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -639,12 +639,12 @@
                  "and exclude_keys excludes keys of model states for the initialization."
                  "e.g.\n"
                  "  # Load all parameters"
-                 "  --init_param some/where/model.pth\n"
+                 "  --init_param some/where/model.pb\n"
                  "  # Load only decoder parameters"
-                 "  --init_param some/where/model.pth:decoder:decoder\n"
+                 "  --init_param some/where/model.pb:decoder:decoder\n"
                  "  # Load only decoder parameters excluding decoder.embed"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
-                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
+                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
         )
         group.add_argument(
             "--ignore_init_mismatch",
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 36499a2..e151473 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -826,7 +826,7 @@
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@
             if "model.ckpt-" in model_name or ".bin" in model_name:
                 model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                             '.pb')) if ".bin" in model_name else os.path.join(
-                    model_dir, "{}.pth".format(model_name))
+                    model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index ae7ee9b..9875f6a 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -553,7 +553,7 @@
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
@@ -750,47 +750,47 @@
             cls, args: argparse.Namespace, train: bool
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
-        if args.use_preprocessor:
-            retval = CommonPreprocessor(
-                train=train,
-                token_type=args.token_type,
-                token_list=args.token_list,
-                bpemodel=None,
-                non_linguistic_symbols=None,
-                text_cleaner=None,
-                g2p_type=None,
-                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
-                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
-                # NOTE(kamo): Check attribute existence for backward compatibility
-                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
-                rir_apply_prob=args.rir_apply_prob
-                if hasattr(args, "rir_apply_prob")
-                else 1.0,
-                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
-                noise_apply_prob=args.noise_apply_prob
-                if hasattr(args, "noise_apply_prob")
-                else 1.0,
-                noise_db_range=args.noise_db_range
-                if hasattr(args, "noise_db_range")
-                else "13_15",
-                speech_volume_normalize=args.speech_volume_normalize
-                if hasattr(args, "rir_scp")
-                else None,
-            )
-        else:
-            retval = None
-        assert check_return_type(retval)
-        return retval
+        # if args.use_preprocessor:
+        #     retval = CommonPreprocessor(
+        #         train=train,
+        #         token_type=args.token_type,
+        #         token_list=args.token_list,
+        #         bpemodel=None,
+        #         non_linguistic_symbols=None,
+        #         text_cleaner=None,
+        #         g2p_type=None,
+        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
+        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
+        #         # NOTE(kamo): Check attribute existence for backward compatibility
+        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+        #         rir_apply_prob=args.rir_apply_prob
+        #         if hasattr(args, "rir_apply_prob")
+        #         else 1.0,
+        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+        #         noise_apply_prob=args.noise_apply_prob
+        #         if hasattr(args, "noise_apply_prob")
+        #         else 1.0,
+        #         noise_db_range=args.noise_db_range
+        #         if hasattr(args, "noise_db_range")
+        #         else "13_15",
+        #         speech_volume_normalize=args.speech_volume_normalize
+        #         if hasattr(args, "rir_scp")
+        #         else None,
+        #     )
+        # else:
+        #     retval = None
+        # assert check_return_type(retval)
+        return None
 
     @classmethod
     def required_data_names(
             cls, train: bool = True, inference: bool = False
     ) -> Tuple[str, ...]:
         if not inference:
-            retval = ("speech", "profile", "binary_labels")
+            retval = ("speech", )
         else:
             # Recognition mode
-            retval = ("speech")
+            retval = ("speech", )
         return retval
 
     @classmethod
@@ -823,7 +823,7 @@
 
         # 2. Encoder
         encoder_class = encoder_choices.get_class(args.encoder)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        encoder = encoder_class(**args.encoder_conf)
 
         # 3. EncoderDecoderAttractor
         encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)
diff --git a/funasr/tasks/sv.py b/funasr/tasks/sv.py
index 1b08c4d..bef5dc5 100644
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@@ -501,7 +501,7 @@
                 if ".bin" in model_name:
                     model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                 else:
-                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                 if os.path.exists(model_name_pth):
                     logging.info("model_file is load from pth: {}".format(model_name_pth))
                     model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index 8e3f05e..e9b18cd 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -52,13 +52,13 @@
         init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>
 
     Examples:
-        >>> load_pretrained_model("somewhere/model.pth", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
-        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
+        >>> load_pretrained_model("somewhere/model.pb", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
+        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
         >>> load_pretrained_model(
-        ...     "somewhere/model.pth:decoder:decoder:decoder.embed", model
+        ...     "somewhere/model.pb:decoder:decoder:decoder.embed", model
         ... )
-        >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
+        >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
     """
     sps = init_param.split(":", 4)
     if len(sps) == 4:
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 50bce47..efe2009 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -205,9 +205,9 @@
         else:
             scaler = None
 
-        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
+        if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
             cls.resume(
-                checkpoint=output_dir / "checkpoint.pth",
+                checkpoint=output_dir / "checkpoint.pb",
                 model=model,
                 optimizers=optimizers,
                 schedulers=schedulers,
@@ -361,7 +361,7 @@
                         },
                         buffer,
                     )
-                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
+                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
                 else:
                     torch.save(
                         {
@@ -374,7 +374,7 @@
                             ],
                             "scaler": scaler.state_dict() if scaler is not None else None,
                         },
-                        output_dir / "checkpoint.pth",
+                        output_dir / "checkpoint.pb",
                     )
 
                 # 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@
                     buffer = BytesIO()
                     torch.save(model.state_dict(), buffer)
                     trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
-                                                                       f"{iepoch}epoch.pth"),buffer.getvalue())
+                                                                       f"{iepoch}epoch.pb"),buffer.getvalue())
                 else:
-                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
+                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")
 
-                # Creates a sym link latest.pth -> {iepoch}epoch.pth
+                # Creates a sym link latest.pb -> {iepoch}epoch.pb
                 if trainer_options.use_pai:
-                    p = os.path.join(trainer_options.output_dir, "latest.pth")
+                    p = os.path.join(trainer_options.output_dir, "latest.pb")
                     if trainer_options.oss_bucket.object_exists(p):
                         trainer_options.oss_bucket.delete_object(p)
                     trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
+                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
                 else:
-                    p = output_dir / "latest.pth"
+                    p = output_dir / "latest.pb"
                     if p.is_symlink() or p.exists():
                         p.unlink()
-                    p.symlink_to(f"{iepoch}epoch.pth")
+                    p.symlink_to(f"{iepoch}epoch.pb")
 
                 _improved = []
                 for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@
                         # Creates sym links if it's the best result
                         if best_epoch == iepoch:
                             if trainer_options.use_pai:
-                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
+                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
                                 if trainer_options.oss_bucket.object_exists(p):
                                     trainer_options.oss_bucket.delete_object(p)
                                 trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
-                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
+                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
                             else:
-                                p = output_dir / f"{_phase}.{k}.best.pth"
+                                p = output_dir / f"{_phase}.{k}.best.pb"
                                 if p.is_symlink() or p.exists():
                                     p.unlink()
-                                p.symlink_to(f"{iepoch}epoch.pth")
+                                p.symlink_to(f"{iepoch}epoch.pb")
                             _improved.append(f"{_phase}.{k}")
                 if len(_improved) == 0:
                     logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@
                         type="model",
                         metadata={"improved": _improved},
                     )
-                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
+                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
                     aliases = [
                         f"epoch-{iepoch}",
                         "best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@
 
                 for e in range(1, iepoch):
                     if trainer_options.use_pai:
-                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
+                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
                         if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
                             trainer_options.oss_bucket.delete_object(p)
                             _removed.append(str(p))
                     else:
-                        p = output_dir / f"{e}epoch.pth"
+                        p = output_dir / f"{e}epoch.pb"
                         if p.exists() and e not in nbests:
                             p.unlink()
                             _removed.append(str(p))
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index f5a238e..423110c 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,6 +1,10 @@
 import torch
 import copy
+import codecs
 import logging
+import edit_distance
+import argparse
+import pdb
 import numpy as np
 from typing import Any, List, Tuple, Union
 
@@ -9,7 +13,8 @@
                        us_peaks, 
                        char_list, 
                        vad_offset=0.0, 
-                       force_time_shift=-1.5
+                       force_time_shift=-1.5,
+                       sil_in_str=True
                        ):
     if not len(char_list):
         return []
@@ -62,6 +67,8 @@
             timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
     res_txt = ""
     for char, timestamp in zip(new_char_list, timestamp_list):
+        #if char != '<sil>':
+        if not sil_in_str and char == '<sil>': continue
         res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
     res = []
     for char, timestamp in zip(new_char_list, timestamp_list):
@@ -121,4 +128,181 @@
     return res
 
 
+class AverageShiftCalculator():
+    def __init__(self):
+        logging.warning("Calculating average shift.")
+    def __call__(self, file1, file2):
+        uttid_list1, ts_dict1 = self.read_timestamps(file1)
+        uttid_list2, ts_dict2 = self.read_timestamps(file2)
+        uttid_intersection = self._intersection(uttid_list1, uttid_list2)
+        res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
+        logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
+        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))
+
+    def _intersection(self, list1, list2):
+        set1 = set(list1)
+        set2 = set(list2)
+        if set1 == set2:
+            logging.warning("Uttid same checked.")
+            return set1
+        itsc = list(set1 & set2)
+        logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
+        return itsc
+
+    def read_timestamps(self, file):
+        # read timestamps file in standard format
+        uttid_list = []
+        ts_dict = {}
+        with codecs.open(file, 'r') as fin:
+            for line in fin.readlines():
+                text = ''
+                ts_list = []
+                line = line.rstrip()
+                uttid = line.split()[0]
+                uttid_list.append(uttid)
+                body = " ".join(line.split()[1:])
+                for pd in body.split(';'):
+                    if not len(pd): continue
+                    # pdb.set_trace() 
+                    char, start, end = pd.lstrip(" ").split(' ')
+                    text += char + ','
+                    ts_list.append((float(start), float(end)))
+                # ts_lists.append(ts_list)
+                ts_dict[uttid] = (text[:-1], ts_list)
+        logging.warning("File {} read done.".format(file))
+        return uttid_list, ts_dict
+
+    def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
+        shift_time = 0
+        for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
+            shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
+        num_tokens = len(filtered_timestamp_list1)
+        return shift_time, num_tokens
+
+    def as_cal(self, uttid_list, ts_dict1, ts_dict2):
+        # calculate average shift between timestamp1 and timestamp2
+        # when characters differ, use edit distance alignment
+        # and calculate the error between the same characters
+        self._accumlated_shift = 0
+        self._accumlated_tokens = 0
+        self.max_shift = 0
+        self.max_shift_uttid = None
+        for uttid in uttid_list:
+            (t1, ts1) = ts_dict1[uttid]
+            (t2, ts2) = ts_dict2[uttid]
+            _align, _align2, _align3 = [], [], []
+            fts1, fts2 = [], []
+            _t1, _t2 = [], []
+            sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
+            s = sm.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align.append(0)
+                if s[j][0] == "replace" or s[j][0] == "delete":
+                    _align3.append(0)
+                elif s[j][0] == "equal":
+                    _align.append(1)
+                    _align3.append(1)
+                else:
+                    continue
+            # use s to index t2
+            for a, ts , t in zip(_align, ts2, t2.split(',')):
+                if a: 
+                    fts2.append(ts)
+                    _t2.append(t)
+            sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
+            s = sm2.get_opcodes()
+            for j in range(len(s)):
+                if s[j][0] == "replace" or s[j][0] == "insert":
+                    _align2.append(0)
+                elif s[j][0] == "equal":
+                    _align2.append(1)
+                else:
+                    continue
+            # use s2 tp index t1
+            for a, ts, t in zip(_align3, ts1, t1.split(',')):
+                if a: 
+                    fts1.append(ts)
+                    _t1.append(t)
+            if len(fts1) == len(fts2):
+                shift_time, num_tokens = self._shift(fts1, fts2)
+                self._accumlated_shift += shift_time
+                self._accumlated_tokens += num_tokens
+                if shift_time/num_tokens > self.max_shift:
+                    self.max_shift = shift_time/num_tokens
+                    self.max_shift_uttid = uttid
+            else:
+                logging.warning("length mismatch")
+        return self._accumlated_shift / self._accumlated_tokens
+
+
+def convert_external_alphas(alphas_file, text_file, output_file):
+    from funasr.models.predictor.cif import cif_wo_hidden
+    with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
+        for line1, line2 in zip(f1.readlines(), f2.readlines()):
+            line1 = line1.rstrip()
+            line2 = line2.rstrip()
+            assert line1.split()[0] == line2.split()[0]
+            uttid = line1.split()[0]
+            alphas = [float(i) for i in line1.split()[1:]]
+            new_alphas = np.array(remove_chunk_padding(alphas))
+            new_alphas[-1] += 1e-4
+            text = line2.split()[1:]
+            if len(text) + 1 != int(new_alphas.sum()):
+                # force resize
+                new_alphas *= (len(text) + 1) / int(new_alphas.sum())
+            peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
+            if " " in text:
+                text = text.split()
+            else:
+                text = [i for i in text]
+            res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, 
+                                                     force_time_shift=-7.0, 
+                                                     sil_in_str=False)
+            f3.write("{} {}\n".format(uttid, res_str))
+
+
+def remove_chunk_padding(alphas):
+    # remove the padding part in alphas if using chunk paraformer for GPU
+    START_ZERO = 45
+    MID_ZERO = 75
+    REAL_FRAMES = 360  # for chunk based encoder 10-120-10 and fsmn padding 5
+    alphas = alphas[START_ZERO:]  # remove the padding at beginning
+    new_alphas = []
+    while True:
+        new_alphas = new_alphas + alphas[:REAL_FRAMES]
+        alphas = alphas[REAL_FRAMES+MID_ZERO:]
+        if len(alphas) < REAL_FRAMES: break
+    return new_alphas
+
+SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']
+
+
+def main(args):
+    if args.mode == 'cal_aas':
+        asc = AverageShiftCalculator()
+        asc(args.input, args.input2)
+    elif args.mode == 'read_ext_alphas':
+        convert_external_alphas(args.input, args.input2, args.output)
+    else:
+        logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser(description='timestamp tools')
+    parser.add_argument('--mode', 
+                        default=None, 
+                        type=str, 
+                        choices=SUPPORTED_MODES, 
+                        help='timestamp related toolbox')
+    parser.add_argument('--input', default=None, type=str, help='input file path')
+    parser.add_argument('--output', default=None, type=str, help='output file name')
+    parser.add_argument('--input2', default=None, type=str, help='input2 file path')
+    parser.add_argument('--kaldi-ts-type', 
+                        default='v2', 
+                        type=str, 
+                        choices=['v0', 'v1', 'v2'], 
+                        help='kaldi timestamp to write')
+    args = parser.parse_args()
+    main(args)
 
diff --git a/setup.py b/setup.py
index 087d90d..c854769 100644
--- a/setup.py
+++ b/setup.py
@@ -17,7 +17,7 @@
         "humanfriendly",
         "scipy>=1.4.1",
         # "filelock",
-        "librosa>=0.8.0",
+        "librosa==0.8.1",
         "jamo==0.4.1",  # For kss
         "PyYAML>=5.1.2",
         "soundfile>=0.10.2",
@@ -41,6 +41,8 @@
         # PAI
         "oss2",
         "kaldi-native-fbank",
+        # timestamp
+        "edit-distance"
     ],
     # train: The modules invoked when training only.
     "train": [
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 70dbe89..32b8af5 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -451,7 +451,7 @@
 
     def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
         inference_pipeline = pipeline(
-            task=Tasks.,
+            task=Tasks.auto_speech_recognition,
             model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',

--
Gitblit v1.9.1