From 8a08405b668e06c4670b4c13f6793e193f21a21d Mon Sep 17 00:00:00 2001
From: Yabin Li <wucong.lyb@alibaba-inc.com>
Date: 星期一, 08 五月 2023 11:43:08 +0800
Subject: [PATCH] Merge branch 'main' into dev_apis

---
 funasr/runtime/onnxruntime/readme.md                                                                                               |   12 
 funasr/runtime/grpc/Readme.md                                                                                                      |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py                                 |   24 
 docs/model_zoo/modelscope_models.md                                                                                                |  126 +
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md                                                                       |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py                                |    1 
 funasr/bin/asr_inference_paraformer.py                                                                                             |    3 
 funasr/bin/vad_inference.py                                                                                                        |    2 
 funasr/runtime/websocket/readme.md                                                                                                 |  108 +
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py                                                   |   11 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py                                  |    4 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md                                                             |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh                                           |    1 
 funasr/runtime/onnxruntime/src/audio.cpp                                                                                           |   11 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py                                                               |   12 
 funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py                                                                     |   45 
 funasr/models/encoder/sanm_encoder.py                                                                                              |   24 
 funasr/runtime/python/websocket/README.md                                                                                          |   58 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py                               |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py                             |    4 
 funasr/tasks/abs_task.py                                                                                                           |    6 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh                                                                       |    1 
 funasr/export/models/CT_Transformer.py                                                                                             |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh                                |    1 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py                    |   16 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md                      |    1 
 funasr/runtime/onnxruntime/src/e2e-vad.h                                                                                           |   63 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md                         |    1 
 funasr/bin/build_trainer.py                                                                                                        |    3 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh                                                              |    1 
 docs/reference/papers.md                                                                                                           |    0 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh                          |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py                                |    4 
 funasr/runtime/websocket/websocketsrv.cpp                                                                                          |  158 ++
 docs/index.rst                                                                                                                     |   18 
 funasr/runtime/websocket/websocketsrv.h                                                                                            |   93 +
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py                              |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py                                |    4 
 egs_modelscope/speaker_diarization/TEMPLATE/README.md                                                                              |   81 +
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py                   |    4 
 docs/reference/build_task.md                                                                                                       |    0 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py                                        |    4 
 egs_modelscope/punctuation/TEMPLATE/infer.sh                                                                                       |   66 
 funasr/bin/asr_inference_paraformer_streaming.py                                                                                   |  441 +----
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py                                                                        |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py                  |    4 
 funasr/runtime/onnxruntime/CMakeLists.txt                                                                                          |    3 
 docs/modelscope_pipeline/quick_start.md                                                                                            |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py                              |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py                                |    1 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md                             |    1 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh                   |    0 
 funasr/models/e2e_asr_paraformer.py                                                                                                |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py                    |   37 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py                                 |    4 
 egs_modelscope/tp/TEMPLATE/infer.py                                                                                                |    0 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py                             |    4 
 docs/runtime/websocket_cpp.md                                                                                                      |    1 
 funasr/runtime/websocket/websocketclient.cpp                                                                                       |  221 +++
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils                      |    1 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh                                                                        |    1 
 fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py                                                              |    2 
 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py                                           |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py                             |    4 
 funasr/models/e2e_asr_contextual_paraformer.py                                                                                     |  372 +++++
 funasr/runtime/python/websocket/parse_args.py                                                                                      |   40 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py                                 |    4 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py                                            |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py                                 |    4 
 funasr/torch_utils/load_pretrained_model.py                                                                                        |    2 
 egs_modelscope/speaker_verification/TEMPLATE/README.md                                                                             |  121 +
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py                              |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py                                         |    4 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md                             |    1 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py                                                              |    1 
 funasr/runtime/python/grpc/Readme.md                                                                                               |    2 
 funasr/train/trainer.py                                                                                                            |   14 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py                           |   39 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py                              |    4 
 docs/modelscope_pipeline/punc_pipeline.md                                                                                          |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py                          |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py                            |    4 
 egs_modelscope/vad/TEMPLATE/README.md                                                                                              |   38 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py                              |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py                                                                       |    1 
 egs_modelscope/punctuation/TEMPLATE/infer.py                                                                                       |   23 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py                                  |    4 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py                               |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py                                          |    4 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md                  |  264 +++
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py                                                                         |    4 
 egs_modelscope/tp/TEMPLATE/infer.sh                                                                                                |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py                              |    4 
 README.md                                                                                                                          |   12 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py                                          |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py                                                                 |    4 
 funasr/datasets/large_datasets/dataset.py                                                                                          |   37 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py                              |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py                              |    4 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py                   |   33 
 docs/installation/installation.md                                                                                                  |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py                              |    1 
 funasr/datasets/large_datasets/utils/hotword_utils.py                                                                              |   32 
 funasr/datasets/large_datasets/utils/tokenize.py                                                                                   |    8 
 docs/modelscope_pipeline/sv_pipeline.md                                                                                            |    1 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh                       |    6 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py                              |    4 
 funasr/runtime/websocket/websocketmain.cpp                                                                                         |  157 ++
 funasr/modules/embedding.py                                                                                                        |   13 
 funasr/version.txt                                                                                                                 |    2 
 .gitignore                                                                                                                         |    3 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py                           |    4 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py                            |    4 
 funasr/datasets/large_datasets/utils/padding.py                                                                                    |   58 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py                                           |    1 
 funasr/runtime/python/websocket/ws_server_online.py                                                                                |   93 +
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py                                        |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh                              |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py                           |    4 
 docs/reference/application.md                                                                                                      |    0 
 egs_modelscope/speaker_verification/TEMPLATE/infer.py                                                                              |   15 
 egs_modelscope/tp/TEMPLATE/README.md                                                                                               |   42 
 funasr/runtime/python/websocket/ws_server_offline.py                                                                               |  150 ++
 docs/README.md                                                                                                                     |   19 
 egs_modelscope/asr/paraformer/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/infer.py                       |    4 
 docs/installation/docker.md                                                                                                        |    0 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py                                                                        |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py                                 |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh                                |    1 
 docs/reference/FQA.md                                                                                                              |    0 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md                               |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md                                                                      |    1 
 funasr/runtime/python/onnxruntime/setup.py                                                                                         |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py                               |    6 
 egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py                      |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py                                                                  |    4 
 funasr/tasks/asr.py                                                                                                                |    3 
 egs_modelscope/punctuation/TEMPLATE/README.md                                                                                      |  110 +
 docs/modelscope_pipeline/itn_pipeline.md                                                                                           |   63 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh                              |    1 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py                               |    4 
 docs/model_zoo/huggingface_models.md                                                                                               |    0 
 /dev/null                                                                                                                          |  185 --
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md                                          |    1 
 funasr/runtime/websocket/CMakeLists.txt                                                                                            |   64 
 docs/modelscope_pipeline/sd_pipeline.md                                                                                            |    1 
 funasr/bin/vad_inference_online.py                                                                                                 |    1 
 funasr/models/predictor/cif.py                                                                                                     |  128 +
 egs_modelscope/asr/TEMPLATE/README.md                                                                                              |   58 
 egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py                      |    4 
 funasr/runtime/python/websocket/ws_client.py                                                                                       |  226 +++
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py                             |    4 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md                               |    1 
 egs_modelscope/punctuation/TEMPLATE/utils                                                                                          |    1 
 egs_modelscope/lm/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/infer.py                                                    |    4 
 161 files changed, 3,421 insertions(+), 911 deletions(-)

diff --git a/.gitignore b/.gitignore
index 33b8c39..c4b031f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,5 @@
 .egg*
 dist
 build
-funasr.egg-info
\ No newline at end of file
+funasr.egg-info
+docs/_build
\ No newline at end of file
diff --git a/README.md b/README.md
index 665f425..64d6d89 100644
--- a/README.md
+++ b/README.md
@@ -13,10 +13,10 @@
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
 | [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
-| [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
+| [**Tutorial_CN**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
 | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
 | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
-| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/modelscope_models.md)
+| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
 | [**Contact**](#contact)
 | [**M2MET2.0 Challenge**](https://github.com/alibaba-damo-academy/FunASR#multi-channel-multi-party-meeting-transcription-20-m2met20-challenge)
 
@@ -28,7 +28,7 @@
 
 ## Highlights
 - FunASR supports speech recognition(ASR), Multi-talker ASR, Voice Activity Detection(VAD), Punctuation Restoration, Language Models, Speaker Verification and Speaker diarization.   
-- We have released large number of academic and industrial pretrained models on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
+- We have released large number of academic and industrial pretrained models on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), ref to [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
 - The pretrained model [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) obtains the best performance on many tasks in [SpeechIO leaderboard](https://github.com/SpeechColab/Leaderboard)
 - FunASR supplies a easy-to-use pipeline to finetune pretrained models from [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
 - Compared to [Espnet](https://github.com/espnet/espnet) framework, the training speed of large-scale datasets in FunASR is much faster owning to the optimized dataloader.
@@ -60,12 +60,8 @@
 # pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple
 ```
 
-For more details, please ref to [installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html)
+For more details, please ref to [installation](https://alibaba-damo-academy.github.io/FunASR/en/installation/installation.html)
 
-[//]: # ()
-[//]: # (## Usage)
-
-[//]: # (For users who are new to FunASR and ModelScope, please refer to FunASR Docs&#40;[CN]&#40;https://alibaba-damo-academy.github.io/FunASR/cn/index.html&#41; / [EN]&#40;https://alibaba-damo-academy.github.io/FunASR/en/index.html&#41;&#41;)
 
 ## Contact
 
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..4e16b04
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,19 @@
+# FunASR document generation
+
+## Generate HTML
+For convenience, we provide users with the ability to generate local HTML manually.
+
+First, you should install the following packages, which is required for building HTML:
+```sh
+conda activate funasr
+pip install requests sphinx nbsphinx sphinx_markdown_tables sphinx_rtd_theme recommonmark
+```
+
+Then you can generate HTML manually.
+
+```sh
+cd docs
+make html
+```
+
+The generated files are all contained in the "FunASR/docs/_build" directory. You can access the FunASR documentation by simply opening the "html/index.html" file in your browser from this directory.
\ No newline at end of file
diff --git a/docs/index.rst b/docs/index.rst
index e6aff5f..c2656bd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,8 +17,8 @@
    :maxdepth: 1
    :caption: Installation
 
-   ./installation.md
-   ./docker.md
+   ./installation/installation.md
+   ./installation/docker.md
 
 .. toctree::
    :maxdepth: 1
@@ -44,6 +44,7 @@
    ./modelscope_pipeline/tp_pipeline.md
    ./modelscope_pipeline/sv_pipeline.md
    ./modelscope_pipeline/sd_pipeline.md
+   ./modelscope_pipeline/itn_pipeline.md
 
 .. toctree::
    :maxdepth: 1
@@ -56,8 +57,8 @@
    :maxdepth: 1
    :caption: Model Zoo
 
-   ./modelscope_models.md
-   ./huggingface_models.md
+   ./model_zoo/modelscope_models.md
+   ./model_zoo/huggingface_models.md
 
 .. toctree::
    :maxdepth: 1
@@ -70,6 +71,7 @@
    ./runtime/grpc_python.md
    ./runtime/grpc_cpp.md
    ./runtime/websocket_python.md
+   ./runtime/websocket_cpp.md
 
 .. toctree::
    :maxdepth: 1
@@ -84,25 +86,25 @@
    :maxdepth: 1
    :caption: Funasr Library
 
-   ./build_task.md
+   ./reference/build_task.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Papers
 
-   ./papers.md
+   ./reference/papers.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Application
 
-   ./application.md
+   ./reference/application.md
 
 .. toctree::
    :maxdepth: 1
    :caption: FQA
 
-   ./FQA.md
+   ./reference/FQA.md
 
 
 Indices and tables
diff --git a/docs/docker.md b/docs/installation/docker.md
similarity index 100%
rename from docs/docker.md
rename to docs/installation/docker.md
diff --git a/docs/installation.md b/docs/installation/installation.md
similarity index 100%
rename from docs/installation.md
rename to docs/installation/installation.md
diff --git a/docs/huggingface_models.md b/docs/model_zoo/huggingface_models.md
similarity index 100%
rename from docs/huggingface_models.md
rename to docs/model_zoo/huggingface_models.md
diff --git a/docs/model_zoo/modelscope_models.md b/docs/model_zoo/modelscope_models.md
new file mode 100644
index 0000000..1b7f475
--- /dev/null
+++ b/docs/model_zoo/modelscope_models.md
@@ -0,0 +1,126 @@
+# Pretrained Models on ModelScope
+
+## Model License
+-  Apache License 2.0
+
+## Model Zoo
+Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
+
+### Speech Recognition Models
+#### Paraformer Models
+
+|                                                                     Model Name                                                                     | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+|        [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Paraformer-large-long](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which ould deal with arbitrary length input wav                                                                                 |
+| [Paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords. |
+|              [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)              | CN & EN  | Alibaba Speech Data (50000hours) |    8358    |    68M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+|           [Paraformer-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)           | CN & EN  | Alibaba Speech Data (50000hours) |    8404    |    68M    |     Online     | Which could deal with streaming input                                                                                           |
+|  [Paraformer-large-online](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Online     | Which could deal with streaming input                                                                                                    |
+|       [Paraformer-tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary)       |    CN    |  Alibaba Speech Data (200hours)  |    544     |   5.2M    |    Offline     | Lightweight Paraformer model which supports Mandarin command words recognition                                                  |
+|                   [Paraformer-aishell](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary)                   |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
+|       [ParaformerBert-aishell](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)       |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
+|        [Paraformer-aishell2](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)         |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
+|    [ParaformerBert-aishell2](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)     |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
+
+
+#### UniASR Models
+
+|                                                                    Model Name                                                                     |    Language     |           Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:-------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:---------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+|             [UniASR](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)             |     CN & EN     | Alibaba Speech Data (60000 hours) |    8358    |   100M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|      [UniASR-large](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)       |     CN & EN     | Alibaba Speech Data (60000 hours) |    8358    |   220M    |    Offline     | UniASR streaming offline unifying models                                                                                                    |
+|          [UniASR English](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/summary)           |       EN        | Alibaba Speech Data (10000 hours) |    1080     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|          [UniASR Russian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/summary)           |       RU        | Alibaba Speech Data (5000 hours)  |    1664     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|           [UniASR Japanese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/summary)           |       JA        | Alibaba Speech Data (5000 hours)  |    5977     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR Korean](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/summary)           |       KO        | Alibaba Speech Data (2000 hours)  |    6400     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+| [UniASR Cantonese (CHS)](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/summary) | Cantonese (CHS) | Alibaba Speech Data (5000 hours)  |    1468     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|         [UniASR Indonesian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/summary)         |       ID        | Alibaba Speech Data (1000 hours)  |    1067     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR Vietnamese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary)           |       VI        | Alibaba Speech Data (1000 hours)  |    1001     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|          [UniASR Spanish](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/summary)           |       ES        | Alibaba Speech Data (1000 hours)  |    3445     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|         [UniASR Portuguese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/summary)         |       PT        | Alibaba Speech Data (1000 hours)  |    1617     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR French](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary)           |       FR        | Alibaba Speech Data (1000 hours)  |    3472     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|           [UniASR German](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary)           |       GE        | Alibaba Speech Data (1000 hours)  |    3690     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|            [UniASR Persian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary)             |       FA        | Alibaba Speech Data (1000 hours)  |    1257     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|                [UniASR Burmese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary)                 |       MY        | Alibaba Speech Data (1000 hours)  |    696     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|                [UniASR Hebrew](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary)                 |       HE        | Alibaba Speech Data (1000 hours)  |    1085    |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|              [UniASR Urdu](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary)                      |       UR        | Alibaba Speech Data (1000 hours)  |    877     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+
+
+
+#### Conformer Models
+
+|                                                       Model Name                                                       | Language |     Training Data     | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:----------------------------------------------------------------------------------------------------------------------:|:--------:|:---------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)   |   CN     |  AISHELL (178hours)   |    4234    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Conformer](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)   |   CN     | AISHELL-2 (1000hours) |    5212    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary)   |   EN     | Alibaba Speech Data (10000hours) |    4199    |    220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+
+
+#### RNN-T Models
+
+### Multi-talker Speech Recognition Models
+
+#### MFCCA Models
+
+|                                                  Model Name                                                   | Language |               Training Data                | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:-------------------------------------------------------------------------------------------------------------:|:--------:|:------------------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+| [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)    |   CN     | AliMeeting銆丄ISHELL-4銆丼imudata (917hours)   |     4950   |    45M    |    Offline     | Duration of input wav <= 20s, channel of input wav <= 8 channel |
+
+
+
+### Voice Activity Detection Models
+
+|                                           Model Name                                           |        Training Data         | Parameters | Sampling Rate | Notes |
+|:----------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:-------------:|:------|
+| [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) | Alibaba Speech Data (5000hours) |    0.4M    |     16000     |       |
+|   [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-8k-common/summary)        | Alibaba Speech Data (5000hours) |    0.4M    |     8000      |       |
+
+### Punctuation Restoration Models
+
+|                                                         Model Name                                                         |        Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
+|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------|
+|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    Offline     |   offline punctuation model    |
+| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     Online     |  online punctuation model     |
+
+### Language Models
+
+|                                                       Model Name                                                       |        Training Data         | Parameters | Vocab Size | Notes |
+|:----------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:------|
+| [Transformer](https://www.modelscope.cn/models/damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/summary)      | Alibaba Speech Data (?hours) |    57M     |    8404    |       |
+
+### Speaker Verification Models
+
+|                                                  Model Name                                                   |   Training Data   | Parameters | Number Speaker | Notes |
+|:-------------------------------------------------------------------------------------------------------------:|:-----------------:|:----------:|:----------:|:------|
+| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours)  |   17.5M    |    3465    |    Xvector, speaker verification, Chinese   |
+| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) |    61M     |    6135    |   Xvector, speaker verification, English    |
+
+### Speaker Diarization Models
+
+|                                                    Model Name                                                    |    Training Data    | Parameters | Notes |
+|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
+| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/summary) | AliMeeting (120 hours) |   40.5M    |    Speaker diarization, profiles and records, Chinese |
+| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/summary)    |  CallHome (60 hours)  |     12M     |    Speaker diarization, profiles and records, English   |
+
+### Timestamp Prediction Models
+
+|                                                    Model Name                                     |  Language  |    Training Data    | Parameters | Notes |
+|:--------------------------------------------------------------------------------------------------:|:--------------:|:-------------------:|:----------:|:------|
+| [TP-Aligner](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) | CN | Alibaba Speech Data (50000hours) |   37.8M    |    Timestamp prediction, Mandarin, middle size |
+
+### Inverse Text Normalization (ITN) Models
+
+|                                                    Model Name                                                    | Language | Parameters | Notes                    |
+|:----------------------------------------------------------------------------------------------------------------:|:--------:|:----------:|:-------------------------|
+| [English](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-en/summary) |    EN    |   1.54M    | ITN, ASR post-processing |
+| [Russian](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ru/summary) |    RU    |   17.79M   | ITN, ASR post-processing |
+| [Japanese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ja/summary) |    JA    |    6.8M    | ITN, ASR post-processing |
+| [Korean](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ko/summary) |    KO    |   1.28M    | ITN, ASR post-processing |
+| [Indonesian](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-id/summary) |    ID    |   2.06M    | ITN, ASR post-processing |
+| [Vietnamese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-vi/summary) |    VI    |   0.92M    | ITN, ASR post-processing |
+| [Tagalog](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-tl/summary) |    TL    |    0.65M     | ITN, ASR post-processing |
+| [Spanish](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-es/summary) |    ES    |   1.32M    | ITN, ASR post-processing |
+| [Portuguese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-pt/summary) |    PT    |   1.28M    | ITN, ASR post-processing |
+| [French](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-fr/summary) |    FR    |   4.39M    | ITN, ASR post-processing |
+| [German](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-de/summary)|    GE    |   3.95M    | ITN, ASR post-processing |
diff --git a/docs/modelscope_models.md b/docs/modelscope_models.md
deleted file mode 100644
index 5f94a09..0000000
--- a/docs/modelscope_models.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Pretrained Models on ModelScope
-
-## Model License
--  Apache License 2.0
-
-## Model Zoo
-Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
-
-### Speech Recognition Models
-#### Paraformer Models
-
-|                                                                     Model Name                                                                     | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:--------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-|        [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-| [Paraformer-large-long](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which ould deal with arbitrary length input wav                                                                                 |
-| [Paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords. |
-|              [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)              | CN & EN  | Alibaba Speech Data (50000hours) |    8358    |    68M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-|          [Paraformer-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)           | CN & EN  | Alibaba Speech Data (50000hours) |    8404    |    68M    |     Online     | Which could deal with streaming input                                                                                           |
-|       [Paraformer-tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary)       |    CN    |  Alibaba Speech Data (200hours)  |    544     |   5.2M    |    Offline     | Lightweight Paraformer model which supports Mandarin command words recognition                                                  |
-|                   [Paraformer-aishell](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary)                   |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
-|       [ParaformerBert-aishell](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)       |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
-|        [Paraformer-aishell2](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)         |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
-|    [ParaformerBert-aishell2](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)     |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
-
-
-#### UniASR Models
-
-|                                                               Model Name                                                               | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:--------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-|       [UniASR](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8358    |   100M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-| [UniASR-large](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8358    |   220M    |    Offline     | UniASR streaming offline unifying models                                                                                                    |
-|           [UniASR Burmese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary)           | Burmese  |  Alibaba Speech Data (? hours)   |    696     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-|           [UniASR Hebrew](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary)           |  Hebrew  |  Alibaba Speech Data (? hours)   |    1085    |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-|       [UniASR Urdu](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary)                  |   Urdu   |  Alibaba Speech Data (? hours)   |    877     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-
-#### Conformer Models
-
-|                                                       Model Name                                                       | Language |     Training Data     | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:----------------------------------------------------------------------------------------------------------------------:|:--------:|:---------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)   |   CN     |  AISHELL (178hours)   |    4234    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-| [Conformer](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)   |   CN     | AISHELL-2 (1000hours) |    5212    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-
-
-#### RNN-T Models
-
-### Multi-talker Speech Recognition Models
-
-#### MFCCA Models
-
-|                                                  Model Name                                                   | Language |               Training Data                | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:-------------------------------------------------------------------------------------------------------------:|:--------:|:------------------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-| [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)    |   CN     | AliMeeting銆丄ISHELL-4銆丼imudata (917hours)   |     4950   |    45M    |    Offline     | Duration of input wav <= 20s, channel of input wav <= 8 channel |
-
-
-
-### Voice Activity Detection Models
-
-|                                           Model Name                                           |        Training Data         | Parameters | Sampling Rate | Notes |
-|:----------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:-------------:|:------|
-| [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) | Alibaba Speech Data (5000hours) |    0.4M    |     16000     |       |
-|   [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-8k-common/summary)        | Alibaba Speech Data (5000hours) |    0.4M    |     8000      |       |
-
-### Punctuation Restoration Models
-
-|                                                         Model Name                                                         |        Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
-|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------|
-|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    Offline     |   offline punctuation model    |
-| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     Online     |  online punctuation model     |
-
-### Language Models
-
-|                                                       Model Name                                                       |        Training Data         | Parameters | Vocab Size | Notes |
-|:----------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:------|
-| [Transformer](https://www.modelscope.cn/models/damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/summary)      | Alibaba Speech Data (?hours) |    57M     |    8404    |       |
-
-### Speaker Verification Models
-
-|                                                  Model Name                                                   |   Training Data   | Parameters | Number Speaker | Notes |
-|:-------------------------------------------------------------------------------------------------------------:|:-----------------:|:----------:|:----------:|:------|
-| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours)  |   17.5M    |    3465    |    Xvector, speaker verification, Chinese   |
-| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) |    61M     |    6135    |   Xvector, speaker verification, English    |
-
-### Speaker Diarization Models
-
-|                                                    Model Name                                                    |    Training Data    | Parameters | Notes |
-|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
-| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/summary) | AliMeeting (120 hours) |   40.5M    |    Speaker diarization, profiles and records, Chinese |
-| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/summary)    |  CallHome (60 hours)  |     12M     |    Speaker diarization, profiles and records, English   |
-
-### Timestamp Prediction Models
-
-|                                                    Model Name                                     |  Language  |    Training Data    | Parameters | Notes |
-|:--------------------------------------------------------------------------------------------------:|:--------------:|:-------------------:|:----------:|:------|
-| [TP-Aligner](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) | CN | Alibaba Speech Data (50000hours) |   37.8M    |    Timestamp prediction, Mandarin, middle size |
diff --git a/docs/modelscope_pipeline/itn_pipeline.md b/docs/modelscope_pipeline/itn_pipeline.md
new file mode 100644
index 0000000..2336842
--- /dev/null
+++ b/docs/modelscope_pipeline/itn_pipeline.md
@@ -0,0 +1,63 @@
+# Inverse Text Normalization (ITN)
+
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://modelscope.cn/models?page=1&tasks=inverse-text-processing&type=audio) to inference. Here we take the model of the Japanese ITN model as example to demonstrate the usage.
+
+## Inference
+
+### Quick start
+#### [Japanese ITN model](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ja/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+itn_inference_pipline = pipeline(
+    task=Tasks.inverse_text_processing,
+    model='damo/speech_inverse_text_processing_fun-text-processing-itn-ja',
+    model_revision=None)
+
+itn_result = itn_inference_pipline(text_in='鐧句簩鍗佷笁')
+print(itn_result)
+# 123
+```
+- read text data directly.
+```python
+rec_result = inference_pipeline(text_in='涓�涔濅節涔濆勾銇獣鐢熴仐銇熷悓鍟嗗搧銇仭銇伩銆佺磩涓夊崄骞村墠銆佷簩鍗佸洓姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��')
+# 1999骞淬伀瑾曠敓銇椼仧鍚屽晢鍝併伀銇°仾銇裤�佺磩30骞村墠銆�24姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��
+```
+- text stored via url锛宔xample锛歨ttps://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt
+```python
+rec_result = inference_pipeline(text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt')
+```
+
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/fun_text_processing/inverse_text_normalization)
+
+### API-reference
+#### Define pipeline
+- `task`: `Tasks.inverse_text_processing`
+- `model`: model name in [model zoo](https://modelscope.cn/models?page=1&tasks=inverse-text-processing&type=audio), or model path in local disk
+- `output_dir`: `None` (Default), the output path of results if set
+- `model_revision`: `None` (Default), setting the model version
+
+#### Infer pipeline
+- `text_in`: the input to decode, which could be:
+  - text bytes, `e.g.`: "涓�涔濅節涔濆勾銇獣鐢熴仐銇熷悓鍟嗗搧銇仭銇伩銆佺磩涓夊崄骞村墠銆佷簩鍗佸洓姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��"
+  - text file, `e.g.`: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt
+  In this case of `text file` input, `output_dir` must be set to save the output results
+
+## Modify Your Own ITN Model
+The rule-based ITN code is open-sourced in [FunTextProcessing](https://github.com/alibaba-damo-academy/FunASR/tree/main/fun_text_processing), users can modify by their own grammar rules for different languages. Let's take Japanese as an example, users can add their own whitelist in ```FunASR/fun_text_processing/inverse_text_normalization/ja/data/whitelist.tsv```. After modified the grammar rules, the users can export and evaluate their own ITN models in local directory.
+
+### Export ITN Model
+Export ITN model via ```FunASR/fun_text_processing/inverse_text_normalization/export_models.py```. An example to export ITN model to local folder is shown as below.
+```shell
+cd FunASR/fun_text_processing/inverse_text_normalization/
+python export_models.py --language ja --export_dir ./itn_models/
+```
+
+### Evaluate ITN Model
+Users can evaluate their own ITN model in local directory via ```FunASR/fun_text_processing/inverse_text_normalization/inverse_normalize.py```. Here is an example:
+```shell
+cd FunASR/fun_text_processing/inverse_text_normalization/
+python inverse_normalize.py --input_file ja_itn_example.txt --cache_dir ./itn_models/ --output_file output.txt --language=ja
+```
\ No newline at end of file
diff --git a/docs/modelscope_pipeline/punc_pipeline.md b/docs/modelscope_pipeline/punc_pipeline.md
deleted file mode 100644
index a0203d7..0000000
--- a/docs/modelscope_pipeline/punc_pipeline.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Punctuation Restoration
-
-## Inference with pipeline
-
-### Quick start
-
-### Inference with you data
-
-### Inference with multi-threads on CPU
-
-### Inference with multi GPU
-
-## Finetune with pipeline
-
-### Quick start
-
-### Finetune with your data
-
-## Inference with your finetuned model
-
diff --git a/docs/modelscope_pipeline/punc_pipeline.md b/docs/modelscope_pipeline/punc_pipeline.md
new file mode 120000
index 0000000..4ef4711
--- /dev/null
+++ b/docs/modelscope_pipeline/punc_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/punctuation/TEMPLATE/README.md
\ No newline at end of file
diff --git a/docs/modelscope_pipeline/quick_start.md b/docs/modelscope_pipeline/quick_start.md
index 436fb1d..7e35e91 100644
--- a/docs/modelscope_pipeline/quick_start.md
+++ b/docs/modelscope_pipeline/quick_start.md
@@ -1,7 +1,7 @@
 # Quick Start
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
 
 
 ## Inference with pipeline
diff --git a/docs/modelscope_pipeline/sd_pipeline.md b/docs/modelscope_pipeline/sd_pipeline.md
deleted file mode 100644
index 1330fe6..0000000
--- a/docs/modelscope_pipeline/sd_pipeline.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Speaker Diarization
-
-## Inference with pipeline
-
-### Quick start
-
-### Inference with you data
-
-### Inference with multi-threads on CPU
-
-### Inference with multi GPU
-
-## Finetune with pipeline
-
-### Quick start
-
-### Finetune with your data
-
-## Inference with your finetuned model
-
diff --git a/docs/modelscope_pipeline/sd_pipeline.md b/docs/modelscope_pipeline/sd_pipeline.md
new file mode 120000
index 0000000..9c3ac98
--- /dev/null
+++ b/docs/modelscope_pipeline/sd_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/speaker_diarization/TEMPLATE/README.md
\ No newline at end of file
diff --git a/docs/modelscope_pipeline/sv_pipeline.md b/docs/modelscope_pipeline/sv_pipeline.md
deleted file mode 100644
index c57db38..0000000
--- a/docs/modelscope_pipeline/sv_pipeline.md
+++ /dev/null
@@ -1,20 +0,0 @@
-# Speaker Verification
-
-## Inference with pipeline
-
-### Quick start
-
-### Inference with you data
-
-### Inference with multi-threads on CPU
-
-### Inference with multi GPU
-
-## Finetune with pipeline
-
-### Quick start
-
-### Finetune with your data
-
-## Inference with your finetuned model
-
diff --git a/docs/modelscope_pipeline/sv_pipeline.md b/docs/modelscope_pipeline/sv_pipeline.md
new file mode 120000
index 0000000..3217355
--- /dev/null
+++ b/docs/modelscope_pipeline/sv_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/speaker_verification/TEMPLATE/README.md
\ No newline at end of file
diff --git a/docs/FQA.md b/docs/reference/FQA.md
similarity index 100%
rename from docs/FQA.md
rename to docs/reference/FQA.md
diff --git a/docs/application.md b/docs/reference/application.md
similarity index 100%
rename from docs/application.md
rename to docs/reference/application.md
diff --git a/docs/build_task.md b/docs/reference/build_task.md
similarity index 100%
rename from docs/build_task.md
rename to docs/reference/build_task.md
diff --git a/docs/papers.md b/docs/reference/papers.md
similarity index 100%
rename from docs/papers.md
rename to docs/reference/papers.md
diff --git a/docs/runtime/websocket_cpp.md b/docs/runtime/websocket_cpp.md
new file mode 120000
index 0000000..8a87df5
--- /dev/null
+++ b/docs/runtime/websocket_cpp.md
@@ -0,0 +1 @@
+../../funasr/runtime/websocket/readme.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/README.md b/egs_modelscope/asr/TEMPLATE/README.md
index 83c462d..7ff04eb 100644
--- a/egs_modelscope/asr/TEMPLATE/README.md
+++ b/egs_modelscope/asr/TEMPLATE/README.md
@@ -1,7 +1,7 @@
 # Speech Recognition
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
 
 ## Inference
 
@@ -19,22 +19,24 @@
 rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
 print(rec_result)
 ```
-#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
 ```python
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
-    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.4'
     )
 import soundfile
 speech, sample_rate = soundfile.read("example/asr_example.wav")
 
-param_dict = {"cache": dict(), "is_final": False}
-chunk_stride = 7680# 480ms
-# first chunk, 480ms
+chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+chunk_stride = chunk_size[1] * 960 # 600ms銆�480ms
+# first chunk, 600ms
 speech_chunk = speech[0:chunk_stride] 
 rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
 print(rec_result)
-# next chunk, 480ms
+# next chunk, 600ms
 speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
 rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
 print(rec_result)
@@ -42,7 +44,7 @@
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
 
 #### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
-There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model details, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
 ```python
 decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
 inference_pipeline = pipeline(
@@ -59,7 +61,7 @@
 Undo
 
 #### [MFCCA Model](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
-For more model detailes, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+For more model details, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -74,15 +76,15 @@
 print(rec_result)
 ```
 
-#### API-reference
-##### Define pipeline
+### API-reference
+#### Define pipeline
 - `task`: `Tasks.auto_speech_recognition`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
 - `batch_size`: `1` (Default), batch size when decoding
-##### Infer pipeline
+#### Infer pipeline
 - `audio_in`: the input to decode, which could be: 
   - wav_path, `e.g.`: asr_example.wav,
   - pcm_path, `e.g.`: asr_example.pcm, 
@@ -100,20 +102,20 @@
 ### Inference with multi-thread CPUs or multi GPUs
 FunASR also offer recipes [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
-- Setting parameters in `infer.sh`
-    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-    - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - `output_dir`: output dir of the recognition results
-    - `batch_size`: `64` (Default), batch size of inference on gpu
-    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
-    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
-    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
-    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
-    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
-    - `decoding_mode`: `normal` (Default), decoding mode for UniASR model(fast銆乶ormal銆乷ffline)
-    - `hotword_txt`: `None` (Default), hotword file for contextual paraformer model(the hotword file name ends with .txt")
+#### Settings of `infer.sh`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+- `output_dir`: output dir of the recognition results
+- `batch_size`: `64` (Default), batch size of inference on gpu
+- `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+- `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+- `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+- `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+- `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+- `decoding_mode`: `normal` (Default), decoding mode for UniASR model(fast銆乶ormal銆乷ffline)
+- `hotword_txt`: `None` (Default), hotword file for contextual paraformer model(the hotword file name ends with .txt")
 
-- Decode with multi GPUs:
+#### Decode with multi GPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
@@ -123,7 +125,7 @@
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
-- Decode with multi-thread CPUs:
+#### Decode with multi-thread CPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
@@ -133,7 +135,7 @@
     --njob 64
 ```
 
-- Results
+#### Results
 
 The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
 
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
deleted file mode 100644
index c68a8cd..0000000
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
similarity index 82%
rename from egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
rename to egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
index 3594815..87bb652 100644
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
similarity index 82%
rename from egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
rename to egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
index b55b59f..3b0164a 100644
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
index 77b2cbd..7a6b750 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer.py
@@ -16,13 +16,13 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k",
         output_dir=output_dir_job,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in)
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
index 0d06377..f07f308 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer.py
@@ -16,13 +16,13 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch",
         output_dir=output_dir_job,
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in)
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
new file mode 100644
index 0000000..f6026d6
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
@@ -0,0 +1,11 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
deleted file mode 100755
index 333b66a..0000000
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
-        output_dir=decoding_path,
-        batch_size=1
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if text_in is not None:
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        text_proc_file2 = os.path.join(decoding_path, "1best_recog/token_nosep")
-        with open(text_proc_file, 'r') as hyp_reader:
-                with open(text_proc_file2, 'w') as hyp_writer:
-                    for line in hyp_reader:
-                        new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                        hyp_writer.write(new_context+'\n')
-        text_in2 = os.path.join(decoding_path, "1best_recog/ref_text_nosep")
-        with open(text_in, 'r') as ref_reader:
-            with open(text_in2, 'w') as ref_writer:
-                for line in ref_reader:
-                    new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                    ref_writer.write(new_context+'\n')
-
-
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.sp.cer"))
-        compute_wer(text_in2, text_proc_file2, os.path.join(decoding_path, "text.nosp.cer"))
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950"
-    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pb"
-    modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
deleted file mode 100644
index 49c0aeb..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to infer using a pretrained Paraformer-large Model
-
-### Inference
-
-You can use the pretrain model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # Support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-    - <strong>batch_size:</strong> # Set batch size in inference.
-    - <strong>param_dict:</strong> # Set the hotword list in inference.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py
new file mode 100644
index 0000000..9d08923
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py
@@ -0,0 +1,37 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        model_revision="v1.0.2",
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", data_path="./data")
+    params.output_dir = "./checkpoint"              # 妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "large"                   # finetune contextual paraformer妯″瀷鍙兘浣跨敤large dataset
+    params.batch_bins = 200000                      # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 20                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.0002                              # 璁剧疆瀛︿範鐜�
+
+    modelscope_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
index e60f6d9..6325626 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
@@ -12,7 +12,7 @@
 batch_size=64
 gpu_inference=true    # whether to perform gpu decoding
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
-njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+njob=10    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
 checkpoint_dir=
 checkpoint_name="valid.cer_ctc.ave.pb"
 hotword_txt=None
@@ -55,8 +55,8 @@
             --audio_in ${output_dir}/split/wav.$JOB.scp \
             --output_dir ${output_dir}/output.$JOB \
             --batch_size ${batch_size} \
-            --gpuid ${gpuid} \
-            --hotword_txt ${hotword_txt}
+            --hotword_txt ${hotword_txt} \
+            --gpuid ${gpuid}
         }&
     done
     wait
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
index 18897b1..97e9fce 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
@@ -19,11 +19,15 @@
         os.makedirs(work_dir)
     wav_file_path = os.path.join(work_dir, "wav.scp")
     
+    counter = 0
     with codecs.open(wav_file_path, 'w') as fin: 
         for line in ds_dict:
+            counter += 1
             wav = line["Audio:FILE"]
             idx = wav.split("/")[-1].split(".")[0]
             fin.writelines(idx + " " + wav + "\n")
+            if counter == 50:
+                break
     audio_in = wav_file_path         
 
     inference_pipeline = pipeline(
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
new file mode 100644
index 0000000..b566454
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -0,0 +1,39 @@
+import os
+import logging
+import torch
+import soundfile
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+os.environ["MODELSCOPE_CACHE"] = "./"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.4'
+)
+
+model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online")
+speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
+speech_length = speech.shape[0]
+
+sample_offset = 0
+chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+stride_size =  chunk_size[1] * 960
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+final_result = ""
+
+for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+    if sample_offset + stride_size >= speech_length - 1:
+        stride_size = speech_length - sample_offset
+        param_dict["is_final"] = True
+    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
+                                    param_dict=param_dict)
+    if len(rec_result) != 0:
+        final_result += rec_result['text'] + " "
+        print(rec_result)
+print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
deleted file mode 100644
index c740f71..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
-    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
-    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.sh`
-    - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - <strong>output_dir:</strong> # result dir
-    - <strong>batch_size:</strong> # batchsize of inference
-    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
-    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
-    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
-
-- Decode with multi GPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --batch_size 64 \
-    --gpu_inference true \
-    --gpuid_list "0,1"
-```
-
-- Decode with multi-thread CPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --gpu_inference false \
-    --njob 64
-```
-
-- Results
-
-The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>modelscope_model_name: </strong> # model name on ModelScope
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-    - <strong>batch_size:</strong> # batchsize of inference  
-
-- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
deleted file mode 100644
index 2d311dd..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
-    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/infer.py
index d1fbca2..00be793 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/infer.py
@@ -16,14 +16,14 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch",
         output_dir=output_dir_job,
         batch_size=64
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in)
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
deleted file mode 100644
index c68a8cd..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
similarity index 79%
rename from egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
rename to egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
index 8a6c87b..2863c1a 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@@ -4,12 +4,12 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
         output_dir=output_dir,
-        batch_size=32,
+        batch_size=1,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
similarity index 82%
rename from egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
rename to egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
index dec7de0..f2db74e 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
index 2eb9cc8..6672bbf 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -14,24 +14,26 @@
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.2')
+    model_revision='v1.0.4'
+)
 
 model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
 speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
 speech_length = speech.shape[0]
 
 sample_offset = 0
-step = 4800  #300ms
-param_dict = {"cache": dict(), "is_final": False}
+chunk_size = [8, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+stride_size =  chunk_size[1] * 960
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
 final_result = ""
 
-for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
-    if sample_offset + step >= speech_length - 1:
-        step = speech_length - sample_offset
+for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+    if sample_offset + stride_size >= speech_length - 1:
+        stride_size = speech_length - sample_offset
         param_dict["is_final"] = True
-    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
+    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
                                     param_dict=param_dict)
-    if len(rec_result) != 0 and rec_result['text'] != "sil" and rec_result['text'] != "waiting_for_more_voice":
-        final_result += rec_result['text']
-    print(rec_result)
-print(final_result)
+    if len(rec_result) != 0:
+        final_result += rec_result['text'] + " "
+        print(rec_result)
+print(final_result.strip())
diff --git a/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
index df18903..f4c4fc2 100644
--- a/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
index 83d6805..63bed40 100644
--- a/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformerbert/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://modelscope.oss-cn-beijing.aliyuncs.com/test/audios/asr_example.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
index c151149..862f881 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cantonese-CHS.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
index ac73adf..d4f8d76 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_cantonese-CHS.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py
index 227f4bf..347d316 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py
index 74d9764..936d6d7 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
index 5ace7e4..f82c1f4 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_de.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
index f8d91b8..48b4807 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_de.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
index 49b884b..98f31b6 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
index 57a3afd..423c503 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_en.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
index 510f008..75e22a0 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_es.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
index 2ec5940..cb1b4fa 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_es.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
index 040265d..e6c39c2 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
@@ -16,14 +16,14 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline",
         output_dir=output_dir_job,
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
index 055e4eb..124d5ed 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
@@ -16,14 +16,14 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online",
         output_dir=output_dir_job,
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
index 6aedeea..627d132 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_fr.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
index 2f3e833..305d990 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_fr.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
index c54ab8c..e0d1a4d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
index 219c9ec..e53c37e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_id.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
index ad2671a..75ec783 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_id.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
index 1a174bb..68cc41d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ja.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
index f15bc2d..a741e18 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ja.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
index 618b3f6..b87bcbb 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ko.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
index 135e8f8..9be791c 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ko.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
index cfd869f..b3a9058 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
index 2dcb663..4a43e7c 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_pt.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
index aff2a9a..7029fd9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_pt.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
index 95f447d..3c9d364 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ru.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
index 88c06b4..95da479 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ru.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
index e8c5524..04b02fe 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
index 9472104..4218f3d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_vi.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
index 4a844fc..355e412 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
@@ -4,10 +4,10 @@
 if __name__ == "__main__":
     audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_vi.wav"
     output_dir = "./results"
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
+    rec_result = inference_pipeline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py
index 40686ac..3520989 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py
index dfe934d..a3e2a00 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
index ce8988e..13d2a2e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
@@ -16,14 +16,14 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline",
         output_dir=output_dir_job,
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in)
 
 def modelscope_infer(params):
     # prepare for multi-GPU decoding
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
index 8b4a04d..876d51c 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
@@ -16,14 +16,14 @@
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
     else:
         os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online",
         output_dir=output_dir_job,
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py
index 1c1e303..8ec4288 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py
index 94c1b68..3ab16ea 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/infer.py
@@ -4,11 +4,11 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
         model="damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipeline(audio_in=audio_in)
     print(rec_result)
 
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 94144ef..83c462d 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -1,46 +1,246 @@
-# ModelScope Model
+# Speech Recognition
 
-## How to finetune and infer using a pretrained Paraformer-large Model
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
 
-### Finetune
+## Inference
 
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
+### Quick start
+#### [Paraformer Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 ```python
-    python finetune.py
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
+
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride] 
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+    param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal` is fake streaming, which could be used for evaluating of recognition accuracy.
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### [MFCCA Model](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+For more model detailes, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
 ```
 
-### Inference
+#### API-reference
+##### Define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+##### Infer pipeline
+- `audio_in`: the input to decode, which could be: 
+  - wav_path, `e.g.`: asr_example.wav,
+  - pcm_path, `e.g.`: asr_example.pcm, 
+  - audio bytes stream, `e.g.`: bytes data from a microphone
+  - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+  - wav.scp, kaldi style wav list (`wav_id \t wav_path`), `e.g.`: 
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Default), the output path of results if set
 
-Or you can use the finetuned model for inference directly.
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
+- Setting parameters in `infer.sh`
+    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+    - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - `output_dir`: output dir of the recognition results
+    - `batch_size`: `64` (Default), batch size of inference on gpu
+    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+    - `decoding_mode`: `normal` (Default), decoding mode for UniASR model(fast銆乶ormal銆乷ffline)
+    - `hotword_txt`: `None` (Default), hotword file for contextual paraformer model(the hotword file name ends with .txt")
 
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
 ```
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-
-- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
 ```
 
 - Results
 
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
+
+## Finetune with pipeline
+
+### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    from funasr.utils.modelscope_param import modelscope_args
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    params.output_dir = "./checkpoint"                      # 妯″瀷淇濆瓨璺緞
+    params.data_path = "speech_asr_aishell1_trainsets"      # 鏁版嵁璺緞锛屽彲浠ヤ负modelscope涓凡涓婁紶鏁版嵁锛屼篃鍙互鏄湰鍦版暟鎹�
+    params.dataset_type = "small"                           # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 2000                                # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 50                                   # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                                     # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
+
+### Finetune with your data
+
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+    - `output_dir`: result dir
+    - `data_dir`: the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - `dataset_type`: for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - `batch_bins`: batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - `max_epoch`: number of training epoch
+    - `lr`: learning rate
+
+- Training data formats锛�
+```sh
+cat ./example_data/text
+BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
+BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
+english_example_1 hello world
+english_example_2 go swim 鍘� 娓� 娉�
+
+cat ./example_data/wav.scp
+BAC009S0002W0122 /mnt/data/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /mnt/data/wav/train/S0002/BAC009S0002W0123.wav
+english_example_1 /mnt/data/wav/train/S0002/english_example_1.wav
+english_example_2 /mnt/data/wav/train/S0002/english_example_2.wav
+```
+
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+
+- Setting parameters in [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) is the same with [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/egs_modelscope/asr/TEMPLATE#inference-with-multi-thread-cpus-or-multi-gpus), `model` is the model name from modelscope, which you finetuned.
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1" \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64 \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
new file mode 100644
index 0000000..2fce734
--- /dev/null
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -0,0 +1,16 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+    output_dir = None
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+        output_dir=output_dir
+    )
+    rec_result = inference_pipeline(audio_in=audio_in)
+    print(rec_result)
+
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index df471d6..5bc205c 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -1,19 +1,28 @@
+import os
+import shutil
+import argparse
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
-    output_dir = None
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
-        model_revision="v1.2.1",
-        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        vad_model_revision="v1.1.8",
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-        punc_model_revision="v1.1.6",
-        ngpu=1,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
     )
-    rec_result = inference_pipeline(audio_in=audio_in)
-    print(rec_result)
+    inference_pipeline(audio_in=args.audio_in)
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--decoding_mode', type=str, default="normal")
+    parser.add_argument('--hotword_txt', type=str, default=None)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
similarity index 100%
rename from egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
rename to egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
deleted file mode 100644
index 473019c..0000000
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
new file mode 120000
index 0000000..3d3dd06
--- /dev/null
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
@@ -0,0 +1 @@
+../../asr/TEMPLATE/utils
\ No newline at end of file
diff --git a/egs_modelscope/lm/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/infer.py b/egs_modelscope/lm/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/infer.py
index ec309b2..628cdd8 100644
--- a/egs_modelscope/lm/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/lm/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/infer.py
@@ -6,12 +6,12 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
-inference_pipline = pipeline(
+inference_pipeline = pipeline(
     task=Tasks.language_score_prediction,
     model='damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch',
     output_dir="./tmp/"
 )
 
-rec_result = inference_pipline(text_in=inputs)
+rec_result = inference_pipeline(text_in=inputs)
 print(rec_result)
 
diff --git a/egs_modelscope/punctuation/TEMPLATE/README.md b/egs_modelscope/punctuation/TEMPLATE/README.md
new file mode 100644
index 0000000..08814ea
--- /dev/null
+++ b/egs_modelscope/punctuation/TEMPLATE/README.md
@@ -0,0 +1,110 @@
+# Punctuation Restoration
+
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of the punctuation model of CT-Transformer as example to demonstrate the usage.
+
+## Inference
+
+### Quick start
+#### [CT-Transformer model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+    model_revision=None)
+
+rec_result = inference_pipeline(text_in='example/punc_example.txt')
+print(rec_result)
+```
+- text浜岃繘鍒舵暟鎹紝渚嬪锛氱敤鎴风洿鎺ヤ粠鏂囦欢閲岃鍑篵ytes鏁版嵁
+```python
+rec_result = inference_pipeline(text_in='鎴戜滑閮芥槸鏈ㄥご浜轰笉浼氳璇濅笉浼氬姩')
+```
+- text鏂囦欢url锛屼緥濡傦細https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt
+```python
+rec_result = inference_pipeline(text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/punc_example.txt')
+```
+
+#### [CT-Transformer Realtime model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.punctuation,
+    model='damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727',
+    model_revision=None,
+)
+
+inputs = "璺ㄥ娌虫祦鏄吇鑲叉部宀竱浜烘皯鐨勭敓鍛戒箣婧愰暱鏈熶互鏉ヤ负甯姪涓嬫父鍦板尯闃茬伨鍑忕伨涓柟鎶�鏈汉鍛榺鍦ㄤ笂娓稿湴鍖烘瀬涓烘伓鍔ｇ殑鑷劧鏉′欢涓嬪厠鏈嶅法澶у洶闅剧敋鑷冲啋鐫�鐢熷懡鍗遍櫓|鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏|鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒秥鍑℃槸|涓柟鑳藉仛鐨勬垜浠瑋閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑|浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛瑙勫垝鍜岃璇佸吋椤句笂涓嬫父鐨勫埄鐩�"
+vads = inputs.split("|")
+rec_result_all="outputs:"
+param_dict = {"cache": []}
+for vad in vads:
+    rec_result = inference_pipeline(text_in=vad, param_dict=param_dict)
+    rec_result_all += rec_result['text']
+
+print(rec_result_all)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/238)
+
+
+### API-reference
+#### Define pipeline
+- `task`: `Tasks.punctuation`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `output_dir`: `None` (Default), the output path of results if set
+- `model_revision`: `None` (Default), setting the model version
+
+#### Infer pipeline
+- `text_in`: the input to decode, which could be:
+  - text bytes, `e.g.`: "鎴戜滑閮芥槸鏈ㄥご浜轰笉浼氳璇濅笉浼氬姩"
+  - text file, `e.g.`: example/punc_example.txt
+  In this case of `text file` input, `output_dir` must be set to save the output results
+- `param_dict`: reserving the cache which is necessary in realtime mode. 
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [egs_modelscope/punctuation/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punctuation/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.
+
+#### Settings of `infer.sh`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `data_dir`: the dataset dir needs to include `punc.txt`
+- `output_dir`: output dir of the recognition results
+- `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+- `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+- `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+- `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+- `checkpoint_name`: only used for infer finetuned models, `punc.pb` (Default), which checkpoint is used to infer
+
+#### Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 1 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
+```
+#### Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 1
+```
+
+## Finetune with pipeline
+
+### Quick start
+
+### Finetune with your data
+
+## Inference with your finetuned model
+
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.py b/egs_modelscope/punctuation/TEMPLATE/infer.py
new file mode 100644
index 0000000..edcefbe
--- /dev/null
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.py
@@ -0,0 +1,23 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.punctuation,
+        model=args.model,
+        output_dir=args.output_dir,
+    )
+    inference_pipeline(text_in=args.text_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch")
+    parser.add_argument('--text_in', type=str, default="./data/test/punc.txt")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/TEMPLATE/infer.sh b/egs_modelscope/punctuation/TEMPLATE/infer.sh
new file mode 100644
index 0000000..0af502e
--- /dev/null
+++ b/egs_modelscope/punctuation/TEMPLATE/infer.sh
@@ -0,0 +1,66 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="punc.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/text.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/punc.txt ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --text_in ${output_dir}/split/text.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/final_res
+    if [ -f "${output_dir}/output.1/infer.out" ]; then
+      for i in $(seq "${nj}"); do
+          cat "${output_dir}/output.${i}/infer.out"
+      done | sort -k1 >"${output_dir}/final_res/infer.out"
+    fi
+fi
+
diff --git a/egs_modelscope/punctuation/TEMPLATE/utils b/egs_modelscope/punctuation/TEMPLATE/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/punctuation/TEMPLATE/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
similarity index 100%
rename from egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
rename to egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
deleted file mode 100644
index b125d48..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-task=Tasks.punctuation,
-    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-
-- Setting parameters in `modelscope_common_infer.sh`
-    - <strong>model:</strong> damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch  # pre-trained model, download from modelscope
-    - <strong>text_in:</strong> input path, text or url
-    - <strong>output_dir:</strong> the result dir
-- Then you can run the pipeline to infer with: 
-```sh
-    python ./infer.py
-```
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
deleted file mode 100644
index 367be79..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-1	璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
-2	浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
-3	閭ｄ粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
similarity index 89%
rename from egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
rename to egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
index 0da8d25..20994d3 100644
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
@@ -12,12 +12,12 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
-inference_pipline = pipeline(
+inference_pipeline = pipeline(
     task=Tasks.punctuation,
     model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
     model_revision="v1.1.7",
     output_dir="./tmp/"
 )
 
-rec_result = inference_pipline(text_in=inputs)
+rec_result = inference_pipeline(text_in=inputs)
 print(rec_result)
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/TEMPLATE/README.md b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
new file mode 100644
index 0000000..ba179ed
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
@@ -0,0 +1,81 @@
+# Speaker Diarization
+
+> **Note**: 
+> The modelscope pipeline supports all the models in 
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 
+to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
+
+## Inference with pipeline
+### Quick start
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# initialize pipeline
+inference_diar_pipline = pipeline(
+    mode="sond_demo",
+    num_workers=0,
+    task=Tasks.speaker_diarization,
+    diar_model_config="sond.yaml",
+    model='damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch',
+    reversion="v1.0.5",
+    sv_model="damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch",
+    sv_model_revision="v1.2.2",
+)
+
+# input: a list of audio in which the first item is a speech recording to detect speakers, 
+# and the following wav file are used to extract speaker embeddings.
+audio_list = [
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/record.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk1.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk2.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk3.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/speaker_diarization/spk4.wav",
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+
+### API-reference
+#### Define pipeline
+- `task`: `Tasks.speaker_diarization`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+- `smooth_size`: `83` (Default), the window size to perform smoothing
+- `dur_threshold`: `10` (Default), segments shorter than 100 ms will be dropped
+- `out_format`: `vad` (Default), the output format, choices `["vad", "rttm"]`. 
+  - vad format: spk1: [1.0, 3.0], [5.0, 8.0]
+  - rttm format: "SPEAKER test1 0 1.00 2.00 <NA> <NA> spk1 <NA> <NA>" and "SPEAKER test1 0 5.00 3.00 <NA> <NA> spk1 <NA> <NA>"
+
+#### Infer pipeline for speaker embedding extraction
+- `audio_in`: the input to process, which could be: 
+  - list of url: `e.g.`: waveform files at a website
+  - list of local file path: `e.g.`: path/to/a.wav
+  - ("wav.scp,speech,sound", "profile.scp,profile,kaldi_ark"): a script file of waveform files and another script file of speaker profiles (extracted with the [model](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary))
+    ```text
+    wav.scp
+    test1 path/to/enroll1.wav
+    test2 path/to/enroll2.wav
+    
+    profile.scp
+    test1 path/to/profile.ark:11
+    test2 path/to/profile.ark:234
+    ```
+    The profile.ark file contains speaker embeddings in a kaldi-like style. 
+    Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) for more details.
+
+### Inference with you data
+For single input, we recommend the "list of local file path" mode for inference.
+For multiple inputs, we recommend the last mode with pre-organized wav.scp and profile.scp.
+
+### Inference with multi-threads on CPU
+We recommend the last mode with split wav.scp and profile.scp. Then, run inference for each split part.
+Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) to find a similar process.
+
+### Inference with multi GPU
+Similar to CPU, please set `ngpu=1` for inference on GPU.
+Besides, you should use `CUDA_VISIBLE_DEVICES=0` to specify a GPU device.
+Please refer [README.md](../../speaker_verification/TEMPLATE/README.md) to find a similar process.
diff --git a/egs_modelscope/speaker_verification/TEMPLATE/README.md b/egs_modelscope/speaker_verification/TEMPLATE/README.md
new file mode 100644
index 0000000..d6736e3
--- /dev/null
+++ b/egs_modelscope/speaker_verification/TEMPLATE/README.md
@@ -0,0 +1,121 @@
+# Speaker Verification
+
+> **Note**: 
+> The modelscope pipeline supports all the models in 
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 
+to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
+
+## Inference with pipeline
+
+### Quick start
+#### Speaker verification
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_sv_pipline = pipeline(
+    task=Tasks.speaker_verification,
+    model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
+)
+
+# The same speaker
+rec_result = inference_sv_pipline(audio_in=(
+    'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+    'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
+print("Similarity", rec_result["scores"])
+
+# Different speakers
+rec_result = inference_sv_pipline(audio_in=(
+    'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav',
+    'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav'))
+print("Similarity", rec_result["scores"])
+```
+#### Speaker embedding extraction
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+# Define extraction pipeline
+inference_sv_pipline = pipeline(
+    task=Tasks.speaker_verification,
+    model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch'
+)
+# Extract speaker embedding
+rec_result = inference_sv_pipline(
+    audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav')
+speaker_embedding = rec_result["spk_embedding"]
+```
+Full code of demo, please ref to [infer.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/speaker_verification/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/infer.py).
+
+### API-reference
+#### Define pipeline
+- `task`: `Tasks.speaker_verification`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+- `sv_threshold`: `0.9465` (Default), the similarity threshold to determine 
+whether utterances belong to the same speaker (it should be in (0, 1))
+
+#### Infer pipeline for speaker embedding extraction
+- `audio_in`: the input to process, which could be: 
+  - url (str): `e.g.`: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav
+  - local_path: `e.g.`: path/to/a.wav
+  - wav.scp: `e.g.`: path/to/wav1.scp
+    ```text
+    wav.scp
+    test1 path/to/enroll1.wav
+    test2 path/to/enroll2.wav
+    ```
+  - bytes: `e.g.`: raw bytes data from a microphone
+  - fbank1.scp,speech,kaldi_ark: `e.g.`: extracted 80-dimensional fbank features
+with kaldi toolkits.
+
+#### Infer pipeline for speaker verification
+- `audio_in`: the input to process, which could be: 
+  - Tuple(url1, url2): `e.g.`: (https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav, https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_different.wav)
+  - Tuple(local_path1, local_path2): `e.g.`: (path/to/a.wav, path/to/b.wav)  
+  - Tuple(wav1.scp, wav2.scp): `e.g.`: (path/to/wav1.scp, path/to/wav2.scp)
+    ```text
+    wav1.scp
+    test1 path/to/enroll1.wav
+    test2 path/to/enroll2.wav
+    
+    wav2.scp
+    test1 path/to/same1.wav
+    test2 path/to/diff2.wav
+    ```
+  - Tuple(bytes, bytes): `e.g.`: raw bytes data from a microphone
+  - Tuple("fbank1.scp,speech,kaldi_ark", "fbank2.scp,speech,kaldi_ark"): `e.g.`: extracted 80-dimensional fbank features
+with kaldi toolkits.
+
+### Inference with you data
+Use wav1.scp or fbank.scp to organize your own data to extract speaker embeddings or perform speaker verification. 
+In this case, the `output_dir` should be set to save all the embeddings or scores.
+
+### Inference with multi-threads on CPU
+You can inference with multi-threads on CPU as follow steps:
+1. Set `ngpu=0` while defining the pipeline in `infer.py`.
+2. Split wav.scp to several files `e.g.: 4`
+  ```shell
+  split -l $((`wc -l < wav.scp`/4+1)) --numeric-suffixes wav.scp splits/wav.scp.
+  ```
+3. Start to extract embeddings
+  ```shell
+  for wav_scp in `ls splits/wav.scp.*`; do
+    infer.py ${wav_scp} outputs/$((basename ${wav_scp}))
+  done
+  ```
+4. The embeddings will be saved in `outputs/*`
+
+### Inference with multi GPU
+Similar to inference on CPU, the difference are as follows:
+
+Step 1. Set `ngpu=1` while defining the pipeline in `infer.py`.
+
+Step 3. specify the gpu device with `CUDA_VISIBLE_DEVICES`:
+```shell
+  for wav_scp in `ls splits/wav.scp.*`; do
+    CUDA_VISIBLE_DEVICES=1 infer.py ${wav_scp} outputs/$((basename ${wav_scp}))
+  done
+  ```
diff --git a/egs_modelscope/speaker_verification/TEMPLATE/infer.py b/egs_modelscope/speaker_verification/TEMPLATE/infer.py
new file mode 100644
index 0000000..efab097
--- /dev/null
+++ b/egs_modelscope/speaker_verification/TEMPLATE/infer.py
@@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+import sys
+
+# Define extraction pipeline
+inference_sv_pipline = pipeline(
+    task=Tasks.speaker_verification,
+    model='damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch',
+    output_dir=sys.argv[2],
+)
+# Extract speaker embedding
+rec_result = inference_sv_pipline(
+    audio_in=sys.argv[1],
+
+)
diff --git a/egs_modelscope/tp/TEMPLATE/README.md b/egs_modelscope/tp/TEMPLATE/README.md
index 2678a7f..7cc8508 100644
--- a/egs_modelscope/tp/TEMPLATE/README.md
+++ b/egs_modelscope/tp/TEMPLATE/README.md
@@ -8,12 +8,12 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
-inference_pipline = pipeline(
+inference_pipeline = pipeline(
     task=Tasks.speech_timestamp,
     model='damo/speech_timestamp_prediction-v1-16k-offline',
     output_dir=None)
 
-rec_result = inference_pipline(
+rec_result = inference_pipeline(
     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
     text_in='涓� 涓� 涓� 澶� 骞� 娲� 鍥� 瀹� 涓� 浠� 涔� 璺� 鍒� 瑗� 澶� 骞� 娲� 鏉� 浜� 鍛�',)
 print(rec_result)
@@ -23,15 +23,15 @@
 
 
 
-#### API-reference
-##### Define pipeline
+### API-reference
+#### Define pipeline
 - `task`: `Tasks.speech_timestamp`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
 - `batch_size`: `1` (Default), batch size when decoding
-##### Infer pipeline
+#### Infer pipeline
 - `audio_in`: the input speech to predict, which could be: 
   - wav_path, `e.g.`: asr_example.wav (wav in local or url), 
   - wav.scp, kaldi style wav list (`wav_id wav_path`), `e.g.`: 
@@ -59,37 +59,37 @@
     ```
 
 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+FunASR also offer recipes [egs_modelscope/tp/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/tp/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
-- Setting parameters in `infer.sh`
-    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-    - `data_dir`: the dataset dir **must** include `wav.scp` and `text.scp`
-    - `output_dir`: output dir of the recognition results
-    - `batch_size`: `64` (Default), batch size of inference on gpu
-    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
-    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
-    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
-    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
-    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+#### Settings of `infer.sh`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `data_dir`: the dataset dir **must** include `wav.scp` and `text.txt`
+- `output_dir`: output dir of the recognition results
+- `batch_size`: `64` (Default), batch size of inference on gpu
+- `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+- `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+- `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+- `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+- `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
 
-- Decode with multi GPUs:
+#### Decode with multi GPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
-- Decode with multi-thread CPUs:
+#### Decode with multi-thread CPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 64
+    --njob 1
 ```
 
 ## Finetune with pipeline
diff --git a/egs_modelscope/tp/TEMPLATE/infer.py b/egs_modelscope/tp/TEMPLATE/infer.py
deleted file mode 120000
index df5dff2..0000000
--- a/egs_modelscope/tp/TEMPLATE/infer.py
+++ /dev/null
@@ -1 +0,0 @@
-../speech_timestamp_prediction-v1-16k-offline/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py b/egs_modelscope/tp/TEMPLATE/infer.py
similarity index 100%
rename from egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
rename to egs_modelscope/tp/TEMPLATE/infer.py
diff --git a/egs_modelscope/tp/TEMPLATE/infer.sh b/egs_modelscope/tp/TEMPLATE/infer.sh
index 2a923bb..bae62e8 100644
--- a/egs_modelscope/tp/TEMPLATE/infer.sh
+++ b/egs_modelscope/tp/TEMPLATE/infer.sh
@@ -37,7 +37,7 @@
     split_texts="$split_texts $output_dir/split/text.$JOB.scp"
 done
 perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-perl utils/split_scp.pl ${data_dir}/text.scp ${split_texts}
+perl utils/split_scp.pl ${data_dir}/text.txt ${split_texts}
 
 if [ -n "${checkpoint_dir}" ]; then
   python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
deleted file mode 100644
index 5488aaa..0000000
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>text_in:</strong> # support text, text url.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
new file mode 100644
index 0000000..bcc5128
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
@@ -0,0 +1,12 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.speech_timestamp,
+    model='damo/speech_timestamp_prediction-v1-16k-offline',
+    output_dir=None)
+
+rec_result = inference_pipeline(
+    audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
+    text_in='涓� 涓� 涓� 澶� 骞� 娲� 鍥� 瀹� 涓� 浠� 涔� 璺� 鍒� 瑗� 澶� 骞� 娲� 鏉� 浜� 鍛�',)
+print(rec_result)
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/vad/TEMPLATE/README.md b/egs_modelscope/vad/TEMPLATE/README.md
index 6f746d5..4c6f8c2 100644
--- a/egs_modelscope/vad/TEMPLATE/README.md
+++ b/egs_modelscope/vad/TEMPLATE/README.md
@@ -1,7 +1,7 @@
 # Voice Activity Detection
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of FSMN-VAD as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of FSMN-VAD as example to demonstrate the usage.
 
 ## Inference
 
@@ -43,15 +43,15 @@
 
 
 
-#### API-reference
-##### Define pipeline
+### API-reference
+#### Define pipeline
 - `task`: `Tasks.voice_activity_detection`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
 - `batch_size`: `1` (Default), batch size when decoding
-##### Infer pipeline
+#### Infer pipeline
 - `audio_in`: the input to decode, which could be: 
   - wav_path, `e.g.`: asr_example.wav,
   - pcm_path, `e.g.`: asr_example.pcm, 
@@ -69,35 +69,35 @@
 ### Inference with multi-thread CPUs or multi GPUs
 FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
-- Setting parameters in `infer.sh`
-    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-    - `data_dir`: the dataset dir needs to include `wav.scp`
-    - `output_dir`: output dir of the recognition results
-    - `batch_size`: `64` (Default), batch size of inference on gpu
-    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
-    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
-    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
-    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
-    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+#### Settings of `infer.sh`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `data_dir`: the dataset dir needs to include `wav.scp`
+- `output_dir`: output dir of the recognition results
+- `batch_size`: `64` (Default), batch size of inference on gpu
+- `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+- `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+- `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+- `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+- `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
 
-- Decode with multi GPUs:
+#### Decode with multi GPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
-- Decode with multi-thread CPUs:
+#### Decode with multi-thread CPUs:
 ```shell
     bash infer.sh \
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 64
+    --njob 1
 ```
 
 ## Finetune with pipeline
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
deleted file mode 100644
index 6d9cd30..0000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
similarity index 82%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
index 2bf3251..bbc16c5 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
@@ -4,12 +4,12 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
         model_revision='v1.2.0',
         output_dir=output_dir,
         batch_size=1,
     )
-    segments_result = inference_pipline(audio_in=audio_in)
+    segments_result = inference_pipeline(audio_in=audio_in)
     print(segments_result)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
similarity index 89%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
index 02e919d..65693b5 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
@@ -8,7 +8,7 @@
 
 if __name__ == '__main__':
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
         model_revision='v1.2.0',
@@ -30,7 +30,7 @@
         else:
             is_final = False
         param_dict['is_final'] = is_final
-        segments_result = inference_pipline(audio_in=speech[sample_offset: sample_offset + step],
+        segments_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
                                             param_dict=param_dict)
         print(segments_result)
 
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
deleted file mode 100644
index 6d9cd30..0000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
similarity index 82%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
index 2e50275..84863d0 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
@@ -4,12 +4,12 @@
 if __name__ == '__main__':
     audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example_8k.wav'
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-8k-common",
         model_revision='v1.2.0',
         output_dir=output_dir,
         batch_size=1,
     )
-    segments_result = inference_pipline(audio_in=audio_in)
+    segments_result = inference_pipeline(audio_in=audio_in)
     print(segments_result)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
similarity index 89%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
index a8cc912..5b67da7 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
@@ -8,7 +8,7 @@
 
 if __name__ == '__main__':
     output_dir = None
-    inference_pipline = pipeline(
+    inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-8k-common",
         model_revision='v1.2.0',
@@ -30,7 +30,7 @@
         else:
             is_final = False
         param_dict['is_final'] = is_final
-        segments_result = inference_pipline(audio_in=speech[sample_offset: sample_offset + step],
+        segments_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + step],
                                             param_dict=param_dict)
         print(segments_result)
 
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
index 6b2fce5..539acbc 100644
--- a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
+++ b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
@@ -27,7 +27,7 @@
         graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv"))
         graph_thousand = pynini.string_file(get_abs_path("data/numbers/thousand.tsv"))
 
-        graph_cents = pynini.cross("seratus", "100") | pynini.cross("ratus", "100") | pynini.union(graph_hundreds, pynutil.insert("00"))
+        graph_cents = pynini.cross("seratus", "100") | pynini.cross("ratus", "100") | pynini.union(graph_hundreds, pynutil.insert("0"))
         graph_hundred = pynini.cross("ratus", "") | pynini.cross("seratus", "")
 
         graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("00"))
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 5546c92..5335860 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -41,6 +41,7 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
 from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
 from funasr.bin.tp_inference import SpeechText2Timestamp
@@ -236,7 +237,7 @@
         pre_token_length = pre_token_length.round().long()
         if torch.max(pre_token_length) < 1:
             return []
-        if not isinstance(self.asr_model, ContextualParaformer):
+        if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model, NeatContextualParaformer):
             if self.hotword_list:
                 logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
             decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 821f694..4f04d02 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -8,6 +8,7 @@
 import codecs
 import tempfile
 import requests
+import yaml
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -40,10 +41,11 @@
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_none
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
-from funasr.models.frontend.wav_frontend import WavFrontend
-from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.models.frontend.wav_frontend import WavFrontend, WavFrontendOnline
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+
 np.set_printoptions(threshold=np.inf)
+
 
 class Speech2Text:
     """Speech2Text class
@@ -89,7 +91,7 @@
         )
         frontend = None
         if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
-            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+            frontend = WavFrontendOnline(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
 
         logging.info("asr_model: {}".format(asr_model))
         logging.info("asr_train_args: {}".format(asr_train_args))
@@ -189,8 +191,7 @@
 
     @torch.no_grad()
     def __call__(
-            self, cache: dict, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
-            begin_time: int = 0, end_time: int = None,
+            self, cache: dict, speech: Union[torch.Tensor], speech_lengths: Union[torch.Tensor] = None
     ):
         """Inference
 
@@ -201,38 +202,62 @@
 
         """
         assert check_argument_types()
-
-        # Input as audio signal
-        if isinstance(speech, np.ndarray):
-            speech = torch.tensor(speech)
-        if self.frontend is not None:
-            feats, feats_len = self.frontend.forward(speech, speech_lengths)
-            feats = to_device(feats, device=self.device)
-            feats_len = feats_len.int()
+        results = []
+        cache_en = cache["encoder"]
+        if speech.shape[1] < 16 * 60 and cache_en["is_final"]:
+            if cache_en["start_idx"] == 0:
+                return []
+            cache_en["tail_chunk"] = True
+            feats = cache_en["feats"]
+            feats_len = torch.tensor([feats.shape[1]])
             self.asr_model.frontend = None
+            results = self.infer(feats, feats_len, cache)
+            return results
         else:
-            feats = speech
-            feats_len = speech_lengths
-        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
-        feats_len = cache["encoder"]["stride"] + cache["encoder"]["pad_left"] + cache["encoder"]["pad_right"]
-        feats = feats[:,cache["encoder"]["start_idx"]:cache["encoder"]["start_idx"]+feats_len,:]
-        feats_len = torch.tensor([feats_len])
-        batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
+            if self.frontend is not None:
+                feats, feats_len = self.frontend.forward(speech, speech_lengths, cache_en["is_final"])
+                feats = to_device(feats, device=self.device)
+                feats_len = feats_len.int()
+                self.asr_model.frontend = None
+            else:
+                feats = speech
+                feats_len = speech_lengths
 
-        # a. To device
+            if feats.shape[1] != 0:
+                if cache_en["is_final"]:
+                    if feats.shape[1] + cache_en["chunk_size"][2] < cache_en["chunk_size"][1]:
+                        cache_en["last_chunk"] = True
+                    else:
+                        # first chunk
+                        feats_chunk1 = feats[:, :cache_en["chunk_size"][1], :]
+                        feats_len = torch.tensor([feats_chunk1.shape[1]])
+                        results_chunk1 = self.infer(feats_chunk1, feats_len, cache)
+
+                        # last chunk
+                        cache_en["last_chunk"] = True
+                        feats_chunk2 = feats[:, -(feats.shape[1] + cache_en["chunk_size"][2] - cache_en["chunk_size"][1]):, :]
+                        feats_len = torch.tensor([feats_chunk2.shape[1]])
+                        results_chunk2 = self.infer(feats_chunk2, feats_len, cache)
+
+                        return [" ".join(results_chunk1 + results_chunk2)]
+
+                results = self.infer(feats, feats_len, cache)
+
+        return results
+
+    @torch.no_grad()
+    def infer(self, feats: Union[torch.Tensor], feats_len: Union[torch.Tensor], cache: List = None):
+        batch = {"speech": feats, "speech_lengths": feats_len}
         batch = to_device(batch, device=self.device)
-
         # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache)
+        enc, enc_len = self.asr_model.encode_chunk(feats, feats_len, cache=cache)
         if isinstance(enc, tuple):
             enc = enc[0]
         # assert len(enc) == 1, len(enc)
         enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
 
         predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
-        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
-                                                                        predictor_outs[2], predictor_outs[3]
-        pre_token_length = pre_token_length.floor().long()
+        pre_acoustic_embeds, pre_token_length= predictor_outs[0], predictor_outs[1]
         if torch.max(pre_token_length) < 1:
             return []
         decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
@@ -274,168 +299,11 @@
 
                 # Change integer-ids to tokens
                 token = self.converter.ids2tokens(token_int)
+                token = " ".join(token)
 
-                if self.tokenizer is not None:
-                    text = self.tokenizer.tokens2text(token)
-                else:
-                    text = None
-
-                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+                results.append(token)
 
         # assert check_return_type(results)
-        return results
-
-
-class Speech2TextExport:
-    """Speech2TextExport class
-
-    """
-
-    def __init__(
-            self,
-            asr_train_config: Union[Path, str] = None,
-            asr_model_file: Union[Path, str] = None,
-            cmvn_file: Union[Path, str] = None,
-            lm_train_config: Union[Path, str] = None,
-            lm_file: Union[Path, str] = None,
-            token_type: str = None,
-            bpemodel: str = None,
-            device: str = "cpu",
-            maxlenratio: float = 0.0,
-            minlenratio: float = 0.0,
-            dtype: str = "float32",
-            beam_size: int = 20,
-            ctc_weight: float = 0.5,
-            lm_weight: float = 1.0,
-            ngram_weight: float = 0.9,
-            penalty: float = 0.0,
-            nbest: int = 1,
-            frontend_conf: dict = None,
-            hotword_list_or_file: str = None,
-            **kwargs,
-    ):
-
-        # 1. Build ASR model
-        asr_model, asr_train_args = ASRTask.build_model_from_file(
-            asr_train_config, asr_model_file, cmvn_file, device
-        )
-        frontend = None
-        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
-            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
-
-        logging.info("asr_model: {}".format(asr_model))
-        logging.info("asr_train_args: {}".format(asr_train_args))
-        asr_model.to(dtype=getattr(torch, dtype)).eval()
-
-        token_list = asr_model.token_list
-
-        logging.info(f"Decoding device={device}, dtype={dtype}")
-
-        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
-        if token_type is None:
-            token_type = asr_train_args.token_type
-        if bpemodel is None:
-            bpemodel = asr_train_args.bpemodel
-
-        if token_type is None:
-            tokenizer = None
-        elif token_type == "bpe":
-            if bpemodel is not None:
-                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
-            else:
-                tokenizer = None
-        else:
-            tokenizer = build_tokenizer(token_type=token_type)
-        converter = TokenIDConverter(token_list=token_list)
-        logging.info(f"Text tokenizer: {tokenizer}")
-
-        # self.asr_model = asr_model
-        self.asr_train_args = asr_train_args
-        self.converter = converter
-        self.tokenizer = tokenizer
-
-        self.device = device
-        self.dtype = dtype
-        self.nbest = nbest
-        self.frontend = frontend
-
-        model = Paraformer_export(asr_model, onnx=False)
-        self.asr_model = model
-
-    @torch.no_grad()
-    def __call__(
-            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
-    ):
-        """Inference
-
-        Args:
-                speech: Input speech data
-        Returns:
-                text, token, token_int, hyp
-
-        """
-        assert check_argument_types()
-
-        # Input as audio signal
-        if isinstance(speech, np.ndarray):
-            speech = torch.tensor(speech)
-
-        if self.frontend is not None:
-            feats, feats_len = self.frontend.forward(speech, speech_lengths)
-            feats = to_device(feats, device=self.device)
-            feats_len = feats_len.int()
-            self.asr_model.frontend = None
-        else:
-            feats = speech
-            feats_len = speech_lengths
-
-        enc_len_batch_total = feats_len.sum()
-        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
-        batch = {"speech": feats, "speech_lengths": feats_len}
-
-        # a. To device
-        batch = to_device(batch, device=self.device)
-
-        decoder_outs = self.asr_model(**batch)
-        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
-
-        results = []
-        b, n, d = decoder_out.size()
-        for i in range(b):
-            am_scores = decoder_out[i, :ys_pad_lens[i], :]
-
-            yseq = am_scores.argmax(dim=-1)
-            score = am_scores.max(dim=-1)[0]
-            score = torch.sum(score, dim=-1)
-            # pad with mask tokens to ensure compatibility with sos/eos tokens
-            yseq = torch.tensor(
-                yseq.tolist(), device=yseq.device
-            )
-            nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
-
-            for hyp in nbest_hyps:
-                assert isinstance(hyp, (Hypothesis)), type(hyp)
-
-                # remove sos/eos and get results
-                last_pos = -1
-                if isinstance(hyp.yseq, list):
-                    token_int = hyp.yseq[1:last_pos]
-                else:
-                    token_int = hyp.yseq[1:last_pos].tolist()
-
-                # remove blank symbol id, which is assumed to be 0
-                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
-
-                # Change integer-ids to tokens
-                token = self.converter.ids2tokens(token_int)
-
-                if self.tokenizer is not None:
-                    text = self.tokenizer.tokens2text(token)
-                else:
-                    text = None
-
-                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
-
         return results
 
 
@@ -536,8 +404,6 @@
         **kwargs,
 ):
     assert check_argument_types()
-    ncpu = kwargs.get("ncpu", 1)
-    torch.set_num_threads(ncpu)
 
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
@@ -580,11 +446,9 @@
         penalty=penalty,
         nbest=nbest,
     )
-    if export_mode:
-        speech2text = Speech2TextExport(**speech2text_kwargs)
-    else:
-        speech2text = Speech2Text(**speech2text_kwargs)
-        
+
+    speech2text = Speech2Text(**speech2text_kwargs)
+
     def _load_bytes(input):
         middle_data = np.frombuffer(input, dtype=np.int16)
         middle_data = np.asarray(middle_data)
@@ -599,7 +463,46 @@
         offset = i.min + abs_max
         array = np.frombuffer((middle_data.astype(dtype) - offset) / abs_max, dtype=np.float32)
         return array
-    
+
+    def _read_yaml(yaml_path: Union[str, Path]) -> Dict:
+        if not Path(yaml_path).exists():
+            raise FileExistsError(f'The {yaml_path} does not exist.')
+
+        with open(str(yaml_path), 'rb') as f:
+            data = yaml.load(f, Loader=yaml.Loader)
+        return data
+
+    def _prepare_cache(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
+        if len(cache) > 0:
+            return cache
+        config = _read_yaml(asr_train_config)
+        enc_output_size = config["encoder_conf"]["output_size"]
+        feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
+        cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
+                    "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
+                    "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
+        cache["encoder"] = cache_en
+
+        cache_de = {"decode_fsmn": None}
+        cache["decoder"] = cache_de
+
+        return cache
+
+    def _cache_reset(cache: dict = {}, chunk_size=[5,10,5], batch_size=1):
+        if len(cache) > 0:
+            config = _read_yaml(asr_train_config)
+            enc_output_size = config["encoder_conf"]["output_size"]
+            feats_dims = config["frontend_conf"]["n_mels"] * config["frontend_conf"]["lfr_m"]
+            cache_en = {"start_idx": 0, "cif_hidden": torch.zeros((batch_size, 1, enc_output_size)),
+                        "cif_alphas": torch.zeros((batch_size, 1)), "chunk_size": chunk_size, "last_chunk": False,
+                        "feats": torch.zeros((batch_size, chunk_size[0] + chunk_size[2], feats_dims)), "tail_chunk": False}
+            cache["encoder"] = cache_en
+
+            cache_de = {"decode_fsmn": None}
+            cache["decoder"] = cache_de
+
+        return cache
+
     def _forward(
             data_path_and_name_and_type,
             raw_inputs: Union[np.ndarray, torch.Tensor] = None,
@@ -610,123 +513,56 @@
     ):
 
         # 3. Build data-iterator
-        is_final = False
-        cache = {}
-        if param_dict is not None and "cache" in param_dict:
-            cache = param_dict["cache"]
-        if param_dict is not None and "is_final" in param_dict:
-            is_final = param_dict["is_final"]
-
         if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "bytes":
             raw_inputs = _load_bytes(data_path_and_name_and_type[0])
             raw_inputs = torch.tensor(raw_inputs)
         if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
             raw_inputs = torchaudio.load(data_path_and_name_and_type[0])[0][0]
-            is_final = True
         if data_path_and_name_and_type is None and raw_inputs is not None:
             if isinstance(raw_inputs, np.ndarray):
                 raw_inputs = torch.tensor(raw_inputs)
+        is_final = False
+        cache = {}
+        chunk_size = [5, 10, 5]
+        if param_dict is not None and "cache" in param_dict:
+            cache = param_dict["cache"]
+        if param_dict is not None and "is_final" in param_dict:
+            is_final = param_dict["is_final"]
+        if param_dict is not None and "chunk_size" in param_dict:
+            chunk_size = param_dict["chunk_size"]
+
         # 7 .Start for-loop
         # FIXME(kamo): The output format should be discussed about
+        raw_inputs = torch.unsqueeze(raw_inputs, axis=0)
         asr_result_list = []
-        results = []
-        asr_result = ""
-        wait = True
-        if len(cache) == 0:
-            cache["encoder"] = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None, "is_final": is_final, "left": 0, "right": 0}
-            cache_de = {"decode_fsmn": None}
-            cache["decoder"] = cache_de
-            cache["first_chunk"] = True
-            cache["speech"] = []
-            cache["accum_speech"] = 0
-
-        if raw_inputs is not None:
-            if len(cache["speech"]) == 0:
-                cache["speech"] = raw_inputs
-            else:
-                cache["speech"] = torch.cat([cache["speech"], raw_inputs], dim=0)
-            cache["accum_speech"] += len(raw_inputs)
-            while cache["accum_speech"] >= 960:
-                if cache["first_chunk"]:
-                    if cache["accum_speech"] >= 14400:
-                        speech = torch.unsqueeze(cache["speech"], axis=0)
-                        speech_length = torch.tensor([len(cache["speech"])])
-                        cache["encoder"]["pad_left"] = 5 
-                        cache["encoder"]["pad_right"] = 5 
-                        cache["encoder"]["stride"] = 10
-                        cache["encoder"]["left"] = 5
-                        cache["encoder"]["right"] = 0
-                        results = speech2text(cache, speech, speech_length)
-                        cache["accum_speech"] -= 4800
-                        cache["first_chunk"] = False
-                        cache["encoder"]["start_idx"] = -5
-                        cache["encoder"]["is_final"] = False
-                        wait = False
-                    else:
-                        if is_final:
-                            cache["encoder"]["stride"] = len(cache["speech"]) // 960
-                            cache["encoder"]["pad_left"] = 0
-                            cache["encoder"]["pad_right"] = 0
-                            speech = torch.unsqueeze(cache["speech"], axis=0)
-                            speech_length = torch.tensor([len(cache["speech"])])
-                            results = speech2text(cache, speech, speech_length)
-                            cache["accum_speech"] = 0
-                            wait = False
-                        else:
-                            break
+        cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
+        item = {}
+        if data_path_and_name_and_type is not None and data_path_and_name_and_type[2] == "sound":
+            sample_offset = 0
+            speech_length = raw_inputs.shape[1]
+            stride_size =  chunk_size[1] * 960
+            cache = _prepare_cache(cache, chunk_size=chunk_size, batch_size=1)
+            final_result = ""
+            for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+                if sample_offset + stride_size >= speech_length - 1:
+                    stride_size = speech_length - sample_offset
+                    cache["encoder"]["is_final"] = True
                 else:
-                    if cache["accum_speech"] >= 19200:
-                        cache["encoder"]["start_idx"] += 10
-                        cache["encoder"]["stride"] = 10
-                        cache["encoder"]["pad_left"] = 5
-                        cache["encoder"]["pad_right"] = 5
-                        cache["encoder"]["left"] = 0
-                        cache["encoder"]["right"] = 0
-                        speech = torch.unsqueeze(cache["speech"], axis=0)
-                        speech_length = torch.tensor([len(cache["speech"])])
-                        results = speech2text(cache, speech, speech_length)
-                        cache["accum_speech"] -= 9600
-                        wait = False
-                    else:
-                        if is_final:
-                            cache["encoder"]["is_final"] = True
-                            if cache["accum_speech"] >= 14400:
-                                cache["encoder"]["start_idx"] += 10
-                                cache["encoder"]["stride"] = 10
-                                cache["encoder"]["pad_left"] = 5
-                                cache["encoder"]["pad_right"] = 5
-                                cache["encoder"]["left"] = 0
-                                cache["encoder"]["right"] = cache["accum_speech"] // 960 - 15
-                                speech = torch.unsqueeze(cache["speech"], axis=0)
-                                speech_length = torch.tensor([len(cache["speech"])])
-                                results = speech2text(cache, speech, speech_length)
-                                cache["accum_speech"] -= 9600
-                                wait = False
-                            else:
-                                cache["encoder"]["start_idx"] += 10
-                                cache["encoder"]["stride"] = cache["accum_speech"] // 960 - 5
-                                cache["encoder"]["pad_left"] = 5
-                                cache["encoder"]["pad_right"] = 0
-                                cache["encoder"]["left"] = 0
-                                cache["encoder"]["right"] = 0
-                                speech = torch.unsqueeze(cache["speech"], axis=0)
-                                speech_length = torch.tensor([len(cache["speech"])])
-                                results = speech2text(cache, speech, speech_length)
-                                cache["accum_speech"] = 0
-                                wait = False
-                        else:
-                            break
-                
-                if len(results) >= 1:
-                    asr_result += results[0][0]
-            if asr_result == "":
-                asr_result = "sil"
-            if wait:
-                asr_result = "waiting_for_more_voice"
-            item = {'key': "utt", 'value': asr_result}
-            asr_result_list.append(item)
+                    cache["encoder"]["is_final"] = False
+                input_lens = torch.tensor([stride_size])
+                asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens)
+                if len(asr_result) != 0: 
+                    final_result += " ".join(asr_result) + " "
+            item = {'key': "utt", 'value': final_result.strip()}
         else:
-            return []
+            input_lens = torch.tensor([raw_inputs.shape[1]])
+            cache["encoder"]["is_final"] = is_final
+            asr_result = speech2text(cache, raw_inputs, input_lens)
+            item = {'key': "utt", 'value': " ".join(asr_result)}
+
+        asr_result_list.append(item)
+        if is_final:
+            cache = _cache_reset(cache, chunk_size=chunk_size, batch_size=1)
         return asr_result_list
 
     return _forward
@@ -910,15 +746,4 @@
 
 if __name__ == "__main__":
     main()
-
-    # from modelscope.pipelines import pipeline
-    # from modelscope.utils.constant import Tasks
-    #
-    # inference_16k_pipline = pipeline(
-    #     task=Tasks.auto_speech_recognition,
-    #     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
-    #
-    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
-    # print(rec_result)
-
 
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index 94f7262..5c30fdb 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -83,7 +83,8 @@
         finetune_configs = yaml.safe_load(f)
         # set data_types
         if dataset_type == "large":
-            finetune_configs["dataset_conf"]["data_types"] = "sound,text"
+            if 'data_types' not in finetune_configs['dataset_conf']:
+                finetune_configs["dataset_conf"]["data_types"] = "sound,text"
     finetune_configs = update_dct(configs, finetune_configs)
     for key, value in finetune_configs.items():
         if hasattr(args, key):
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 387b622..f9dc397 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -352,7 +352,6 @@
                 item = {'key': keys[i], 'value': results[i]}
                 vad_results.append(item)
                 if writer is not None:
-                    results[i] = json.loads(results[i])
                     ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
@@ -466,7 +465,6 @@
                         item = {'key': keys[i], 'value': results[i]}
                         vad_results.append(item)
                         if writer is not None:
-                            results[i] = json.loads(results[i])
                             ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
index 4d02620..e1dbcf2 100644
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -243,7 +243,6 @@
                         item = {'key': keys[i], 'value': results[i]}
                         vad_results.append(item)
                         if writer is not None:
-                            results[i] = json.loads(results[i])
                             ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index b0e1b8f..8c224d8 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -101,7 +101,7 @@
                 if data_type == "kaldi_ark":
                     ark_reader = ReadHelper('ark:{}'.format(data_file))
                     reader_list.append(ark_reader)
-                elif data_type == "text" or data_type == "sound":
+                elif data_type == "text" or data_type == "sound" or data_type == 'text_hotword':
                     text_reader = open(data_file, "r")
                     reader_list.append(text_reader)
                 elif data_type == "none":
@@ -131,6 +131,13 @@
                         sample_dict["sampling_rate"] = sampling_rate
                         if data_name == "speech":
                             sample_dict["key"] = key
+                    elif data_type == "text_hotword":
+                        text = item
+                        segs = text.strip().split()
+                        sample_dict[data_name] = segs[1:]
+                        if "key" not in sample_dict:
+                            sample_dict["key"] = segs[0]
+                        sample_dict['hw_tag'] = 1
                     else:
                         text = item
                         segs = text.strip().split()
@@ -167,14 +174,38 @@
     shuffle = conf.get('shuffle', True)
     data_names = conf.get("data_names", "speech,text")
     data_types = conf.get("data_types", "kaldi_ark,text")
-    dataset = AudioDataset(scp_lists, data_names, data_types, frontend_conf=frontend_conf, shuffle=shuffle, mode=mode)
+
+    pre_hwfile = conf.get("pre_hwlist", None)
+    pre_prob = conf.get("pre_prob", 0)  # unused yet
+
+    hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
+                 "double_rate": conf.get("double_rate", 0.1),
+                 "hotword_min_length": conf.get("hotword_min_length", 2),
+                 "hotword_max_length": conf.get("hotword_max_length", 8),
+                 "pre_prob": conf.get("pre_prob", 0.0)}
+
+    if pre_hwfile is not None:
+        pre_hwlist = []
+        with open(pre_hwfile, 'r') as fin:
+            for line in fin.readlines():
+                pre_hwlist.append(line.strip())
+    else:
+        pre_hwlist = None
+
+    dataset = AudioDataset(scp_lists, 
+                           data_names, 
+                           data_types, 
+                           frontend_conf=frontend_conf, 
+                           shuffle=shuffle, 
+                           mode=mode, 
+                           )
 
     filter_conf = conf.get('filter_conf', {})
     filter_fn = partial(filter, **filter_conf)
     dataset = FilterIterDataPipe(dataset, fn=filter_fn)
 
     if "text" in data_names:
-        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer}
+        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
         tokenize_fn = partial(tokenize, **vocab)
         dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
 
diff --git a/funasr/datasets/large_datasets/utils/hotword_utils.py b/funasr/datasets/large_datasets/utils/hotword_utils.py
new file mode 100644
index 0000000..fccfea6
--- /dev/null
+++ b/funasr/datasets/large_datasets/utils/hotword_utils.py
@@ -0,0 +1,32 @@
+import random
+
+def sample_hotword(length, 
+                   hotword_min_length, 
+                   hotword_max_length,
+                   sample_rate,
+                   double_rate,
+                   pre_prob,
+                   pre_index=None):
+        if length < hotword_min_length:
+            return [-1]
+        if random.random() < sample_rate:
+            if pre_prob > 0 and random.random() < pre_prob and pre_index is not None:
+                return pre_index
+            if length == hotword_min_length:
+                return [0, length-1]
+            elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2:
+                # sample two hotwords in a sentence
+                _max_hw_length = min(hotword_max_length, length // 2)
+                # first hotword
+                start1 = random.randint(0, length // 3)
+                end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1)
+                # second hotword
+                start2 = random.randint(end1 + 1, length - hotword_min_length)
+                end2 = random.randint(min(length-1, start2+hotword_min_length-1), min(length-1, start2+hotword_max_length-1))
+                return [start1, end1, start2, end2]
+            else:  # single hotword
+                start = random.randint(0, length - hotword_min_length)
+                end = random.randint(min(length-1, start+hotword_min_length-1), min(length-1, start+hotword_max_length-1))
+                return [start, end]
+        else:
+            return [-1]
\ No newline at end of file
diff --git a/funasr/datasets/large_datasets/utils/padding.py b/funasr/datasets/large_datasets/utils/padding.py
index e0feac6..20ba7a3 100644
--- a/funasr/datasets/large_datasets/utils/padding.py
+++ b/funasr/datasets/large_datasets/utils/padding.py
@@ -13,15 +13,16 @@
     batch = {}
     data_names = data[0].keys()
     for data_name in data_names:
-        if data_name == "key" or data_name =="sampling_rate":
+        if data_name == "key" or data_name == "sampling_rate":
             continue
         else:
-            if data[0][data_name].dtype.kind == "i":
-                pad_value = int_pad_value
-                tensor_type = torch.int64
-            else:
-                pad_value = float_pad_value
-                tensor_type = torch.float32
+            if data_name != 'hotword_indxs':
+                if data[0][data_name].dtype.kind == "i":
+                    pad_value = int_pad_value
+                    tensor_type = torch.int64
+                else:
+                    pad_value = float_pad_value
+                    tensor_type = torch.float32
 
             tensor_list = [torch.tensor(np.copy(d[data_name]), dtype=tensor_type) for d in data]
             tensor_lengths = torch.tensor([len(d[data_name]) for d in data], dtype=torch.int32)
@@ -31,4 +32,47 @@
             batch[data_name] = tensor_pad
             batch[data_name + "_lengths"] = tensor_lengths
 
+    # DHA, EAHC NOT INCLUDED
+    if "hotword_indxs" in batch:
+        # if hotword indxs in batch
+        # use it to slice hotwords out
+        hotword_list = []
+        hotword_lengths = []
+        text = batch['text']
+        text_lengths = batch['text_lengths']
+        hotword_indxs = batch['hotword_indxs']
+        num_hw = sum([int(i) for i in batch['hotword_indxs_lengths'] if i != 1]) // 2
+        B, t1 = text.shape
+        t1 += 1  # TODO: as parameter which is same as predictor_bias
+        ideal_attn = torch.zeros(B, t1, num_hw+1)
+        nth_hw = 0
+        for b, (hotword_indx, one_text, length) in enumerate(zip(hotword_indxs, text, text_lengths)):
+            ideal_attn[b][:,-1] = 1
+            if hotword_indx[0] != -1:
+                start, end = int(hotword_indx[0]), int(hotword_indx[1])
+                hotword = one_text[start: end+1]
+                hotword_list.append(hotword)
+                hotword_lengths.append(end-start+1)
+                ideal_attn[b][start:end+1, nth_hw] = 1
+                ideal_attn[b][start:end+1, -1] = 0
+                nth_hw += 1
+                if len(hotword_indx) == 4 and hotword_indx[2] != -1:
+                    # the second hotword if exist
+                    start, end = int(hotword_indx[2]), int(hotword_indx[3])
+                    hotword_list.append(one_text[start: end+1])
+                    hotword_lengths.append(end-start+1)
+                    ideal_attn[b][start:end+1, nth_hw-1] = 1
+                    ideal_attn[b][start:end+1, -1] = 0
+                    nth_hw += 1
+        hotword_list.append(torch.tensor([1]))
+        hotword_lengths.append(1)
+        hotword_pad = pad_sequence(hotword_list,
+                                batch_first=True,
+                                padding_value=0)
+        batch["hotword_pad"] = hotword_pad
+        batch["hotword_lengths"] = torch.tensor(hotword_lengths, dtype=torch.int32)
+        batch['ideal_attn'] = ideal_attn
+        del batch['hotword_indxs']
+        del batch['hotword_indxs_lengths']
+
     return keys, batch
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index 0d2fd84..f0f0c66 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import re
 import numpy as np
+from funasr.datasets.large_datasets.utils.hotword_utils import sample_hotword
 
 def forward_segment(text, seg_dict):
     word_list = []
@@ -38,7 +39,8 @@
              vocab=None,
              seg_dict=None,
              punc_dict=None,
-             bpe_tokenizer=None):
+             bpe_tokenizer=None,
+             hw_config=None):
     assert "text" in data
     assert isinstance(vocab, dict)
     text = data["text"]
@@ -53,6 +55,10 @@
         text = seg_tokenize(text, seg_dict)
 
     length = len(text)
+    if 'hw_tag' in data:
+        hotword_indxs = sample_hotword(length, **hw_config)
+        data['hotword_indxs'] = hotword_indxs
+        del data['hw_tag']
     for i in range(length):
         x = text[i]
         if i == length-1 and "punc" in data and x.startswith("vad:"):
diff --git a/funasr/export/models/CT_Transformer.py b/funasr/export/models/CT_Transformer.py
index 932e3af..2319c4a 100644
--- a/funasr/export/models/CT_Transformer.py
+++ b/funasr/export/models/CT_Transformer.py
@@ -53,7 +53,7 @@
 
     def get_dummy_inputs(self):
         length = 120
-        text_indexes = torch.randint(0, self.embed.num_embeddings, (2, length))
+        text_indexes = torch.randint(0, self.embed.num_embeddings, (2, length)).type(torch.int32)
         text_lengths = torch.tensor([length-20, length], dtype=torch.int32)
         return (text_indexes, text_lengths)
 
@@ -130,7 +130,7 @@
 
     def get_dummy_inputs(self):
         length = 120
-        text_indexes = torch.randint(0, self.embed.num_embeddings, (1, length))
+        text_indexes = torch.randint(0, self.embed.num_embeddings, (1, length)).type(torch.int32)
         text_lengths = torch.tensor([length], dtype=torch.int32)
         vad_mask = torch.ones(length, length, dtype=torch.float32)[None, None, :, :]
         sub_masks = torch.ones(length, length, dtype=torch.float32)
diff --git a/funasr/models/e2e_asr_contextual_paraformer.py b/funasr/models/e2e_asr_contextual_paraformer.py
new file mode 100644
index 0000000..dc820db
--- /dev/null
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@@ -0,0 +1,372 @@
+import logging
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+
+import torch
+from typeguard import check_argument_types
+
+from funasr.layers.abs_normalize import AbsNormalize
+from funasr.models.ctc import CTC
+from funasr.models.decoder.abs_decoder import AbsDecoder
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.models.frontend.abs_frontend import AbsFrontend
+from funasr.models.postencoder.abs_postencoder import AbsPostEncoder
+from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
+from funasr.models.specaug.abs_specaug import AbsSpecAug
+from funasr.modules.add_sos_eos import add_sos_eos
+from funasr.modules.nets_utils import make_pad_mask, pad_list
+from funasr.modules.nets_utils import th_accuracy
+from funasr.torch_utils.device_funcs import force_gatherable
+from funasr.models.e2e_asr_paraformer import Paraformer
+
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class NeatContextualParaformer(Paraformer):
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        ctc: CTC,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+        predictor = None,
+        predictor_weight: float = 0.0,
+        predictor_bias: int = 0,
+        sampling_ratio: float = 0.2,
+        target_buffer_length: int = -1,
+        inner_dim: int = 256, 
+        bias_encoder_type: str = 'lstm',
+        use_decoder_embedding: bool = False,
+        crit_attn_weight: float = 0.0,
+        crit_attn_smooth: float = 0.0,
+        bias_encoder_dropout_rate: float = 0.0,
+    ):
+        assert check_argument_types()
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
+
+        super().__init__(
+        vocab_size=vocab_size,
+        token_list=token_list,
+        frontend=frontend,
+        specaug=specaug,
+        normalize=normalize,
+        preencoder=preencoder,
+        encoder=encoder,
+        postencoder=postencoder,
+        decoder=decoder,
+        ctc=ctc,
+        ctc_weight=ctc_weight,
+        interctc_weight=interctc_weight,
+        ignore_id=ignore_id,
+        blank_id=blank_id,
+        sos=sos,
+        eos=eos,
+        lsm_weight=lsm_weight,
+        length_normalized_loss=length_normalized_loss,
+        report_cer=report_cer,
+        report_wer=report_wer,
+        sym_space=sym_space,
+        sym_blank=sym_blank,
+        extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+        predictor=predictor,
+        predictor_weight=predictor_weight,
+        predictor_bias=predictor_bias,
+        sampling_ratio=sampling_ratio,
+        )
+
+        if bias_encoder_type == 'lstm':
+            logging.warning("enable bias encoder sampling and contextual training")
+            self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate)
+            self.bias_embed = torch.nn.Embedding(vocab_size, inner_dim)
+        elif bias_encoder_type == 'mean':
+            logging.warning("enable bias encoder sampling and contextual training")
+            self.bias_embed = torch.nn.Embedding(vocab_size, inner_dim)
+        else:
+            logging.error("Unsupport bias encoder type: {}".format(bias_encoder_type))
+
+        self.target_buffer_length = target_buffer_length
+        if self.target_buffer_length > 0:
+            self.hotword_buffer = None
+            self.length_record = []
+            self.current_buffer_length = 0
+        self.use_decoder_embedding = use_decoder_embedding
+        self.crit_attn_weight = crit_attn_weight
+        if self.crit_attn_weight > 0:
+            self.attn_loss = torch.nn.L1Loss()
+        self.crit_attn_smooth = crit_attn_smooth
+
+    def forward(
+            self,
+            speech: torch.Tensor,
+            speech_lengths: torch.Tensor,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor,
+            hotword_pad: torch.Tensor,
+            hotword_lengths: torch.Tensor,
+            ideal_attn: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+                speech.shape[0]
+                == speech_lengths.shape[0]
+                == text.shape[0]
+                == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+        self.step_cur += 1
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        speech = speech[:, :speech_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        loss_pre = None
+        loss_ideal = None
+
+        stats = dict()
+
+        # 1. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (1 - self.interctc_weight) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 2b. Attention decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+            )
+
+        # 3. CTC-Att loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_att + loss_pre * self.predictor_weight
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
+
+        if loss_ideal is not None:
+            loss = loss + loss_ideal * self.crit_attn_weight
+            stats["loss_ideal"] = loss_ideal.detach().cpu()
+
+        # Collect Attn branch stats
+        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["acc"] = acc_att
+        stats["cer"] = cer_att
+        stats["wer"] = wer_att
+        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
+
+        stats["loss"] = torch.clone(loss.detach())
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+    
+    def _calc_att_clas_loss(
+            self,
+            encoder_out: torch.Tensor,
+            encoder_out_lens: torch.Tensor,
+            ys_pad: torch.Tensor,
+            ys_pad_lens: torch.Tensor,
+            hotword_pad: torch.Tensor,
+            hotword_lengths: torch.Tensor,
+            ideal_attn: torch.Tensor,
+    ):
+        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+            encoder_out.device)
+        if self.predictor_bias == 1:
+            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+            ys_pad_lens = ys_pad_lens + self.predictor_bias
+        pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
+                                                                                  ignore_id=self.ignore_id)
+
+        # -1. bias encoder
+        if self.use_decoder_embedding:
+            hw_embed = self.decoder.embed(hotword_pad)
+        else:
+            hw_embed = self.bias_embed(hotword_pad)
+        hw_embed, (_, _) = self.bias_encoder(hw_embed)
+        _ind = np.arange(0, hotword_pad.shape[0]).tolist()
+        selected = hw_embed[_ind, [i-1 for i in hotword_lengths.detach().cpu().tolist()]]
+        contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
+
+        # 0. sampler
+        decoder_out_1st = None
+        if self.sampling_ratio > 0.0:
+            if self.step_cur < 2:
+                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
+                                                           pre_acoustic_embeds, contextual_info)
+        else:
+            if self.step_cur < 2:
+                logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            sematic_embeds = pre_acoustic_embeds
+
+        # 1. Forward decoder
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
+        ) 
+        decoder_out, _ = decoder_outs[0], decoder_outs[1]
+        '''
+        if self.crit_attn_weight > 0 and attn.shape[-1] > 1:
+            ideal_attn = ideal_attn + self.crit_attn_smooth / (self.crit_attn_smooth + 1.0)
+            attn_non_blank = attn[:,:,:,:-1]
+            ideal_attn_non_blank = ideal_attn[:,:,:-1]
+            loss_ideal = self.attn_loss(attn_non_blank.max(1)[0], ideal_attn_non_blank.to(attn.device))
+        else:
+            loss_ideal = None
+        '''
+        loss_ideal = None
+
+        if decoder_out_1st is None:
+            decoder_out_1st = decoder_out
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_pad)
+        acc_att = th_accuracy(
+            decoder_out_1st.view(-1, self.vocab_size),
+            ys_pad,
+            ignore_label=self.ignore_id,
+        )
+        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out_1st.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal
+    
+    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, contextual_info):
+
+        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
+        ys_pad = ys_pad * tgt_mask[:, :, 0]
+        if self.share_embedding:
+            ys_pad_embed = self.decoder.output_layer.weight[ys_pad]
+        else:
+            ys_pad_embed = self.decoder.embed(ys_pad)
+        with torch.no_grad():
+            decoder_outs = self.decoder(
+                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, contextual_info=contextual_info
+            )
+            decoder_out, _ = decoder_outs[0], decoder_outs[1]
+            pred_tokens = decoder_out.argmax(-1)
+            nonpad_positions = ys_pad.ne(self.ignore_id)
+            seq_lens = (nonpad_positions).sum(1)
+            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
+            input_mask = torch.ones_like(nonpad_positions)
+            bsz, seq_len = ys_pad.size()
+            for li in range(bsz):
+                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
+                if target_num > 0:
+                    input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].to(pre_acoustic_embeds.device), value=0)
+            input_mask = input_mask.eq(1)
+            input_mask = input_mask.masked_fill(~nonpad_positions, False)
+            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
+
+        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
+            input_mask_expand_dim, 0)
+        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
+
+    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
+        if hw_list is None:
+            hw_list = [torch.Tensor([1]).long().to(encoder_out.device)]  # empty hotword list
+            hw_list_pad = pad_list(hw_list, 0)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
+            hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
+        else:
+            hw_lengths = [len(i) for i in hw_list]
+            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
+            hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
+                                                            enforce_sorted=False)
+            _, (h_n, _) = self.bias_encoder(hw_embed)
+            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
+        
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed
+        )
+        decoder_out = decoder_outs[0]
+        decoder_out = torch.log_softmax(decoder_out, dim=-1)
+        return decoder_out, ys_pad_lens
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 699d85f..d02783f 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -712,9 +712,9 @@
 
     def calc_predictor_chunk(self, encoder_out, cache=None):
 
-        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = \
+        pre_acoustic_embeds, pre_token_length = \
             self.predictor.forward_chunk(encoder_out, cache["encoder"])
-        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
+        return pre_acoustic_embeds, pre_token_length
 
     def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
         decoder_outs = self.decoder.forward_chunk(
diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py
index f2502bb..2a68011 100644
--- a/funasr/models/encoder/sanm_encoder.py
+++ b/funasr/models/encoder/sanm_encoder.py
@@ -6,9 +6,11 @@
 import logging
 import torch
 import torch.nn as nn
+import torch.nn.functional as F
 from funasr.modules.streaming_utils.chunk_utilis import overlap_chunk
 from typeguard import check_argument_types
 import numpy as np
+from funasr.torch_utils.device_funcs import to_device
 from funasr.modules.nets_utils import make_pad_mask
 from funasr.modules.attention import MultiHeadedAttention, MultiHeadedAttentionSANM, MultiHeadedAttentionSANMwithMask
 from funasr.modules.embedding import SinusoidalPositionEncoder, StreamSinusoidalPositionEncoder
@@ -349,6 +351,23 @@
             return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
 
+    def _add_overlap_chunk(self, feats: np.ndarray, cache: dict = {}):
+        if len(cache) == 0:
+            return feats
+        # process last chunk
+        cache["feats"] = to_device(cache["feats"], device=feats.device)
+        overlap_feats = torch.cat((cache["feats"], feats), dim=1)
+        if cache["is_final"]:
+            cache["feats"] = overlap_feats[:, -cache["chunk_size"][0]:, :]
+            if not cache["last_chunk"]:
+               padding_length = sum(cache["chunk_size"]) - overlap_feats.shape[1]
+               overlap_feats = overlap_feats.transpose(1, 2)
+               overlap_feats = F.pad(overlap_feats, (0, padding_length))
+               overlap_feats = overlap_feats.transpose(1, 2)
+        else:
+            cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :]
+        return overlap_feats
+
     def forward_chunk(self,
                       xs_pad: torch.Tensor,
                       ilens: torch.Tensor,
@@ -360,7 +379,10 @@
             xs_pad = xs_pad
         else:
             xs_pad = self.embed(xs_pad, cache)
-
+        if cache["tail_chunk"]:
+            xs_pad = to_device(cache["feats"], device=xs_pad.device)
+        else:
+            xs_pad = self._add_overlap_chunk(xs_pad, cache)
         encoder_outs = self.encoders0(xs_pad, None, None, None, None)
         xs_pad, masks = encoder_outs[0], encoder_outs[1]
         intermediate_outs = []
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index a5273f8..c59e245 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -2,6 +2,7 @@
 from torch import nn
 import logging
 import numpy as np
+from funasr.torch_utils.device_funcs import to_device
 from funasr.modules.nets_utils import make_pad_mask
 from funasr.modules.streaming_utils.utils import sequence_mask
 
@@ -200,7 +201,7 @@
         return acoustic_embeds, token_num, alphas, cif_peak
 
     def forward_chunk(self, hidden, cache=None):
-        b, t, d = hidden.size()
+        batch_size, len_time, hidden_size = hidden.shape
         h = hidden
         context = h.transpose(1, 2)
         queries = self.pad(context)
@@ -211,58 +212,81 @@
         alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
 
         alphas = alphas.squeeze(-1)
-        mask_chunk_predictor = None
-        if cache is not None:
-            mask_chunk_predictor = None
-            mask_chunk_predictor = torch.zeros_like(alphas)
-            mask_chunk_predictor[:, cache["pad_left"]:cache["stride"] + cache["pad_left"]] = 1.0
-       
-        if mask_chunk_predictor is not None:
-            alphas = alphas * mask_chunk_predictor
-      
-        if cache is not None:
-            if cache["is_final"]:
-                alphas[:, cache["stride"] + cache["pad_left"] - 1] += 0.45
-            if cache["cif_hidden"] is not None:
-                hidden = torch.cat((cache["cif_hidden"], hidden), 1)
-            if cache["cif_alphas"] is not None:
-                alphas = torch.cat((cache["cif_alphas"], alphas), -1)
 
-        token_num = alphas.sum(-1)
-        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
-        len_time = alphas.size(-1)
-        last_fire_place = len_time - 1
-        last_fire_remainds = 0.0
-        pre_alphas_length = 0
-        last_fire = False
- 
-        mask_chunk_peak_predictor = None
-        if cache is not None:
-            mask_chunk_peak_predictor = None
-            mask_chunk_peak_predictor = torch.zeros_like(cif_peak)
-            if cache["cif_alphas"] is not None:
-                pre_alphas_length = cache["cif_alphas"].size(-1)
-                mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
-            mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
-            
-        if mask_chunk_peak_predictor is not None:
-            cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
-        
-        for i in range(len_time):
-            if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
-                last_fire_place = len_time - 1 - i
-                last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
-                last_fire = True
-                break
-        if last_fire:
-           last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
-           cache["cif_hidden"] = hidden[:, last_fire_place:, :]
-           cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
-        else:
-           cache["cif_hidden"] = hidden
-           cache["cif_alphas"] = alphas
-        token_num_int = token_num.floor().type(torch.int32).item()
-        return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
+        token_length = []
+        list_fires = []
+        list_frames = []
+        cache_alphas = []
+        cache_hiddens = []
+
+        if cache is not None and "chunk_size" in cache:
+            alphas[:, :cache["chunk_size"][0]] = 0.0
+            alphas[:, sum(cache["chunk_size"][:2]):] = 0.0
+        if cache is not None and "cif_alphas" in cache and "cif_hidden" in cache:
+            cache["cif_hidden"] = to_device(cache["cif_hidden"], device=hidden.device)
+            cache["cif_alphas"] = to_device(cache["cif_alphas"], device=alphas.device)
+            hidden = torch.cat((cache["cif_hidden"], hidden), dim=1)
+            alphas = torch.cat((cache["cif_alphas"], alphas), dim=1)
+        if cache is not None and "last_chunk" in cache and cache["last_chunk"]:
+            tail_hidden = torch.zeros((batch_size, 1, hidden_size), device=hidden.device)
+            tail_alphas = torch.tensor([[self.tail_threshold]], device=alphas.device)
+            tail_alphas = torch.tile(tail_alphas, (batch_size, 1))
+            hidden = torch.cat((hidden, tail_hidden), dim=1)
+            alphas = torch.cat((alphas, tail_alphas), dim=1)
+
+        len_time = alphas.shape[1]
+        for b in range(batch_size):
+            integrate = 0.0
+            frames = torch.zeros((hidden_size), device=hidden.device)
+            list_frame = []
+            list_fire = []
+            for t in range(len_time):
+                alpha = alphas[b][t]
+                if alpha + integrate < self.threshold:
+                    integrate += alpha
+                    list_fire.append(integrate)
+                    frames += alpha * hidden[b][t]
+                else:
+                    frames += (self.threshold - integrate) * hidden[b][t]
+                    list_frame.append(frames)
+                    integrate += alpha
+                    list_fire.append(integrate)
+                    integrate -= self.threshold
+                    frames = integrate * hidden[b][t]
+
+            cache_alphas.append(integrate)
+            if integrate > 0.0:
+                cache_hiddens.append(frames / integrate)
+            else:
+                cache_hiddens.append(frames)
+
+            token_length.append(torch.tensor(len(list_frame), device=alphas.device))
+            list_fires.append(list_fire)
+            list_frames.append(list_frame)
+
+        cache["cif_alphas"] = torch.stack(cache_alphas, axis=0)
+        cache["cif_alphas"] = torch.unsqueeze(cache["cif_alphas"], axis=0)
+        cache["cif_hidden"] = torch.stack(cache_hiddens, axis=0)
+        cache["cif_hidden"] = torch.unsqueeze(cache["cif_hidden"], axis=0)
+
+        max_token_len = max(token_length)
+        if max_token_len == 0:
+             return hidden, torch.stack(token_length, 0)
+        list_ls = []
+        for b in range(batch_size):
+            pad_frames = torch.zeros((max_token_len - token_length[b], hidden_size), device=alphas.device)
+            if token_length[b] == 0:
+                list_ls.append(pad_frames)
+            else:
+                list_frames[b] = torch.stack(list_frames[b])
+                list_ls.append(torch.cat((list_frames[b], pad_frames), dim=0))
+
+        cache["cif_alphas"] = torch.stack(cache_alphas, axis=0)
+        cache["cif_alphas"] = torch.unsqueeze(cache["cif_alphas"], axis=0)
+        cache["cif_hidden"] = torch.stack(cache_hiddens, axis=0)
+        cache["cif_hidden"] = torch.unsqueeze(cache["cif_hidden"], axis=0)
+        return torch.stack(list_ls, 0), torch.stack(token_length, 0)
+
 
     def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
         b, t, d = hidden.size()
diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index c347e24..aaac80a 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -425,21 +425,14 @@
         return encoding.type(dtype)
 
     def forward(self, x, cache=None):
-        start_idx = 0
-        pad_left = 0
-        pad_right = 0
         batch_size, timesteps, input_dim = x.size()
+        start_idx = 0
         if cache is not None:
             start_idx = cache["start_idx"]
-            pad_left = cache["left"]
-            pad_right = cache["right"]
+            cache["start_idx"] += timesteps
         positions = torch.arange(1, timesteps+start_idx+1)[None, :]
         position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
-        outputs = x + position_encoding[:, start_idx: start_idx + timesteps]
-        outputs = outputs.transpose(1, 2)
-        outputs = F.pad(outputs, (pad_left, pad_right))
-        outputs = outputs.transpose(1, 2)
-        return outputs
+        return x + position_encoding[:, start_idx: start_idx + timesteps]
 
 class StreamingRelPositionalEncoding(torch.nn.Module):
     """Relative positional encoding.
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
index da92559..4499441 100644
--- a/funasr/runtime/grpc/Readme.md
+++ b/funasr/runtime/grpc/Readme.md
@@ -1,4 +1,4 @@
-# Using funasr with grpc-cpp
+# Service with grpc-cpp
 
 ## For the Server
 
diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt
index 25b816f..9f6013f 100644
--- a/funasr/runtime/onnxruntime/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/CMakeLists.txt
@@ -38,5 +38,4 @@
     include_directories(${PROJECT_SOURCE_DIR}/third_party/glog)
     set(BUILD_TESTING OFF)
     add_subdirectory(third_party/glog)
-endif()
-
+endif()
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md
index ab032bf..f3dc3b6 100644
--- a/funasr/runtime/onnxruntime/readme.md
+++ b/funasr/runtime/onnxruntime/readme.md
@@ -4,9 +4,10 @@
 ### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
 
 ```shell
-pip3 install torch torchaudio
-pip install -U modelscope
-pip install -U funasr
+# pip3 install torch torchaudio
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple
 ```
 
 ### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
@@ -126,5 +127,6 @@
 
 ## Acknowledge
 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
-2. We acknowledge [mayong](https://github.com/RapidAI/RapidASR/tree/main/cpp_onnx) for contributing the onnxruntime(cpp api).
-3. We borrowed a lot of code from [FastASR](https://github.com/chenkui164/FastASR) for audio frontend and text-postprocess.
+2. We acknowledge mayong for contributing the onnxruntime of Paraformer and CT_Transformer, [repo-asr](https://github.com/RapidAI/RapidASR/tree/main/cpp_onnx), [repo-punc](https://github.com/RapidAI/RapidPunc).
+3. We acknowledge [ChinaTelecom](https://github.com/zhuzizyf/damo-fsmn-vad-infer-httpserver) for contributing the VAD runtime.
+4. We borrowed a lot of code from [FastASR](https://github.com/chenkui164/FastASR) for audio frontend and text-postprocess.
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 6113614..2ecd3e6 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -247,6 +247,15 @@
         return false;
     }
     
+    if (!header.Validate()) {
+        return false;
+    }
+
+    header.SeekToDataChunk(is);
+    if (!is) {
+        return false;
+    }
+    
     *sampling_rate = header.sample_rate;
     // header.subchunk2_size contains the number of bytes in the data.
     // As we assume each sample contains two bytes, so it is divided by 2 here
@@ -389,8 +398,10 @@
     FILE* fp;
     fp = fopen(filename, "rb");
     if (fp == nullptr)
+	{
         LOG(ERROR) << "Failed to read " << filename;
         return false;
+	}
     fseek(fp, 0, SEEK_END);
     uint32_t n_file_len = ftell(fp);
     fseek(fp, 0, SEEK_SET);
diff --git a/funasr/runtime/onnxruntime/src/e2e-vad.h b/funasr/runtime/onnxruntime/src/e2e-vad.h
index 90f2635..0e0b50f 100644
--- a/funasr/runtime/onnxruntime/src/e2e-vad.h
+++ b/funasr/runtime/onnxruntime/src/e2e-vad.h
@@ -1,6 +1,7 @@
 /**
  * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  * MIT License  (https://opensource.org/licenses/MIT)
+ * Collaborators: zhuzizyf(China Telecom Shanghai)
 */
 
 #include <utility>
@@ -381,10 +382,11 @@
     int max_end_sil_frame_cnt_thresh;
     float speech_noise_thres;
     std::vector<std::vector<float>> scores;
+    int idx_pre_chunk = 0;
     bool max_time_out;
     std::vector<float> decibel;
-    std::vector<float> data_buf;
-    std::vector<float> data_buf_all;
+    int data_buf_size = 0;
+    int data_buf_all_size = 0;
     std::vector<float> waveform;
 
     void AllResetDetection() {
@@ -409,10 +411,11 @@
         max_end_sil_frame_cnt_thresh = vad_opts.max_end_silence_time - vad_opts.speech_to_sil_time_thres;
         speech_noise_thres = vad_opts.speech_noise_thres;
         scores.clear();
+        idx_pre_chunk = 0;
         max_time_out = false;
         decibel.clear();
-        data_buf.clear();
-        data_buf_all.clear();
+        int data_buf_size = 0;
+        int data_buf_all_size = 0;
         waveform.clear();
         ResetDetection();
     }
@@ -432,18 +435,17 @@
     void ComputeDecibel() {
         int frame_sample_length = int(vad_opts.frame_length_ms * vad_opts.sample_rate / 1000);
         int frame_shift_length = int(vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
-        if (data_buf_all.empty()) {
-            data_buf_all = waveform;
-            data_buf = data_buf_all;
+        if (data_buf_all_size == 0) {
+          data_buf_all_size = waveform.size();
+          data_buf_size = data_buf_all_size;
         } else {
-            data_buf_all.insert(data_buf_all.end(), waveform.begin(), waveform.end());
+          data_buf_all_size += waveform.size();
         }
         for (int offset = 0; offset < waveform.size() - frame_sample_length + 1; offset += frame_shift_length) {
             float sum = 0.0;
             for (int i = 0; i < frame_sample_length; i++) {
                 sum += waveform[offset + i] * waveform[offset + i];
             }
-//      float decibel = 10 * log10(sum + 0.000001);
             this->decibel.push_back(10 * log10(sum + 0.000001));
         }
     }
@@ -451,29 +453,16 @@
     void ComputeScores(const std::vector<std::vector<float>> &scores) {
         vad_opts.nn_eval_block_size = scores.size();
         frm_cnt += scores.size();
-        if (this->scores.empty()) {
-            this->scores = scores;  // the first calculation
-        } else {
-            this->scores.insert(this->scores.end(), scores.begin(), scores.end());
-        }
+        this->scores = scores;
     }
 
     void PopDataBufTillFrame(int frame_idx) {
       int frame_sample_length = int(vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
-      int start_pos=-1;
-      int data_length= data_buf.size();
       while (data_buf_start_frame < frame_idx) {
-        if (data_length >= frame_sample_length) {
+        if (data_buf_size >= frame_sample_length) {
           data_buf_start_frame += 1;
-          start_pos= data_buf_start_frame* frame_sample_length;
-          data_length=data_buf_all.size()-start_pos;
-        } else {
-          break;
+          data_buf_size = data_buf_all_size - data_buf_start_frame * frame_sample_length;
         }
-      }
-      if (start_pos!=-1){
-        data_buf.resize(data_length);
-        std::copy(data_buf_all.begin() + start_pos, data_buf_all.end(), data_buf.begin());
       }
     }
 
@@ -487,9 +476,9 @@
             expected_sample_number += int(extra_sample);
         }
         if (end_point_is_sent_end) {
-            expected_sample_number = std::max(expected_sample_number, int(data_buf.size()));
+            expected_sample_number = std::max(expected_sample_number, data_buf_size);
         }
-        if (data_buf.size() < expected_sample_number) {
+        if (data_buf_size < expected_sample_number) {
             std::cout << "error in calling pop data_buf\n";
         }
         if (output_data_buf.size() == 0 || first_frm_is_start_point) {
@@ -503,27 +492,20 @@
         if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
             std::cout << "warning\n";
         }
-        int out_pos = (int) cur_seg.buffer.size();
+
         int data_to_pop;
         if (end_point_is_sent_end) {
             data_to_pop = expected_sample_number;
         } else {
             data_to_pop = int(frm_cnt * vad_opts.frame_in_ms * vad_opts.sample_rate / 1000);
         }
-        if (data_to_pop > int(data_buf.size())) {
+        if (data_to_pop > data_buf_size) {
             std::cout << "VAD data_to_pop is bigger than data_buf.size()!!!\n";
-            data_to_pop = (int) data_buf.size();
-            expected_sample_number = (int) data_buf.size();
+            data_to_pop = data_buf_size;
+            expected_sample_number = data_buf_size;
         }
         cur_seg.doa = 0;
-        for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++) {
-            cur_seg.buffer.push_back(data_buf.back());
-            out_pos++;
-        }
-        for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++) {
-            cur_seg.buffer.push_back(data_buf.back());
-            out_pos++;
-        }
+        
         if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
             std::cout << "Something wrong with the VAD algorithm\n";
         }
@@ -619,7 +601,7 @@
         if (sil_pdf_ids.size() > 0) {
             std::vector<float> sil_pdf_scores;
             for (auto sil_pdf_id: sil_pdf_ids) {
-                sil_pdf_scores.push_back(scores[t][sil_pdf_id]);
+                sil_pdf_scores.push_back(scores[t - idx_pre_chunk][sil_pdf_id]);
             }
             sum_score = accumulate(sil_pdf_scores.begin(), sil_pdf_scores.end(), 0.0);
             noise_prob = log(sum_score) * vad_opts.speech_2_noise_ratio;
@@ -663,6 +645,7 @@
             frame_state = GetFrameState(frm_cnt - 1 - i);
             DetectOneFrame(frame_state, frm_cnt - 1 - i, false);
         }
+        idx_pre_chunk += scores.size();
         return 0;
     }
 
diff --git a/funasr/runtime/onnxruntime/third_party/install_openblas.sh b/funasr/runtime/onnxruntime/third_party/install_openblas.sh
deleted file mode 100644
index 4a41012..0000000
--- a/funasr/runtime/onnxruntime/third_party/install_openblas.sh
+++ /dev/null
@@ -1,39 +0,0 @@
-#!/usr/bin/env bash
-
-OPENBLAS_VERSION=0.3.13
-
-WGET=${WGET:-wget}
-
-set -e
-
-if ! command -v gfortran 2>/dev/null; then
-  echo "$0: gfortran is not installed.  Please install it, e.g. by:"
-  echo " apt-get install gfortran"
-  echo "(if on Debian or Ubuntu), or:"
-  echo " yum install gcc-gfortran"
-  echo "(if on RedHat/CentOS).  On a Mac, if brew is installed, it's:"
-  echo " brew install gfortran"
-  exit 1
-fi
-
-
-tarball=OpenBLAS-$OPENBLAS_VERSION.tar.gz
-
-rm -rf xianyi-OpenBLAS-* OpenBLAS OpenBLAS-*.tar.gz
-
-if [ -d "$DOWNLOAD_DIR" ]; then
-  cp -p "$DOWNLOAD_DIR/$tarball" .
-else
-  url=$($WGET -qO- "https://api.github.com/repos/xianyi/OpenBLAS/releases/tags/v${OPENBLAS_VERSION}" | python -c 'import sys,json;print(json.load(sys.stdin)["tarball_url"])')
-  test -n "$url"
-  $WGET -t3 -nv -O $tarball "$url"
-fi
-
-tar xzf $tarball
-mv xianyi-OpenBLAS-* OpenBLAS
-
-make PREFIX=$(pwd)/OpenBLAS/install USE_LOCKING=1 USE_THREAD=0 -C OpenBLAS all install
-if [ $? -eq 0 ]; then
-   echo "OpenBLAS is installed successfully."
-   rm $tarball
-fi
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/wave/asr_example.wav b/funasr/runtime/onnxruntime/wave/asr_example.wav
deleted file mode 100644
index be33a3c..0000000
--- a/funasr/runtime/onnxruntime/wave/asr_example.wav
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/onnxruntime/wave/long.wav b/funasr/runtime/onnxruntime/wave/long.wav
deleted file mode 100644
index 22b383a..0000000
--- a/funasr/runtime/onnxruntime/wave/long.wav
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/onnxruntime/wave/short.wav b/funasr/runtime/onnxruntime/wave/short.wav
deleted file mode 100644
index bf13bb1..0000000
--- a/funasr/runtime/onnxruntime/wave/short.wav
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/onnxruntime/wave/test.pcm.bytes b/funasr/runtime/onnxruntime/wave/test.pcm.bytes
deleted file mode 100644
index f3962c6..0000000
--- a/funasr/runtime/onnxruntime/wave/test.pcm.bytes
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/onnxruntime/wave/test.pcm.wav b/funasr/runtime/onnxruntime/wave/test.pcm.wav
deleted file mode 100644
index b83d56c..0000000
--- a/funasr/runtime/onnxruntime/wave/test.pcm.wav
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/python/grpc/Readme.md b/funasr/runtime/python/grpc/Readme.md
index 895013a..742268b 100644
--- a/funasr/runtime/python/grpc/Readme.md
+++ b/funasr/runtime/python/grpc/Readme.md
@@ -1,4 +1,4 @@
-# Using funasr with grpc-python
+# Service with grpc-python
 We can send streaming audio data to server in real-time with grpc client every 10 ms e.g., and get transcribed text when stop speaking.
 The audio data is in streaming, the asr inference process is in offline.
 
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
index b5b3312..3cda80d 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/utils/e2e_vad.py
@@ -229,10 +229,11 @@
         self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
         self.speech_noise_thres = self.vad_opts.speech_noise_thres
         self.scores = None
+        self.idx_pre_chunk = 0
         self.max_time_out = False
         self.decibel = []
-        self.data_buf = None
-        self.data_buf_all = None
+        self.data_buf_size = 0
+        self.data_buf_all_size = 0
         self.waveform = None
         self.ResetDetection()
 
@@ -259,10 +260,11 @@
         self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
         self.speech_noise_thres = self.vad_opts.speech_noise_thres
         self.scores = None
+        self.idx_pre_chunk = 0
         self.max_time_out = False
         self.decibel = []
-        self.data_buf = None
-        self.data_buf_all = None
+        self.data_buf_size = 0
+        self.data_buf_all_size = 0
         self.waveform = None
         self.ResetDetection()
 
@@ -280,11 +282,11 @@
     def ComputeDecibel(self) -> None:
         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
         frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
-        if self.data_buf_all is None:
-            self.data_buf_all = self.waveform[0]  # self.data_buf is pointed to self.waveform[0]
-            self.data_buf = self.data_buf_all
+        if self.data_buf_all_size == 0:
+            self.data_buf_all_size = len(self.waveform[0])
+            self.data_buf_size = self.data_buf_all_size
         else:
-            self.data_buf_all = np.concatenate((self.data_buf_all, self.waveform[0]))
+            self.data_buf_all_size += len(self.waveform[0])
         for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
             self.decibel.append(
                 10 * math.log10(np.square((self.waveform[0][offset: offset + frame_sample_length])).sum() + \
@@ -294,17 +296,14 @@
         # scores = self.encoder(feats, in_cache)  # return B * T * D
         self.vad_opts.nn_eval_block_size = scores.shape[1]
         self.frm_cnt += scores.shape[1]  # count total frames
-        if self.scores is None:
-            self.scores = scores  # the first calculation
-        else:
-            self.scores = np.concatenate((self.scores, scores), axis=1)
+        self.scores=scores
 
     def PopDataBufTillFrame(self, frame_idx: int) -> None:  # need check again
         while self.data_buf_start_frame < frame_idx:
-            if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
+            if self.data_buf_size >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                 self.data_buf_start_frame += 1
-                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
-                    self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
+                self.data_buf_size = self.data_buf_all_size-self.data_buf_start_frame * int(
+                    self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
 
     def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
                            last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
@@ -315,8 +314,8 @@
                                       self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
             expected_sample_number += int(extra_sample)
         if end_point_is_sent_end:
-            expected_sample_number = max(expected_sample_number, len(self.data_buf))
-        if len(self.data_buf) < expected_sample_number:
+            expected_sample_number = max(expected_sample_number, self.data_buf_size)
+        if self.data_buf_size < expected_sample_number:
             print('error in calling pop data_buf\n')
 
         if len(self.output_data_buf) == 0 or first_frm_is_start_point:
@@ -334,10 +333,10 @@
             data_to_pop = expected_sample_number
         else:
             data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
-        if data_to_pop > len(self.data_buf):
-            print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
-            data_to_pop = len(self.data_buf)
-            expected_sample_number = len(self.data_buf)
+        if data_to_pop > self.data_buf_size:
+            print('VAD data_to_pop is bigger than self.data_buf_size!!!\n')
+            data_to_pop = self.data_buf_size
+            expected_sample_number = self.data_buf_size
 
         cur_seg.doa = 0
         for sample_cpy_out in range(0, data_to_pop):
@@ -420,7 +419,7 @@
         assert len(self.sil_pdf_ids) == self.vad_opts.silence_pdf_num
         if len(self.sil_pdf_ids) > 0:
             assert len(self.scores) == 1  # 鍙敮鎸乥atch_size = 1鐨勬祴璇�
-            sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
+            sil_pdf_scores = [self.scores[0][t - self.idx_pre_chunk][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
             sum_score = sum(sil_pdf_scores)
             noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
             total_score = 1.0
@@ -502,7 +501,7 @@
             frame_state = FrameState.kFrameStateInvalid
             frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
             self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
-
+        self.idx_pre_chunk += self.scores.shape[1]
         return 0
 
     def DetectLastFrames(self) -> int:
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index 06603f0..3fafd53 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -13,7 +13,7 @@
 
 
 MODULE_NAME = 'funasr_onnx'
-VERSION_NUM = '0.0.6'
+VERSION_NUM = '0.0.7'
 
 setuptools.setup(
     name=MODULE_NAME,
diff --git a/funasr/runtime/python/websocket/ASR_client.py b/funasr/runtime/python/websocket/ASR_client.py
deleted file mode 100644
index fe67981..0000000
--- a/funasr/runtime/python/websocket/ASR_client.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import pyaudio
-# import websocket #鍖哄埆鏈嶅姟绔繖閲屾槸 websocket-client搴�
-import time
-import websockets
-import asyncio
-from queue import Queue
-# import threading
-import argparse
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--host",
-                    type=str,
-                    default="localhost",
-                    required=False,
-                    help="host ip, localhost, 0.0.0.0")
-parser.add_argument("--port",
-                    type=int,
-                    default=10095,
-                    required=False,
-                    help="grpc server port")
-parser.add_argument("--chunk_size",
-                    type=int,
-                    default=300,
-                    help="ms")
-
-args = parser.parse_args()
-
-voices = Queue()
-
-
-    
-# 鍏朵粬鍑芥暟鍙互閫氳繃璋冪敤send(data)鏉ュ彂閫佹暟鎹紝渚嬪锛�
-async def record():
-    #print("2")
-    global voices 
-    FORMAT = pyaudio.paInt16
-    CHANNELS = 1
-    RATE = 16000
-    CHUNK = int(RATE / 1000 * args.chunk_size)
-
-    p = pyaudio.PyAudio()
-
-    stream = p.open(format=FORMAT,
-                    channels=CHANNELS,
-                    rate=RATE,
-                    input=True,
-                    frames_per_buffer=CHUNK)
-
-    while True:
-
-        data = stream.read(CHUNK)
-        
-        voices.put(data)
-        #print(voices.qsize())
-
-        await asyncio.sleep(0.01)
-    
-
-
-async def ws_send():
-    global voices
-    global websocket
-    print("started to sending data!")
-    while True:
-        while not voices.empty():
-            data = voices.get()
-            voices.task_done()
-            try:
-                await websocket.send(data) # 閫氳繃ws瀵硅薄鍙戦�佹暟鎹�
-            except Exception as e:
-                print('Exception occurred:', e)
-            await asyncio.sleep(0.01)
-        await asyncio.sleep(0.01)
-
-
-
-async def message():
-    global websocket
-    while True:
-        try:
-            print(await websocket.recv())
-        except Exception as e:
-            print("Exception:", e)          
-        
-
-
-async def ws_client():
-    global websocket # 瀹氫箟涓�涓叏灞�鍙橀噺ws锛岀敤浜庝繚瀛榳ebsocket杩炴帴瀵硅薄
-    # uri = "ws://11.167.134.197:8899"
-    uri = "ws://{}:{}".format(args.host, args.port)
-    #ws = await websockets.connect(uri, subprotocols=["binary"]) # 鍒涘缓涓�涓暱杩炴帴
-    async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None):
-        task = asyncio.create_task(record()) # 鍒涘缓涓�涓悗鍙颁换鍔″綍闊�
-        task2 = asyncio.create_task(ws_send()) # 鍒涘缓涓�涓悗鍙颁换鍔″彂閫�
-        task3 = asyncio.create_task(message()) # 鍒涘缓涓�涓悗鍙版帴鏀舵秷鎭殑浠诲姟
-        await asyncio.gather(task, task2, task3)
-
-
-asyncio.get_event_loop().run_until_complete(ws_client()) # 鍚姩鍗忕▼
-asyncio.get_event_loop().run_forever()
diff --git a/funasr/runtime/python/websocket/ASR_server.py b/funasr/runtime/python/websocket/ASR_server.py
deleted file mode 100644
index 827df7b..0000000
--- a/funasr/runtime/python/websocket/ASR_server.py
+++ /dev/null
@@ -1,185 +0,0 @@
-import asyncio
-import websockets
-import time
-from queue import Queue
-import threading
-import argparse
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-import logging
-import tracemalloc
-tracemalloc.start()
-
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-
-websocket_users = set()  #缁存姢瀹㈡埛绔垪琛�
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--host",
-                    type=str,
-                    default="0.0.0.0",
-                    required=False,
-                    help="host ip, localhost, 0.0.0.0")
-parser.add_argument("--port",
-                    type=int,
-                    default=10095,
-                    required=False,
-                    help="grpc server port")
-parser.add_argument("--asr_model",
-                    type=str,
-                    default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
-                    help="model from modelscope")
-parser.add_argument("--vad_model",
-                    type=str,
-                    default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-                    help="model from modelscope")
-
-parser.add_argument("--punc_model",
-                    type=str,
-                    default="",
-                    help="model from modelscope")
-parser.add_argument("--ngpu",
-                    type=int,
-                    default=1,
-                    help="0 for cpu, 1 for gpu")
-
-args = parser.parse_args()
-
-print("model loading")
- 
-
-# vad
-inference_pipeline_vad = pipeline(
-    task=Tasks.voice_activity_detection,
-    model=args.vad_model,
-    model_revision=None,
-    output_dir=None,
-    batch_size=1,
-    mode='online',
-    ngpu=args.ngpu,
-)
-# param_dict_vad = {'in_cache': dict(), "is_final": False}
-  
-# asr
-param_dict_asr = {}
-# param_dict["hotword"] = "灏忎簲 灏忎簲鏈�"  # 璁剧疆鐑瘝锛岀敤绌烘牸闅斿紑
-inference_pipeline_asr = pipeline(
-    task=Tasks.auto_speech_recognition,
-    model=args.asr_model,
-    param_dict=param_dict_asr,
-    ngpu=args.ngpu,
-)
-if args.punc_model != "":
-    # param_dict_punc = {'cache': list()}
-    inference_pipeline_punc = pipeline(
-        task=Tasks.punctuation,
-        model=args.punc_model,
-        model_revision=None,
-        ngpu=args.ngpu,
-    )
-else:
-    inference_pipeline_punc = None
-
-print("model loaded")
-
-
-
-async def ws_serve(websocket, path):
-    #speek = Queue()
-    frames = []  # 瀛樺偍鎵�鏈夌殑甯ф暟鎹�
-    buffer = []  # 瀛樺偍缂撳瓨涓殑甯ф暟鎹紙鏈�澶氫袱涓墖娈碉級
-    RECORD_NUM = 0
-    global websocket_users
-    speech_start, speech_end = False, False
-    # 璋冪敤asr鍑芥暟
-    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
-    websocket.param_dict_punc = {'cache': list()}
-    websocket.speek = Queue()  #websocket 娣诲姞杩涢槦鍒楀璞� 璁゛sr璇诲彇璇煶鏁版嵁鍖�
-    websocket.send_msg = Queue()   #websocket 娣诲姞涓槦鍒楀璞�  璁﹚s鍙戦�佹秷鎭埌瀹㈡埛绔�
-    websocket_users.add(websocket)
-    ss = threading.Thread(target=asr, args=(websocket,))
-    ss.start()
-    
-    try:
-        async for message in websocket:
-            #voices.put(message)
-            #print("put")
-            #await websocket.send("123")
-            buffer.append(message)
-            if len(buffer) > 2:
-                buffer.pop(0)  # 濡傛灉缂撳瓨瓒呰繃涓や釜鐗囨锛屽垯鍒犻櫎鏈�鏃╃殑涓�涓�
-              
-            if speech_start:
-                frames.append(message)
-                RECORD_NUM += 1
-            speech_start_i, speech_end_i = vad(message, websocket)
-            #print(speech_start_i, speech_end_i)
-            if speech_start_i:
-                speech_start = speech_start_i
-                frames = []
-                frames.extend(buffer)  # 鎶婁箣鍓�2涓闊虫暟鎹揩鍔犲叆
-            if speech_end_i or RECORD_NUM > 300:
-                speech_start = False
-                audio_in = b"".join(frames)
-                websocket.speek.put(audio_in)
-                frames = []  # 娓呯┖鎵�鏈夌殑甯ф暟鎹�
-                buffer = []  # 娓呯┖缂撳瓨涓殑甯ф暟鎹紙鏈�澶氫袱涓墖娈碉級
-                RECORD_NUM = 0
-            if not websocket.send_msg.empty():
-                await websocket.send(websocket.send_msg.get())
-                websocket.send_msg.task_done()
-
-     
-    except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)    # 閾炬帴鏂紑
-        websocket_users.remove(websocket)
-    except websockets.InvalidState:
-        print("InvalidState...")    # 鏃犳晥鐘舵��
-    except Exception as e:
-        print("Exception:", e)
- 
-
-def asr(websocket):  # ASR鎺ㄧ悊
-        global inference_pipeline_asr, inference_pipeline_punc
-        # global param_dict_punc
-        global websocket_users
-        while websocket in  websocket_users:
-            if not websocket.speek.empty():
-                audio_in = websocket.speek.get()
-                websocket.speek.task_done()
-                if len(audio_in) > 0:
-                    rec_result = inference_pipeline_asr(audio_in=audio_in)
-                    if inference_pipeline_punc is not None and 'text' in rec_result:
-                        rec_result = inference_pipeline_punc(text_in=rec_result['text'], param_dict=websocket.param_dict_punc)
-                    # print(rec_result)
-                    if "text" in rec_result:
-                        websocket.send_msg.put(rec_result["text"]) # 瀛樺叆鍙戦�侀槦鍒�  鐩存帴璋冪敤send鍙戦�佷笉浜�
-               
-            time.sleep(0.1)
-
-def vad(data, websocket):  # VAD鎺ㄧ悊
-    global inference_pipeline_vad
-    #print(type(data))
-    # print(param_dict_vad)
-    segments_result = inference_pipeline_vad(audio_in=data, param_dict=websocket.param_dict_vad)
-    # print(segments_result)
-    # print(param_dict_vad)
-    speech_start = False
-    speech_end = False
-    
-    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
-        return speech_start, speech_end
-    if segments_result["text"][0][0] != -1:
-        speech_start = True
-    if segments_result["text"][0][1] != -1:
-        speech_end = True
-    return speech_start, speech_end
-
- 
-start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
-asyncio.get_event_loop().run_until_complete(start_server)
-asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/README.md b/funasr/runtime/python/websocket/README.md
index 73f8aeb..473c37a 100644
--- a/funasr/runtime/python/websocket/README.md
+++ b/funasr/runtime/python/websocket/README.md
@@ -1,11 +1,10 @@
-# Using funasr with websocket
-We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking.
-The audio data is in streaming, the asr inference process is in offline.
+# Service with websocket-python
 
+This is a demo using funasr pipeline with websocket python-api. 
 
 ## For the Server
 
-Install the modelscope and funasr
+### Install the modelscope and funasr
 
 ```shell
 pip install -U modelscope funasr
@@ -14,18 +13,31 @@
 git clone https://github.com/alibaba/FunASR.git && cd FunASR
 ```
 
-Install the requirements for server
+### Install the requirements for server
 
 ```shell
 cd funasr/runtime/python/websocket
 pip install -r requirements_server.txt
 ```
 
-Start server
-
+### Start server
+#### ASR offline server
 ```shell
-python ASR_server.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+python ws_server_offline.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
 ```
+
+#### ASR streaming server
+```shell
+python ws_server_online.py --port 10095 --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+```
+
+#### ASR offline/online 2pass server
+
+[//]: # (```shell)
+
+[//]: # (python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+
+[//]: # (```)
 
 ## For the client
 
@@ -36,11 +48,33 @@
 pip install -r requirements_client.txt
 ```
 
-Start client
-
+### Start client
+#### ASR offline client
+##### Recording from mircrophone
 ```shell
-python ASR_client.py --host "127.0.0.1" --port 10095 --chunk_size 300
+# --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100
+```
+##### Loadding from wav.scp(kaldi style)
+```shell
+# --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100 --audio_in "./data/wav.scp" --send_without_sleep --output_dir "./results"
+```
+#### ASR streaming client
+##### Recording from mircrophone
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --words_max_print 100
+```
+##### Loadding from wav.scp(kaldi style)
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --audio_in "./data/wav.scp" --words_max_print 100 --output_dir "./results"
 ```
 
+#### ASR offline/online 2pass client
+
 ## Acknowledge
-1. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service.
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service.
+3. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service of offline model.
diff --git a/funasr/runtime/python/websocket/parse_args.py b/funasr/runtime/python/websocket/parse_args.py
new file mode 100644
index 0000000..d170be8
--- /dev/null
+++ b/funasr/runtime/python/websocket/parse_args.py
@@ -0,0 +1,40 @@
+# -*- encoding: utf-8 -*-
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument("--host",
+                    type=str,
+                    default="0.0.0.0",
+                    required=False,
+                    help="host ip, localhost, 0.0.0.0")
+parser.add_argument("--port",
+                    type=int,
+                    default=10095,
+                    required=False,
+                    help="grpc server port")
+parser.add_argument("--asr_model",
+                    type=str,
+                    default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+                    help="model from modelscope")
+parser.add_argument("--asr_model_online",
+                    type=str,
+                    default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online",
+                    help="model from modelscope")
+parser.add_argument("--vad_model",
+                    type=str,
+                    default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+                    help="model from modelscope")
+parser.add_argument("--punc_model",
+                    type=str,
+                    default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727",
+                    help="model from modelscope")
+parser.add_argument("--ngpu",
+                    type=int,
+                    default=1,
+                    help="0 for cpu, 1 for gpu")
+parser.add_argument("--ncpu",
+                    type=int,
+                    default=1,
+                    help="cpu cores")
+
+args = parser.parse_args()
+print(args)
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_client.py b/funasr/runtime/python/websocket/ws_client.py
new file mode 100644
index 0000000..9de31a4
--- /dev/null
+++ b/funasr/runtime/python/websocket/ws_client.py
@@ -0,0 +1,226 @@
+# -*- encoding: utf-8 -*-
+import os
+import time
+import websockets
+import asyncio
+# import threading
+import argparse
+import json
+import traceback
+from multiprocessing import Process
+from funasr.fileio.datadir_writer import DatadirWriter
+
+parser = argparse.ArgumentParser()
+parser.add_argument("--host",
+                    type=str,
+                    default="localhost",
+                    required=False,
+                    help="host ip, localhost, 0.0.0.0")
+parser.add_argument("--port",
+                    type=int,
+                    default=10095,
+                    required=False,
+                    help="grpc server port")
+parser.add_argument("--chunk_size",
+                    type=str,
+                    default="5, 10, 5",
+                    help="chunk")
+parser.add_argument("--chunk_interval",
+                    type=int,
+                    default=10,
+                    help="chunk")
+parser.add_argument("--audio_in",
+                    type=str,
+                    default=None,
+                    help="audio_in")
+parser.add_argument("--send_without_sleep",
+                    action="store_true",
+                    default=False,
+                    help="if audio_in is set, send_without_sleep")
+parser.add_argument("--test_thread_num",
+                    type=int,
+                    default=1,
+                    help="test_thread_num")
+parser.add_argument("--words_max_print",
+                    type=int,
+                    default=100,
+                    help="chunk")
+parser.add_argument("--output_dir",
+                    type=str,
+                    default=None,
+                    help="output_dir")
+
+args = parser.parse_args()
+args.chunk_size = [int(x) for x in args.chunk_size.split(",")]
+print(args)
+# voices = asyncio.Queue()
+from queue import Queue
+voices = Queue()
+
+ibest_writer = None
+if args.output_dir is not None:
+    writer = DatadirWriter(args.output_dir)
+    ibest_writer = writer[f"1best_recog"]
+
+async def record_microphone():
+    is_finished = False
+    import pyaudio
+    #print("2")
+    global voices 
+    FORMAT = pyaudio.paInt16
+    CHANNELS = 1
+    RATE = 16000
+    chunk_size = 60*args.chunk_size[1]/args.chunk_interval
+    CHUNK = int(RATE / 1000 * chunk_size)
+
+    p = pyaudio.PyAudio()
+
+    stream = p.open(format=FORMAT,
+                    channels=CHANNELS,
+                    rate=RATE,
+                    input=True,
+                    frames_per_buffer=CHUNK)
+    is_speaking = True
+    while True:
+
+        data = stream.read(CHUNK)
+        data = data.decode('ISO-8859-1')
+        message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "audio": data, "is_speaking": is_speaking, "is_finished": is_finished})
+        
+        voices.put(message)
+
+        await asyncio.sleep(0.005)
+
+async def record_from_scp():
+    import wave
+    global voices
+    is_finished = False
+    if args.audio_in.endswith(".scp"):
+        f_scp = open(args.audio_in)
+        wavs = f_scp.readlines()
+    else:
+        wavs = [args.audio_in]
+    for wav in wavs:
+        wav_splits = wav.strip().split()
+        wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
+        wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
+        
+        # bytes_f = open(wav_path, "rb")
+        # bytes_data = bytes_f.read()
+        with wave.open(wav_path, "rb") as wav_file:
+            params = wav_file.getparams()
+            # header_length = wav_file.getheaders()[0][1]
+            # wav_file.setpos(header_length)
+            frames = wav_file.readframes(wav_file.getnframes())
+
+        audio_bytes = bytes(frames)
+        # stride = int(args.chunk_size/1000*16000*2)
+        stride = int(60*args.chunk_size[1]/args.chunk_interval/1000*16000*2)
+        chunk_num = (len(audio_bytes)-1)//stride + 1
+        # print(stride)
+        is_speaking = True
+        for i in range(chunk_num):
+            if i == chunk_num-1:
+                is_speaking = False
+            beg = i*stride
+            data = audio_bytes[beg:beg+stride]
+            data = data.decode('ISO-8859-1')
+            message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "is_speaking": is_speaking, "audio": data, "is_finished": is_finished, "wav_name": wav_name})
+            voices.put(message)
+            # print("data_chunk: ", len(data_chunk))
+            # print(voices.qsize())
+            sleep_duration = 0.001 if args.send_without_sleep else 60*args.chunk_size[1]/args.chunk_interval/1000
+            await asyncio.sleep(sleep_duration)
+
+    is_finished = True
+    message = json.dumps({"is_finished": is_finished})
+    voices.put(message)
+
+async def ws_send():
+    global voices
+    global websocket
+    print("started to sending data!")
+    while True:
+        while not voices.empty():
+            data = voices.get()
+            voices.task_done()
+            try:
+                await websocket.send(data)
+            except Exception as e:
+                print('Exception occurred:', e)
+                traceback.print_exc()
+                exit(0)
+            await asyncio.sleep(0.005)
+        await asyncio.sleep(0.005)
+
+
+
+async def message(id):
+    global websocket
+    text_print = ""
+    while True:
+        try:
+            meg = await websocket.recv()
+            meg = json.loads(meg)
+            # print(meg, end = '')
+            # print("\r")
+            # print(meg)
+            wav_name = meg.get("wav_name", "demo")
+            print(wav_name)
+            text = meg["text"]
+            if ibest_writer is not None:
+                ibest_writer["text"][wav_name] = text
+            if meg["mode"] == "online":
+                text_print += " {}".format(text)
+            else:
+                text_print += "{}".format(text)
+            text_print = text_print[-args.words_max_print:]
+            os.system('clear')
+            print("\rpid"+str(id)+": "+text_print)
+        except Exception as e:
+            print("Exception:", e)
+            traceback.print_exc()
+            exit(0)
+
+async def print_messge():
+    global websocket
+    while True:
+        try:
+            meg = await websocket.recv()
+            meg = json.loads(meg)
+            print(meg)
+        except Exception as e:
+            print("Exception:", e)
+            traceback.print_exc()
+            exit(0)
+
+async def ws_client(id):
+    global websocket
+    uri = "ws://{}:{}".format(args.host, args.port)
+    async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None):
+        if args.audio_in is not None:
+            task = asyncio.create_task(record_from_scp())
+        else:
+            task = asyncio.create_task(record_microphone())
+        task2 = asyncio.create_task(ws_send())
+        task3 = asyncio.create_task(message(id))
+        await asyncio.gather(task, task2, task3)
+
+def one_thread(id):
+   asyncio.get_event_loop().run_until_complete(ws_client(id)) # 鍚姩鍗忕▼
+   asyncio.get_event_loop().run_forever()
+
+
+if __name__ == '__main__':
+    process_list = []
+    for i in range(args.test_thread_num):   
+        p = Process(target=one_thread,args=(i,))
+        p.start()
+        process_list.append(p)
+
+    for i in process_list:
+        p.join()
+
+    print('end')
+ 
+
diff --git a/funasr/runtime/python/websocket/ws_server_offline.py b/funasr/runtime/python/websocket/ws_server_offline.py
new file mode 100644
index 0000000..15578f6
--- /dev/null
+++ b/funasr/runtime/python/websocket/ws_server_offline.py
@@ -0,0 +1,150 @@
+import asyncio
+import json
+import websockets
+import time
+import logging
+import tracemalloc
+import numpy as np
+
+from parse_args import args
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
+
+tracemalloc.start()
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+
+websocket_users = set()
+
+print("model loading")
+# asr
+inference_pipeline_asr = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=args.asr_model,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+    model_revision=None)
+
+
+# vad
+inference_pipeline_vad = pipeline(
+    task=Tasks.voice_activity_detection,
+    model=args.vad_model,
+    model_revision=None,
+    output_dir=None,
+    batch_size=1,
+    mode='online',
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+)
+
+if args.punc_model != "":
+    inference_pipeline_punc = pipeline(
+        task=Tasks.punctuation,
+        model=args.punc_model,
+        model_revision=None,
+        ngpu=args.ngpu,
+        ncpu=args.ncpu,
+    )
+else:
+    inference_pipeline_punc = None
+
+print("model loaded")
+
+async def ws_serve(websocket, path):
+    frames = []
+    frames_asr = []
+    global websocket_users
+    websocket_users.add(websocket)
+    websocket.param_dict_asr = {}
+    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
+    websocket.param_dict_punc = {'cache': list()}
+    websocket.vad_pre_idx = 0
+    speech_start = False
+
+    try:
+        async for message in websocket:
+            message = json.loads(message)
+            is_finished = message["is_finished"]
+            if not is_finished:
+                audio = bytes(message['audio'], 'ISO-8859-1')
+                frames.append(audio)
+                duration_ms = len(audio)//32
+                websocket.vad_pre_idx += duration_ms
+
+                is_speaking = message["is_speaking"]
+                websocket.param_dict_vad["is_final"] = not is_speaking
+                websocket.wav_name = message.get("wav_name", "demo")
+                if speech_start:
+                    frames_asr.append(audio)
+                speech_start_i, speech_end_i = await async_vad(websocket, audio)
+                if speech_start_i:
+                    speech_start = True
+                    beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
+                    frames_pre = frames[-beg_bias:]
+                    frames_asr = []
+                    frames_asr.extend(frames_pre)
+                if speech_end_i or not is_speaking:
+                    audio_in = b"".join(frames_asr)
+                    await async_asr(websocket, audio_in)
+                    frames_asr = []
+                    speech_start = False
+                    if not is_speaking:
+                        websocket.vad_pre_idx = 0
+                        frames = []
+                        websocket.param_dict_vad = {'in_cache': dict()}
+                    else:
+                        frames = frames[-20:]
+
+     
+    except websockets.ConnectionClosed:
+        print("ConnectionClosed...", websocket_users)
+        websocket_users.remove(websocket)
+    except websockets.InvalidState:
+        print("InvalidState...")
+    except Exception as e:
+        print("Exception:", e)
+
+
+async def async_vad(websocket, audio_in):
+
+    segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
+
+    speech_start = False
+    speech_end = False
+    
+    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
+        return speech_start, speech_end
+    if segments_result["text"][0][0] != -1:
+        speech_start = segments_result["text"][0][0]
+    if segments_result["text"][0][1] != -1:
+        speech_end = True
+    return speech_start, speech_end
+
+
+async def async_asr(websocket, audio_in):
+            if len(audio_in) > 0:
+                # print(len(audio_in))
+                audio_in = load_bytes(audio_in)
+                
+                rec_result = inference_pipeline_asr(audio_in=audio_in,
+                                                    param_dict=websocket.param_dict_asr)
+                # print(rec_result)
+                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
+                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
+                                                         param_dict=websocket.param_dict_punc)
+                    # print(rec_result)
+                message = json.dumps({"mode": "offline", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                await websocket.send(message)
+                
+                
+ 
+
+
+start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
+asyncio.get_event_loop().run_until_complete(start_server)
+asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_server_online.py b/funasr/runtime/python/websocket/ws_server_online.py
new file mode 100644
index 0000000..b1cd4ea
--- /dev/null
+++ b/funasr/runtime/python/websocket/ws_server_online.py
@@ -0,0 +1,93 @@
+import asyncio
+import json
+import websockets
+import time
+from queue import Queue
+import threading
+import logging
+import tracemalloc
+import numpy as np
+
+from parse_args import args
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
+
+tracemalloc.start()
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+
+websocket_users = set()
+
+
+print("model loading")
+
+inference_pipeline_asr_online = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=args.asr_model_online,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+    model_revision='v1.0.4')
+
+print("model loaded")
+
+
+
+async def ws_serve(websocket, path):
+    frames_online = []
+    global websocket_users
+    websocket.send_msg = Queue()
+    websocket_users.add(websocket)
+    websocket.param_dict_asr_online = {"cache": dict()}
+    websocket.speek_online = Queue()
+
+    try:
+        async for message in websocket:
+            message = json.loads(message)
+            is_finished = message["is_finished"]
+            if not is_finished:
+                audio = bytes(message['audio'], 'ISO-8859-1')
+
+                is_speaking = message["is_speaking"]
+                websocket.param_dict_asr_online["is_final"] = not is_speaking
+                websocket.wav_name = message.get("wav_name", "demo")
+                websocket.param_dict_asr_online["chunk_size"] = message["chunk_size"]
+                
+                frames_online.append(audio)
+                if len(frames_online) % message["chunk_interval"] == 0 or not is_speaking:
+                    audio_in = b"".join(frames_online)
+                    await async_asr_online(websocket,audio_in)
+                    frames_online = []
+
+
+     
+    except websockets.ConnectionClosed:
+        print("ConnectionClosed...", websocket_users)
+        websocket_users.remove(websocket)
+    except websockets.InvalidState:
+        print("InvalidState...")
+    except Exception as e:
+        print("Exception:", e)
+ 
+async def async_asr_online(websocket,audio_in):
+            if len(audio_in) > 0:
+                audio_in = load_bytes(audio_in)
+                rec_result = inference_pipeline_asr_online(audio_in=audio_in,
+                                                           param_dict=websocket.param_dict_asr_online)
+                if websocket.param_dict_asr_online["is_final"]:
+                    websocket.param_dict_asr_online["cache"] = dict()
+                if "text" in rec_result:
+                    if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
+                        # if len(rec_result["text"])>0:
+                        #     rec_result["text"][0]=rec_result["text"][0] #.replace(" ","")
+                        message = json.dumps({"mode": "online", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                        await websocket.send(message)
+
+
+
+start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
+asyncio.get_event_loop().run_until_complete(start_server)
+asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/websocket/CMakeLists.txt b/funasr/runtime/websocket/CMakeLists.txt
new file mode 100644
index 0000000..07d96d9
--- /dev/null
+++ b/funasr/runtime/websocket/CMakeLists.txt
@@ -0,0 +1,64 @@
+cmake_minimum_required(VERSION 3.10)
+
+project(FunASRWebscoket) 
+
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
+
+option(ENABLE_WEBSOCKET "Whether to build websocket server" ON)
+ 
+if(ENABLE_WEBSOCKET)
+  cmake_policy(SET CMP0135 NEW)
+
+  include(FetchContent)
+  FetchContent_Declare(websocketpp
+  GIT_REPOSITORY https://github.com/zaphoyd/websocketpp.git
+    GIT_TAG 0.8.2
+    SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/websocket
+    )
+  
+  FetchContent_MakeAvailable(websocketpp)
+  include_directories(${PROJECT_SOURCE_DIR}/third_party/websocket)
+   
+
+  FetchContent_Declare(asio
+     URL   https://github.com/chriskohlhoff/asio/archive/refs/tags/asio-1-24-0.tar.gz
+   SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/asio
+  )
+  
+  FetchContent_MakeAvailable(asio)
+  include_directories(${PROJECT_SOURCE_DIR}/third_party/asio/asio/include)
+ 
+  FetchContent_Declare(json
+     URL   https://github.com/nlohmann/json/archive/refs/tags/v3.11.2.tar.gz
+   SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/json
+  )
+  
+  FetchContent_MakeAvailable(json)
+  include_directories(${PROJECT_SOURCE_DIR}/third_party/json/include)
+ 
+ 
+
+endif()
+
+# Include generated *.pb.h files
+link_directories(${ONNXRUNTIME_DIR}/lib)
+
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/include/)
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp/include/)
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank)
+
+add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/yaml-cpp yaml-cpp)
+add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/kaldi-native-fbank/kaldi-native-fbank/csrc csrc)
+add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/src src)
+
+include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog)
+set(BUILD_TESTING OFF)
+add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog glog)
+ 
+
+add_executable(websocketmain "websocketmain.cpp" "websocketsrv.cpp")
+add_executable(websocketclient "websocketclient.cpp")
+
+target_link_libraries(websocketclient PUBLIC funasr)
+target_link_libraries(websocketmain PUBLIC funasr)
diff --git a/funasr/runtime/websocket/readme.md b/funasr/runtime/websocket/readme.md
new file mode 100644
index 0000000..6ff3e50
--- /dev/null
+++ b/funasr/runtime/websocket/readme.md
@@ -0,0 +1,108 @@
+# Service with websocket-cpp
+
+## Export the model
+### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
+
+```shell
+# pip3 install torch torchaudio
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+
+### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
+
+```shell
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True
+```
+
+## Building for Linux/Unix
+
+### Download onnxruntime
+```shell
+# download an appropriate onnxruntime from https://github.com/microsoft/onnxruntime/releases/tag/v1.14.0
+# here we get a copy of onnxruntime for linux 64
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.14.0/onnxruntime-linux-x64-1.14.0.tgz
+tar -zxvf onnxruntime-linux-x64-1.14.0.tgz
+```
+
+### Install openblas
+```shell
+sudo apt-get install libopenblas-dev #ubuntu
+# sudo yum -y install openblas-devel #centos
+```
+
+### Build runtime
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd funasr/runtime/websocket
+mkdir build && cd build
+cmake  -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/path/to/onnxruntime-linux-x64-1.14.0
+make
+```
+## Run the websocket server
+
+```shell
+cd bin
+websocketmain  [--model_thread_num <int>] [--decoder_thread_num
+                        <int>] [--io_thread_num <int>] [--port <int>]
+                        [--listen_ip <string>] [--wav-scp <string>]
+                        [--wav-path <string>] [--punc-config <string>]
+                        [--punc-model <string>] --am-config <string>
+                        --am-cmvn <string> --am-model <string>
+                        [--vad-config <string>] [--vad-cmvn <string>]
+                        [--vad-model <string>] [--] [--version] [-h]
+Where:
+   --wav-scp <string>
+     wave scp path
+   --wav-path <string>
+     wave file path
+
+   --punc-config <string>
+     punc config path
+   --punc-model <string>
+     punc model path
+
+   --am-config <string>
+     (required)  am config path
+   --am-cmvn <string>
+     (required)  am cmvn path
+   --am-model <string>
+     (required)  am model path
+
+   --vad-config <string>
+     vad config path
+   --vad-cmvn <string>
+     vad cmvn path
+   --vad-model <string>
+     vad model path
+   --decoder_thread_num <int>
+     number of threads for decoder
+   --io_thread_num <int>
+     number of threads for network io
+  
+   Required: --am-config <string> --am-cmvn <string> --am-model <string> 
+   If use vad, please add: [--vad-config <string>] [--vad-cmvn <string>] [--vad-model <string>]
+   If use punc, please add: [--punc-config <string>] [--punc-model <string>] 
+example:
+   websocketmain --am-config /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml --am-model /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.onnx --am-cmvn /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn
+```
+
+## Run websocket client test
+
+```shell
+Usage: websocketclient server_ip port wav_path threads_num
+
+example:
+
+websocketclient 127.0.0.1 8889 funasr/runtime/websocket/test.pcm.wav 64
+
+result json, example like:
+{"text":"涓�浜屼笁鍥涗簲鍏竷鍏節鍗佷竴浜屼笁鍥涗簲鍏竷鍏節鍗�"}
+```
+
+
+## Acknowledge
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/add-offline-websocket-srv/funasr/runtime/websocket) for contributing the websocket(cpp-api).
+
+
diff --git a/funasr/runtime/websocket/websocketclient.cpp b/funasr/runtime/websocket/websocketclient.cpp
new file mode 100644
index 0000000..9ef1d5e
--- /dev/null
+++ b/funasr/runtime/websocket/websocketclient.cpp
@@ -0,0 +1,221 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// client for websocket, support multiple threads
+// Usage: websocketclient server_ip port wav_path threads_num
+
+#define ASIO_STANDALONE 1
+#include <websocketpp/client.hpp>
+#include <websocketpp/common/thread.hpp>
+#include <websocketpp/config/asio_no_tls_client.hpp>
+
+#include "audio.h"
+
+/**
+ * Define a semi-cross platform helper method that waits/sleeps for a bit.
+ */
+void wait_a_bit() {
+#ifdef WIN32
+  Sleep(1000);
+#else
+  sleep(1);
+#endif
+}
+typedef websocketpp::config::asio_client::message_type::ptr message_ptr;
+
+class websocket_client {
+ public:
+  typedef websocketpp::client<websocketpp::config::asio_client> client;
+  typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
+
+  websocket_client() : m_open(false), m_done(false) {
+    // set up access channels to only log interesting things
+    m_client.clear_access_channels(websocketpp::log::alevel::all);
+    m_client.set_access_channels(websocketpp::log::alevel::connect);
+    m_client.set_access_channels(websocketpp::log::alevel::disconnect);
+    m_client.set_access_channels(websocketpp::log::alevel::app);
+
+    // Initialize the Asio transport policy
+    m_client.init_asio();
+
+    // Bind the handlers we are using
+    using websocketpp::lib::bind;
+    using websocketpp::lib::placeholders::_1;
+    m_client.set_open_handler(bind(&websocket_client::on_open, this, _1));
+    m_client.set_close_handler(bind(&websocket_client::on_close, this, _1));
+    m_client.set_close_handler(bind(&websocket_client::on_close, this, _1));
+
+    m_client.set_message_handler(
+        [this](websocketpp::connection_hdl hdl, message_ptr msg) {
+          on_message(hdl, msg);
+        });
+
+    m_client.set_fail_handler(bind(&websocket_client::on_fail, this, _1));
+    m_client.clear_access_channels(websocketpp::log::alevel::all);
+  }
+  void on_message(websocketpp::connection_hdl hdl, message_ptr msg) {
+    const std::string& payload = msg->get_payload();
+    switch (msg->get_opcode()) {
+      case websocketpp::frame::opcode::text:
+        std::cout << "on_message=" << payload << std::endl;
+    }
+  }
+  // This method will block until the connection is complete
+  void run(const std::string& uri, const std::string& wav_path) {
+    // Create a new connection to the given URI
+    websocketpp::lib::error_code ec;
+    client::connection_ptr con = m_client.get_connection(uri, ec);
+    if (ec) {
+      m_client.get_alog().write(websocketpp::log::alevel::app,
+                                "Get Connection Error: " + ec.message());
+      return;
+    }
+    this->wav_path = std::move(wav_path);
+    // Grab a handle for this connection so we can talk to it in a thread
+    // safe manor after the event loop starts.
+    m_hdl = con->get_handle();
+
+    // Queue the connection. No DNS queries or network connections will be
+    // made until the io_service event loop is run.
+    m_client.connect(con);
+
+    // Create a thread to run the ASIO io_service event loop
+    websocketpp::lib::thread asio_thread(&client::run, &m_client);
+
+    send_wav_data();
+    asio_thread.join();
+  }
+
+  // The open handler will signal that we are ready to start sending data
+  void on_open(websocketpp::connection_hdl) {
+    m_client.get_alog().write(websocketpp::log::alevel::app,
+                              "Connection opened, starting data!");
+
+    scoped_lock guard(m_lock);
+    m_open = true;
+  }
+
+  // The close handler will signal that we should stop sending data
+  void on_close(websocketpp::connection_hdl) {
+    m_client.get_alog().write(websocketpp::log::alevel::app,
+                              "Connection closed, stopping data!");
+
+    scoped_lock guard(m_lock);
+    m_done = true;
+  }
+
+  // The fail handler will signal that we should stop sending data
+  void on_fail(websocketpp::connection_hdl) {
+    m_client.get_alog().write(websocketpp::log::alevel::app,
+                              "Connection failed, stopping data!");
+
+    scoped_lock guard(m_lock);
+    m_done = true;
+  }
+  // send wav to server
+  void send_wav_data() {
+    uint64_t count = 0;
+    std::stringstream val;
+
+    Audio audio(1);
+    int32_t sampling_rate = 16000;
+
+    if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate)) {
+      std::cout << "error in load wav" << std::endl;
+      return;
+    }
+
+    float* buff;
+    int len;
+    int flag = 0;
+    bool wait = false;
+    while (1) {
+      {
+        scoped_lock guard(m_lock);
+        // If the connection has been closed, stop generating data
+        if (m_done) {
+          break;
+        }
+
+        // If the connection hasn't been opened yet wait a bit and retry
+        if (!m_open) {
+          wait = true;
+        } else {
+          break;
+        }
+      }
+
+      if (wait) {
+        std::cout << "wait.." << m_open << std::endl;
+        wait_a_bit();
+
+        continue;
+      }
+    }
+    websocketpp::lib::error_code ec;
+    // fetch wav data use asr engine api
+    while (audio.Fetch(buff, len, flag) > 0) {
+      short iArray[len];
+
+      // convert float -1,1 to short -32768,32767
+      for (size_t i = 0; i < len; ++i) {
+        iArray[i] = (short)(buff[i] * 32767);
+      }
+      // send data to server
+      m_client.send(m_hdl, iArray, len * sizeof(short),
+                    websocketpp::frame::opcode::binary, ec);
+      std::cout << "sended data len=" << len * sizeof(short) << std::endl;
+      // The most likely error that we will get is that the connection is
+      // not in the right state. Usually this means we tried to send a
+      // message to a connection that was closed or in the process of
+      // closing. While many errors here can be easily recovered from,
+      // in this simple example, we'll stop the data loop.
+      if (ec) {
+        m_client.get_alog().write(websocketpp::log::alevel::app,
+                                  "Send Error: " + ec.message());
+        break;
+      }
+
+      wait_a_bit();
+    }
+
+    m_client.send(m_hdl, "Done", websocketpp::frame::opcode::text, ec);
+    wait_a_bit();
+  }
+
+ private:
+  client m_client;
+  websocketpp::connection_hdl m_hdl;
+  websocketpp::lib::mutex m_lock;
+  std::string wav_path;
+  bool m_open;
+  bool m_done;
+};
+
+int main(int argc, char* argv[]) {
+  if (argc < 5) {
+    printf("Usage: %s server_ip port wav_path threads_num\n", argv[0]);
+    exit(-1);
+  }
+  std::string server_ip = argv[1];
+  std::string port = argv[2];
+  std::string wav_path = argv[3];
+  int threads_num = atoi(argv[4]);
+  std::vector<websocketpp::lib::thread> client_threads;
+
+  std::string uri = "ws://" + server_ip + ":" + port;
+
+  for (size_t i = 0; i < threads_num; i++) {
+    client_threads.emplace_back([uri, wav_path]() {
+      websocket_client c;
+      c.run(uri, wav_path);
+    });
+  }
+
+  for (auto& t : client_threads) {
+    t.join();
+  }
+}
\ No newline at end of file
diff --git a/funasr/runtime/websocket/websocketmain.cpp b/funasr/runtime/websocket/websocketmain.cpp
new file mode 100644
index 0000000..24e4269
--- /dev/null
+++ b/funasr/runtime/websocket/websocketmain.cpp
@@ -0,0 +1,157 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// io server
+// Usage:websocketmain  [--model_thread_num <int>] [--decoder_thread_num
+//                        <int>] [--io_thread_num <int>] [--port <int>]
+//                        [--listen_ip <string>] [--wav-scp <string>]
+//                        [--wav-path <string>] [--punc-config <string>]
+//                        [--punc-model <string>] --am-config <string>
+//                        --am-cmvn <string> --am-model <string>
+//                        [--vad-config <string>] [--vad-cmvn <string>]
+//                        [--vad-model <string>] [--] [--version] [-h]
+#include "websocketsrv.h"
+
+using namespace std;
+void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key,
+              std::map<std::string, std::string>& model_path) {
+  if (value_arg.isSet()) {
+    model_path.insert({key, value_arg.getValue()});
+    LOG(INFO) << key << " : " << value_arg.getValue();
+  }
+}
+int main(int argc, char* argv[]) {
+  try {
+    google::InitGoogleLogging(argv[0]);
+    FLAGS_logtostderr = true;
+
+    TCLAP::CmdLine cmd("websocketmain", ' ', "1.0");
+    TCLAP::ValueArg<std::string> vad_model("", VAD_MODEL_PATH, "vad model path",
+                                           false, "", "string");
+    TCLAP::ValueArg<std::string> vad_cmvn("", VAD_CMVN_PATH, "vad cmvn path",
+                                          false, "", "string");
+    TCLAP::ValueArg<std::string> vad_config(
+        "", VAD_CONFIG_PATH, "vad config path", false, "", "string");
+
+    TCLAP::ValueArg<std::string> am_model("", AM_MODEL_PATH, "am model path",
+                                          true, "", "string");
+    TCLAP::ValueArg<std::string> am_cmvn("", AM_CMVN_PATH, "am cmvn path", true,
+                                         "", "string");
+    TCLAP::ValueArg<std::string> am_config("", AM_CONFIG_PATH, "am config path",
+                                           true, "", "string");
+
+    TCLAP::ValueArg<std::string> punc_model(
+        "", PUNC_MODEL_PATH, "punc model path", false, "", "string");
+    TCLAP::ValueArg<std::string> punc_config(
+        "", PUNC_CONFIG_PATH, "punc config path", false, "", "string");
+
+    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false,
+                                          "", "string");
+    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false,
+                                         "", "string");
+
+    TCLAP::ValueArg<std::string> listen_ip("", "listen_ip", "listen_ip", false,
+                                           "0.0.0.0", "string");
+    TCLAP::ValueArg<int> port("", "port", "port", false, 8889, "int");
+    TCLAP::ValueArg<int> io_thread_num("", "io_thread_num", "io_thread_num",
+                                       false, 8, "int");
+    TCLAP::ValueArg<int> decoder_thread_num(
+        "", "decoder_thread_num", "decoder_thread_num", false, 8, "int");
+    TCLAP::ValueArg<int> model_thread_num("", "model_thread_num",
+                                          "model_thread_num", false, 1, "int");
+
+    cmd.add(vad_model);
+    cmd.add(vad_cmvn);
+    cmd.add(vad_config);
+    cmd.add(am_model);
+    cmd.add(am_cmvn);
+    cmd.add(am_config);
+    cmd.add(punc_model);
+    cmd.add(punc_config);
+    cmd.add(wav_path);
+    cmd.add(wav_scp);
+    cmd.add(listen_ip);
+    cmd.add(port);
+    cmd.add(io_thread_num);
+    cmd.add(decoder_thread_num);
+    cmd.add(model_thread_num);
+    cmd.parse(argc, argv);
+
+    std::map<std::string, std::string> model_path;
+    GetValue(vad_model, VAD_MODEL_PATH, model_path);
+    GetValue(vad_cmvn, VAD_CMVN_PATH, model_path);
+    GetValue(vad_config, VAD_CONFIG_PATH, model_path);
+    GetValue(am_model, AM_MODEL_PATH, model_path);
+    GetValue(am_cmvn, AM_CMVN_PATH, model_path);
+    GetValue(am_config, AM_CONFIG_PATH, model_path);
+    GetValue(punc_model, PUNC_MODEL_PATH, model_path);
+    GetValue(punc_config, PUNC_CONFIG_PATH, model_path);
+    GetValue(wav_path, WAV_PATH, model_path);
+    GetValue(wav_scp, WAV_SCP, model_path);
+
+
+    std::string s_listen_ip = listen_ip.getValue();
+    int s_port = port.getValue();
+    int s_io_thread_num = io_thread_num.getValue();
+    int s_decoder_thread_num = decoder_thread_num.getValue();
+
+    int s_model_thread_num = model_thread_num.getValue();
+
+ 
+    asio::io_context io_decoder;  // context for decoding
+
+    std::vector<std::thread> decoder_threads;
+
+    auto conn_guard = asio::make_work_guard(
+        io_decoder);  // make sure threads can wait in the queue
+
+    // create threads pool
+    for (int32_t i = 0; i < s_decoder_thread_num; ++i) {
+      decoder_threads.emplace_back([&io_decoder]() { io_decoder.run(); });
+    }
+
+    server server_;       // server for websocket
+    server_.init_asio();  // init asio
+    server_.set_reuse_addr(
+        true);  // reuse address as we create multiple threads
+
+    // list on port for accept
+    server_.listen(asio::ip::address::from_string(s_listen_ip), s_port);
+
+    WebSocketServer websocket_srv(io_decoder,
+                                  &server_);  // websocket server for asr engine
+    websocket_srv.initAsr(model_path, s_model_thread_num);  // init asr model
+    std::cout << "asr model init finished. listen on port:" << s_port
+              << std::endl;
+
+    // Start the ASIO network io_service run loop
+    if (s_io_thread_num == 1) {
+      server_.run();
+    } else {
+      typedef websocketpp::lib::shared_ptr<websocketpp::lib::thread> thread_ptr;
+      std::vector<thread_ptr> ts;
+      // create threads for io network
+      for (size_t i = 0; i < s_io_thread_num; i++) {
+        ts.push_back(websocketpp::lib::make_shared<websocketpp::lib::thread>(
+            &server::run, &server_));
+      }
+      // wait for theads
+      for (size_t i = 0; i < s_io_thread_num; i++) {
+        ts[i]->join();
+      }
+    }
+
+    // wait for theads
+    for (auto& t : decoder_threads) {
+      t.join();
+    }
+
+  } catch (std::exception const& e) {
+    std::cerr << "Error: " << e.what() << std::endl;
+  }
+
+  return 0;
+}
\ No newline at end of file
diff --git a/funasr/runtime/websocket/websocketsrv.cpp b/funasr/runtime/websocket/websocketsrv.cpp
new file mode 100644
index 0000000..7e54210
--- /dev/null
+++ b/funasr/runtime/websocket/websocketsrv.cpp
@@ -0,0 +1,158 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// websocket server for asr engine
+// take some ideas from https://github.com/k2-fsa/sherpa-onnx
+// online-websocket-server-impl.cc, thanks. The websocket server has two threads
+// pools, one for handle network data and one for asr decoder.
+// now only support offline engine.
+
+#include "websocketsrv.h"
+
+#include <thread>
+#include <utility>
+#include <vector>
+
+// feed buffer to asr engine for decoder
+void WebSocketServer::do_decoder(const std::vector<char>& buffer,
+                                 websocketpp::connection_hdl& hdl) {
+  try {
+    int num_samples = buffer.size();  // the size of the buf
+
+    if (!buffer.empty()) {
+      // fout.write(buffer.data(), buffer.size());
+      // feed data to asr engine
+      FUNASR_RESULT Result = FunASRRecogPCMBuffer(
+          asr_hanlde, buffer.data(), buffer.size(), 16000, RASR_NONE, NULL);
+
+      std::string asr_result =
+          ((FUNASR_RECOG_RESULT*)Result)->msg;  // get decode result
+
+      websocketpp::lib::error_code ec;
+      nlohmann::json jsonresult;        // result json
+      jsonresult["text"] = asr_result;  // put result in 'text'
+
+      // send the json to client
+      server_->send(hdl, jsonresult.dump(), websocketpp::frame::opcode::text,
+                    ec);
+
+      std::cout << "buffer.size=" << buffer.size()
+                << ",result json=" << jsonresult.dump() << std::endl;
+      if (!isonline) {
+        //  close the client if it is not online asr
+        server_->close(hdl, websocketpp::close::status::normal, "DONE", ec);
+        // fout.close();
+      }
+    }
+
+  } catch (std::exception const& e) {
+    std::cerr << "Error: " << e.what() << std::endl;
+  }
+}
+
+void WebSocketServer::on_open(websocketpp::connection_hdl hdl) {
+  scoped_lock guard(m_lock);     // for threads safty
+  check_and_clean_connection();  // remove closed connection
+  sample_map.emplace(
+      hdl, std::make_shared<std::vector<char>>());  // put a new data vector for
+                                                    // new connection
+  std::cout << "on_open, active connections: " << sample_map.size()
+            << std::endl;
+}
+
+void WebSocketServer::on_close(websocketpp::connection_hdl hdl) {
+  scoped_lock guard(m_lock);
+  sample_map.erase(hdl);  // remove data vector when  connection is closed
+  std::cout << "on_close, active connections: " << sample_map.size()
+            << std::endl;
+}
+
+// remove closed connection
+void WebSocketServer::check_and_clean_connection() {
+  std::vector<websocketpp::connection_hdl> to_remove;  // remove list
+  auto iter = sample_map.begin();
+  while (iter != sample_map.end()) {  // loop to find closed connection
+    websocketpp::connection_hdl hdl = iter->first;
+    server::connection_ptr con = server_->get_con_from_hdl(hdl);
+    if (con->get_state() != 1) {  // session::state::open ==1
+      to_remove.push_back(hdl);
+    }
+    iter++;
+  }
+  for (auto hdl : to_remove) {
+    sample_map.erase(hdl);
+    std::cout << "remove one connection " << std::endl;
+  }
+}
+void WebSocketServer::on_message(websocketpp::connection_hdl hdl,
+                                 message_ptr msg) {
+  unique_lock lock(m_lock);
+  // find the sample data vector according to one connection
+  std::shared_ptr<std::vector<char>> sample_data_p = nullptr;
+
+  auto it = sample_map.find(hdl);
+  if (it != sample_map.end()) {
+    sample_data_p = it->second;
+  }
+  lock.unlock();
+  if (sample_data_p == nullptr) {
+    std::cout << "error when fetch sample data vector" << std::endl;
+    return;
+  }
+
+  const std::string& payload = msg->get_payload();  // get msg type
+
+  switch (msg->get_opcode()) {
+    case websocketpp::frame::opcode::text:
+      if (payload == "Done") {
+        std::cout << "client done" << std::endl;
+
+        if (isonline) {
+          // do_close(ws);
+        } else {
+          // for offline, send all receive data to decoder engine
+          asio::post(io_decoder_, std::bind(&WebSocketServer::do_decoder, this,
+                                            std::move(*(sample_data_p.get())),
+                                            std::move(hdl)));
+        }
+      }
+      break;
+    case websocketpp::frame::opcode::binary: {
+      // recived binary data
+      const auto* pcm_data = static_cast<const char*>(payload.data());
+      int32_t num_samples = payload.size();
+
+      if (isonline) {
+        // if online TODO(zhaoming) still not done
+        std::vector<char> s(pcm_data, pcm_data + num_samples);
+        asio::post(io_decoder_, std::bind(&WebSocketServer::do_decoder, this,
+                                          std::move(s), std::move(hdl)));
+      } else {
+        // for offline, we add receive data to end of the sample data vector
+        sample_data_p->insert(sample_data_p->end(), pcm_data,
+                              pcm_data + num_samples);
+      }
+
+      break;
+    }
+    default:
+      break;
+  }
+}
+
+// init asr model
+void WebSocketServer::initAsr(std::map<std::string, std::string>& model_path,
+                              int thread_num) {
+  try {
+    // init model with api
+
+    asr_hanlde = FunASRInit(model_path, thread_num);
+    std::cout << "model ready" << std::endl;
+
+  } catch (const std::exception& e) {
+    std::cout << e.what() << std::endl;
+  }
+}
diff --git a/funasr/runtime/websocket/websocketsrv.h b/funasr/runtime/websocket/websocketsrv.h
new file mode 100644
index 0000000..2d0c7bd
--- /dev/null
+++ b/funasr/runtime/websocket/websocketsrv.h
@@ -0,0 +1,93 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// websocket server for asr engine
+// take some ideas from https://github.com/k2-fsa/sherpa-onnx
+// online-websocket-server-impl.cc, thanks. The websocket server has two threads
+// pools, one for handle network data and one for asr decoder.
+// now only support offline engine.
+
+#ifndef WEBSOCKETSRV_SERVER_H_
+#define WEBSOCKETSRV_SERVER_H_
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+#define ASIO_STANDALONE 1  // not boost
+#include <glog/logging.h>
+
+#include <fstream>
+#include <functional>
+#include <websocketpp/common/thread.hpp>
+#include <websocketpp/config/asio_no_tls.hpp>
+#include <websocketpp/server.hpp>
+
+#include "asio.hpp"
+#include "com-define.h"
+#include "libfunasrapi.h"
+#include "nlohmann/json.hpp"
+#include "tclap/CmdLine.h"
+typedef websocketpp::server<websocketpp::config::asio> server;
+typedef server::message_ptr message_ptr;
+using websocketpp::lib::bind;
+using websocketpp::lib::placeholders::_1;
+using websocketpp::lib::placeholders::_2;
+typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
+typedef websocketpp::lib::unique_lock<websocketpp::lib::mutex> unique_lock;
+
+typedef struct {
+  std::string msg;
+  float snippet_time;
+} FUNASR_RECOG_RESULT;
+
+class WebSocketServer {
+ public:
+  WebSocketServer(asio::io_context& io_decoder, server* server_)
+      : io_decoder_(io_decoder), server_(server_) {
+    // set message handle
+    server_->set_message_handler(
+        [this](websocketpp::connection_hdl hdl, message_ptr msg) {
+          on_message(hdl, msg);
+        });
+    // set open handle
+    server_->set_open_handler(
+        [this](websocketpp::connection_hdl hdl) { on_open(hdl); });
+    // set close handle
+    server_->set_close_handler(
+        [this](websocketpp::connection_hdl hdl) { on_close(hdl); });
+    // begin accept
+    server_->start_accept();
+    // not print log
+    server_->clear_access_channels(websocketpp::log::alevel::all);
+  }
+  void do_decoder(const std::vector<char>& buffer,
+                  websocketpp::connection_hdl& hdl);
+
+  void initAsr(std::map<std::string, std::string>& model_path, int thread_num);
+  void on_message(websocketpp::connection_hdl hdl, message_ptr msg);
+  void on_open(websocketpp::connection_hdl hdl);
+  void on_close(websocketpp::connection_hdl hdl);
+
+ private:
+  void check_and_clean_connection();
+  asio::io_context& io_decoder_;  // threads for asr decoder
+  // std::ofstream fout;
+  FUNASR_HANDLE asr_hanlde;  // asr engine handle
+  bool isonline = false;  // online or offline engine, now only support offline
+  server* server_;        // websocket server
+
+  // use map to keep the received samples data from one connection in offline
+  // engine. if for online engline, a data struct is needed(TODO)
+  std::map<websocketpp::connection_hdl, std::shared_ptr<std::vector<char>>,
+           std::owner_less<websocketpp::connection_hdl>>
+      sample_map;
+  websocketpp::lib::mutex m_lock;  // mutex for sample_map
+};
+
+#endif  // WEBSOCKETSRV_SERVER_H_
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index 3d2004c..31057f9 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -549,6 +549,12 @@
             help="The number of gradient accumulation",
         )
         group.add_argument(
+            "--bias_grad_times",
+            type=float,
+            default=1.0,
+            help="To scale the gradient of contextual related params",
+        )
+        group.add_argument(
             "--no_forward_run",
             type=str2bool,
             default=False,
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index d52c9c3..4d10092 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -42,6 +42,7 @@
 from funasr.models.joint_net.joint_network import JointNetwork
 from funasr.models.e2e_asr import ESPnetASRModel
 from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerOnline, ParaformerBert, BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
 from funasr.models.e2e_tp import TimestampPredictor
 from funasr.models.e2e_asr_mfcca import MFCCA
 from funasr.models.e2e_uni_asr import UniASR
@@ -128,6 +129,7 @@
         paraformer_bert=ParaformerBert,
         bicif_paraformer=BiCifParaformer,
         contextual_paraformer=ContextualParaformer,
+        neatcontextual_paraformer=NeatContextualParaformer,
         mfcca=MFCCA,
         timestamp_prediction=TimestampPredictor,
     ),
@@ -1647,7 +1649,6 @@
             normalize = None
 
         # 4. Encoder
-
         if getattr(args, "encoder", None) is not None:
             encoder_class = encoder_choices.get_class(args.encoder)
             encoder = encoder_class(input_size, **args.encoder_conf)
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index e9b18cd..b54f777 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -120,6 +120,6 @@
     if ignore_init_mismatch:
         src_state = filter_state_dict(dst_state, src_state)
 
-    logging.info("Loaded src_state keys: {}".format(src_state.keys()))
+    # logging.info("Loaded src_state keys: {}".format(src_state.keys()))
     dst_state.update(src_state)
     obj.load_state_dict(dst_state)
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 7c187e9..a40f031 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -95,6 +95,7 @@
     use_pai: bool
     oss_bucket: Union[oss2.Bucket, None]
     batch_interval: int
+    bias_grad_times: float
 
 class Trainer:
     """Trainer having a optimizer.
@@ -546,8 +547,11 @@
         no_forward_run = options.no_forward_run
         ngpu = options.ngpu
         use_wandb = options.use_wandb
+        bias_grad_times = options.bias_grad_times
         distributed = distributed_option.distributed
 
+        if bias_grad_times != 1.0:
+            logging.warning("Using bias_grad_times: {} for gradient scaling".format(bias_grad_times))
         if log_interval is None:
             try:
                 log_interval = max(len(iterator) // 20, 10)
@@ -690,6 +694,16 @@
                         scale_factor=0.55,
                     )
 
+                # for contextual training
+                if bias_grad_times != 1.0:
+                    # contextual related parameter names
+                    cr_pnames = ["bias_encoder", "bias_embed", "decoder.bias_decoder", "decoder.bias_output"]
+                    for name, param in model.named_parameters():
+                        for cr_pname in cr_pnames:
+                            if cr_pname in name:
+                                param.grad *= bias_grad_times
+                                continue
+
                 # compute the gradient norm to check if it is normal or not
                 grad_norm = torch.nn.utils.clip_grad_norm_(
                     model.parameters(),
diff --git a/funasr/version.txt b/funasr/version.txt
index 17b2ccd..f905682 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.4.3
+0.4.7

--
Gitblit v1.9.1