From 1d1ef01b4e23630a99a3be7e9d1dce9550a793e9 Mon Sep 17 00:00:00 2001
From: yhliang <68215459+yhliang-aslp@users.noreply.github.com>
Date: 星期四, 11 五月 2023 16:26:24 +0800
Subject: [PATCH] Merge branch 'main' into dev_smohan

---
 funasr/runtime/grpc/Readme.md                                                                                                      |   49 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py                                 |    4 
 docs/model_zoo/modelscope_models.md                                                                                                |  126 +
 funasr/runtime/onnxruntime/src/tokenizer.h                                                                                         |    4 
 funasr/bin/vad_inference.py                                                                                                        |   13 
 funasr/runtime/websocket/readme.md                                                                                                 |   59 
 tests/test_asr_inference_pipeline.py                                                                                               |   16 
 docs/m2met2_cn/_build/html/_sources/简介.md.txt                                                                                      |   29 
 funasr/runtime/onnxruntime/src/resample.cpp                                                                                        |    2 
 funasr/runtime/onnxruntime/src/audio.cpp                                                                                           |   42 
 funasr/models/encoder/conformer_encoder.py                                                                                         |    4 
 docs/m2met2/_build/doctrees/environment.pickle                                                                                     |    0 
 funasr/runtime/python/websocket/README.md                                                                                          |  107 
 funasr/runtime/grpc/paraformer-server.cc                                                                                           |   50 
 docs/m2met2_cn/_build/html/genindex.html                                                                                           |    8 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh                                                                       |    2 
 docs/m2met2/Contact.md                                                                                                             |    4 
 docs/academic_recipe/vad_recipe.md                                                                                                 |  129 -
 docs/m2met2/_build/html/Baseline.html                                                                                              |   35 
 funasr/utils/timestamp_tools.py                                                                                                    |   49 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md                         |    2 
 funasr/bin/build_trainer.py                                                                                                        |    3 
 docs/m2met2/_build/html/Dataset.html                                                                                               |    8 
 funasr/runtime/python/websocket/ws_server_2pass.py                                                                                 |  182 +
 funasr/models/decoder/rnnt_decoder.py                                                                                              |   12 
 funasr/runtime/onnxruntime/src/fsmn-vad.h                                                                                          |   11 
 docs/m2met2_cn/_build/html/组委会.html                                                                                                |    8 
 docs/reference/papers.md                                                                                                           |    0 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh                          |  103 +
 funasr/runtime/websocket/websocketsrv.cpp                                                                                          |    6 
 docs/academic_recipe/lm_recipe.md                                                                                                  |  128 -
 funasr/bin/asr_inference_rnnt.py                                                                                                   |   19 
 docs/m2met2_cn/_build/html/_sources/联系方式.md.txt                                                                                    |    6 
 docs/m2met2/_build/doctrees/Organizers.doctree                                                                                     |    0 
 docs/m2met2_cn/简介.md                                                                                                               |   29 
 docs/modelscope_pipeline/quick_start.md                                                                                            |    2 
 docs/m2met2_cn/_build/doctrees/基线.doctree                                                                                          |    0 
 funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp                                                                         |   66 
 docs/m2met2_cn/_build/doctrees/赛道设置与评估.doctree                                                                                     |    0 
 funasr/runtime/onnxruntime/src/offline-stream.cpp                                                                                  |   64 
 docs/m2met2_cn/_build/html/searchindex.js                                                                                          |    2 
 funasr/models/e2e_asr_contextual_paraformer.py                                                                                     |  372 +++
 funasr/runtime/onnxruntime/src/paraformer.cpp                                                                                      |   80 
 funasr/runtime/python/websocket/parse_args.py                                                                                      |    7 
 funasr/runtime/onnxruntime/src/alignedmem.cpp                                                                                      |    3 
 funasr/torch_utils/load_pretrained_model.py                                                                                        |    2 
 docs/m2met2/_build/html/Track_setting_and_evaluation.html                                                                          |    8 
 funasr/runtime/onnxruntime/include/offline-stream.h                                                                                |   30 
 docs/m2met2_cn/_build/html/search.html                                                                                             |    8 
 egs_modelscope/speaker_verification/TEMPLATE/README.md                                                                             |    4 
 egs/alimeeting/sa-asr/asr_local.sh                                                                                                 |  146 -
 funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp                                                                         |  152 +
 funasr/train/trainer.py                                                                                                            |   14 
 docs/m2met2_cn/联系方式.md                                                                                                             |    6 
 funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py                                                                          |    6 
 egs_modelscope/vad/TEMPLATE/README.md                                                                                              |   12 
 funasr/runtime/onnxruntime/src/tensor.h                                                                                            |    4 
 funasr/bin/punctuation_infer_vadrealtime.py                                                                                        |    4 
 funasr/runtime/onnxruntime/src/predefine-coe.h                                                                                     |    3 
 funasr/runtime/onnxruntime/include/com-define.h                                                                                    |   42 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py                                                                         |    2 
 docs/m2met2/_build/html/_sources/Baseline.md.txt                                                                                   |   27 
 funasr/datasets/large_datasets/dataset.py                                                                                          |   37 
 egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py                                                           |    7 
 docs/m2met2_cn/_build/html/基线.html                                                                                                 |   35 
 docs/installation/installation.md                                                                                                  |    0 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh                       |  105 +
 funasr/runtime/websocket/websocketmain.cpp                                                                                         |   98 
 docs/m2met2/_build/html/_images/qrcode.png                                                                                         |    0 
 funasr/modules/nets_utils.py                                                                                                       |   35 
 .gitignore                                                                                                                         |    4 
 docs/m2met2_cn/_build/doctrees/简介.doctree                                                                                          |    0 
 funasr/runtime/onnxruntime/src/tokenizer.cpp                                                                                       |    7 
 egs/alimeeting/sa-asr/README.md                                                                                                    |    6 
 funasr/datasets/large_datasets/utils/padding.py                                                                                    |   58 
 docs/m2met2/conf.py                                                                                                                |    2 
 funasr/runtime/onnxruntime/src/commonfunc.h                                                                                        |    9 
 docs/academic_recipe/punc_recipe.md                                                                                                |  129 -
 docs/m2met2/Baseline.md                                                                                                            |   27 
 docs/m2met2_cn/images/qrcode.png                                                                                                   |    0 
 docs/m2met2/_build/doctrees/Baseline.doctree                                                                                       |    0 
 docs/reference/application.md                                                                                                      |    0 
 funasr/runtime/python/websocket/ws_server_offline.py                                                                               |  150 +
 docs/README.md                                                                                                                     |   19 
 funasr/runtime/onnxruntime/src/util.cpp                                                                                            |   12 
 funasr/runtime/onnxruntime/include/funasrruntime.h                                                                                 |   98 +
 docs/installation/docker.md                                                                                                        |    0 
 docs/reference/FQA.md                                                                                                              |    0 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py |   40 
 funasr/runtime/onnxruntime/src/punc-model.cpp                                                                                      |   22 
 docs/m2met2/_build/html/_images/baseline_result.png                                                                                |    0 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py                                                                  |    2 
 docs/modelscope_pipeline/itn_pipeline.md                                                                                           |   63 
 docs/m2met2_cn/_build/html/objects.inv                                                                                             |    0 
 funasr/runtime/websocket/CMakeLists.txt                                                                                            |    2 
 funasr/bin/vad_inference_online.py                                                                                                 |    5 
 egs_modelscope/asr/TEMPLATE/README.md                                                                                              |   10 
 docs/m2met2_cn/赛道设置与评估.md                                                                                                          |    2 
 docs/m2met2/_build/doctrees/Introduction.doctree                                                                                   |    0 
 funasr/runtime/onnxruntime/src/ct-transformer.cpp                                                                                  |   11 
 funasr/runtime/onnxruntime/readme.md                                                                                               |  134 +
 funasr/runtime/onnxruntime/src/model.cpp                                                                                           |   17 
 docs/m2met2_cn/_build/doctrees/联系方式.doctree                                                                                        |    0 
 funasr/runtime/onnxruntime/src/online-feature.h                                                                                    |    7 
 egs_modelscope/vad/TEMPLATE/infer.py                                                                                               |    4 
 docs/m2met2_cn/index.rst                                                                                                           |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md                                                                       |    2 
 funasr/runtime/grpc/paraformer-server.h                                                                                            |    2 
 docs/m2met2/Organizers.md                                                                                                          |    2 
 funasr/bin/asr_inference_paraformer.py                                                                                             |    3 
 docs/m2met2_cn/_build/doctrees/environment.pickle                                                                                  |    0 
 funasr/runtime/onnxruntime/src/precomp.h                                                                                           |   11 
 docs/m2met2_cn/_build/html/联系方式.html                                                                                               |   17 
 funasr/runtime/onnxruntime/include/vad-model.h                                                                                     |   29 
 funasr/tasks/abs_task.py                                                                                                           |    6 
 docs/m2met2/_build/html/objects.inv                                                                                                |    2 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md                      |    2 
 funasr/runtime/onnxruntime/src/e2e-vad.h                                                                                           |   22 
 funasr/runtime/python/grpc/proto/paraformer.proto                                                                                  |   14 
 docs/m2met2_cn/_build/html/index.html                                                                                              |   12 
 docs/academic_recipe/sd_recipe.md                                                                                                  |  129 -
 funasr/runtime/onnxruntime/src/vad-model.cpp                                                                                       |   28 
 docs/m2met2/_build/html/Rules.html                                                                                                 |    8 
 docs/index.rst                                                                                                                     |   17 
 funasr/runtime/onnxruntime/include/model.h                                                                                         |    8 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py                            |    2 
 funasr/runtime/websocket/websocketsrv.h                                                                                            |    2 
 egs/alimeeting/sa-asr/run.sh                                                                                                       |    8 
 funasr/runtime/onnxruntime/src/ct-transformer.h                                                                                    |    6 
 egs_modelscope/speaker_diarization/TEMPLATE/README.md                                                                              |    4 
 docs/reference/build_task.md                                                                                                       |    0 
 funasr/runtime/onnxruntime/include/audio.h                                                                                         |    9 
 funasr/bin/asr_inference_paraformer_streaming.py                                                                                   |   24 
 docs/m2met2_cn/_build/html/赛道设置与评估.html                                                                                            |   10 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py                                                                        |    2 
 funasr/models/e2e_asr_transducer.py                                                                                                |    8 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md                             |    2 
 docs/m2met2_cn/_build/html/规则.html                                                                                                 |    8 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py                    |   37 
 docs/m2met2_cn/conf.py                                                                                                             |    2 
 docs/m2met2/_build/html/genindex.html                                                                                              |    8 
 funasr/runtime/websocket/websocketclient.cpp                                                                                       |    2 
 funasr/runtime/onnxruntime/src/paraformer.h                                                                                        |   24 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh                                                                        |    2 
 docs/m2met2_cn/_build/html/_images/qrcode.png                                                                                      |    0 
 funasr/runtime/onnxruntime/include/punc-model.h                                                                                    |   20 
 docs/m2met2/images/baseline_result.png                                                                                             |    0 
 docs/m2met2/_build/html/Organizers.html                                                                                            |   10 
 docs/m2met2_cn/_build/html/_sources/index.rst.txt                                                                                  |    4 
 docs/m2met2/Introduction.md                                                                                                        |   20 
 funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp                                                                             |   87 
 funasr/runtime/onnxruntime/src/CMakeLists.txt                                                                                      |    7 
 docs/m2met2_cn/基线.md                                                                                                               |   27 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py                              |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md                             |    2 
 docs/m2met2/_build/html/_sources/Organizers.md.txt                                                                                 |    2 
 funasr/runtime/onnxruntime/src/fsmn-vad.cpp                                                                                        |   15 
 funasr/runtime/onnxruntime/src/resample.h                                                                                          |    5 
 funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp                                                                        |   98 +
 docs/m2met2_cn/_build/doctrees/index.doctree                                                                                       |    0 
 docs/m2met2/_build/html/Contact.html                                                                                               |   15 
 docs/m2met2_cn/_build/html/_images/baseline_result.png                                                                             |    0 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py                                                                       |    2 
 docs/m2met2/_build/html/_sources/Introduction.md.txt                                                                               |   20 
 docs/m2met2/_build/html/index.html                                                                                                 |    8 
 docs/m2met2/_build/doctrees/Contact.doctree                                                                                        |    0 
 funasr/runtime/onnxruntime/src/util.h                                                                                              |    5 
 README.md                                                                                                                          |   16 
 docs/m2met2/images/qrcode.png                                                                                                      |    0 
 docs/m2met2_cn/_build/html/简介.html                                                                                                 |   36 
 funasr/runtime/onnxruntime/src/alignedmem.h                                                                                        |    2 
 docs/m2met2_cn/_build/html/_sources/基线.md.txt                                                                                      |   27 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py                              |    2 
 funasr/datasets/large_datasets/utils/hotword_utils.py                                                                              |   32 
 funasr/datasets/large_datasets/utils/tokenize.py                                                                                   |    8 
 funasr/runtime/onnxruntime/src/vocab.h                                                                                             |    2 
 docs/m2met2/_build/html/_sources/Contact.md.txt                                                                                    |    4 
 docs/m2met2_cn/_build/html/_sources/赛道设置与评估.md.txt                                                                                 |    2 
 docs/academic_recipe/sv_recipe.md                                                                                                  |  129 -
 funasr/version.txt                                                                                                                 |    2 
 docs/m2met2_cn/images/baseline_result.png                                                                                          |    0 
 egs_modelscope/vad/TEMPLATE/infer.sh                                                                                               |    2 
 funasr/runtime/onnxruntime/src/funasrruntime.cpp                                                                                   |  349 +++
 docs/m2met2/_build/html/search.html                                                                                                |    8 
 funasr/runtime/python/websocket/ws_server_online.py                                                                                |   51 
 docs/m2met2_cn/_build/html/.buildinfo                                                                                              |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh                              |  103 +
 egs_modelscope/tp/TEMPLATE/README.md                                                                                               |    4 
 funasr/runtime/onnxruntime/src/online-feature.cpp                                                                                  |    4 
 docs/m2met2/_build/html/searchindex.js                                                                                             |    2 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py                                                                        |    2 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md                                                                      |    2 
 funasr/runtime/python/onnxruntime/setup.py                                                                                         |    2 
 docs/m2met2/_build/html/.buildinfo                                                                                                 |    2 
 docs/m2met2/_build/html/Introduction.html                                                                                          |   28 
 docs/m2met2_cn/_build/html/数据集.html                                                                                                |    8 
 funasr/tasks/asr.py                                                                                                                |    5 
 egs_modelscope/punctuation/TEMPLATE/README.md                                                                                      |    6 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh                              |  103 +
 docs/model_zoo/huggingface_models.md                                                                                               |    0 
 /dev/null                                                                                                                          |  210 --
 funasr/runtime/python/websocket/ws_client.py                                                                                       |  127 +
 funasr/modules/repeat.py                                                                                                           |    4 
 funasr/runtime/onnxruntime/src/vocab.cpp                                                                                           |    3 
 204 files changed, 3,725 insertions(+), 1,863 deletions(-)

diff --git a/.gitignore b/.gitignore
index 33b8c39..58bee36 100644
--- a/.gitignore
+++ b/.gitignore
@@ -16,4 +16,6 @@
 .egg*
 dist
 build
-funasr.egg-info
\ No newline at end of file
+funasr.egg-info
+docs/_build
+modelscope
\ No newline at end of file
diff --git a/README.md b/README.md
index 414eb9b..e9c6ef9 100644
--- a/README.md
+++ b/README.md
@@ -13,22 +13,22 @@
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
 | [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
-| [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
+| [**Tutorial_CN**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
 | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
 | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
-| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/modelscope_models.md)
+| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
 | [**Contact**](#contact)
 | [**M2MET2.0 Challenge**](https://github.com/alibaba-damo-academy/FunASR#multi-channel-multi-party-meeting-transcription-20-m2met20-challenge)
 
 ## What's new: 
-### Multi-Channel Multi-Party Meeting Transcription 2.0 (M2MET2.0) Challenge
-We are pleased to announce that the M2MeT2.0 challenge will be held in the near future. The baseline system is conducted on FunASR and is provided as a receipe of AliMeeting corpus. For more details you can see the guidence of M2MET2.0 ([CN](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)/[EN](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)).
+### Multi-Channel Multi-Party Meeting Transcription 2.0 (M2MeT2.0) Challenge
+We are pleased to announce that the M2MeT2.0 challenge has been accepted by the ASRU 2023 challenge special session. The registration is now open. The baseline system is conducted on FunASR and is provided as a receipe of AliMeeting corpus. For more details you can see the guidence of M2MET2.0 ([CN](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)/[EN](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)).
 ### Release notes
 For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
 ## Highlights
 - FunASR supports speech recognition(ASR), Multi-talker ASR, Voice Activity Detection(VAD), Punctuation Restoration, Language Models, Speaker Verification and Speaker diarization.   
-- We have released large number of academic and industrial pretrained models on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), ref to [Model Zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html)
+- We have released large number of academic and industrial pretrained models on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), ref to [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
 - The pretrained model [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) obtains the best performance on many tasks in [SpeechIO leaderboard](https://github.com/SpeechColab/Leaderboard)
 - FunASR supplies a easy-to-use pipeline to finetune pretrained models from [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
 - Compared to [Espnet](https://github.com/espnet/espnet) framework, the training speed of large-scale datasets in FunASR is much faster owning to the optimized dataloader.
@@ -60,12 +60,8 @@
 # pip install -U modelscope -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html -i https://mirror.sjtu.edu.cn/pypi/web/simple
 ```
 
-For more details, please ref to [installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html)
+For more details, please ref to [installation](https://alibaba-damo-academy.github.io/FunASR/en/installation/installation.html)
 
-[//]: # ()
-[//]: # (## Usage)
-
-[//]: # (For users who are new to FunASR and ModelScope, please refer to FunASR Docs&#40;[CN]&#40;https://alibaba-damo-academy.github.io/FunASR/cn/index.html&#41; / [EN]&#40;https://alibaba-damo-academy.github.io/FunASR/en/index.html&#41;&#41;)
 
 ## Contact
 
diff --git a/docs/README.md b/docs/README.md
new file mode 100644
index 0000000..4e16b04
--- /dev/null
+++ b/docs/README.md
@@ -0,0 +1,19 @@
+# FunASR document generation
+
+## Generate HTML
+For convenience, we provide users with the ability to generate local HTML manually.
+
+First, you should install the following packages, which is required for building HTML:
+```sh
+conda activate funasr
+pip install requests sphinx nbsphinx sphinx_markdown_tables sphinx_rtd_theme recommonmark
+```
+
+Then you can generate HTML manually.
+
+```sh
+cd docs
+make html
+```
+
+The generated files are all contained in the "FunASR/docs/_build" directory. You can access the FunASR documentation by simply opening the "html/index.html" file in your browser from this directory.
\ No newline at end of file
diff --git a/docs/academic_recipe/lm_recipe.md b/docs/academic_recipe/lm_recipe.md
index f82a6fe..730e27c 100644
--- a/docs/academic_recipe/lm_recipe.md
+++ b/docs/academic_recipe/lm_recipe.md
@@ -1,129 +1,3 @@
 # Speech Recognition
-Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
 
-## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
-- `CUDA_VISIBLE_DEVICES`: visible gpu list
-- `gpu_num`: the number of GPUs used for training
-- `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
-- `data_aishell`: the raw path of AISHELL-1 dataset
-- `feats_dir`: the path for saving processed data
-- `nj`: the number of jobs for data preparation
-- `speed_perturb`: the range of speech perturbed
-- `exp_dir`: the path for saving experimental results
-- `tag`: the suffix of experimental result directory
-
-## Stage 0: Data preparation
-This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
-
-## Stage 1: Feature Generation
-This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
-
-## Stage 2: Dictionary Preparation
-This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: indicates the blank token for CTC
-* `<s>`: indicates the start-of-sentence token
-* `</s>`: indicates the end-of-sentence token
-* `<unk>`: indicates the out-of-vocabulary token
-
-## Stage 3: Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
-
-* DDP Training
-
-We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
-
-* DataLoader
-
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
-
-* Configuration
-
-The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
-
-* Training Steps
-
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
-
-* Tensorboard
-
-Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## Stage 4: Decoding
-This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
-
-* Mode Selection
-
-As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
-
-* Configuration
-
-We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
-
-* CPU/GPU Decoding
-
-We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
-
-* Performance
-
-We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
+Undo
diff --git a/docs/academic_recipe/punc_recipe.md b/docs/academic_recipe/punc_recipe.md
index 0306cd3..e9f79bb 100644
--- a/docs/academic_recipe/punc_recipe.md
+++ b/docs/academic_recipe/punc_recipe.md
@@ -1,129 +1,2 @@
 # Punctuation Restoration
-Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
-
-## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
-- `CUDA_VISIBLE_DEVICES`: visible gpu list
-- `gpu_num`: the number of GPUs used for training
-- `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
-- `data_aishell`: the raw path of AISHELL-1 dataset
-- `feats_dir`: the path for saving processed data
-- `nj`: the number of jobs for data preparation
-- `speed_perturb`: the range of speech perturbed
-- `exp_dir`: the path for saving experimental results
-- `tag`: the suffix of experimental result directory
-
-## Stage 0: Data preparation
-This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
-
-## Stage 1: Feature Generation
-This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
-
-## Stage 2: Dictionary Preparation
-This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: indicates the blank token for CTC
-* `<s>`: indicates the start-of-sentence token
-* `</s>`: indicates the end-of-sentence token
-* `<unk>`: indicates the out-of-vocabulary token
-
-## Stage 3: Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
-
-* DDP Training
-
-We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
-
-* DataLoader
-
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
-
-* Configuration
-
-The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
-
-* Training Steps
-
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
-
-* Tensorboard
-
-Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## Stage 4: Decoding
-This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
-
-* Mode Selection
-
-As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
-
-* Configuration
-
-We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
-
-* CPU/GPU Decoding
-
-We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
-
-* Performance
-
-We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
+Undo
\ No newline at end of file
diff --git a/docs/academic_recipe/sd_recipe.md b/docs/academic_recipe/sd_recipe.md
index 90eb4b3..8b38d7b 100644
--- a/docs/academic_recipe/sd_recipe.md
+++ b/docs/academic_recipe/sd_recipe.md
@@ -1,129 +1,2 @@
 # Speaker Diarization
-Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
-
-## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
-- `CUDA_VISIBLE_DEVICES`: visible gpu list
-- `gpu_num`: the number of GPUs used for training
-- `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
-- `data_aishell`: the raw path of AISHELL-1 dataset
-- `feats_dir`: the path for saving processed data
-- `nj`: the number of jobs for data preparation
-- `speed_perturb`: the range of speech perturbed
-- `exp_dir`: the path for saving experimental results
-- `tag`: the suffix of experimental result directory
-
-## Stage 0: Data preparation
-This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
-
-## Stage 1: Feature Generation
-This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
-
-## Stage 2: Dictionary Preparation
-This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: indicates the blank token for CTC
-* `<s>`: indicates the start-of-sentence token
-* `</s>`: indicates the end-of-sentence token
-* `<unk>`: indicates the out-of-vocabulary token
-
-## Stage 3: Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
-
-* DDP Training
-
-We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
-
-* DataLoader
-
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
-
-* Configuration
-
-The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
-
-* Training Steps
-
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
-
-* Tensorboard
-
-Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## Stage 4: Decoding
-This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
-
-* Mode Selection
-
-As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
-
-* Configuration
-
-We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
-
-* CPU/GPU Decoding
-
-We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
-
-* Performance
-
-We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
+Undo
diff --git a/docs/academic_recipe/sv_recipe.md b/docs/academic_recipe/sv_recipe.md
index 0eebe3d..7fe493b 100644
--- a/docs/academic_recipe/sv_recipe.md
+++ b/docs/academic_recipe/sv_recipe.md
@@ -1,129 +1,2 @@
 # Speaker Verification
-Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
-
-## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
-- `CUDA_VISIBLE_DEVICES`: visible gpu list
-- `gpu_num`: the number of GPUs used for training
-- `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
-- `data_aishell`: the raw path of AISHELL-1 dataset
-- `feats_dir`: the path for saving processed data
-- `nj`: the number of jobs for data preparation
-- `speed_perturb`: the range of speech perturbed
-- `exp_dir`: the path for saving experimental results
-- `tag`: the suffix of experimental result directory
-
-## Stage 0: Data preparation
-This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
-
-## Stage 1: Feature Generation
-This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
-
-## Stage 2: Dictionary Preparation
-This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: indicates the blank token for CTC
-* `<s>`: indicates the start-of-sentence token
-* `</s>`: indicates the end-of-sentence token
-* `<unk>`: indicates the out-of-vocabulary token
-
-## Stage 3: Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
-
-* DDP Training
-
-We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
-
-* DataLoader
-
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
-
-* Configuration
-
-The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
-
-* Training Steps
-
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
-
-* Tensorboard
-
-Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## Stage 4: Decoding
-This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
-
-* Mode Selection
-
-As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
-
-* Configuration
-
-We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
-
-* CPU/GPU Decoding
-
-We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
-
-* Performance
-
-We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
+Undo
diff --git a/docs/academic_recipe/vad_recipe.md b/docs/academic_recipe/vad_recipe.md
index 6aa7532..0216bc3 100644
--- a/docs/academic_recipe/vad_recipe.md
+++ b/docs/academic_recipe/vad_recipe.md
@@ -1,129 +1,2 @@
 # Voice Activity Detection
-Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
-
-## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
-- `CUDA_VISIBLE_DEVICES`: visible gpu list
-- `gpu_num`: the number of GPUs used for training
-- `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
-- `data_aishell`: the raw path of AISHELL-1 dataset
-- `feats_dir`: the path for saving processed data
-- `nj`: the number of jobs for data preparation
-- `speed_perturb`: the range of speech perturbed
-- `exp_dir`: the path for saving experimental results
-- `tag`: the suffix of experimental result directory
-
-## Stage 0: Data preparation
-This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
-
-## Stage 1: Feature Generation
-This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
-
-## Stage 2: Dictionary Preparation
-This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: indicates the blank token for CTC
-* `<s>`: indicates the start-of-sentence token
-* `</s>`: indicates the end-of-sentence token
-* `<unk>`: indicates the out-of-vocabulary token
-
-## Stage 3: Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
-
-* DDP Training
-
-We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
-
-* DataLoader
-
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
-
-* Configuration
-
-The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
-
-* Training Steps
-
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
-
-* Tensorboard
-
-Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## Stage 4: Decoding
-This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
-
-* Mode Selection
-
-As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
-
-* Configuration
-
-We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
-
-* CPU/GPU Decoding
-
-We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
-
-* Performance
-
-We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
+Undo
diff --git a/docs/index.rst b/docs/index.rst
index b8fcacd..c2656bd 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -17,8 +17,8 @@
    :maxdepth: 1
    :caption: Installation
 
-   ./installation.md
-   ./docker.md
+   ./installation/installation.md
+   ./installation/docker.md
 
 .. toctree::
    :maxdepth: 1
@@ -44,6 +44,7 @@
    ./modelscope_pipeline/tp_pipeline.md
    ./modelscope_pipeline/sv_pipeline.md
    ./modelscope_pipeline/sd_pipeline.md
+   ./modelscope_pipeline/itn_pipeline.md
 
 .. toctree::
    :maxdepth: 1
@@ -56,8 +57,8 @@
    :maxdepth: 1
    :caption: Model Zoo
 
-   ./modelscope_models.md
-   ./huggingface_models.md
+   ./model_zoo/modelscope_models.md
+   ./model_zoo/huggingface_models.md
 
 .. toctree::
    :maxdepth: 1
@@ -85,25 +86,25 @@
    :maxdepth: 1
    :caption: Funasr Library
 
-   ./build_task.md
+   ./reference/build_task.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Papers
 
-   ./papers.md
+   ./reference/papers.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Application
 
-   ./application.md
+   ./reference/application.md
 
 .. toctree::
    :maxdepth: 1
    :caption: FQA
 
-   ./FQA.md
+   ./reference/FQA.md
 
 
 Indices and tables
diff --git a/docs/docker.md b/docs/installation/docker.md
similarity index 100%
rename from docs/docker.md
rename to docs/installation/docker.md
diff --git a/docs/installation.md b/docs/installation/installation.md
similarity index 100%
rename from docs/installation.md
rename to docs/installation/installation.md
diff --git a/docs/m2met2/Baseline.md b/docs/m2met2/Baseline.md
index 6f9609b..4e12162 100644
--- a/docs/m2met2/Baseline.md
+++ b/docs/m2met2/Baseline.md
@@ -1,13 +1,34 @@
 # Baseline
 ## Overview
-We will release an E2E SA-ASR~\cite{kanda21b_interspeech} baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.
+We will release an E2E SA-ASR baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.
 
 ![model archietecture](images/sa_asr_arch.png)
 
 ## Quick start
-#TODO: fill with the README.md of the baseline
+To run the baseline, first you need to install FunASR and ModelScope. ([installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html))  
+There are two startup scripts, `run.sh` for training and evaluating on the old eval and test sets, and `run_m2met_2023_infer.sh` for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)) Challenge.  
+Before running `run.sh`, you must manually download and unpack the [AliMeeting](http://www.openslr.org/119/) corpus and place it in the `./dataset` directory:
+```shell
+dataset
+|鈥斺�� Eval_Ali_far
+|鈥斺�� Eval_Ali_near
+|鈥斺�� Test_Ali_far
+|鈥斺�� Test_Ali_near
+|鈥斺�� Train_Ali_far
+|鈥斺�� Train_Ali_near
+```
+Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.  
+```shell
+data/Test_2023_Ali_far
+|鈥斺�� wav.scp
+|鈥斺�� wav_raw.scp
+|鈥斺�� segments
+|鈥斺�� utt2spk
+|鈥斺�� spk2utt
+```
+For more details you can see [here](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 
 ## Baseline results
 The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy. 
 
-![baseline result](images/baseline_result.png)
\ No newline at end of file
+![baseline_result](images/baseline_result.png)
\ No newline at end of file
diff --git a/docs/m2met2/Contact.md b/docs/m2met2/Contact.md
index 3097ad7..6aa3bb7 100644
--- a/docs/m2met2/Contact.md
+++ b/docs/m2met2/Contact.md
@@ -1,9 +1,9 @@
 # Contact
-If you have any questions about M2MET2.0 challenge, please contact us by
+If you have any questions about M2MeT2.0 challenge, please contact us by
 
 - email: [m2met.alimeeting@gmail.com](mailto:m2met.alimeeting@gmail.com)
 
 |                Wechat group                |
 |:------------------------------------------:|
-<!-- | <img src="images/wechat.png" width="300"/> | -->
+| <img src="images/qrcode.png" width="300"/> | 
 
diff --git a/docs/m2met2/Introduction.md b/docs/m2met2/Introduction.md
index eac9eb6..fc7c356 100644
--- a/docs/m2met2/Introduction.md
+++ b/docs/m2met2/Introduction.md
@@ -6,23 +6,23 @@
 
 The ICASSP2022 M2MeT challenge focuses on meeting scenarios, and it comprises two main tasks: speaker diarization and multi-speaker automatic speech recognition. The former involves identifying who spoke when in the meeting, while the latter aims to transcribe speech from multiple speakers simultaneously, which poses significant technical difficulties due to overlapping speech and acoustic interferences.
 
-Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying "who spoke what at when". To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models' performance and advance the state of the art in this area.
+Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU 2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying "who spoke what at when". To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models' performance and advance the state of the art in this area.
 
 ## Timeline(AOE Time)
 - $ April~29, 2023: $ Challenge and registration open.
-- $ May~8, 2023: $ Baseline release.
-- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
-- $ June~9, 2023: $ Test data release and leaderboard open.
-- $ June~13, 2023: $ Final submission deadline.
-- $ June~19, 2023: $ Evaluation result and ranking release.
+- $ May~11, 2023: $ Baseline release.
+- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
+- $ June~16, 2023: $ Test data release and leaderboard open.
+- $ June~20, 2023: $ Final submission deadline and leaderboar close.
+- $ June~26, 2023: $ Evaluation result and ranking release.
 - $ July~3, 2023: $ Deadline for paper submission.
 - $ July~10, 2023: $ Deadline for final paper submission.
-- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge session
+- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.
 
 ## Guidelines
 
-Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 15, 2023. 
+Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 22, 2023. Participants are also welcome to join the [wechat group](https://alibaba-damo-academy.github.io/FunASR/m2met2/Contact.html) of M2MeT2.0 and keep up to date with the latest updates about the challenge.
 
-[M2MET2.0 Registration](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
+[M2MeT2.0 Registration](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
 
-Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top three submissions to be included in the ASRU2023 Proceedings. 
+Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top ranking submissions to be included in the ASRU2023 Proceedings. 
diff --git a/docs/m2met2/Organizers.md b/docs/m2met2/Organizers.md
index e16c803..f5a9da2 100644
--- a/docs/m2met2/Organizers.md
+++ b/docs/m2met2/Organizers.md
@@ -1,5 +1,5 @@
 # Organizers
-***Lei Xie, Professor, Northwestern Polytechnical University, China***
+***Lei Xie, Professor, AISHELL foundation, China***
 
 Email: [lxie@nwpu.edu.cn](mailto:lxie@nwpu.edu.cn)
 
diff --git a/docs/m2met2/_build/doctrees/Baseline.doctree b/docs/m2met2/_build/doctrees/Baseline.doctree
index 9fc7c50..f6ea62f 100644
--- a/docs/m2met2/_build/doctrees/Baseline.doctree
+++ b/docs/m2met2/_build/doctrees/Baseline.doctree
Binary files differ
diff --git a/docs/m2met2/_build/doctrees/Contact.doctree b/docs/m2met2/_build/doctrees/Contact.doctree
index e3f579f..0508819 100644
--- a/docs/m2met2/_build/doctrees/Contact.doctree
+++ b/docs/m2met2/_build/doctrees/Contact.doctree
Binary files differ
diff --git a/docs/m2met2/_build/doctrees/Introduction.doctree b/docs/m2met2/_build/doctrees/Introduction.doctree
index 84f1baa..6ffceef 100644
--- a/docs/m2met2/_build/doctrees/Introduction.doctree
+++ b/docs/m2met2/_build/doctrees/Introduction.doctree
Binary files differ
diff --git a/docs/m2met2/_build/doctrees/Organizers.doctree b/docs/m2met2/_build/doctrees/Organizers.doctree
index 0f571a3..7ecfbdf 100644
--- a/docs/m2met2/_build/doctrees/Organizers.doctree
+++ b/docs/m2met2/_build/doctrees/Organizers.doctree
Binary files differ
diff --git a/docs/m2met2/_build/doctrees/environment.pickle b/docs/m2met2/_build/doctrees/environment.pickle
index ea9c740..fe68059 100644
--- a/docs/m2met2/_build/doctrees/environment.pickle
+++ b/docs/m2met2/_build/doctrees/environment.pickle
Binary files differ
diff --git a/docs/m2met2/_build/html/.buildinfo b/docs/m2met2/_build/html/.buildinfo
index d62b4cf..97d32c4 100644
--- a/docs/m2met2/_build/html/.buildinfo
+++ b/docs/m2met2/_build/html/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 9907eab6bf227ca0fc6db297f26919da
+config: a62852d90c3e533904d811bbf85f977d
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/m2met2/_build/html/Baseline.html b/docs/m2met2/_build/html/Baseline.html
index e52d322..62c656c 100644
--- a/docs/m2met2/_build/html/Baseline.html
+++ b/docs/m2met2/_build/html/Baseline.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Baseline &#8212; m2met2  documentation</title>
+    <title>Baseline &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -44,7 +44,7 @@
         <li class="right" >
           <a href="Track_setting_and_evaluation.html" title="Track &amp; Evaluation"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Baseline</a></li> 
       </ul>
     </div>
@@ -55,7 +55,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -126,17 +126,38 @@
 <h1>Baseline<a class="headerlink" href="#baseline" title="Permalink to this heading">露</a></h1>
 <section id="overview">
 <h2>Overview<a class="headerlink" href="#overview" title="Permalink to this heading">露</a></h2>
-<p>We will release an E2E SA-ASR~\cite{kanda21b_interspeech} baseline conducted on <a class="reference external" href="https://github.com/alibaba-damo-academy/FunASR">FunASR</a> at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.</p>
+<p>We will release an E2E SA-ASR baseline conducted on <a class="reference external" href="https://github.com/alibaba-damo-academy/FunASR">FunASR</a> at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.</p>
 <p><img alt="model archietecture" src="_images/sa_asr_arch.png" /></p>
 </section>
 <section id="quick-start">
 <h2>Quick start<a class="headerlink" href="#quick-start" title="Permalink to this heading">露</a></h2>
-<p>#TODO: fill with the README.md of the baseline</p>
+<p>To run the baseline, first you need to install FunASR and ModelScope. (<a class="reference external" href="https://alibaba-damo-academy.github.io/FunASR/en/installation.html">installation</a>)<br />
+There are two startup scripts, <code class="docutils literal notranslate"><span class="pre">run.sh</span></code> for training and evaluating on the old eval and test sets, and <code class="docutils literal notranslate"><span class="pre">run_m2met_2023_infer.sh</span></code> for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 (<a class="reference external" href="https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html">M2MeT2.0</a>) Challenge.<br />
+Before running <code class="docutils literal notranslate"><span class="pre">run.sh</span></code>, you must manually download and unpack the <a class="reference external" href="http://www.openslr.org/119/">AliMeeting</a> corpus and place it in the <code class="docutils literal notranslate"><span class="pre">./dataset</span></code> directory:</p>
+<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>dataset
+<span class="p">|</span>鈥斺��<span class="w"> </span>Eval_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Eval_Ali_near
+<span class="p">|</span>鈥斺��<span class="w"> </span>Test_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Test_Ali_near
+<span class="p">|</span>鈥斺��<span class="w"> </span>Train_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Train_Ali_near
+</pre></div>
+</div>
+<p>Before running <code class="docutils literal notranslate"><span class="pre">run_m2met_2023_infer.sh</span></code>, you need to place the new test set <code class="docutils literal notranslate"><span class="pre">Test_2023_Ali_far</span></code> (to be released after the challenge starts) in the <code class="docutils literal notranslate"><span class="pre">./dataset</span></code> directory, which contains only raw audios. Then put the given <code class="docutils literal notranslate"><span class="pre">wav.scp</span></code>, <code class="docutils literal notranslate"><span class="pre">wav_raw.scp</span></code>, <code class="docutils literal notranslate"><span class="pre">segments</span></code>, <code class="docutils literal notranslate"><span class="pre">utt2spk</span></code> and <code class="docutils literal notranslate"><span class="pre">spk2utt</span></code> in the <code class="docutils literal notranslate"><span class="pre">./data/Test_2023_Ali_far</span></code> directory.</p>
+<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>data/Test_2023_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>wav.scp
+<span class="p">|</span>鈥斺��<span class="w"> </span>wav_raw.scp
+<span class="p">|</span>鈥斺��<span class="w"> </span>segments
+<span class="p">|</span>鈥斺��<span class="w"> </span>utt2spk
+<span class="p">|</span>鈥斺��<span class="w"> </span>spk2utt
+</pre></div>
+</div>
+<p>For more details you can see <a class="reference external" href="https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md">here</a></p>
 </section>
 <section id="baseline-results">
 <h2>Baseline results<a class="headerlink" href="#baseline-results" title="Permalink to this heading">露</a></h2>
 <p>The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy.</p>
-<p><img alt="baseline result" src="_images/baseline_result.png" /></p>
+<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
 </section>
 </section>
 
@@ -170,7 +191,7 @@
         <li class="right" >
           <a href="Track_setting_and_evaluation.html" title="Track &amp; Evaluation"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Baseline</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Contact.html b/docs/m2met2/_build/html/Contact.html
index eafd2d5..f268ef4 100644
--- a/docs/m2met2/_build/html/Contact.html
+++ b/docs/m2met2/_build/html/Contact.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Contact &#8212; m2met2  documentation</title>
+    <title>Contact &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -40,7 +40,7 @@
         <li class="right" >
           <a href="Organizers.html" title="Organizers"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Contact</a></li> 
       </ul>
     </div>
@@ -51,7 +51,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -120,7 +120,7 @@
             
   <section id="contact">
 <h1>Contact<a class="headerlink" href="#contact" title="Permalink to this heading">露</a></h1>
-<p>If you have any questions about M2MET2.0 challenge, please contact us by</p>
+<p>If you have any questions about M2MeT2.0 challenge, please contact us by</p>
 <ul class="simple">
 <li><p>email: <a class="reference external" href="mailto:m2met&#46;alimeeting&#37;&#52;&#48;gmail&#46;com">m2met<span>&#46;</span>alimeeting<span>&#64;</span>gmail<span>&#46;</span>com</a></p></li>
 </ul>
@@ -129,8 +129,11 @@
 <tr class="row-odd"><th class="head text-center"><p>Wechat group</p></th>
 </tr>
 </thead>
+<tbody>
+<tr class="row-even"><td class="text-center"><p><a class="reference internal" href="_images/qrcode.png"><img alt="_images/qrcode.png" src="_images/qrcode.png" style="width: 300px;" /></a></p></td>
+</tr>
+</tbody>
 </table>
-<!-- | <img src="images/wechat.png" width="300"/> | -->
 </section>
 
 
@@ -157,7 +160,7 @@
         <li class="right" >
           <a href="Organizers.html" title="Organizers"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Contact</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Dataset.html b/docs/m2met2/_build/html/Dataset.html
index 43bf8a1..f6b2a04 100644
--- a/docs/m2met2/_build/html/Dataset.html
+++ b/docs/m2met2/_build/html/Dataset.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Datasets &#8212; m2met2  documentation</title>
+    <title>Datasets &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="Introduction.html" title="Introduction"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Datasets</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -181,7 +181,7 @@
         <li class="right" >
           <a href="Introduction.html" title="Introduction"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Datasets</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Introduction.html b/docs/m2met2/_build/html/Introduction.html
index 2ddafe2..82394fc 100644
--- a/docs/m2met2/_build/html/Introduction.html
+++ b/docs/m2met2/_build/html/Introduction.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Introduction &#8212; m2met2  documentation</title>
+    <title>Introduction &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="index.html" title="ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Introduction</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -130,27 +130,27 @@
 <p>Automatic speech recognition (ASR) and speaker diarization have made significant strides in recent years, resulting in a surge of speech technology applications across various domains. However, meetings present unique challenges to speech technologies due to their complex acoustic conditions and diverse speaking styles, including overlapping speech, variable numbers of speakers, far-field signals in large conference rooms, and environmental noise and reverberation.</p>
 <p>Over the years, several challenges have been organized to advance the development of meeting transcription, including the Rich Transcription evaluation and Computational Hearing in Multisource Environments (CHIME) challenges. The latest iteration of the CHIME challenge has a particular focus on distant automatic speech recognition and developing systems that can generalize across various array topologies and application scenarios. However, while progress has been made in English meeting transcription, language differences remain a significant barrier to achieving comparable results in non-English languages, such as Mandarin. The Multimodal Information Based Speech Processing (MISP) and Multi-Channel Multi-Party Meeting Transcription (M2MeT) challenges have been instrumental in advancing Mandarin meeting transcription. The MISP challenge seeks to address the problem of audio-visual distant multi-microphone signal processing in everyday home environments, while the M2MeT challenge focuses on tackling the speech overlap issue in offline meeting rooms.</p>
 <p>The ICASSP2022 M2MeT challenge focuses on meeting scenarios, and it comprises two main tasks: speaker diarization and multi-speaker automatic speech recognition. The former involves identifying who spoke when in the meeting, while the latter aims to transcribe speech from multiple speakers simultaneously, which poses significant technical difficulties due to overlapping speech and acoustic interferences.</p>
-<p>Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying 鈥渨ho spoke what at when鈥�. To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models鈥� performance and advance the state of the art in this area.</p>
+<p>Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU 2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying 鈥渨ho spoke what at when鈥�. To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models鈥� performance and advance the state of the art in this area.</p>
 </section>
 <section id="timeline-aoe-time">
 <h2>Timeline(AOE Time)<a class="headerlink" href="#timeline-aoe-time" title="Permalink to this heading">露</a></h2>
 <ul class="simple">
 <li><p><span class="math notranslate nohighlight">\( April~29, 2023: \)</span> Challenge and registration open.</p></li>
-<li><p><span class="math notranslate nohighlight">\( May~8, 2023: \)</span> Baseline release.</p></li>
-<li><p><span class="math notranslate nohighlight">\( May~15, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~9, 2023: \)</span> Test data release and leaderboard open.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~13, 2023: \)</span> Final submission deadline.</p></li>
-<li><p><span class="math notranslate nohighlight">\( June~19, 2023: \)</span> Evaluation result and ranking release.</p></li>
+<li><p><span class="math notranslate nohighlight">\( May~11, 2023: \)</span> Baseline release.</p></li>
+<li><p><span class="math notranslate nohighlight">\( May~22, 2023: \)</span> Registration deadline, the due date for participants to join the Challenge.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~16, 2023: \)</span> Test data release and leaderboard open.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~20, 2023: \)</span> Final submission deadline and leaderboar close.</p></li>
+<li><p><span class="math notranslate nohighlight">\( June~26, 2023: \)</span> Evaluation result and ranking release.</p></li>
 <li><p><span class="math notranslate nohighlight">\( July~3, 2023: \)</span> Deadline for paper submission.</p></li>
 <li><p><span class="math notranslate nohighlight">\( July~10, 2023: \)</span> Deadline for final paper submission.</p></li>
-<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and challenge session</p></li>
+<li><p><span class="math notranslate nohighlight">\( December~12\ to\ 16, 2023: \)</span> ASRU Workshop and Challenge Session.</p></li>
 </ul>
 </section>
 <section id="guidelines">
 <h2>Guidelines<a class="headerlink" href="#guidelines" title="Permalink to this heading">露</a></h2>
-<p>Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 15, 2023.</p>
-<p><a class="reference external" href="https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link">M2MET2.0 Registration</a></p>
-<p>Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top three submissions to be included in the ASRU2023 Proceedings.</p>
+<p>Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 22, 2023. Participants are also welcome to join the <a class="reference external" href="https://alibaba-damo-academy.github.io/FunASR/m2met2/Contact.html">wechat group</a> of M2MeT2.0 and keep up to date with the latest updates about the challenge.</p>
+<p><a class="reference external" href="https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link">M2MeT2.0 Registration</a></p>
+<p>Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top ranking submissions to be included in the ASRU2023 Proceedings.</p>
 </section>
 </section>
 
@@ -184,7 +184,7 @@
         <li class="right" >
           <a href="index.html" title="ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Introduction</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Organizers.html b/docs/m2met2/_build/html/Organizers.html
index 0a8811e..e500019 100644
--- a/docs/m2met2/_build/html/Organizers.html
+++ b/docs/m2met2/_build/html/Organizers.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Organizers &#8212; m2met2  documentation</title>
+    <title>Organizers &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -44,7 +44,7 @@
         <li class="right" >
           <a href="Rules.html" title="Rules"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Organizers</a></li> 
       </ul>
     </div>
@@ -55,7 +55,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -124,7 +124,7 @@
             
   <section id="organizers">
 <h1>Organizers<a class="headerlink" href="#organizers" title="Permalink to this heading">露</a></h1>
-<p><em><strong>Lei Xie, Professor, Northwestern Polytechnical University, China</strong></em></p>
+<p><em><strong>Lei Xie, Professor, AISHELL foundation, China</strong></em></p>
 <p>Email: <a class="reference external" href="mailto:lxie&#37;&#52;&#48;nwpu&#46;edu&#46;cn">lxie<span>&#64;</span>nwpu<span>&#46;</span>edu<span>&#46;</span>cn</a></p>
 <a class="reference internal image-reference" href="_images/lxie.jpeg"><img alt="lxie" src="_images/lxie.jpeg" style="width: 20%;" /></a>
 <p><em><strong>Kong Aik Lee, Senior Scientist at Institute for Infocomm Research, A*Star, Singapore</strong></em></p>
@@ -180,7 +180,7 @@
         <li class="right" >
           <a href="Rules.html" title="Rules"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Organizers</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Rules.html b/docs/m2met2/_build/html/Rules.html
index 5965115..01f79cb 100644
--- a/docs/m2met2/_build/html/Rules.html
+++ b/docs/m2met2/_build/html/Rules.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Rules &#8212; m2met2  documentation</title>
+    <title>Rules &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -44,7 +44,7 @@
         <li class="right" >
           <a href="Baseline.html" title="Baseline"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Rules</a></li> 
       </ul>
     </div>
@@ -55,7 +55,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -165,7 +165,7 @@
         <li class="right" >
           <a href="Baseline.html" title="Baseline"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Rules</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/Track_setting_and_evaluation.html b/docs/m2met2/_build/html/Track_setting_and_evaluation.html
index 859f444..1cd72d9 100644
--- a/docs/m2met2/_build/html/Track_setting_and_evaluation.html
+++ b/docs/m2met2/_build/html/Track_setting_and_evaluation.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Track &amp; Evaluation &#8212; m2met2  documentation</title>
+    <title>Track &amp; Evaluation &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="Dataset.html" title="Datasets"
              accesskey="P">previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Track &amp; Evaluation</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -180,7 +180,7 @@
         <li class="right" >
           <a href="Dataset.html" title="Datasets"
              >previous</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Track &amp; Evaluation</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/_images/baseline_result.png b/docs/m2met2/_build/html/_images/baseline_result.png
index d51d775..6b76361 100644
--- a/docs/m2met2/_build/html/_images/baseline_result.png
+++ b/docs/m2met2/_build/html/_images/baseline_result.png
Binary files differ
diff --git a/docs/m2met2/_build/html/_images/qrcode.png b/docs/m2met2/_build/html/_images/qrcode.png
new file mode 100644
index 0000000..fc4c349
--- /dev/null
+++ b/docs/m2met2/_build/html/_images/qrcode.png
Binary files differ
diff --git a/docs/m2met2/_build/html/_sources/Baseline.md.txt b/docs/m2met2/_build/html/_sources/Baseline.md.txt
index 6f9609b..4e12162 100644
--- a/docs/m2met2/_build/html/_sources/Baseline.md.txt
+++ b/docs/m2met2/_build/html/_sources/Baseline.md.txt
@@ -1,13 +1,34 @@
 # Baseline
 ## Overview
-We will release an E2E SA-ASR~\cite{kanda21b_interspeech} baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.
+We will release an E2E SA-ASR baseline conducted on [FunASR](https://github.com/alibaba-damo-academy/FunASR) at the time according to the timeline. The model architecture is shown in Figure 3. The SpeakerEncoder is initialized with a pre-trained speaker verification model from ModelScope. This speaker verification model is also be used to extract the speaker embedding in the speaker profile.
 
 ![model archietecture](images/sa_asr_arch.png)
 
 ## Quick start
-#TODO: fill with the README.md of the baseline
+To run the baseline, first you need to install FunASR and ModelScope. ([installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html))  
+There are two startup scripts, `run.sh` for training and evaluating on the old eval and test sets, and `run_m2met_2023_infer.sh` for inference on the new test set of the Multi-Channel Multi-Party Meeting Transcription 2.0 ([M2MeT2.0](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)) Challenge.  
+Before running `run.sh`, you must manually download and unpack the [AliMeeting](http://www.openslr.org/119/) corpus and place it in the `./dataset` directory:
+```shell
+dataset
+|鈥斺�� Eval_Ali_far
+|鈥斺�� Eval_Ali_near
+|鈥斺�� Test_Ali_far
+|鈥斺�� Test_Ali_near
+|鈥斺�� Train_Ali_far
+|鈥斺�� Train_Ali_near
+```
+Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.  
+```shell
+data/Test_2023_Ali_far
+|鈥斺�� wav.scp
+|鈥斺�� wav_raw.scp
+|鈥斺�� segments
+|鈥斺�� utt2spk
+|鈥斺�� spk2utt
+```
+For more details you can see [here](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 
 ## Baseline results
 The results of the baseline system are shown in Table 3. The speaker profile adopts the oracle speaker embedding during training. However, due to the lack of oracle speaker label during evaluation, the speaker profile provided by an additional spectral clustering is used. Meanwhile, the results of using the oracle speaker profile on Eval and Test Set are also provided to show the impact of speaker profile accuracy. 
 
-![baseline result](images/baseline_result.png)
\ No newline at end of file
+![baseline_result](images/baseline_result.png)
\ No newline at end of file
diff --git a/docs/m2met2/_build/html/_sources/Contact.md.txt b/docs/m2met2/_build/html/_sources/Contact.md.txt
index 3097ad7..6aa3bb7 100644
--- a/docs/m2met2/_build/html/_sources/Contact.md.txt
+++ b/docs/m2met2/_build/html/_sources/Contact.md.txt
@@ -1,9 +1,9 @@
 # Contact
-If you have any questions about M2MET2.0 challenge, please contact us by
+If you have any questions about M2MeT2.0 challenge, please contact us by
 
 - email: [m2met.alimeeting@gmail.com](mailto:m2met.alimeeting@gmail.com)
 
 |                Wechat group                |
 |:------------------------------------------:|
-<!-- | <img src="images/wechat.png" width="300"/> | -->
+| <img src="images/qrcode.png" width="300"/> | 
 
diff --git a/docs/m2met2/_build/html/_sources/Introduction.md.txt b/docs/m2met2/_build/html/_sources/Introduction.md.txt
index eac9eb6..fc7c356 100644
--- a/docs/m2met2/_build/html/_sources/Introduction.md.txt
+++ b/docs/m2met2/_build/html/_sources/Introduction.md.txt
@@ -6,23 +6,23 @@
 
 The ICASSP2022 M2MeT challenge focuses on meeting scenarios, and it comprises two main tasks: speaker diarization and multi-speaker automatic speech recognition. The former involves identifying who spoke when in the meeting, while the latter aims to transcribe speech from multiple speakers simultaneously, which poses significant technical difficulties due to overlapping speech and acoustic interferences.
 
-Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying "who spoke what at when". To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models' performance and advance the state of the art in this area.
+Building on the success of the previous M2MeT challenge, we are excited to propose the M2MeT2.0 challenge as an ASRU 2023 challenge special session. In the original M2MeT challenge, the evaluation metric was speaker-independent, which meant that the transcription could be determined, but not the corresponding speaker. To address this limitation and further advance the current multi-talker ASR system towards practicality, the M2MeT2.0 challenge proposes the speaker-attributed ASR task with two sub-tracks: fixed and open training conditions. The speaker-attribute automatic speech recognition (ASR) task aims to tackle the practical and challenging problem of identifying "who spoke what at when". To facilitate reproducible research in this field, we offer a comprehensive overview of the dataset, rules, evaluation metrics, and baseline systems. Furthermore, we will release a carefully curated test set, comprising approximately 10 hours of audio, according to the timeline. The new test set is designed to enable researchers to validate and compare their models' performance and advance the state of the art in this area.
 
 ## Timeline(AOE Time)
 - $ April~29, 2023: $ Challenge and registration open.
-- $ May~8, 2023: $ Baseline release.
-- $ May~15, 2023: $ Registration deadline, the due date for participants to join the Challenge.
-- $ June~9, 2023: $ Test data release and leaderboard open.
-- $ June~13, 2023: $ Final submission deadline.
-- $ June~19, 2023: $ Evaluation result and ranking release.
+- $ May~11, 2023: $ Baseline release.
+- $ May~22, 2023: $ Registration deadline, the due date for participants to join the Challenge.
+- $ June~16, 2023: $ Test data release and leaderboard open.
+- $ June~20, 2023: $ Final submission deadline and leaderboar close.
+- $ June~26, 2023: $ Evaluation result and ranking release.
 - $ July~3, 2023: $ Deadline for paper submission.
 - $ July~10, 2023: $ Deadline for final paper submission.
-- $ December~12\ to\ 16, 2023: $ ASRU Workshop and challenge session
+- $ December~12\ to\ 16, 2023: $ ASRU Workshop and Challenge Session.
 
 ## Guidelines
 
-Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 15, 2023. 
+Interested participants, whether from academia or industry, must register for the challenge by completing the Google form below. The deadline for registration is May 22, 2023. Participants are also welcome to join the [wechat group](https://alibaba-damo-academy.github.io/FunASR/m2met2/Contact.html) of M2MeT2.0 and keep up to date with the latest updates about the challenge.
 
-[M2MET2.0 Registration](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
+[M2MeT2.0 Registration](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
 
-Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top three submissions to be included in the ASRU2023 Proceedings. 
+Within three working days, the challenge organizer will send email invitations to eligible teams to participate in the challenge. All qualified teams are required to adhere to the challenge rules, which will be published on the challenge page. Prior to the ranking release time, each participant must submit a system description document detailing their approach and methods. The organizer will select the top ranking submissions to be included in the ASRU2023 Proceedings. 
diff --git a/docs/m2met2/_build/html/_sources/Organizers.md.txt b/docs/m2met2/_build/html/_sources/Organizers.md.txt
index e16c803..f5a9da2 100644
--- a/docs/m2met2/_build/html/_sources/Organizers.md.txt
+++ b/docs/m2met2/_build/html/_sources/Organizers.md.txt
@@ -1,5 +1,5 @@
 # Organizers
-***Lei Xie, Professor, Northwestern Polytechnical University, China***
+***Lei Xie, Professor, AISHELL foundation, China***
 
 Email: [lxie@nwpu.edu.cn](mailto:lxie@nwpu.edu.cn)
 
diff --git a/docs/m2met2/_build/html/genindex.html b/docs/m2met2/_build/html/genindex.html
index e7e17b6..b331f6f 100644
--- a/docs/m2met2/_build/html/genindex.html
+++ b/docs/m2met2/_build/html/genindex.html
@@ -14,7 +14,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Index &#8212; m2met2  documentation</title>
+    <title>Index &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -35,7 +35,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="#" title="General Index"
              accesskey="I">index</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Index</a></li> 
       </ul>
     </div>
@@ -46,7 +46,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -132,7 +132,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="#" title="General Index"
              >index</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Index</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/index.html b/docs/m2met2/_build/html/index.html
index dcbb8cb..dd2a9cc 100644
--- a/docs/m2met2/_build/html/index.html
+++ b/docs/m2met2/_build/html/index.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0) &#8212; m2met2  documentation</title>
+    <title>ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0) &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -40,7 +40,7 @@
         <li class="right" >
           <a href="Introduction.html" title="Introduction"
              accesskey="N">next</a> |</li>
-        <li class="nav-item nav-item-0"><a href="#">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="#">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)</a></li> 
       </ul>
     </div>
@@ -51,7 +51,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    #" class="text-logo">m2met2  documentation</a>
+    #" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -160,7 +160,7 @@
         <li class="right" >
           <a href="Introduction.html" title="Introduction"
              >next</a> |</li>
-        <li class="nav-item nav-item-0"><a href="#">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="#">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/objects.inv b/docs/m2met2/_build/html/objects.inv
index 393198c..d8a5ea5 100644
--- a/docs/m2met2/_build/html/objects.inv
+++ b/docs/m2met2/_build/html/objects.inv
@@ -1,5 +1,5 @@
 # Sphinx inventory version 2
-# Project: m2met2
+# Project: MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0
 # Version: 
 # The remainder of this file is compressed using zlib.
 x趨怉O�0秋�飀衾2矜�!橫�#�<-�6@鞀乥湡迬�７鲼~�鱚迒
B禞罓�#�*頰阖G	�;ョ奀辺b赿咰謕�a帍昻;5风�7蒗5S項鑷箉∞+F)�|曵瑉;樶篣鮼)~L庫z兛<踜2��巎排缭H瞁!檄<�1|~骀蛯�,�撒)臕�挙顥G}�,I(&[燛D矢�9�;芁觿lWk竿翷P蠛C菐�藏�)�;937椔�8薕�1譡�龙猋-硻抑sV�
�,�
\ No newline at end of file
diff --git a/docs/m2met2/_build/html/search.html b/docs/m2met2/_build/html/search.html
index 71adf36..f91b51a 100644
--- a/docs/m2met2/_build/html/search.html
+++ b/docs/m2met2/_build/html/search.html
@@ -14,7 +14,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>Search &#8212; m2met2  documentation</title>
+    <title>Search &#8212; MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     
@@ -41,7 +41,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="genindex.html" title="General Index"
              accesskey="I">index</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Search</a></li> 
       </ul>
     </div>
@@ -52,7 +52,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  documentation</a>
+    index.html" class="text-logo">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-toc">
     
@@ -149,7 +149,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="genindex.html" title="General Index"
              >index</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  documentation</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">Search</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2/_build/html/searchindex.js b/docs/m2met2/_build/html/searchindex.js
index 54443a0..3387db5 100644
--- a/docs/m2met2/_build/html/searchindex.js
+++ b/docs/m2met2/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["Baseline", "Contact", "Dataset", "Introduction", "Organizers", "Rules", "Track_setting_and_evaluation", "index"], "filenames": ["Baseline.md", "Contact.md", "Dataset.md", "Introduction.md", "Organizers.md", "Rules.md", "Track_setting_and_evaluation.md", "index.rst"], "titles": ["Baseline", "Contact", "Datasets", "Introduction", "Organizers", "Rules", "Track &amp; Evaluation", "ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)"], "terms": {"we": [0, 2, 3, 7], "releas": [0, 2, 3, 6], "an": [0, 2, 3, 6], "e2": 0, "sa": 0, "asr": [0, 3, 7], "cite": 0, "kanda21b_interspeech": 0, "conduct": [0, 2], "funasr": 0, "time": [0, 6], "accord": [0, 3], "timelin": [0, 2], "The": [0, 2, 3, 5, 6], "model": [0, 2, 3, 5, 6], "architectur": 0, "i": [0, 2, 3, 5], "shown": [0, 2], "figur": [0, 6], "3": [0, 2, 3], "speakerencod": 0, "initi": 0, "pre": [0, 6], "train": [0, 3, 5, 7], "speaker": [0, 2, 3, 7], "verif": 0, "from": [0, 2, 3, 5, 6], "modelscop": [0, 6], "thi": [0, 3, 5, 6], "also": [0, 2, 6], "us": [0, 2, 5, 6], "extract": 0, "embed": 0, "profil": 0, "todo": 0, "fill": 0, "readm": 0, "md": 0, "system": [0, 3, 5, 6, 7], "ar": [0, 2, 3, 5, 6, 7], "tabl": [0, 2], "adopt": 0, "oracl": [0, 6], "dure": [0, 2, 6], "howev": [0, 3, 6], "due": [0, 3], "lack": 0, "label": [0, 5, 6], "evalu": [0, 2, 3, 7], "provid": [0, 2, 6, 7], "addit": [0, 6], "spectral": 0, "cluster": 0, "meanwhil": 0, "eval": [0, 2, 5, 6], "test": [0, 2, 3, 5, 6], "set": [0, 2, 3, 5, 6], "show": 0, "impact": 0, "accuraci": [0, 6], "If": [1, 5, 6], "you": 1, "have": [1, 3], "ani": [1, 5, 6], "question": 1, "about": 1, "m2met2": [1, 3], "0": [1, 2, 3], "challeng": [1, 3, 5, 6], "pleas": 1, "u": [1, 2], "email": [1, 3, 4], "m2met": [1, 3, 6, 7], "alimeet": [1, 6], "gmail": 1, "com": [1, 4], "wechat": 1, "group": [1, 2], "In": [2, 3, 5], "fix": [2, 3, 7], "condit": [2, 3, 7], "restrict": 2, "three": [2, 3, 6], "publicli": [2, 6], "avail": [2, 6], "corpora": 2, "name": 2, "aishel": [2, 4, 6], "4": [2, 6], "cn": [2, 4, 6], "celeb": [2, 6], "To": [2, 3, 7], "perform": [2, 3], "new": [2, 3, 6], "call": 2, "2023": [2, 3, 5, 6], "score": [2, 6], "rank": [2, 3, 6], "describ": 2, "contain": [2, 6], "118": 2, "75": 2, "hour": [2, 3, 6], "speech": [2, 3, 6, 7], "total": [2, 6], "divid": [2, 6], "104": 2, "10": [2, 3, 6], "specif": [2, 6], "212": 2, "8": [2, 3], "20": 2, "session": [2, 3, 6, 7], "respect": 2, "each": [2, 3, 6], "consist": [2, 6], "15": [2, 3], "30": 2, "minut": 2, "discuss": 2, "particip": [2, 5, 6], "number": [2, 3, 6], "456": 2, "25": 2, "60": 2, "balanc": 2, "gender": 2, "coverag": 2, "collect": 2, "13": [2, 3], "meet": [2, 3, 6], "venu": 2, "which": [2, 3, 6], "categor": 2, "type": 2, "small": 2, "medium": 2, "larg": [2, 3], "room": [2, 3], "size": 2, "rang": 2, "m": 2, "2": [2, 6], "55": 2, "differ": [2, 3, 6], "give": 2, "varieti": 2, "acoust": [2, 3, 6], "properti": 2, "layout": 2, "paramet": [2, 5], "togeth": 2, "wall": 2, "materi": 2, "cover": 2, "cement": 2, "glass": 2, "etc": 2, "other": 2, "furnish": 2, "includ": [2, 3, 5, 6], "sofa": 2, "tv": 2, "blackboard": 2, "fan": 2, "air": 2, "condition": 2, "plant": 2, "record": [2, 6], "sit": 2, "around": 2, "microphon": [2, 3], "arrai": [2, 3], "place": 2, "natur": 2, "convers": 2, "distanc": 2, "5": 2, "all": [2, 3, 5, 6], "nativ": 2, "chines": 2, "speak": [2, 3], "mandarin": [2, 3], "without": 2, "strong": 2, "accent": 2, "variou": [2, 3], "kind": 2, "indoor": 2, "nois": [2, 3, 5], "limit": [2, 3, 5], "click": 2, "keyboard": 2, "door": 2, "open": [2, 3, 7], "close": 2, "bubbl": 2, "made": [2, 3], "For": 2, "both": [2, 6], "requir": [2, 3, 6], "remain": [2, 3], "same": [2, 5], "posit": 2, "There": 2, "overlap": [2, 3], "between": [2, 6], "exampl": 2, "fig": 2, "1": 2, "within": [2, 3], "one": [2, 5], "ensur": 2, "ratio": 2, "select": [2, 3, 5, 6], "topic": 2, "medic": 2, "treatment": 2, "educ": 2, "busi": 2, "organ": [2, 3, 5, 6, 7], "manag": 2, "industri": [2, 3], "product": 2, "daili": 2, "routin": 2, "averag": 2, "42": 2, "27": 2, "34": 2, "76": 2, "more": 2, "A": [2, 4], "distribut": 2, "were": 2, "ident": [2, 6], "compris": [2, 3, 7], "therebi": 2, "share": 2, "similar": 2, "configur": 2, "field": [2, 3, 6], "signal": [2, 3], "headset": 2, "onli": [2, 5, 6], "": [2, 6], "own": 2, "transcrib": [2, 3, 6], "It": [2, 6], "worth": [2, 6], "note": [2, 6], "far": [2, 3], "audio": [2, 3, 6], "synchron": 2, "common": 2, "transcript": [2, 3, 5, 6], "prepar": 2, "textgrid": 2, "format": 2, "inform": [2, 3], "durat": 2, "id": 2, "segment": [2, 6], "timestamp": [2, 6], "mention": 2, "abov": 2, "can": [2, 3, 5, 6], "download": 2, "openslr": 2, "via": 2, "follow": [2, 5], "link": 2, "particularli": 2, "baselin": [2, 3, 7], "conveni": 2, "script": 2, "automat": [3, 7], "recognit": [3, 7], "diariz": 3, "signific": 3, "stride": 3, "recent": 3, "year": 3, "result": 3, "surg": 3, "technologi": 3, "applic": 3, "across": 3, "domain": 3, "present": 3, "uniqu": [3, 6], "complex": [3, 5], "divers": 3, "style": 3, "variabl": 3, "confer": 3, "environment": 3, "reverber": [3, 5], "over": 3, "sever": 3, "been": 3, "advanc": [3, 7], "develop": [3, 6], "rich": 3, "comput": [3, 5], "hear": 3, "multisourc": 3, "environ": 3, "chime": 3, "latest": 3, "iter": 3, "ha": 3, "particular": 3, "focu": 3, "distant": 3, "gener": 3, "topologi": 3, "scenario": 3, "while": 3, "progress": 3, "english": 3, "languag": [3, 5], "barrier": 3, "achiev": 3, "compar": 3, "non": 3, "multimod": 3, "base": 3, "process": [3, 6], "misp": 3, "multi": [3, 6], "channel": 3, "parti": [3, 6], "instrument": 3, "seek": 3, "address": 3, "problem": 3, "visual": 3, "everydai": 3, "home": 3, "focus": 3, "tackl": 3, "issu": 3, "offlin": 3, "icassp2022": 3, "two": [3, 5, 7], "main": 3, "task": [3, 6, 7], "former": 3, "involv": [3, 6], "identifi": 3, "who": 3, "spoke": 3, "when": 3, "latter": 3, "aim": 3, "multipl": [3, 6], "simultan": 3, "pose": [3, 6], "technic": 3, "difficulti": 3, "interfer": 3, "build": [3, 6, 7], "success": [3, 7], "previou": 3, "excit": 3, "propos": [3, 7], "asru2023": [3, 7], "special": [3, 5, 7], "origin": [3, 5], "metric": [3, 7], "wa": [3, 6], "independ": 3, "meant": 3, "could": 3, "determin": 3, "correspond": [3, 5], "further": 3, "current": [3, 7], "talker": [3, 7], "toward": 3, "practic": 3, "attribut": [3, 7], "sub": [3, 5, 7], "track": [3, 5, 7], "what": 3, "facilit": [3, 7], "reproduc": [3, 7], "research": [3, 4, 7], "offer": 3, "comprehens": [3, 7], "overview": [3, 7], "dataset": [3, 5, 6, 7], "rule": [3, 7], "furthermor": 3, "carefulli": 3, "curat": 3, "approxim": [3, 6], "design": 3, "enabl": 3, "valid": 3, "state": [3, 6, 7], "art": [3, 7], "area": 3, "april": 3, "29": 3, "registr": 3, "mai": 3, "deadlin": 3, "date": 3, "join": 3, "june": 3, "9": 3, "data": [3, 5, 6], "leaderboard": 3, "final": [3, 5, 6], "submiss": 3, "19": 3, "juli": 3, "paper": [3, 6], "decemb": 3, "12": 3, "16": 3, "asru": 3, "workshop": 3, "interest": 3, "whether": 3, "academia": 3, "must": [3, 5, 6], "regist": 3, "complet": 3, "googl": 3, "form": 3, "below": 3, "work": 3, "dai": 3, "send": 3, "invit": 3, "elig": [3, 5], "team": 3, "qualifi": 3, "adher": [3, 5], "publish": 3, "page": 3, "prior": 3, "submit": 3, "descript": [3, 6], "document": 3, "detail": [3, 6], "approach": [3, 5], "method": 3, "top": 3, "proceed": 3, "lei": 4, "xie": 4, "professor": 4, "northwestern": 4, "polytechn": 4, "univers": 4, "china": 4, "lxie": 4, "nwpu": 4, "edu": 4, "kong": 4, "aik": 4, "lee": 4, "senior": 4, "scientist": 4, "institut": 4, "infocomm": 4, "star": 4, "singapor": 4, "kongaik": 4, "ieee": 4, "org": 4, "zhiji": 4, "yan": 4, "princip": 4, "engin": 4, "alibaba": 4, "yzj": 4, "inc": 4, "shiliang": 4, "zhang": 4, "sly": 4, "zsl": 4, "yanmin": 4, "qian": 4, "shanghai": 4, "jiao": 4, "tong": 4, "yanminqian": 4, "sjtu": 4, "zhuo": 4, "chen": 4, "appli": 4, "microsoft": 4, "usa": 4, "zhuc": 4, "jian": 4, "wu": 4, "wujian": 4, "hui": 4, "bu": 4, "ceo": 4, "foundat": 4, "buhui": 4, "aishelldata": 4, "should": 5, "augment": 5, "allow": [5, 6], "ad": 5, "speed": 5, "perturb": 5, "tone": 5, "chang": 5, "permit": 5, "purpos": 5, "instead": [5, 6], "util": [5, 6], "tune": 5, "violat": 5, "strictli": [5, 6], "prohibit": [5, 6], "fine": 5, "cpcer": [5, 6], "lower": 5, "judg": 5, "superior": 5, "forc": 5, "align": 5, "obtain": [5, 6], "frame": 5, "level": 5, "classif": 5, "basi": 5, "shallow": 5, "fusion": 5, "end": 5, "e": [5, 6], "g": 5, "la": 5, "rnnt": 5, "transform": [5, 6], "come": 5, "right": 5, "interpret": 5, "belong": 5, "case": 5, "circumst": 5, "coordin": 5, "assign": 6, "illustr": 6, "aishell4": 6, "constrain": 6, "sourc": 6, "addition": 6, "corpu": 6, "soon": 6, "simpl": 6, "voic": 6, "activ": 6, "detect": 6, "vad": 6, "concaten": 6, "minimum": 6, "permut": 6, "charact": 6, "error": 6, "rate": 6, "calcul": 6, "step": 6, "firstli": 6, "refer": 6, "hypothesi": 6, "chronolog": 6, "order": 6, "secondli": 6, "cer": 6, "repeat": 6, "possibl": 6, "lowest": 6, "tthe": 6, "insert": 6, "Ins": 6, "substitut": 6, "delet": 6, "del": 6, "output": 6, "text": 6, "frac": 6, "mathcal": 6, "n_": 6, "100": 6, "where": 6, "usag": 6, "third": 6, "hug": 6, "face": 6, "list": 6, "clearli": 6, "privat": 6, "manual": 6, "simul": 6, "thei": 6, "mandatori": 6, "clear": 6, "scheme": 6, "delight": 7, "introduct": 7, "contact": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"baselin": 0, "overview": [0, 2], "quick": 0, "start": 0, "result": 0, "contact": 1, "dataset": 2, "train": [2, 6], "data": 2, "detail": 2, "alimeet": 2, "corpu": 2, "get": 2, "introduct": 3, "call": 3, "particip": 3, "timelin": 3, "aoe": 3, "time": 3, "guidelin": 3, "organ": 4, "rule": 5, "track": 6, "evalu": 6, "speaker": 6, "attribut": 6, "asr": 6, "metric": 6, "sub": 6, "arrang": 6, "i": 6, "fix": 6, "condit": 6, "ii": 6, "open": 6, "asru": 7, "2023": 7, "multi": 7, "channel": 7, "parti": 7, "meet": 7, "transcript": 7, "challeng": 7, "2": 7, "0": 7, "m2met2": 7, "content": 7}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 57}, "alltitles": {"Baseline": [[0, "baseline"]], "Overview": [[0, "overview"]], "Quick start": [[0, "quick-start"]], "Baseline results": [[0, "baseline-results"]], "Contact": [[1, "contact"]], "Datasets": [[2, "datasets"]], "Overview of training data": [[2, "overview-of-training-data"]], "Detail of AliMeeting corpus": [[2, "detail-of-alimeeting-corpus"]], "Get the data": [[2, "get-the-data"]], "Introduction": [[3, "introduction"]], "Call for participation": [[3, "call-for-participation"]], "Timeline(AOE Time)": [[3, "timeline-aoe-time"]], "Guidelines": [[3, "guidelines"]], "Organizers": [[4, "organizers"]], "Rules": [[5, "rules"]], "Track & Evaluation": [[6, "track-evaluation"]], "Speaker-Attributed ASR": [[6, "speaker-attributed-asr"]], "Evaluation metric": [[6, "evaluation-metric"]], "Sub-track arrangement": [[6, "sub-track-arrangement"]], "Sub-track I (Fixed Training Condition):": [[6, "sub-track-i-fixed-training-condition"]], "Sub-track II (Open Training Condition):": [[6, "sub-track-ii-open-training-condition"]], "ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)": [[7, "asru-2023-multi-channel-multi-party-meeting-transcription-challenge-2-0-m2met2-0"]], "Contents:": [[7, null]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["Baseline", "Contact", "Dataset", "Introduction", "Organizers", "Rules", "Track_setting_and_evaluation", "index"], "filenames": ["Baseline.md", "Contact.md", "Dataset.md", "Introduction.md", "Organizers.md", "Rules.md", "Track_setting_and_evaluation.md", "index.rst"], "titles": ["Baseline", "Contact", "Datasets", "Introduction", "Organizers", "Rules", "Track &amp; Evaluation", "ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)"], "terms": {"we": [0, 2, 3, 7], "releas": [0, 2, 3, 6], "an": [0, 2, 3, 6], "e2": 0, "sa": 0, "asr": [0, 3, 7], "conduct": [0, 2], "funasr": 0, "time": [0, 6], "accord": [0, 3], "timelin": [0, 2], "The": [0, 2, 3, 5, 6], "model": [0, 2, 3, 5, 6], "architectur": 0, "i": [0, 2, 3, 5], "shown": [0, 2], "figur": [0, 6], "3": [0, 2, 3], "speakerencod": 0, "initi": 0, "pre": [0, 6], "train": [0, 3, 5, 7], "speaker": [0, 2, 3, 7], "verif": 0, "from": [0, 2, 3, 5, 6], "modelscop": [0, 6], "thi": [0, 3, 5, 6], "also": [0, 2, 3, 6], "us": [0, 2, 5, 6], "extract": 0, "embed": 0, "profil": 0, "To": [0, 2, 3, 7], "run": 0, "first": 0, "you": [0, 1], "need": 0, "instal": 0, "There": [0, 2], "ar": [0, 2, 3, 5, 6, 7], "two": [0, 3, 5, 7], "startup": 0, "script": [0, 2], "sh": 0, "evalu": [0, 2, 3, 7], "old": 0, "eval": [0, 2, 5, 6], "test": [0, 2, 3, 5, 6], "set": [0, 2, 3, 5, 6], "run_m2met_2023_inf": 0, "infer": 0, "new": [0, 2, 3, 6], "multi": [0, 3, 6], "channel": [0, 3], "parti": [0, 3, 6], "meet": [0, 2, 3, 6], "transcript": [0, 2, 3, 5, 6], "2": [0, 2, 6], "0": [0, 1, 2, 3], "m2met2": [0, 1, 3], "challeng": [0, 1, 3, 5, 6], "befor": 0, "must": [0, 3, 5, 6], "manual": [0, 6], "download": [0, 2], "unpack": 0, "alimeet": [0, 1, 6], "corpu": [0, 6], "place": [0, 2], "dataset": [0, 3, 5, 6, 7], "directori": 0, "eval_ali_far": 0, "eval_ali_near": 0, "test_ali_far": 0, "test_ali_near": 0, "train_ali_far": 0, "train_ali_near": 0, "test_2023_ali_far": 0, "after": 0, "which": [0, 2, 3, 6], "contain": [0, 2, 6], "onli": [0, 2, 5, 6], "raw": 0, "audio": [0, 2, 3, 6], "Then": 0, "put": 0, "given": 0, "wav": 0, "scp": 0, "wav_raw": 0, "segment": [0, 2, 6], "utt2spk": 0, "spk2utt": 0, "data": [0, 3, 5, 6], "For": [0, 2], "more": [0, 2], "detail": [0, 3, 6], "can": [0, 2, 3, 5, 6], "see": 0, "here": 0, "system": [0, 3, 5, 6, 7], "tabl": [0, 2], "adopt": 0, "oracl": [0, 6], "dure": [0, 2, 6], "howev": [0, 3, 6], "due": [0, 3], "lack": 0, "label": [0, 5, 6], "provid": [0, 2, 6, 7], "addit": [0, 6], "spectral": 0, "cluster": 0, "meanwhil": 0, "show": 0, "impact": 0, "accuraci": [0, 6], "If": [1, 5, 6], "have": [1, 3], "ani": [1, 5, 6], "question": 1, "about": [1, 3], "pleas": 1, "u": [1, 2], "email": [1, 3, 4], "m2met": [1, 3, 6, 7], "gmail": 1, "com": [1, 4], "wechat": [1, 3], "group": [1, 2, 3], "In": [2, 3, 5], "fix": [2, 3, 7], "condit": [2, 3, 7], "restrict": 2, "three": [2, 3, 6], "publicli": [2, 6], "avail": [2, 6], "corpora": 2, "name": 2, "aishel": [2, 4, 6], "4": [2, 6], "cn": [2, 4, 6], "celeb": [2, 6], "perform": [2, 3], "call": 2, "2023": [2, 3, 5, 6], "score": [2, 6], "rank": [2, 3, 6], "describ": 2, "118": 2, "75": 2, "hour": [2, 3, 6], "speech": [2, 3, 6, 7], "total": [2, 6], "divid": [2, 6], "104": 2, "10": [2, 3, 6], "specif": [2, 6], "212": 2, "8": 2, "20": [2, 3], "session": [2, 3, 6, 7], "respect": 2, "each": [2, 3, 6], "consist": [2, 6], "15": 2, "30": 2, "minut": 2, "discuss": 2, "particip": [2, 5, 6], "number": [2, 3, 6], "456": 2, "25": 2, "60": 2, "balanc": 2, "gender": 2, "coverag": 2, "collect": 2, "13": 2, "venu": 2, "categor": 2, "type": 2, "small": 2, "medium": 2, "larg": [2, 3], "room": [2, 3], "size": 2, "rang": 2, "m": 2, "55": 2, "differ": [2, 3, 6], "give": 2, "varieti": 2, "acoust": [2, 3, 6], "properti": 2, "layout": 2, "paramet": [2, 5], "togeth": 2, "wall": 2, "materi": 2, "cover": 2, "cement": 2, "glass": 2, "etc": 2, "other": 2, "furnish": 2, "includ": [2, 3, 5, 6], "sofa": 2, "tv": 2, "blackboard": 2, "fan": 2, "air": 2, "condition": 2, "plant": 2, "record": [2, 6], "sit": 2, "around": 2, "microphon": [2, 3], "arrai": [2, 3], "natur": 2, "convers": 2, "distanc": 2, "5": 2, "all": [2, 3, 5, 6], "nativ": 2, "chines": 2, "speak": [2, 3], "mandarin": [2, 3], "without": 2, "strong": 2, "accent": 2, "variou": [2, 3], "kind": 2, "indoor": 2, "nois": [2, 3, 5], "limit": [2, 3, 5], "click": 2, "keyboard": 2, "door": 2, "open": [2, 3, 7], "close": [2, 3], "bubbl": 2, "made": [2, 3], "both": [2, 6], "requir": [2, 3, 6], "remain": [2, 3], "same": [2, 5], "posit": 2, "overlap": [2, 3], "between": [2, 6], "exampl": 2, "fig": 2, "1": 2, "within": [2, 3], "one": [2, 5], "ensur": 2, "ratio": 2, "select": [2, 3, 5, 6], "topic": 2, "medic": 2, "treatment": 2, "educ": 2, "busi": 2, "organ": [2, 3, 5, 6, 7], "manag": 2, "industri": [2, 3], "product": 2, "daili": 2, "routin": 2, "averag": 2, "42": 2, "27": 2, "34": 2, "76": 2, "A": [2, 4], "distribut": 2, "were": 2, "ident": [2, 6], "compris": [2, 3, 7], "therebi": 2, "share": 2, "similar": 2, "configur": 2, "field": [2, 3, 6], "signal": [2, 3], "headset": 2, "": [2, 6], "own": 2, "transcrib": [2, 3, 6], "It": [2, 6], "worth": [2, 6], "note": [2, 6], "far": [2, 3], "synchron": 2, "common": 2, "prepar": 2, "textgrid": 2, "format": 2, "inform": [2, 3], "durat": 2, "id": 2, "timestamp": [2, 6], "mention": 2, "abov": 2, "openslr": 2, "via": 2, "follow": [2, 5], "link": 2, "particularli": 2, "baselin": [2, 3, 7], "conveni": 2, "automat": [3, 7], "recognit": [3, 7], "diariz": 3, "signific": 3, "stride": 3, "recent": 3, "year": 3, "result": 3, "surg": 3, "technologi": 3, "applic": 3, "across": 3, "domain": 3, "present": 3, "uniqu": [3, 6], "complex": [3, 5], "divers": 3, "style": 3, "variabl": 3, "confer": 3, "environment": 3, "reverber": [3, 5], "over": 3, "sever": 3, "been": 3, "advanc": [3, 7], "develop": [3, 6], "rich": 3, "comput": [3, 5], "hear": 3, "multisourc": 3, "environ": 3, "chime": 3, "latest": 3, "iter": 3, "ha": 3, "particular": 3, "focu": 3, "distant": 3, "gener": 3, "topologi": 3, "scenario": 3, "while": 3, "progress": 3, "english": 3, "languag": [3, 5], "barrier": 3, "achiev": 3, "compar": 3, "non": 3, "multimod": 3, "base": 3, "process": [3, 6], "misp": 3, "instrument": 3, "seek": 3, "address": 3, "problem": 3, "visual": 3, "everydai": 3, "home": 3, "focus": 3, "tackl": 3, "issu": 3, "offlin": 3, "icassp2022": 3, "main": 3, "task": [3, 6, 7], "former": 3, "involv": [3, 6], "identifi": 3, "who": 3, "spoke": 3, "when": 3, "latter": 3, "aim": 3, "multipl": [3, 6], "simultan": 3, "pose": [3, 6], "technic": 3, "difficulti": 3, "interfer": 3, "build": [3, 6, 7], "success": [3, 7], "previou": 3, "excit": 3, "propos": [3, 7], "asru": 3, "special": [3, 5, 7], "origin": [3, 5], "metric": [3, 7], "wa": [3, 6], "independ": 3, "meant": 3, "could": 3, "determin": 3, "correspond": [3, 5], "further": 3, "current": [3, 7], "talker": [3, 7], "toward": 3, "practic": 3, "attribut": [3, 7], "sub": [3, 5, 7], "track": [3, 5, 7], "what": 3, "facilit": [3, 7], "reproduc": [3, 7], "research": [3, 4, 7], "offer": 3, "comprehens": [3, 7], "overview": [3, 7], "rule": [3, 7], "furthermor": 3, "carefulli": 3, "curat": 3, "approxim": [3, 6], "design": 3, "enabl": 3, "valid": 3, "state": [3, 6, 7], "art": [3, 7], "area": 3, "april": 3, "29": 3, "registr": 3, "mai": 3, "11": 3, "22": 3, "deadlin": 3, "date": 3, "join": 3, "june": 3, "16": 3, "leaderboard": 3, "final": [3, 5, 6], "submiss": 3, "leaderboar": 3, "26": 3, "juli": 3, "paper": [3, 6], "decemb": 3, "12": 3, "workshop": 3, "interest": 3, "whether": 3, "academia": 3, "regist": 3, "complet": 3, "googl": 3, "form": 3, "below": 3, "welcom": 3, "keep": 3, "up": 3, "updat": 3, "work": 3, "dai": 3, "send": 3, "invit": 3, "elig": [3, 5], "team": 3, "qualifi": 3, "adher": [3, 5], "publish": 3, "page": 3, "prior": 3, "submit": 3, "descript": [3, 6], "document": 3, "approach": [3, 5], "method": 3, "top": 3, "asru2023": [3, 7], "proceed": 3, "lei": 4, "xie": 4, "professor": 4, "foundat": 4, "china": 4, "lxie": 4, "nwpu": 4, "edu": 4, "kong": 4, "aik": 4, "lee": 4, "senior": 4, "scientist": 4, "institut": 4, "infocomm": 4, "star": 4, "singapor": 4, "kongaik": 4, "ieee": 4, "org": 4, "zhiji": 4, "yan": 4, "princip": 4, "engin": 4, "alibaba": 4, "yzj": 4, "inc": 4, "shiliang": 4, "zhang": 4, "sly": 4, "zsl": 4, "yanmin": 4, "qian": 4, "shanghai": 4, "jiao": 4, "tong": 4, "univers": 4, "yanminqian": 4, "sjtu": 4, "zhuo": 4, "chen": 4, "appli": 4, "microsoft": 4, "usa": 4, "zhuc": 4, "jian": 4, "wu": 4, "wujian": 4, "hui": 4, "bu": 4, "ceo": 4, "buhui": 4, "aishelldata": 4, "should": 5, "augment": 5, "allow": [5, 6], "ad": 5, "speed": 5, "perturb": 5, "tone": 5, "chang": 5, "permit": 5, "purpos": 5, "instead": [5, 6], "util": [5, 6], "tune": 5, "violat": 5, "strictli": [5, 6], "prohibit": [5, 6], "fine": 5, "cpcer": [5, 6], "lower": 5, "judg": 5, "superior": 5, "forc": 5, "align": 5, "obtain": [5, 6], "frame": 5, "level": 5, "classif": 5, "basi": 5, "shallow": 5, "fusion": 5, "end": 5, "e": [5, 6], "g": 5, "la": 5, "rnnt": 5, "transform": [5, 6], "come": 5, "right": 5, "interpret": 5, "belong": 5, "case": 5, "circumst": 5, "coordin": 5, "assign": 6, "illustr": 6, "aishell4": 6, "constrain": 6, "sourc": 6, "addition": 6, "soon": 6, "simpl": 6, "voic": 6, "activ": 6, "detect": 6, "vad": 6, "concaten": 6, "minimum": 6, "permut": 6, "charact": 6, "error": 6, "rate": 6, "calcul": 6, "step": 6, "firstli": 6, "refer": 6, "hypothesi": 6, "chronolog": 6, "order": 6, "secondli": 6, "cer": 6, "repeat": 6, "possibl": 6, "lowest": 6, "tthe": 6, "insert": 6, "Ins": 6, "substitut": 6, "delet": 6, "del": 6, "output": 6, "text": 6, "frac": 6, "mathcal": 6, "n_": 6, "100": 6, "where": 6, "usag": 6, "third": 6, "hug": 6, "face": 6, "list": 6, "clearli": 6, "privat": 6, "simul": 6, "thei": 6, "mandatori": 6, "clear": 6, "scheme": 6, "delight": 7, "introduct": 7, "contact": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"baselin": 0, "overview": [0, 2], "quick": 0, "start": 0, "result": 0, "contact": 1, "dataset": 2, "train": [2, 6], "data": 2, "detail": 2, "alimeet": 2, "corpu": 2, "get": 2, "introduct": 3, "call": 3, "particip": 3, "timelin": 3, "aoe": 3, "time": 3, "guidelin": 3, "organ": 4, "rule": 5, "track": 6, "evalu": 6, "speaker": 6, "attribut": 6, "asr": 6, "metric": 6, "sub": 6, "arrang": 6, "i": 6, "fix": 6, "condit": 6, "ii": 6, "open": 6, "asru": 7, "2023": 7, "multi": 7, "channel": 7, "parti": 7, "meet": 7, "transcript": 7, "challeng": 7, "2": 7, "0": 7, "m2met2": 7, "content": 7}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 57}, "alltitles": {"Baseline": [[0, "baseline"]], "Overview": [[0, "overview"]], "Quick start": [[0, "quick-start"]], "Baseline results": [[0, "baseline-results"]], "Contact": [[1, "contact"]], "Datasets": [[2, "datasets"]], "Overview of training data": [[2, "overview-of-training-data"]], "Detail of AliMeeting corpus": [[2, "detail-of-alimeeting-corpus"]], "Get the data": [[2, "get-the-data"]], "Introduction": [[3, "introduction"]], "Call for participation": [[3, "call-for-participation"]], "Timeline(AOE Time)": [[3, "timeline-aoe-time"]], "Guidelines": [[3, "guidelines"]], "Organizers": [[4, "organizers"]], "Rules": [[5, "rules"]], "Track & Evaluation": [[6, "track-evaluation"]], "Speaker-Attributed ASR": [[6, "speaker-attributed-asr"]], "Evaluation metric": [[6, "evaluation-metric"]], "Sub-track arrangement": [[6, "sub-track-arrangement"]], "Sub-track I (Fixed Training Condition):": [[6, "sub-track-i-fixed-training-condition"]], "Sub-track II (Open Training Condition):": [[6, "sub-track-ii-open-training-condition"]], "ASRU 2023 MULTI-CHANNEL MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0 (M2MeT2.0)": [[7, "asru-2023-multi-channel-multi-party-meeting-transcription-challenge-2-0-m2met2-0"]], "Contents:": [[7, null]]}, "indexentries": {}})
\ No newline at end of file
diff --git a/docs/m2met2/conf.py b/docs/m2met2/conf.py
index 19b93f5..9b2a9b5 100644
--- a/docs/m2met2/conf.py
+++ b/docs/m2met2/conf.py
@@ -7,7 +7,7 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'm2met2'
+project = 'MULTI-PARTY MEETING TRANSCRIPTION CHALLENGE 2.0'
 copyright = '2023, Speech Lab, Alibaba Group; ASLP Group, Northwestern Polytechnical University'
 author = 'Speech Lab, Alibaba Group; Audio, Speech and Language Processing Group, Northwestern Polytechnical University'
 
diff --git a/docs/m2met2/images/baseline_result.png b/docs/m2met2/images/baseline_result.png
index d51d775..6b76361 100644
--- a/docs/m2met2/images/baseline_result.png
+++ b/docs/m2met2/images/baseline_result.png
Binary files differ
diff --git a/docs/m2met2/images/qrcode.png b/docs/m2met2/images/qrcode.png
new file mode 100644
index 0000000..fc4c349
--- /dev/null
+++ b/docs/m2met2/images/qrcode.png
Binary files differ
diff --git a/docs/m2met2_cn/_build/doctrees/environment.pickle b/docs/m2met2_cn/_build/doctrees/environment.pickle
index fb92f83..8426df6 100644
--- a/docs/m2met2_cn/_build/doctrees/environment.pickle
+++ b/docs/m2met2_cn/_build/doctrees/environment.pickle
Binary files differ
diff --git a/docs/m2met2_cn/_build/doctrees/index.doctree b/docs/m2met2_cn/_build/doctrees/index.doctree
index 77742ac..1677b8b 100644
--- a/docs/m2met2_cn/_build/doctrees/index.doctree
+++ b/docs/m2met2_cn/_build/doctrees/index.doctree
Binary files differ
diff --git "a/docs/m2met2_cn/_build/doctrees/\345\237\272\347\272\277.doctree" "b/docs/m2met2_cn/_build/doctrees/\345\237\272\347\272\277.doctree"
index f88f67d..e9e895c 100644
--- "a/docs/m2met2_cn/_build/doctrees/\345\237\272\347\272\277.doctree"
+++ "b/docs/m2met2_cn/_build/doctrees/\345\237\272\347\272\277.doctree"
Binary files differ
diff --git "a/docs/m2met2_cn/_build/doctrees/\347\256\200\344\273\213.doctree" "b/docs/m2met2_cn/_build/doctrees/\347\256\200\344\273\213.doctree"
index 373ee34..595b41e 100644
--- "a/docs/m2met2_cn/_build/doctrees/\347\256\200\344\273\213.doctree"
+++ "b/docs/m2met2_cn/_build/doctrees/\347\256\200\344\273\213.doctree"
Binary files differ
diff --git "a/docs/m2met2_cn/_build/doctrees/\350\201\224\347\263\273\346\226\271\345\274\217.doctree" "b/docs/m2met2_cn/_build/doctrees/\350\201\224\347\263\273\346\226\271\345\274\217.doctree"
index df477c8..6b8208a 100644
--- "a/docs/m2met2_cn/_build/doctrees/\350\201\224\347\263\273\346\226\271\345\274\217.doctree"
+++ "b/docs/m2met2_cn/_build/doctrees/\350\201\224\347\263\273\346\226\271\345\274\217.doctree"
Binary files differ
diff --git "a/docs/m2met2_cn/_build/doctrees/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.doctree" "b/docs/m2met2_cn/_build/doctrees/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.doctree"
index fa83a0c..c6be4ae 100644
--- "a/docs/m2met2_cn/_build/doctrees/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.doctree"
+++ "b/docs/m2met2_cn/_build/doctrees/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.doctree"
Binary files differ
diff --git a/docs/m2met2_cn/_build/html/.buildinfo b/docs/m2met2_cn/_build/html/.buildinfo
index 6b8368e..35632ee 100644
--- a/docs/m2met2_cn/_build/html/.buildinfo
+++ b/docs/m2met2_cn/_build/html/.buildinfo
@@ -1,4 +1,4 @@
 # Sphinx build info version 1
 # This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
-config: 5462207d1656a9ae4ca43c2890d094be
+config: 06d9c1d4093817b45b9d4df7ab350eaf
 tags: 645f666f9bcd5a90fca523b33c5a78b7
diff --git a/docs/m2met2_cn/_build/html/_images/baseline_result.png b/docs/m2met2_cn/_build/html/_images/baseline_result.png
index d51d775..6b76361 100644
--- a/docs/m2met2_cn/_build/html/_images/baseline_result.png
+++ b/docs/m2met2_cn/_build/html/_images/baseline_result.png
Binary files differ
diff --git a/docs/m2met2_cn/_build/html/_images/qrcode.png b/docs/m2met2_cn/_build/html/_images/qrcode.png
new file mode 100644
index 0000000..fc4c349
--- /dev/null
+++ b/docs/m2met2_cn/_build/html/_images/qrcode.png
Binary files differ
diff --git a/docs/m2met2_cn/_build/html/_sources/index.rst.txt b/docs/m2met2_cn/_build/html/_sources/index.rst.txt
index c089b36..3d9f241 100644
--- a/docs/m2met2_cn/_build/html/_sources/index.rst.txt
+++ b/docs/m2met2_cn/_build/html/_sources/index.rst.txt
@@ -5,8 +5,8 @@
 
 ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0
 ==================================================================================
-鍦ㄤ笂涓�灞奙2MET绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖��
-涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
+鍦ㄤ笂涓�灞奙2MeT绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖��
+涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆�
 
 .. toctree::
diff --git "a/docs/m2met2_cn/_build/html/_sources/\345\237\272\347\272\277.md.txt" "b/docs/m2met2_cn/_build/html/_sources/\345\237\272\347\272\277.md.txt"
index e4d02f7..e8fc32c 100644
--- "a/docs/m2met2_cn/_build/html/_sources/\345\237\272\347\272\277.md.txt"
+++ "b/docs/m2met2_cn/_build/html/_sources/\345\237\272\347\272\277.md.txt"
@@ -5,8 +5,29 @@
 ![model archietecture](images/sa_asr_arch.png)
 
 ## 蹇�熷紑濮�
-#TODO: fill with the README.md of the baseline
-
+棣栧厛闇�瑕佸畨瑁匜unASR鍜孧odelScope. ([installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html))  
+鍩虹嚎绯荤粺鏈夎缁冨拰娴嬭瘯涓や釜鑴氭湰,`run.sh`鏄敤浜庤缁冨熀绾跨郴缁熷苟鍦∕2MeT鐨勯獙璇佷笌娴嬭瘯闆嗕笂璇勪及鐨勶紝鑰宍run_m2met_2023_infer.sh`鐢ㄤ簬姝ゆ绔炶禌棰勫寮�鏀剧殑鍏ㄦ柊娴嬭瘯闆嗕笂娴嬭瘯鍚屾椂鐢熸垚绗﹀悎绔炶禌鏈�缁堟彁浜ゆ牸寮忕殑鏂囦欢銆�
+鍦ㄨ繍琛� `run.sh`鍓嶏紝闇�瑕佽嚜琛屼笅杞藉苟瑙ｅ帇[AliMeeting](http://www.openslr.org/119/)鏁版嵁闆嗗苟鏀剧疆浜巂./dataset`鐩綍涓嬶細
+```shell
+dataset
+|鈥斺�� Eval_Ali_far
+|鈥斺�� Eval_Ali_near
+|鈥斺�� Test_Ali_far
+|鈥斺�� Test_Ali_near
+|鈥斺�� Train_Ali_far
+|鈥斺�� Train_Ali_near
+```
+鍦ㄨ繍琛宍run_m2met_2023_infer.sh`鍓�, 闇�瑕佸皢娴嬭瘯闆哷Test_2023_Ali_far`锛堜粎鍖呭惈闊抽锛屽皢浜�6.16鍙戝竷锛夋斁缃簬`./dataset`鐩綍涓嬨�傜劧鍚庡皢涓诲姙鏂规彁渚涚殑`wav.scp`锛宍wav_raw.scp`锛宍segments`锛宍utt2spk`鍜宍spk2utt`鏀剧疆浜巂./data/Test_2023_Ali_far`鐩綍涓嬨��
+```shell
+data/Test_2023_Ali_far
+|鈥斺�� wav.scp
+|鈥斺�� wav_raw.scp
+|鈥斺�� segments
+|鈥斺�� utt2spk
+|鈥斺�� spk2utt
+```
+鏇村鍩虹嚎绯荤粺璇︽儏瑙乕姝ゅ](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 ## 鍩虹嚎缁撴灉
 鍩虹嚎绯荤粺鐨勭粨鏋滃琛�3鎵�绀恒�傚湪璁粌鏈熼棿锛岃璇濅汉妗ｆ閲囩敤浜嗙湡瀹炶璇濅汉宓屽叆銆傜劧鑰岀敱浜庡湪璇勪及杩囩▼涓己涔忕湡瀹炶璇濅汉鏍囩锛屽洜姝や娇鐢ㄤ簡鐢遍澶栫殑璋辫仛绫绘彁渚涚殑璇磋瘽浜虹壒寰併�傚悓鏃舵垜浠繕鎻愪緵浜嗗湪璇勪及鍜屾祴璇曢泦涓婁娇鐢ㄧ湡瀹炶璇濅汉妗ｆ鐨勭粨鏋滐紝浠ユ樉绀鸿璇濅汉妗ｆ鍑嗙‘鎬х殑褰卞搷銆�
-![baseline result](images/baseline_result.png)
\ No newline at end of file
+
+![baseline_result](images/baseline_result.png)
\ No newline at end of file
diff --git "a/docs/m2met2_cn/_build/html/_sources/\347\256\200\344\273\213.md.txt" "b/docs/m2met2_cn/_build/html/_sources/\347\256\200\344\273\213.md.txt"
index 52df97d..be456ff 100644
--- "a/docs/m2met2_cn/_build/html/_sources/\347\256\200\344\273\213.md.txt"
+++ "b/docs/m2met2_cn/_build/html/_sources/\347\256\200\344\273\213.md.txt"
@@ -1,32 +1,33 @@
 # 绠�浠�
 ## 绔炶禌浠嬬粛
+
 璇煶璇嗗埆锛圓utomatic Speech Recognition锛夈�佽璇濅汉鏃ュ織锛圫peaker Diarization锛夌瓑璇煶澶勭悊鎶�鏈殑鏈�鏂板彂灞曟縺鍙戜簡浼楀鏅鸿兘璇煶鐨勫箍娉涘簲鐢ㄣ�傜劧鑰屼細璁満鏅敱浜庡叾澶嶆潅鐨勫０瀛︽潯浠跺拰涓嶅悓鐨勮璇濋鏍硷紝鍖呮嫭閲嶅彔鐨勮璇濄�佷笉鍚屾暟閲忕殑鍙戣█鑰呫�佸ぇ浼氳瀹ょ殑杩滃満淇″彿浠ュ強鐜鍣０鍜屾贩鍝嶏紝浠嶇劧灞炰簬涓�椤规瀬鍏锋寫鎴樻�х殑浠诲姟銆�
 
 涓轰簡鎺ㄥ姩浼氳鍦烘櫙璇煶璇嗗埆鐨勫彂灞曪紝宸茬粡鏈夊緢澶氱浉鍏崇殑鎸戞垬璧涳紝濡� Rich Transcription evaluation 鍜� CHIME锛圕omputational Hearing in Multisource Environments锛� 鎸戞垬璧涖�傛渶鏂扮殑CHIME鎸戞垬璧涘叧娉ㄤ簬杩滆窛绂昏嚜鍔ㄨ闊宠瘑鍒拰寮�鍙戣兘鍦ㄥ悇绉嶄笉鍚屾嫇鎵戠粨鏋勭殑闃靛垪鍜屽簲鐢ㄥ満鏅腑閫氱敤鐨勭郴缁熴�傜劧鑰屼笉鍚岃瑷�涔嬮棿鐨勫樊寮傞檺鍒朵簡闈炶嫳璇細璁浆褰曠殑杩涘睍銆侻ISP锛圡ultimodal Information Based Speech Processing锛夊拰M2MeT锛圡ulti-Channel Multi-Party Meeting Transcription锛夋寫鎴樿禌涓烘帹鍔ㄦ櫘閫氳瘽浼氳鍦烘櫙璇煶璇嗗埆鍋氬嚭浜嗚础鐚�侻ISP鎸戞垬璧涗晶閲嶄簬鐢ㄨ鍚妯℃�佺殑鏂规硶瑙ｅ喅鏃ュ父瀹跺涵鐜涓殑杩滆窛绂诲楹﹀厠椋庝俊鍙峰鐞嗛棶棰橈紝鑰孧2MeT鎸戞垬鍒欎晶閲嶄簬瑙ｅ喅绂荤嚎浼氳瀹や腑浼氳杞綍鐨勮闊抽噸鍙犻棶棰樸��
 
-ASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��
+IASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��
 
-鍦ㄤ笂涓�灞奙2MET鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MET鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
-涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
+鍦ㄤ笂涓�灞奙2MeT鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU 2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MeT鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
+涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆傛澶栵紝鎴戜滑灏嗘牴鎹椂闂磋〃鍙戝竷涓�涓叏鏂扮殑娴嬭瘯闆嗭紝鍖呮嫭澶х害10灏忔椂鐨勯煶棰戙��
 
 
 ## 鏃堕棿瀹夋帓(AOE鏃堕棿)
 
 - $ 2023.4.29: $ 寮�鏀炬敞鍐�
-- $ 2023.5.8: $ 鍩虹嚎鍙戝竷
-- $ 2023.5.15: $ 娉ㄥ唽鎴
-- $ 2023.6.9: $ 娴嬭瘯闆嗘暟鎹彂甯�
-- $ 2023.6.13: $ 鏈�缁堢粨鏋滄彁浜ゆ埅姝�
-- $ 2023.6.19: $ 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�
-- $ 2023.7.3: $ 璁烘枃鎻愪氦鎴
-- $ 2023.7.10: $ 鏈�缁堢増璁烘枃鎻愪氦鎴
-- $ 2023.12.12: $ ASRU Workshop & challenge session
+- $ 2023.5.11: $ 鍩虹嚎鍙戝竷
+- $ 2023.5.22: $ 娉ㄥ唽鎴
+- $ 2023.6.16: $ 娴嬭瘯闆嗘暟鎹彂甯冿紝鎺掕姒滃紑鏀�
+- $ 2023.6.20: $ 鏈�缁堢粨鏋滄彁浜ゆ埅姝紝鎺掕姒滃叧闂�
+- $ 2023.6.26: $ 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�
+- $ 2023.7.3: $ 璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�
+- $ 2023.7.10: $ 鏈�缁堢増璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�
+- $ 2023.12.12: $ ASRU Workshop & Challenge Session
 
 ## 绔炶禌鎶ュ悕
 
-鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�15鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曪細
+鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�22鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曘�傚悓鏃舵杩庡箍澶у弬璧涜�呭姞鍏瀹樻柟浜ゆ祦寰俊缇(https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/%E8%81%94%E7%B3%BB%E6%96%B9%E5%BC%8F.html)浜ゆ祦骞跺強鏃惰幏鍙栫珵璧涙渶鏂版秷鎭細
 
-[M2MET2.0鎶ュ悕](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
+[M2MeT2.0鎶ュ悕](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
 
-涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗛�夋嫨鍓嶄笁鍚嶇撼鍏SRU2023璁烘枃闆嗐��
\ No newline at end of file
+涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗘帓鍚嶅墠鍒楃殑闃熶紞绾冲叆ASRU2023璁烘枃闆嗐��
\ No newline at end of file
diff --git "a/docs/m2met2_cn/_build/html/_sources/\350\201\224\347\263\273\346\226\271\345\274\217.md.txt" "b/docs/m2met2_cn/_build/html/_sources/\350\201\224\347\263\273\346\226\271\345\274\217.md.txt"
index 5c65ca0..fd8f9a4 100644
--- "a/docs/m2met2_cn/_build/html/_sources/\350\201\224\347\263\273\346\226\271\345\274\217.md.txt"
+++ "b/docs/m2met2_cn/_build/html/_sources/\350\201\224\347\263\273\346\226\271\345\274\217.md.txt"
@@ -1,9 +1,9 @@
 # 鑱旂郴鏂瑰紡
-濡傛灉瀵筂2MET2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�
+濡傛灉瀵筂2MeT2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�
 
 - 閭欢: [m2met.alimeeting@gmail.com](mailto:m2met.alimeeting@gmail.com)
 
-|              M2MET2.0绔炶禌瀹樻柟寰俊缇�         |
+|              M2MeT2.0绔炶禌瀹樻柟寰俊缇�         |
 |:------------------------------------------:|
-<!-- | <img src="images/wechat.png" width="300"/> | -->
+| <img src="images/qrcode.png" width="300"/> |
 
diff --git "a/docs/m2met2_cn/_build/html/_sources/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md.txt" "b/docs/m2met2_cn/_build/html/_sources/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md.txt"
index 94a6236..ccfbdf3 100644
--- "a/docs/m2met2_cn/_build/html/_sources/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md.txt"
+++ "b/docs/m2met2_cn/_build/html/_sources/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md.txt"
@@ -1,6 +1,6 @@
 # 璧涢亾璁剧疆涓庤瘎浼�
 ## 璇磋瘽浜虹浉鍏崇殑璇煶璇嗗埆
-璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MET2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��
+璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MeT2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��
 
 ![task difference](images/task_diff.png)
 
diff --git a/docs/m2met2_cn/_build/html/genindex.html b/docs/m2met2_cn/_build/html/genindex.html
index 5558bcf..1eee622 100644
--- a/docs/m2met2_cn/_build/html/genindex.html
+++ b/docs/m2met2_cn/_build/html/genindex.html
@@ -14,7 +14,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>绱㈠紩 &#8212; m2met2  鏂囨。</title>
+    <title>绱㈠紩 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -36,7 +36,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="#" title="鎬荤储寮�"
              accesskey="I">绱㈠紩</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绱㈠紩</a></li> 
       </ul>
     </div>
@@ -47,7 +47,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -133,7 +133,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="#" title="鎬荤储寮�"
              >绱㈠紩</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绱㈠紩</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2_cn/_build/html/index.html b/docs/m2met2_cn/_build/html/index.html
index fbc2fce..b7672cf 100644
--- a/docs/m2met2_cn/_build/html/index.html
+++ b/docs/m2met2_cn/_build/html/index.html
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0 &#8212; m2met2  鏂囨。</title>
+    <title>ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -41,7 +41,7 @@
         <li class="right" >
           <a href="%E7%AE%80%E4%BB%8B.html" title="绠�浠�"
              accesskey="N">涓嬩竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="#">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="#">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0</a></li> 
       </ul>
     </div>
@@ -52,7 +52,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    #" class="text-logo">m2met2  鏂囨。</a>
+    #" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -121,8 +121,8 @@
             
   <section id="asru-2023-2-0">
 <h1>ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0<a class="headerlink" href="#asru-2023-2-0" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h1>
-<p>鍦ㄤ笂涓�灞奙2MET绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖��
-涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
+<p>鍦ㄤ笂涓�灞奙2MeT绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖��
+涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆�</p>
 <div class="toctree-wrapper compound">
 <p class="caption" role="heading"><span class="caption-text">鐩綍:</span></p>
@@ -161,7 +161,7 @@
         <li class="right" >
           <a href="%E7%AE%80%E4%BB%8B.html" title="绠�浠�"
              >涓嬩竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="#">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="#">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2_cn/_build/html/objects.inv b/docs/m2met2_cn/_build/html/objects.inv
index 89341d3..d846652 100644
--- a/docs/m2met2_cn/_build/html/objects.inv
+++ b/docs/m2met2_cn/_build/html/objects.inv
Binary files differ
diff --git a/docs/m2met2_cn/_build/html/search.html b/docs/m2met2_cn/_build/html/search.html
index 4fe0684..ca234a0 100644
--- a/docs/m2met2_cn/_build/html/search.html
+++ b/docs/m2met2_cn/_build/html/search.html
@@ -14,7 +14,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>鎼滅储 &#8212; m2met2  鏂囨。</title>
+    <title>鎼滅储 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     
@@ -42,7 +42,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="genindex.html" title="鎬荤储寮�"
              accesskey="I">绱㈠紩</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鎼滅储</a></li> 
       </ul>
     </div>
@@ -53,7 +53,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-toc">
     
@@ -149,7 +149,7 @@
         <li class="right" style="margin-right: 10px">
           <a href="genindex.html" title="鎬荤储寮�"
              >绱㈠紩</a></li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鎼滅储</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2_cn/_build/html/searchindex.js b/docs/m2met2_cn/_build/html/searchindex.js
index c9fe167..2e211ff 100644
--- a/docs/m2met2_cn/_build/html/searchindex.js
+++ b/docs/m2met2_cn/_build/html/searchindex.js
@@ -1 +1 @@
-Search.setIndex({"docnames": ["index", "\u57fa\u7ebf", "\u6570\u636e\u96c6", "\u7b80\u4ecb", "\u7ec4\u59d4\u4f1a", "\u8054\u7cfb\u65b9\u5f0f", "\u89c4\u5219", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30"], "filenames": ["index.rst", "\u57fa\u7ebf.md", "\u6570\u636e\u96c6.md", "\u7b80\u4ecb.md", "\u7ec4\u59d4\u4f1a.md", "\u8054\u7cfb\u65b9\u5f0f.md", "\u89c4\u5219.md", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30.md"], "titles": ["ASRU 2023 \u591a\u901a\u9053\u591a\u65b9\u4f1a\u8bae\u8f6c\u5f55\u6311\u6218 2.0", "\u57fa\u7ebf", "\u6570\u636e\u96c6", "\u7b80\u4ecb", "\u7ec4\u59d4\u4f1a", "\u8054\u7cfb\u65b9\u5f0f", "\u7ade\u8d5b\u89c4\u5219", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30"], "terms": {"m2met": [0, 3, 5, 7], "asru2023": [0, 3], "m2met2": [0, 3, 5, 7], "funasr": 1, "sa": 1, "asr": [1, 3, 7], "speakerencod": 1, "modelscop": [1, 7], "todo": 1, "fill": 1, "with": 1, "the": 1, "readm": 1, "md": 1, "of": 1, "baselin": [1, 2], "aishel": [2, 7], "cn": [2, 4, 7], "celeb": [2, 7], "test": [2, 6, 7], "2023": [2, 3, 6, 7], "118": 2, "75": 2, "104": 2, "train": 2, "eval": [2, 6], "10": [2, 3, 7], "212": 2, "15": [2, 3], "30": 2, "456": 2, "25": 2, "13": [2, 3], "55": 2, "42": 2, "27": 2, "34": 2, "76": 2, "20": 2, "textgrid": 2, "id": 2, "openslr": 2, "automat": 3, "speech": 3, "recognit": 3, "speaker": 3, "diariz": 3, "rich": 3, "transcript": 3, "evalu": 3, "chime": 3, "comput": 3, "hear": 3, "in": 3, "multisourc": 3, "environ": 3, "misp": 3, "multimod": 3, "inform": 3, "base": 3, "process": 3, "multi": 3, "channel": 3, "parti": 3, "meet": 3, "assp2022": 3, "29": 3, "19": 3, "12": 3, "asru": 3, "workshop": 3, "challeng": 3, "session": 3, "lxie": 4, "nwpu": 4, "edu": 4, "kong": 4, "aik": 4, "lee": 4, "star": 4, "kongaik": 4, "ieee": 4, "org": 4, "zhiji": 4, "yzj": 4, "alibaba": 4, "inc": 4, "com": [4, 5], "sli": 4, "zsl": 4, "yanminqian": 4, "sjtu": 4, "zhuc": 4, "microsoft": 4, "wujian": 4, "ceo": 4, "buhui": 4, "aishelldata": 4, "alimeet": [5, 7], "gmail": 5, "cpcer": [6, 7], "las": 6, "rnnt": 6, "transform": 6, "aishell4": 7, "vad": 7, "cer": 7, "ins": 7, "sub": 7, "del": 7, "text": 7, "frac": 7, "mathcal": 7, "n_": 7, "total": 7, "time": 7, "100": 7, "hug": 7, "face": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"asru": 0, "2023": 0, "alimeet": 2, "aoe": 3}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 57}, "alltitles": {"ASRU 2023 \u591a\u901a\u9053\u591a\u65b9\u4f1a\u8bae\u8f6c\u5f55\u6311\u6218 2.0": [[0, "asru-2023-2-0"]], "\u76ee\u5f55:": [[0, null]], "\u57fa\u7ebf": [[1, "id1"]], "\u57fa\u7ebf\u6982\u8ff0": [[1, "id2"]], "\u5feb\u901f\u5f00\u59cb": [[1, "id3"]], "\u57fa\u7ebf\u7ed3\u679c": [[1, "id4"]], "\u6570\u636e\u96c6": [[2, "id1"]], "\u6570\u636e\u96c6\u6982\u8ff0": [[2, "id2"]], "Alimeeting\u6570\u636e\u96c6\u4ecb\u7ecd": [[2, "alimeeting"]], "\u83b7\u53d6\u6570\u636e": [[2, "id3"]], "\u7b80\u4ecb": [[3, "id1"]], "\u7ade\u8d5b\u4ecb\u7ecd": [[3, "id2"]], "\u65f6\u95f4\u5b89\u6392(AOE\u65f6\u95f4)": [[3, "aoe"]], "\u7ade\u8d5b\u62a5\u540d": [[3, "id3"]], "\u7ec4\u59d4\u4f1a": [[4, "id1"]], "\u8054\u7cfb\u65b9\u5f0f": [[5, "id1"]], "\u7ade\u8d5b\u89c4\u5219": [[6, "id1"]], "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30": [[7, "id1"]], "\u8bf4\u8bdd\u4eba\u76f8\u5173\u7684\u8bed\u97f3\u8bc6\u522b": [[7, "id2"]], "\u8bc4\u4f30\u65b9\u6cd5": [[7, "id3"]], "\u5b50\u8d5b\u9053\u8bbe\u7f6e": [[7, "id4"]], "\u5b50\u8d5b\u9053\u4e00 (\u9650\u5b9a\u8bad\u7ec3\u6570\u636e):": [[7, "id5"]], "\u5b50\u8d5b\u9053\u4e8c (\u5f00\u653e\u8bad\u7ec3\u6570\u636e):": [[7, "id6"]]}, "indexentries": {}})
\ No newline at end of file
+Search.setIndex({"docnames": ["index", "\u57fa\u7ebf", "\u6570\u636e\u96c6", "\u7b80\u4ecb", "\u7ec4\u59d4\u4f1a", "\u8054\u7cfb\u65b9\u5f0f", "\u89c4\u5219", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30"], "filenames": ["index.rst", "\u57fa\u7ebf.md", "\u6570\u636e\u96c6.md", "\u7b80\u4ecb.md", "\u7ec4\u59d4\u4f1a.md", "\u8054\u7cfb\u65b9\u5f0f.md", "\u89c4\u5219.md", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30.md"], "titles": ["ASRU 2023 \u591a\u901a\u9053\u591a\u65b9\u4f1a\u8bae\u8f6c\u5f55\u6311\u6218 2.0", "\u57fa\u7ebf", "\u6570\u636e\u96c6", "\u7b80\u4ecb", "\u7ec4\u59d4\u4f1a", "\u8054\u7cfb\u65b9\u5f0f", "\u7ade\u8d5b\u89c4\u5219", "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30"], "terms": {"m2met": [0, 1, 3, 5, 7], "asru2023": [0, 3], "m2met2": [0, 3, 5, 7], "funasr": 1, "sa": 1, "asr": [1, 3, 7], "speakerencod": 1, "modelscop": [1, 7], "instal": 1, "run": 1, "sh": 1, "run_m2met_2023_inf": 1, "alimeet": [1, 5, 7], "dataset": 1, "eval_ali_far": 1, "eval_ali_near": 1, "test_ali_far": 1, "test_ali_near": 1, "train_ali_far": 1, "train_ali_near": 1, "test_2023_ali_far": 1, "16": [1, 3], "wav": 1, "scp": 1, "wav_raw": 1, "segment": 1, "utt2spk": 1, "spk2utt": 1, "data": 1, "aishel": [2, 7], "cn": [2, 4, 7], "celeb": [2, 7], "test": [2, 6, 7], "2023": [2, 3, 6, 7], "118": 2, "75": 2, "104": 2, "train": 2, "eval": [2, 6], "10": [2, 3, 7], "212": 2, "15": 2, "30": 2, "456": 2, "25": 2, "13": 2, "55": 2, "42": 2, "27": 2, "34": 2, "76": 2, "20": [2, 3], "textgrid": 2, "id": 2, "openslr": 2, "baselin": 2, "automat": 3, "speech": 3, "recognit": 3, "speaker": 3, "diariz": 3, "rich": 3, "transcript": 3, "evalu": 3, "chime": 3, "comput": 3, "hear": 3, "in": 3, "multisourc": 3, "environ": 3, "misp": 3, "multimod": 3, "inform": 3, "base": 3, "process": 3, "multi": 3, "channel": 3, "parti": 3, "meet": 3, "iassp2022": 3, "asru": 3, "29": 3, "11": 3, "22": 3, "26": 3, "session": 3, "12": 3, "workshop": 3, "challeng": 3, "lxie": 4, "nwpu": 4, "edu": 4, "kong": 4, "aik": 4, "lee": 4, "star": 4, "kongaik": 4, "ieee": 4, "org": 4, "zhiji": 4, "yzj": 4, "alibaba": 4, "inc": 4, "com": [4, 5], "sli": 4, "zsl": 4, "yanminqian": 4, "sjtu": 4, "zhuc": 4, "microsoft": 4, "wujian": 4, "ceo": 4, "buhui": 4, "aishelldata": 4, "gmail": 5, "cpcer": [6, 7], "las": 6, "rnnt": 6, "transform": 6, "aishell4": 7, "vad": 7, "cer": 7, "ins": 7, "sub": 7, "del": 7, "text": 7, "frac": 7, "mathcal": 7, "n_": 7, "total": 7, "time": 7, "100": 7, "hug": 7, "face": 7}, "objects": {}, "objtypes": {}, "objnames": {}, "titleterms": {"asru": 0, "2023": 0, "alimeet": 2, "aoe": 3}, "envversion": {"sphinx.domains.c": 2, "sphinx.domains.changeset": 1, "sphinx.domains.citation": 1, "sphinx.domains.cpp": 8, "sphinx.domains.index": 1, "sphinx.domains.javascript": 2, "sphinx.domains.math": 2, "sphinx.domains.python": 3, "sphinx.domains.rst": 2, "sphinx.domains.std": 2, "sphinx": 57}, "alltitles": {"ASRU 2023 \u591a\u901a\u9053\u591a\u65b9\u4f1a\u8bae\u8f6c\u5f55\u6311\u6218 2.0": [[0, "asru-2023-2-0"]], "\u76ee\u5f55:": [[0, null]], "\u57fa\u7ebf": [[1, "id1"]], "\u57fa\u7ebf\u6982\u8ff0": [[1, "id2"]], "\u5feb\u901f\u5f00\u59cb": [[1, "id3"]], "\u57fa\u7ebf\u7ed3\u679c": [[1, "id4"]], "\u6570\u636e\u96c6": [[2, "id1"]], "\u6570\u636e\u96c6\u6982\u8ff0": [[2, "id2"]], "Alimeeting\u6570\u636e\u96c6\u4ecb\u7ecd": [[2, "alimeeting"]], "\u83b7\u53d6\u6570\u636e": [[2, "id3"]], "\u7b80\u4ecb": [[3, "id1"]], "\u7ade\u8d5b\u4ecb\u7ecd": [[3, "id2"]], "\u65f6\u95f4\u5b89\u6392(AOE\u65f6\u95f4)": [[3, "aoe"]], "\u7ade\u8d5b\u62a5\u540d": [[3, "id3"]], "\u7ec4\u59d4\u4f1a": [[4, "id1"]], "\u8054\u7cfb\u65b9\u5f0f": [[5, "id1"]], "\u7ade\u8d5b\u89c4\u5219": [[6, "id1"]], "\u8d5b\u9053\u8bbe\u7f6e\u4e0e\u8bc4\u4f30": [[7, "id1"]], "\u8bf4\u8bdd\u4eba\u76f8\u5173\u7684\u8bed\u97f3\u8bc6\u522b": [[7, "id2"]], "\u8bc4\u4f30\u65b9\u6cd5": [[7, "id3"]], "\u5b50\u8d5b\u9053\u8bbe\u7f6e": [[7, "id4"]], "\u5b50\u8d5b\u9053\u4e00 (\u9650\u5b9a\u8bad\u7ec3\u6570\u636e):": [[7, "id5"]], "\u5b50\u8d5b\u9053\u4e8c (\u5f00\u653e\u8bad\u7ec3\u6570\u636e):": [[7, "id6"]]}, "indexentries": {}})
\ No newline at end of file
diff --git "a/docs/m2met2_cn/_build/html/\345\237\272\347\272\277.html" "b/docs/m2met2_cn/_build/html/\345\237\272\347\272\277.html"
index f28043e..f1afb2d 100644
--- "a/docs/m2met2_cn/_build/html/\345\237\272\347\272\277.html"
+++ "b/docs/m2met2_cn/_build/html/\345\237\272\347\272\277.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>鍩虹嚎 &#8212; m2met2  鏂囨。</title>
+    <title>鍩虹嚎 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="%E8%B5%9B%E9%81%93%E8%AE%BE%E7%BD%AE%E4%B8%8E%E8%AF%84%E4%BC%B0.html" title="璧涢亾璁剧疆涓庤瘎浼�"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鍩虹嚎</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -132,12 +132,33 @@
 </section>
 <section id="id3">
 <h2>蹇�熷紑濮�<a class="headerlink" href="#id3" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
-<p>#TODO: fill with the README.md of the baseline</p>
+<p>棣栧厛闇�瑕佸畨瑁匜unASR鍜孧odelScope. (<a class="reference external" href="https://alibaba-damo-academy.github.io/FunASR/en/installation.html">installation</a>)<br />
+鍩虹嚎绯荤粺鏈夎缁冨拰娴嬭瘯涓や釜鑴氭湰,<code class="docutils literal notranslate"><span class="pre">run.sh</span></code>鏄敤浜庤缁冨熀绾跨郴缁熷苟鍦∕2MeT鐨勯獙璇佷笌娴嬭瘯闆嗕笂璇勪及鐨勶紝鑰�<code class="docutils literal notranslate"><span class="pre">run_m2met_2023_infer.sh</span></code>鐢ㄤ簬姝ゆ绔炶禌棰勫寮�鏀剧殑鍏ㄦ柊娴嬭瘯闆嗕笂娴嬭瘯鍚屾椂鐢熸垚绗﹀悎绔炶禌鏈�缁堟彁浜ゆ牸寮忕殑鏂囦欢銆�
+鍦ㄨ繍琛� <code class="docutils literal notranslate"><span class="pre">run.sh</span></code>鍓嶏紝闇�瑕佽嚜琛屼笅杞藉苟瑙ｅ帇<a class="reference external" href="http://www.openslr.org/119/">AliMeeting</a>鏁版嵁闆嗗苟鏀剧疆浜�<code class="docutils literal notranslate"><span class="pre">./dataset</span></code>鐩綍涓嬶細</p>
+<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>dataset
+<span class="p">|</span>鈥斺��<span class="w"> </span>Eval_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Eval_Ali_near
+<span class="p">|</span>鈥斺��<span class="w"> </span>Test_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Test_Ali_near
+<span class="p">|</span>鈥斺��<span class="w"> </span>Train_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>Train_Ali_near
+</pre></div>
+</div>
+<p>鍦ㄨ繍琛�<code class="docutils literal notranslate"><span class="pre">run_m2met_2023_infer.sh</span></code>鍓�, 闇�瑕佸皢娴嬭瘯闆�<code class="docutils literal notranslate"><span class="pre">Test_2023_Ali_far</span></code>锛堜粎鍖呭惈闊抽锛屽皢浜�6.16鍙戝竷锛夋斁缃簬<code class="docutils literal notranslate"><span class="pre">./dataset</span></code>鐩綍涓嬨�傜劧鍚庡皢涓诲姙鏂规彁渚涚殑<code class="docutils literal notranslate"><span class="pre">wav.scp</span></code>锛�<code class="docutils literal notranslate"><span class="pre">wav_raw.scp</span></code>锛�<code class="docutils literal notranslate"><span class="pre">segments</span></code>锛�<code class="docutils literal notranslate"><span class="pre">utt2spk</span></code>鍜�<code class="docutils literal notranslate"><span class="pre">spk2utt</span></code>鏀剧疆浜�<code class="docutils literal notranslate"><span class="pre">./data/Test_2023_Ali_far</span></code>鐩綍涓嬨��</p>
+<div class="highlight-shell notranslate"><div class="highlight"><pre><span></span>data/Test_2023_Ali_far
+<span class="p">|</span>鈥斺��<span class="w"> </span>wav.scp
+<span class="p">|</span>鈥斺��<span class="w"> </span>wav_raw.scp
+<span class="p">|</span>鈥斺��<span class="w"> </span>segments
+<span class="p">|</span>鈥斺��<span class="w"> </span>utt2spk
+<span class="p">|</span>鈥斺��<span class="w"> </span>spk2utt
+</pre></div>
+</div>
+<p>鏇村鍩虹嚎绯荤粺璇︽儏瑙�<a class="reference external" href="https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md">姝ゅ</a></p>
 </section>
 <section id="id4">
 <h2>鍩虹嚎缁撴灉<a class="headerlink" href="#id4" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
-<p>鍩虹嚎绯荤粺鐨勭粨鏋滃琛�3鎵�绀恒�傚湪璁粌鏈熼棿锛岃璇濅汉妗ｆ閲囩敤浜嗙湡瀹炶璇濅汉宓屽叆銆傜劧鑰岀敱浜庡湪璇勪及杩囩▼涓己涔忕湡瀹炶璇濅汉鏍囩锛屽洜姝や娇鐢ㄤ簡鐢遍澶栫殑璋辫仛绫绘彁渚涚殑璇磋瘽浜虹壒寰併�傚悓鏃舵垜浠繕鎻愪緵浜嗗湪璇勪及鍜屾祴璇曢泦涓婁娇鐢ㄧ湡瀹炶璇濅汉妗ｆ鐨勭粨鏋滐紝浠ユ樉绀鸿璇濅汉妗ｆ鍑嗙‘鎬х殑褰卞搷銆�
-<img alt="baseline result" src="_images/baseline_result.png" /></p>
+<p>鍩虹嚎绯荤粺鐨勭粨鏋滃琛�3鎵�绀恒�傚湪璁粌鏈熼棿锛岃璇濅汉妗ｆ閲囩敤浜嗙湡瀹炶璇濅汉宓屽叆銆傜劧鑰岀敱浜庡湪璇勪及杩囩▼涓己涔忕湡瀹炶璇濅汉鏍囩锛屽洜姝や娇鐢ㄤ簡鐢遍澶栫殑璋辫仛绫绘彁渚涚殑璇磋瘽浜虹壒寰併�傚悓鏃舵垜浠繕鎻愪緵浜嗗湪璇勪及鍜屾祴璇曢泦涓婁娇鐢ㄧ湡瀹炶璇濅汉妗ｆ鐨勭粨鏋滐紝浠ユ樉绀鸿璇濅汉妗ｆ鍑嗙‘鎬х殑褰卞搷銆�</p>
+<p><img alt="baseline_result" src="_images/baseline_result.png" /></p>
 </section>
 </section>
 
@@ -171,7 +192,7 @@
         <li class="right" >
           <a href="%E8%B5%9B%E9%81%93%E8%AE%BE%E7%BD%AE%E4%B8%8E%E8%AF%84%E4%BC%B0.html" title="璧涢亾璁剧疆涓庤瘎浼�"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鍩虹嚎</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\346\225\260\346\215\256\351\233\206.html" "b/docs/m2met2_cn/_build/html/\346\225\260\346\215\256\351\233\206.html"
index ddefcc1..016c58f 100644
--- "a/docs/m2met2_cn/_build/html/\346\225\260\346\215\256\351\233\206.html"
+++ "b/docs/m2met2_cn/_build/html/\346\225\260\346\215\256\351\233\206.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>鏁版嵁闆� &#8212; m2met2  鏂囨。</title>
+    <title>鏁版嵁闆� &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="%E7%AE%80%E4%BB%8B.html" title="绠�浠�"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鏁版嵁闆�</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -180,7 +180,7 @@
         <li class="right" >
           <a href="%E7%AE%80%E4%BB%8B.html" title="绠�浠�"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鏁版嵁闆�</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\347\256\200\344\273\213.html" "b/docs/m2met2_cn/_build/html/\347\256\200\344\273\213.html"
index f1da18e..1f9d560 100644
--- "a/docs/m2met2_cn/_build/html/\347\256\200\344\273\213.html"
+++ "b/docs/m2met2_cn/_build/html/\347\256\200\344\273\213.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>绠�浠� &#8212; m2met2  鏂囨。</title>
+    <title>绠�浠� &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -46,7 +46,7 @@
         <li class="right" >
           <a href="index.html" title="ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绠�浠�</a></li> 
       </ul>
     </div>
@@ -57,7 +57,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -130,30 +130,30 @@
 <h2>绔炶禌浠嬬粛<a class="headerlink" href="#id2" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
 <p>璇煶璇嗗埆锛圓utomatic Speech Recognition锛夈�佽璇濅汉鏃ュ織锛圫peaker Diarization锛夌瓑璇煶澶勭悊鎶�鏈殑鏈�鏂板彂灞曟縺鍙戜簡浼楀鏅鸿兘璇煶鐨勫箍娉涘簲鐢ㄣ�傜劧鑰屼細璁満鏅敱浜庡叾澶嶆潅鐨勫０瀛︽潯浠跺拰涓嶅悓鐨勮璇濋鏍硷紝鍖呮嫭閲嶅彔鐨勮璇濄�佷笉鍚屾暟閲忕殑鍙戣█鑰呫�佸ぇ浼氳瀹ょ殑杩滃満淇″彿浠ュ強鐜鍣０鍜屾贩鍝嶏紝浠嶇劧灞炰簬涓�椤规瀬鍏锋寫鎴樻�х殑浠诲姟銆�</p>
 <p>涓轰簡鎺ㄥ姩浼氳鍦烘櫙璇煶璇嗗埆鐨勫彂灞曪紝宸茬粡鏈夊緢澶氱浉鍏崇殑鎸戞垬璧涳紝濡� Rich Transcription evaluation 鍜� CHIME锛圕omputational Hearing in Multisource Environments锛� 鎸戞垬璧涖�傛渶鏂扮殑CHIME鎸戞垬璧涘叧娉ㄤ簬杩滆窛绂昏嚜鍔ㄨ闊宠瘑鍒拰寮�鍙戣兘鍦ㄥ悇绉嶄笉鍚屾嫇鎵戠粨鏋勭殑闃靛垪鍜屽簲鐢ㄥ満鏅腑閫氱敤鐨勭郴缁熴�傜劧鑰屼笉鍚岃瑷�涔嬮棿鐨勫樊寮傞檺鍒朵簡闈炶嫳璇細璁浆褰曠殑杩涘睍銆侻ISP锛圡ultimodal Information Based Speech Processing锛夊拰M2MeT锛圡ulti-Channel Multi-Party Meeting Transcription锛夋寫鎴樿禌涓烘帹鍔ㄦ櫘閫氳瘽浼氳鍦烘櫙璇煶璇嗗埆鍋氬嚭浜嗚础鐚�侻ISP鎸戞垬璧涗晶閲嶄簬鐢ㄨ鍚妯℃�佺殑鏂规硶瑙ｅ喅鏃ュ父瀹跺涵鐜涓殑杩滆窛绂诲楹﹀厠椋庝俊鍙峰鐞嗛棶棰橈紝鑰孧2MeT鎸戞垬鍒欎晶閲嶄簬瑙ｅ喅绂荤嚎浼氳瀹や腑浼氳杞綍鐨勮闊抽噸鍙犻棶棰樸��</p>
-<p>ASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��</p>
-<p>鍦ㄤ笂涓�灞奙2MET鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MET鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
-涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
+<p>IASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��</p>
+<p>鍦ㄤ笂涓�灞奙2MeT鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU 2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MeT鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
+涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆傛澶栵紝鎴戜滑灏嗘牴鎹椂闂磋〃鍙戝竷涓�涓叏鏂扮殑娴嬭瘯闆嗭紝鍖呮嫭澶х害10灏忔椂鐨勯煶棰戙��</p>
 </section>
 <section id="aoe">
 <h2>鏃堕棿瀹夋帓(AOE鏃堕棿)<a class="headerlink" href="#aoe" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
 <ul class="simple">
 <li><p><span class="math notranslate nohighlight">\( 2023.4.29: \)</span> 寮�鏀炬敞鍐�</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.5.8: \)</span> 鍩虹嚎鍙戝竷</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.5.15: \)</span> 娉ㄥ唽鎴</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.6.9: \)</span> 娴嬭瘯闆嗘暟鎹彂甯�</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.6.13: \)</span> 鏈�缁堢粨鏋滄彁浜ゆ埅姝�</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.6.19: \)</span> 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.7.3: \)</span> 璁烘枃鎻愪氦鎴</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.7.10: \)</span> 鏈�缁堢増璁烘枃鎻愪氦鎴</p></li>
-<li><p><span class="math notranslate nohighlight">\( 2023.12.12: \)</span> ASRU Workshop &amp; challenge session</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.5.11: \)</span> 鍩虹嚎鍙戝竷</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.5.22: \)</span> 娉ㄥ唽鎴</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.6.16: \)</span> 娴嬭瘯闆嗘暟鎹彂甯冿紝鎺掕姒滃紑鏀�</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.6.20: \)</span> 鏈�缁堢粨鏋滄彁浜ゆ埅姝紝鎺掕姒滃叧闂�</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.6.26: \)</span> 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.7.3: \)</span> 璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.7.10: \)</span> 鏈�缁堢増璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�</p></li>
+<li><p><span class="math notranslate nohighlight">\( 2023.12.12: \)</span> ASRU Workshop &amp; Challenge Session</p></li>
 </ul>
 </section>
 <section id="id3">
 <h2>绔炶禌鎶ュ悕<a class="headerlink" href="#id3" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
-<p>鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�15鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曪細</p>
-<p><a class="reference external" href="https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link">M2MET2.0鎶ュ悕</a></p>
-<p>涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗛�夋嫨鍓嶄笁鍚嶇撼鍏SRU2023璁烘枃闆嗐��</p>
+<p>鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�22鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曘�傚悓鏃舵杩庡箍澶у弬璧涜�呭姞鍏�<a class="reference external" href="https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/%E8%81%94%E7%B3%BB%E6%96%B9%E5%BC%8F.html">瀹樻柟浜ゆ祦寰俊缇�</a>浜ゆ祦骞跺強鏃惰幏鍙栫珵璧涙渶鏂版秷鎭細</p>
+<p><a class="reference external" href="https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link">M2MeT2.0鎶ュ悕</a></p>
+<p>涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗘帓鍚嶅墠鍒楃殑闃熶紞绾冲叆ASRU2023璁烘枃闆嗐��</p>
 </section>
 </section>
 
@@ -187,7 +187,7 @@
         <li class="right" >
           <a href="index.html" title="ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绠�浠�</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\347\273\204\345\247\224\344\274\232.html" "b/docs/m2met2_cn/_build/html/\347\273\204\345\247\224\344\274\232.html"
index ddf93bb..e39465f 100644
--- "a/docs/m2met2_cn/_build/html/\347\273\204\345\247\224\344\274\232.html"
+++ "b/docs/m2met2_cn/_build/html/\347\273\204\345\247\224\344\274\232.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>缁勫浼� &#8212; m2met2  鏂囨。</title>
+    <title>缁勫浼� &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="%E8%A7%84%E5%88%99.html" title="绔炶禌瑙勫垯"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">缁勫浼�</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -188,7 +188,7 @@
         <li class="right" >
           <a href="%E8%A7%84%E5%88%99.html" title="绔炶禌瑙勫垯"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">缁勫浼�</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\350\201\224\347\263\273\346\226\271\345\274\217.html" "b/docs/m2met2_cn/_build/html/\350\201\224\347\263\273\346\226\271\345\274\217.html"
index 249e5dd..fc060e8 100644
--- "a/docs/m2met2_cn/_build/html/\350\201\224\347\263\273\346\226\271\345\274\217.html"
+++ "b/docs/m2met2_cn/_build/html/\350\201\224\347\263\273\346\226\271\345\274\217.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>鑱旂郴鏂瑰紡 &#8212; m2met2  鏂囨。</title>
+    <title>鑱旂郴鏂瑰紡 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -41,7 +41,7 @@
         <li class="right" >
           <a href="%E7%BB%84%E5%A7%94%E4%BC%9A.html" title="缁勫浼�"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鑱旂郴鏂瑰紡</a></li> 
       </ul>
     </div>
@@ -52,7 +52,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -121,17 +121,20 @@
             
   <section id="id1">
 <h1>鑱旂郴鏂瑰紡<a class="headerlink" href="#id1" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h1>
-<p>濡傛灉瀵筂2MET2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�</p>
+<p>濡傛灉瀵筂2MeT2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�</p>
 <ul class="simple">
 <li><p>閭欢: <a class="reference external" href="mailto:m2met&#46;alimeeting&#37;&#52;&#48;gmail&#46;com">m2met<span>&#46;</span>alimeeting<span>&#64;</span>gmail<span>&#46;</span>com</a></p></li>
 </ul>
 <table class="docutils align-default">
 <thead>
-<tr class="row-odd"><th class="head text-center"><p>M2MET2.0绔炶禌瀹樻柟寰俊缇�</p></th>
+<tr class="row-odd"><th class="head text-center"><p>M2MeT2.0绔炶禌瀹樻柟寰俊缇�</p></th>
 </tr>
 </thead>
+<tbody>
+<tr class="row-even"><td class="text-center"><p><a class="reference internal" href="_images/qrcode.png"><img alt="_images/qrcode.png" src="_images/qrcode.png" style="width: 300px;" /></a></p></td>
+</tr>
+</tbody>
 </table>
-<!-- | <img src="images/wechat.png" width="300"/> | -->
 </section>
 
 
@@ -158,7 +161,7 @@
         <li class="right" >
           <a href="%E7%BB%84%E5%A7%94%E4%BC%9A.html" title="缁勫浼�"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">鑱旂郴鏂瑰紡</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\350\247\204\345\210\231.html" "b/docs/m2met2_cn/_build/html/\350\247\204\345\210\231.html"
index 5186ebd..7d54533 100644
--- "a/docs/m2met2_cn/_build/html/\350\247\204\345\210\231.html"
+++ "b/docs/m2met2_cn/_build/html/\350\247\204\345\210\231.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>绔炶禌瑙勫垯 &#8212; m2met2  鏂囨。</title>
+    <title>绔炶禌瑙勫垯 &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -45,7 +45,7 @@
         <li class="right" >
           <a href="%E5%9F%BA%E7%BA%BF.html" title="鍩虹嚎"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绔炶禌瑙勫垯</a></li> 
       </ul>
     </div>
@@ -56,7 +56,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -166,7 +166,7 @@
         <li class="right" >
           <a href="%E5%9F%BA%E7%BA%BF.html" title="鍩虹嚎"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">绔炶禌瑙勫垯</a></li> 
       </ul>
     </div>
diff --git "a/docs/m2met2_cn/_build/html/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.html" "b/docs/m2met2_cn/_build/html/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.html"
index 072ea54..c9a15f9 100644
--- "a/docs/m2met2_cn/_build/html/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.html"
+++ "b/docs/m2met2_cn/_build/html/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.html"
@@ -15,7 +15,7 @@
   <link rel="stylesheet" type="text/css" href="_static/css/bootstrap-theme.min.css" />
   <meta name="viewport" content="width=device-width, initial-scale=1.0">
   
-    <title>璧涢亾璁剧疆涓庤瘎浼� &#8212; m2met2  鏂囨。</title>
+    <title>璧涢亾璁剧疆涓庤瘎浼� &#8212; 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</title>
     <link rel="stylesheet" type="text/css" href="_static/pygments.css" />
     <link rel="stylesheet" type="text/css" href="_static/guzzle.css" />
     <script data-url_root="./" id="documentation_options" src="_static/documentation_options.js"></script>
@@ -46,7 +46,7 @@
         <li class="right" >
           <a href="%E6%95%B0%E6%8D%AE%E9%9B%86.html" title="鏁版嵁闆�"
              accesskey="P">涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">璧涢亾璁剧疆涓庤瘎浼�</a></li> 
       </ul>
     </div>
@@ -57,7 +57,7 @@
       </div>
   <div id="left-column">
     <div class="sphinxsidebar"><a href="
-    index.html" class="text-logo">m2met2  鏂囨。</a>
+    index.html" class="text-logo">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a>
 <div class="sidebar-block">
   <div class="sidebar-wrapper">
     <div id="main-search">
@@ -128,7 +128,7 @@
 <h1>璧涢亾璁剧疆涓庤瘎浼�<a class="headerlink" href="#id1" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h1>
 <section id="id2">
 <h2>璇磋瘽浜虹浉鍏崇殑璇煶璇嗗埆<a class="headerlink" href="#id2" title="姝ゆ爣棰樼殑姘镐箙閾炬帴">露</a></h2>
-<p>璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MET2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��</p>
+<p>璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MeT2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��</p>
 <p><img alt="task difference" src="_images/task_diff.png" /></p>
 </section>
 <section id="id3">
@@ -181,7 +181,7 @@
         <li class="right" >
           <a href="%E6%95%B0%E6%8D%AE%E9%9B%86.html" title="鏁版嵁闆�"
              >涓婁竴椤�</a> |</li>
-        <li class="nav-item nav-item-0"><a href="index.html">m2met2  鏂囨。</a> &#187;</li>
+        <li class="nav-item nav-item-0"><a href="index.html">澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0</a> &#187;</li>
         <li class="nav-item nav-item-this"><a href="">璧涢亾璁剧疆涓庤瘎浼�</a></li> 
       </ul>
     </div>
diff --git a/docs/m2met2_cn/conf.py b/docs/m2met2_cn/conf.py
index da3a332..b6300d1 100644
--- a/docs/m2met2_cn/conf.py
+++ b/docs/m2met2_cn/conf.py
@@ -7,7 +7,7 @@
 # -- Project information -----------------------------------------------------
 # https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
 
-project = 'm2met2'
+project = '澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬2.0'
 copyright = '2023, Speech Lab, Alibaba Group; ASLP Group, Northwestern Polytechnical University'
 author = 'Speech Lab, Alibaba Group; Audio, Speech and Language Processing Group, Northwestern Polytechnical University'
 
diff --git a/docs/m2met2_cn/images/baseline_result.png b/docs/m2met2_cn/images/baseline_result.png
index d51d775..6b76361 100644
--- a/docs/m2met2_cn/images/baseline_result.png
+++ b/docs/m2met2_cn/images/baseline_result.png
Binary files differ
diff --git a/docs/m2met2_cn/images/qrcode.png b/docs/m2met2_cn/images/qrcode.png
new file mode 100644
index 0000000..fc4c349
--- /dev/null
+++ b/docs/m2met2_cn/images/qrcode.png
Binary files differ
diff --git a/docs/m2met2_cn/index.rst b/docs/m2met2_cn/index.rst
index c089b36..3d9f241 100644
--- a/docs/m2met2_cn/index.rst
+++ b/docs/m2met2_cn/index.rst
@@ -5,8 +5,8 @@
 
 ASRU 2023 澶氶�氶亾澶氭柟浼氳杞綍鎸戞垬 2.0
 ==================================================================================
-鍦ㄤ笂涓�灞奙2MET绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖��
-涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
+鍦ㄤ笂涓�灞奙2MeT绔炶禌鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖��
+涓轰簡灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆�
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆�
 
 .. toctree::
diff --git "a/docs/m2met2_cn/\345\237\272\347\272\277.md" "b/docs/m2met2_cn/\345\237\272\347\272\277.md"
index e4d02f7..e8fc32c 100644
--- "a/docs/m2met2_cn/\345\237\272\347\272\277.md"
+++ "b/docs/m2met2_cn/\345\237\272\347\272\277.md"
@@ -5,8 +5,29 @@
 ![model archietecture](images/sa_asr_arch.png)
 
 ## 蹇�熷紑濮�
-#TODO: fill with the README.md of the baseline
-
+棣栧厛闇�瑕佸畨瑁匜unASR鍜孧odelScope. ([installation](https://alibaba-damo-academy.github.io/FunASR/en/installation.html))  
+鍩虹嚎绯荤粺鏈夎缁冨拰娴嬭瘯涓や釜鑴氭湰,`run.sh`鏄敤浜庤缁冨熀绾跨郴缁熷苟鍦∕2MeT鐨勯獙璇佷笌娴嬭瘯闆嗕笂璇勪及鐨勶紝鑰宍run_m2met_2023_infer.sh`鐢ㄤ簬姝ゆ绔炶禌棰勫寮�鏀剧殑鍏ㄦ柊娴嬭瘯闆嗕笂娴嬭瘯鍚屾椂鐢熸垚绗﹀悎绔炶禌鏈�缁堟彁浜ゆ牸寮忕殑鏂囦欢銆�
+鍦ㄨ繍琛� `run.sh`鍓嶏紝闇�瑕佽嚜琛屼笅杞藉苟瑙ｅ帇[AliMeeting](http://www.openslr.org/119/)鏁版嵁闆嗗苟鏀剧疆浜巂./dataset`鐩綍涓嬶細
+```shell
+dataset
+|鈥斺�� Eval_Ali_far
+|鈥斺�� Eval_Ali_near
+|鈥斺�� Test_Ali_far
+|鈥斺�� Test_Ali_near
+|鈥斺�� Train_Ali_far
+|鈥斺�� Train_Ali_near
+```
+鍦ㄨ繍琛宍run_m2met_2023_infer.sh`鍓�, 闇�瑕佸皢娴嬭瘯闆哷Test_2023_Ali_far`锛堜粎鍖呭惈闊抽锛屽皢浜�6.16鍙戝竷锛夋斁缃簬`./dataset`鐩綍涓嬨�傜劧鍚庡皢涓诲姙鏂规彁渚涚殑`wav.scp`锛宍wav_raw.scp`锛宍segments`锛宍utt2spk`鍜宍spk2utt`鏀剧疆浜巂./data/Test_2023_Ali_far`鐩綍涓嬨��
+```shell
+data/Test_2023_Ali_far
+|鈥斺�� wav.scp
+|鈥斺�� wav_raw.scp
+|鈥斺�� segments
+|鈥斺�� utt2spk
+|鈥斺�� spk2utt
+```
+鏇村鍩虹嚎绯荤粺璇︽儏瑙乕姝ゅ](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs/alimeeting/sa-asr/README.md)
 ## 鍩虹嚎缁撴灉
 鍩虹嚎绯荤粺鐨勭粨鏋滃琛�3鎵�绀恒�傚湪璁粌鏈熼棿锛岃璇濅汉妗ｆ閲囩敤浜嗙湡瀹炶璇濅汉宓屽叆銆傜劧鑰岀敱浜庡湪璇勪及杩囩▼涓己涔忕湡瀹炶璇濅汉鏍囩锛屽洜姝や娇鐢ㄤ簡鐢遍澶栫殑璋辫仛绫绘彁渚涚殑璇磋瘽浜虹壒寰併�傚悓鏃舵垜浠繕鎻愪緵浜嗗湪璇勪及鍜屾祴璇曢泦涓婁娇鐢ㄧ湡瀹炶璇濅汉妗ｆ鐨勭粨鏋滐紝浠ユ樉绀鸿璇濅汉妗ｆ鍑嗙‘鎬х殑褰卞搷銆�
-![baseline result](images/baseline_result.png)
\ No newline at end of file
+
+![baseline_result](images/baseline_result.png)
\ No newline at end of file
diff --git "a/docs/m2met2_cn/\347\256\200\344\273\213.md" "b/docs/m2met2_cn/\347\256\200\344\273\213.md"
index 52df97d..be456ff 100644
--- "a/docs/m2met2_cn/\347\256\200\344\273\213.md"
+++ "b/docs/m2met2_cn/\347\256\200\344\273\213.md"
@@ -1,32 +1,33 @@
 # 绠�浠�
 ## 绔炶禌浠嬬粛
+
 璇煶璇嗗埆锛圓utomatic Speech Recognition锛夈�佽璇濅汉鏃ュ織锛圫peaker Diarization锛夌瓑璇煶澶勭悊鎶�鏈殑鏈�鏂板彂灞曟縺鍙戜簡浼楀鏅鸿兘璇煶鐨勫箍娉涘簲鐢ㄣ�傜劧鑰屼細璁満鏅敱浜庡叾澶嶆潅鐨勫０瀛︽潯浠跺拰涓嶅悓鐨勮璇濋鏍硷紝鍖呮嫭閲嶅彔鐨勮璇濄�佷笉鍚屾暟閲忕殑鍙戣█鑰呫�佸ぇ浼氳瀹ょ殑杩滃満淇″彿浠ュ強鐜鍣０鍜屾贩鍝嶏紝浠嶇劧灞炰簬涓�椤规瀬鍏锋寫鎴樻�х殑浠诲姟銆�
 
 涓轰簡鎺ㄥ姩浼氳鍦烘櫙璇煶璇嗗埆鐨勫彂灞曪紝宸茬粡鏈夊緢澶氱浉鍏崇殑鎸戞垬璧涳紝濡� Rich Transcription evaluation 鍜� CHIME锛圕omputational Hearing in Multisource Environments锛� 鎸戞垬璧涖�傛渶鏂扮殑CHIME鎸戞垬璧涘叧娉ㄤ簬杩滆窛绂昏嚜鍔ㄨ闊宠瘑鍒拰寮�鍙戣兘鍦ㄥ悇绉嶄笉鍚屾嫇鎵戠粨鏋勭殑闃靛垪鍜屽簲鐢ㄥ満鏅腑閫氱敤鐨勭郴缁熴�傜劧鑰屼笉鍚岃瑷�涔嬮棿鐨勫樊寮傞檺鍒朵簡闈炶嫳璇細璁浆褰曠殑杩涘睍銆侻ISP锛圡ultimodal Information Based Speech Processing锛夊拰M2MeT锛圡ulti-Channel Multi-Party Meeting Transcription锛夋寫鎴樿禌涓烘帹鍔ㄦ櫘閫氳瘽浼氳鍦烘櫙璇煶璇嗗埆鍋氬嚭浜嗚础鐚�侻ISP鎸戞垬璧涗晶閲嶄簬鐢ㄨ鍚妯℃�佺殑鏂规硶瑙ｅ喅鏃ュ父瀹跺涵鐜涓殑杩滆窛绂诲楹﹀厠椋庝俊鍙峰鐞嗛棶棰橈紝鑰孧2MeT鎸戞垬鍒欎晶閲嶄簬瑙ｅ喅绂荤嚎浼氳瀹や腑浼氳杞綍鐨勮闊抽噸鍙犻棶棰樸��
 
-ASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��
+IASSP2022 M2MeT鎸戞垬鐨勪晶閲嶇偣鏄細璁満鏅紝瀹冨寘鎷袱涓禌閬擄細璇磋瘽浜烘棩璁板拰澶氳璇濅汉鑷姩璇煶璇嗗埆銆傚墠鑰呮秹鍙婅瘑鍒�滆皝鍦ㄤ粈涔堟椂鍊欒浜嗚瘽鈥濓紝鑰屽悗鑰呮棬鍦ㄥ悓鏃惰瘑鍒潵鑷涓璇濅汉鐨勮闊筹紝璇煶閲嶅彔鍜屽悇绉嶅櫔澹板甫鏉ヤ簡宸ㄥぇ鐨勬妧鏈洶闅俱��
 
-鍦ㄤ笂涓�灞奙2MET鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU2023涓婄户缁妇鍔濵2MET2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MET鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
-涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MET2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
+鍦ㄤ笂涓�灞奙2MeT鎴愬姛涓惧姙鐨勫熀纭�涓婏紝鎴戜滑灏嗗湪ASRU 2023涓婄户缁妇鍔濵2MeT2.0鎸戞垬璧涖�傚湪涓婁竴灞奙2MeT鎸戞垬璧涗腑锛岃瘎浼版寚鏍囨槸璇磋瘽浜烘棤鍏崇殑锛屾垜浠彧鑳藉緱鍒拌瘑鍒枃鏈紝鑰屼笉鑳界‘瀹氱浉搴旂殑璇磋瘽浜恒��
+涓轰簡瑙ｅ喅杩欎竴灞�闄愭�у苟灏嗙幇鍦ㄧ殑澶氳璇濅汉璇煶璇嗗埆绯荤粺鎺ㄥ悜瀹炵敤鍖栵紝M2MeT2.0鎸戞垬璧涘皢鍦ㄨ璇濅汉鐩稿叧鐨勪汉鐗╀笂璇勪及锛屽苟涓斿悓鏃惰绔嬮檺瀹氭暟鎹笌涓嶉檺瀹氭暟鎹袱涓瓙璧涢亾銆傞�氳繃灏嗚闊冲綊灞炰簬鐗瑰畾鐨勮璇濅汉锛岃繖椤逛换鍔℃棬鍦ㄦ彁楂樺璇磋瘽浜篈SR绯荤粺鍦ㄧ湡瀹炰笘鐣岀幆澧冧腑鐨勫噯纭�у拰閫傜敤鎬с��
 鎴戜滑瀵规暟鎹泦銆佽鍒欍�佸熀绾跨郴缁熷拰璇勪及鏂规硶杩涜浜嗚缁嗕粙缁嶏紝浠ヨ繘涓�姝ヤ績杩涘璇磋瘽浜鸿闊宠瘑鍒鍩熺爺绌剁殑鍙戝睍銆傛澶栵紝鎴戜滑灏嗘牴鎹椂闂磋〃鍙戝竷涓�涓叏鏂扮殑娴嬭瘯闆嗭紝鍖呮嫭澶х害10灏忔椂鐨勯煶棰戙��
 
 
 ## 鏃堕棿瀹夋帓(AOE鏃堕棿)
 
 - $ 2023.4.29: $ 寮�鏀炬敞鍐�
-- $ 2023.5.8: $ 鍩虹嚎鍙戝竷
-- $ 2023.5.15: $ 娉ㄥ唽鎴
-- $ 2023.6.9: $ 娴嬭瘯闆嗘暟鎹彂甯�
-- $ 2023.6.13: $ 鏈�缁堢粨鏋滄彁浜ゆ埅姝�
-- $ 2023.6.19: $ 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�
-- $ 2023.7.3: $ 璁烘枃鎻愪氦鎴
-- $ 2023.7.10: $ 鏈�缁堢増璁烘枃鎻愪氦鎴
-- $ 2023.12.12: $ ASRU Workshop & challenge session
+- $ 2023.5.11: $ 鍩虹嚎鍙戝竷
+- $ 2023.5.22: $ 娉ㄥ唽鎴
+- $ 2023.6.16: $ 娴嬭瘯闆嗘暟鎹彂甯冿紝鎺掕姒滃紑鏀�
+- $ 2023.6.20: $ 鏈�缁堢粨鏋滄彁浜ゆ埅姝紝鎺掕姒滃叧闂�
+- $ 2023.6.26: $ 璇勪及缁撴灉鍜屾帓鍚嶅彂甯�
+- $ 2023.7.3: $ 璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�
+- $ 2023.7.10: $ 鏈�缁堢増璁烘枃鎻愪氦鎴锛堥�氳繃ASRU2023瀹樻柟鎶曠锛岄�夋嫨绔炶禌Session锛�
+- $ 2023.12.12: $ ASRU Workshop & Challenge Session
 
 ## 绔炶禌鎶ュ悕
 
-鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�15鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曪細
+鏉ヨ嚜瀛︽湳鐣屽拰宸ヤ笟鐣岀殑鏈夋剰鍚戝弬璧涜�呭潎搴斿湪2023骞�5鏈�22鏃ュ強涔嬪墠濉啓涓嬫柟鐨勮胺姝岃〃鍗曘�傚悓鏃舵杩庡箍澶у弬璧涜�呭姞鍏瀹樻柟浜ゆ祦寰俊缇(https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/%E8%81%94%E7%B3%BB%E6%96%B9%E5%BC%8F.html)浜ゆ祦骞跺強鏃惰幏鍙栫珵璧涙渶鏂版秷鎭細
 
-[M2MET2.0鎶ュ悕](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
+[M2MeT2.0鎶ュ悕](https://docs.google.com/forms/d/e/1FAIpQLSf77T9vAl7Ym-u5g8gXu18SBofoWRaFShBo26Ym0-HDxHW9PQ/viewform?usp=sf_link)
 
-涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗛�夋嫨鍓嶄笁鍚嶇撼鍏SRU2023璁烘枃闆嗐��
\ No newline at end of file
+涓诲姙鏂瑰皢鍦�3涓伐浣滄棩鍐呴�氳繃鐢靛瓙閭欢閫氱煡绗﹀悎鏉′欢鐨勫弬璧涘洟闃燂紝鍥㈤槦蹇呴』閬靛畧灏嗗湪鎸戞垬缃戠珯涓婂彂甯冪殑鎸戞垬瑙勫垯銆傚湪鎺掑悕鍙戝竷涔嬪墠锛屾瘡涓弬璧涜�呭繀椤绘彁浜や竴浠界郴缁熸弿杩版枃浠讹紝璇︾粏璇存槑浣跨敤鐨勬柟娉曞拰妯″瀷銆備富鍔炴柟灏嗘帓鍚嶅墠鍒楃殑闃熶紞绾冲叆ASRU2023璁烘枃闆嗐��
\ No newline at end of file
diff --git "a/docs/m2met2_cn/\350\201\224\347\263\273\346\226\271\345\274\217.md" "b/docs/m2met2_cn/\350\201\224\347\263\273\346\226\271\345\274\217.md"
index 5c65ca0..fd8f9a4 100644
--- "a/docs/m2met2_cn/\350\201\224\347\263\273\346\226\271\345\274\217.md"
+++ "b/docs/m2met2_cn/\350\201\224\347\263\273\346\226\271\345\274\217.md"
@@ -1,9 +1,9 @@
 # 鑱旂郴鏂瑰紡
-濡傛灉瀵筂2MET2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�
+濡傛灉瀵筂2MeT2.0绔炶禌鏈変换浣曠枒闂紝娆㈣繋閫氳繃浠ヤ笅鏂瑰紡鑱旂郴鎴戜滑锛�
 
 - 閭欢: [m2met.alimeeting@gmail.com](mailto:m2met.alimeeting@gmail.com)
 
-|              M2MET2.0绔炶禌瀹樻柟寰俊缇�         |
+|              M2MeT2.0绔炶禌瀹樻柟寰俊缇�         |
 |:------------------------------------------:|
-<!-- | <img src="images/wechat.png" width="300"/> | -->
+| <img src="images/qrcode.png" width="300"/> |
 
diff --git "a/docs/m2met2_cn/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md" "b/docs/m2met2_cn/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md"
index 94a6236..ccfbdf3 100644
--- "a/docs/m2met2_cn/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md"
+++ "b/docs/m2met2_cn/\350\265\233\351\201\223\350\256\276\347\275\256\344\270\216\350\257\204\344\274\260.md"
@@ -1,6 +1,6 @@
 # 璧涢亾璁剧疆涓庤瘎浼�
 ## 璇磋瘽浜虹浉鍏崇殑璇煶璇嗗埆
-璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MET2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��
+璇磋瘽浜虹浉鍏崇殑ASR浠诲姟闇�瑕佷粠閲嶅彔鐨勮闊充腑璇嗗埆姣忎釜璇磋瘽浜虹殑璇煶锛屽苟涓鸿瘑鍒唴瀹瑰垎閰嶄竴涓璇濅汉鏍囩銆傚浘2灞曠ず浜嗚璇濅汉鐩稿叧璇煶璇嗗埆浠诲姟鍜屽璇磋瘽浜鸿闊宠瘑鍒换鍔＄殑涓昏鍖哄埆銆傚湪鏈绔炶禌涓瑼liMeeting銆丄ishell4鍜孋n-Celeb鏁版嵁闆嗗彲浣滀负鍙楅檺鏁版嵁婧愩�傚湪M2MeT鎸戞垬璧涗腑浣跨敤鐨凙liMeeting鏁版嵁闆嗗寘鍚缁冦�佽瘎浼板拰娴嬭瘯闆嗭紝鍦∕2MeT2.0鍙互鍦ㄨ缁冨拰璇勪及涓娇鐢ㄣ�傛澶栵紝涓�涓寘鍚害10灏忔椂浼氳鏁版嵁鐨勬柊鐨凾est-2023闆嗗皢鏍规嵁璧涚▼瀹夋帓鍙戝竷骞剁敤浜庢寫鎴樿禌鐨勮瘎鍒嗗拰鎺掑悕銆傚�煎緱娉ㄦ剰鐨勬槸锛屽浜嶵est-2023娴嬭瘯闆嗭紝涓诲姙鏂瑰皢涓嶅啀鎻愪緵鑰虫満鐨勮繎鍦洪煶棰戙�佽浆褰曚互鍙婄湡瀹炴椂闂存埑銆傝�屾槸鎻愪緵鍙互閫氳繃涓�涓畝鍗曠殑VAD妯″瀷寰楀埌鐨勫寘鍚涓璇濅汉鐨勭墖娈点��
 
 ![task difference](images/task_diff.png)
 
diff --git a/docs/huggingface_models.md b/docs/model_zoo/huggingface_models.md
similarity index 100%
rename from docs/huggingface_models.md
rename to docs/model_zoo/huggingface_models.md
diff --git a/docs/model_zoo/modelscope_models.md b/docs/model_zoo/modelscope_models.md
new file mode 100644
index 0000000..1b7f475
--- /dev/null
+++ b/docs/model_zoo/modelscope_models.md
@@ -0,0 +1,126 @@
+# Pretrained Models on ModelScope
+
+## Model License
+-  Apache License 2.0
+
+## Model Zoo
+Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
+
+### Speech Recognition Models
+#### Paraformer Models
+
+|                                                                     Model Name                                                                     | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:--------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+|        [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Paraformer-large-long](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which ould deal with arbitrary length input wav                                                                                 |
+| [Paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords. |
+|              [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)              | CN & EN  | Alibaba Speech Data (50000hours) |    8358    |    68M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+|           [Paraformer-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)           | CN & EN  | Alibaba Speech Data (50000hours) |    8404    |    68M    |     Online     | Which could deal with streaming input                                                                                           |
+|  [Paraformer-large-online](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Online     | Which could deal with streaming input                                                                                                    |
+|       [Paraformer-tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary)       |    CN    |  Alibaba Speech Data (200hours)  |    544     |   5.2M    |    Offline     | Lightweight Paraformer model which supports Mandarin command words recognition                                                  |
+|                   [Paraformer-aishell](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary)                   |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
+|       [ParaformerBert-aishell](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)       |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
+|        [Paraformer-aishell2](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)         |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
+|    [ParaformerBert-aishell2](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)     |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
+
+
+#### UniASR Models
+
+|                                                                    Model Name                                                                     |    Language     |           Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:-------------------------------------------------------------------------------------------------------------------------------------------------:|:---------------:|:---------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+|             [UniASR](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)             |     CN & EN     | Alibaba Speech Data (60000 hours) |    8358    |   100M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|      [UniASR-large](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary)       |     CN & EN     | Alibaba Speech Data (60000 hours) |    8358    |   220M    |    Offline     | UniASR streaming offline unifying models                                                                                                    |
+|          [UniASR English](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/summary)           |       EN        | Alibaba Speech Data (10000 hours) |    1080     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|          [UniASR Russian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/summary)           |       RU        | Alibaba Speech Data (5000 hours)  |    1664     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|           [UniASR Japanese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/summary)           |       JA        | Alibaba Speech Data (5000 hours)  |    5977     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR Korean](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/summary)           |       KO        | Alibaba Speech Data (2000 hours)  |    6400     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+| [UniASR Cantonese (CHS)](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/summary) | Cantonese (CHS) | Alibaba Speech Data (5000 hours)  |    1468     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|         [UniASR Indonesian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/summary)         |       ID        | Alibaba Speech Data (1000 hours)  |    1067     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR Vietnamese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary)           |       VI        | Alibaba Speech Data (1000 hours)  |    1001     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|          [UniASR Spanish](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/summary)           |       ES        | Alibaba Speech Data (1000 hours)  |    3445     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|         [UniASR Portuguese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/summary)         |       PT        | Alibaba Speech Data (1000 hours)  |    1617     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|           [UniASR French](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary)           |       FR        | Alibaba Speech Data (1000 hours)  |    3472     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|           [UniASR German](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary)           |       GE        | Alibaba Speech Data (1000 hours)  |    3690     |    95M    |     Online     | UniASR streaming online unifying models                                                                                                    |
+|            [UniASR Persian](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary)             |       FA        | Alibaba Speech Data (1000 hours)  |    1257     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|                [UniASR Burmese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary)                 |       MY        | Alibaba Speech Data (1000 hours)  |    696     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|                [UniASR Hebrew](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary)                 |       HE        | Alibaba Speech Data (1000 hours)  |    1085    |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+|              [UniASR Urdu](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary)                      |       UR        | Alibaba Speech Data (1000 hours)  |    877     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
+
+
+
+#### Conformer Models
+
+|                                                       Model Name                                                       | Language |     Training Data     | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:----------------------------------------------------------------------------------------------------------------------:|:--------:|:---------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)   |   CN     |  AISHELL (178hours)   |    4234    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Conformer](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)   |   CN     | AISHELL-2 (1000hours) |    5212    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr-en-16k-vocab4199-pytorch/summary)   |   EN     | Alibaba Speech Data (10000hours) |    4199    |    220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
+
+
+#### RNN-T Models
+
+### Multi-talker Speech Recognition Models
+
+#### MFCCA Models
+
+|                                                  Model Name                                                   | Language |               Training Data                | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
+|:-------------------------------------------------------------------------------------------------------------:|:--------:|:------------------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
+| [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)    |   CN     | AliMeeting銆丄ISHELL-4銆丼imudata (917hours)   |     4950   |    45M    |    Offline     | Duration of input wav <= 20s, channel of input wav <= 8 channel |
+
+
+
+### Voice Activity Detection Models
+
+|                                           Model Name                                           |        Training Data         | Parameters | Sampling Rate | Notes |
+|:----------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:-------------:|:------|
+| [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) | Alibaba Speech Data (5000hours) |    0.4M    |     16000     |       |
+|   [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-8k-common/summary)        | Alibaba Speech Data (5000hours) |    0.4M    |     8000      |       |
+
+### Punctuation Restoration Models
+
+|                                                         Model Name                                                         |        Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
+|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------|
+|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    Offline     |   offline punctuation model    |
+| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     Online     |  online punctuation model     |
+
+### Language Models
+
+|                                                       Model Name                                                       |        Training Data         | Parameters | Vocab Size | Notes |
+|:----------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:------|
+| [Transformer](https://www.modelscope.cn/models/damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/summary)      | Alibaba Speech Data (?hours) |    57M     |    8404    |       |
+
+### Speaker Verification Models
+
+|                                                  Model Name                                                   |   Training Data   | Parameters | Number Speaker | Notes |
+|:-------------------------------------------------------------------------------------------------------------:|:-----------------:|:----------:|:----------:|:------|
+| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours)  |   17.5M    |    3465    |    Xvector, speaker verification, Chinese   |
+| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) |    61M     |    6135    |   Xvector, speaker verification, English    |
+
+### Speaker Diarization Models
+
+|                                                    Model Name                                                    |    Training Data    | Parameters | Notes |
+|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
+| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/summary) | AliMeeting (120 hours) |   40.5M    |    Speaker diarization, profiles and records, Chinese |
+| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/summary)    |  CallHome (60 hours)  |     12M     |    Speaker diarization, profiles and records, English   |
+
+### Timestamp Prediction Models
+
+|                                                    Model Name                                     |  Language  |    Training Data    | Parameters | Notes |
+|:--------------------------------------------------------------------------------------------------:|:--------------:|:-------------------:|:----------:|:------|
+| [TP-Aligner](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) | CN | Alibaba Speech Data (50000hours) |   37.8M    |    Timestamp prediction, Mandarin, middle size |
+
+### Inverse Text Normalization (ITN) Models
+
+|                                                    Model Name                                                    | Language | Parameters | Notes                    |
+|:----------------------------------------------------------------------------------------------------------------:|:--------:|:----------:|:-------------------------|
+| [English](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-en/summary) |    EN    |   1.54M    | ITN, ASR post-processing |
+| [Russian](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ru/summary) |    RU    |   17.79M   | ITN, ASR post-processing |
+| [Japanese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ja/summary) |    JA    |    6.8M    | ITN, ASR post-processing |
+| [Korean](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ko/summary) |    KO    |   1.28M    | ITN, ASR post-processing |
+| [Indonesian](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-id/summary) |    ID    |   2.06M    | ITN, ASR post-processing |
+| [Vietnamese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-vi/summary) |    VI    |   0.92M    | ITN, ASR post-processing |
+| [Tagalog](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-tl/summary) |    TL    |    0.65M     | ITN, ASR post-processing |
+| [Spanish](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-es/summary) |    ES    |   1.32M    | ITN, ASR post-processing |
+| [Portuguese](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-pt/summary) |    PT    |   1.28M    | ITN, ASR post-processing |
+| [French](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-fr/summary) |    FR    |   4.39M    | ITN, ASR post-processing |
+| [German](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-de/summary)|    GE    |   3.95M    | ITN, ASR post-processing |
diff --git a/docs/modelscope_models.md b/docs/modelscope_models.md
deleted file mode 100644
index 5f94a09..0000000
--- a/docs/modelscope_models.md
+++ /dev/null
@@ -1,94 +0,0 @@
-# Pretrained Models on ModelScope
-
-## Model License
--  Apache License 2.0
-
-## Model Zoo
-Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
-
-### Speech Recognition Models
-#### Paraformer Models
-
-|                                                                     Model Name                                                                     | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:--------------------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-|        [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-| [Paraformer-large-long](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which ould deal with arbitrary length input wav                                                                                 |
-| [Paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8404    |   220M    |    Offline     | Which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords. |
-|              [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)              | CN & EN  | Alibaba Speech Data (50000hours) |    8358    |    68M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-|          [Paraformer-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)           | CN & EN  | Alibaba Speech Data (50000hours) |    8404    |    68M    |     Online     | Which could deal with streaming input                                                                                           |
-|       [Paraformer-tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary)       |    CN    |  Alibaba Speech Data (200hours)  |    544     |   5.2M    |    Offline     | Lightweight Paraformer model which supports Mandarin command words recognition                                                  |
-|                   [Paraformer-aishell](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary)                   |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
-|       [ParaformerBert-aishell](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)       |    CN    |        AISHELL (178hours)        |    4234    |    43M    |    Offline     |                                                                                                                                 |
-|        [Paraformer-aishell2](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)         |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
-|    [ParaformerBert-aishell2](https://www.modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)     |    CN    |      AISHELL-2 (1000hours)       |    5212    |    64M    |    Offline     |                                                                                                                                 |
-
-
-#### UniASR Models
-
-|                                                               Model Name                                                               | Language |          Training Data           | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:--------------------------------------------------------------------------------------------------------------------------------------:|:--------:|:--------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-|       [UniASR](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary)        | CN & EN  | Alibaba Speech Data (60000hours) |    8358    |   100M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-| [UniASR-large](https://modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) | CN & EN  | Alibaba Speech Data (60000hours) |    8358    |   220M    |    Offline     | UniASR streaming offline unifying models                                                                                                    |
-|           [UniASR Burmese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary)           | Burmese  |  Alibaba Speech Data (? hours)   |    696     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-|           [UniASR Hebrew](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary)           |  Hebrew  |  Alibaba Speech Data (? hours)   |    1085    |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-|       [UniASR Urdu](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary)                  |   Urdu   |  Alibaba Speech Data (? hours)   |    877     |    95M    |     Online     | UniASR streaming offline unifying models                                                                                                    |
-
-#### Conformer Models
-
-|                                                       Model Name                                                       | Language |     Training Data     | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:----------------------------------------------------------------------------------------------------------------------:|:--------:|:---------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-| [Conformer](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary)   |   CN     |  AISHELL (178hours)   |    4234    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-| [Conformer](https://www.modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary)   |   CN     | AISHELL-2 (1000hours) |    5212    |    44M    |    Offline     | Duration of input wav <= 20s                                                                                                    |
-
-
-#### RNN-T Models
-
-### Multi-talker Speech Recognition Models
-
-#### MFCCA Models
-
-|                                                  Model Name                                                   | Language |               Training Data                | Vocab Size | Parameter | Offline/Online | Notes                                                                                                                           |
-|:-------------------------------------------------------------------------------------------------------------:|:--------:|:------------------------------------------:|:----------:|:---------:|:--------------:|:--------------------------------------------------------------------------------------------------------------------------------|
-| [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)    |   CN     | AliMeeting銆丄ISHELL-4銆丼imudata (917hours)   |     4950   |    45M    |    Offline     | Duration of input wav <= 20s, channel of input wav <= 8 channel |
-
-
-
-### Voice Activity Detection Models
-
-|                                           Model Name                                           |        Training Data         | Parameters | Sampling Rate | Notes |
-|:----------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:-------------:|:------|
-| [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) | Alibaba Speech Data (5000hours) |    0.4M    |     16000     |       |
-|   [FSMN-VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-8k-common/summary)        | Alibaba Speech Data (5000hours) |    0.4M    |     8000      |       |
-
-### Punctuation Restoration Models
-
-|                                                         Model Name                                                         |        Training Data         | Parameters | Vocab Size| Offline/Online | Notes |
-|:--------------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:--------------:|:------|
-|      [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary)      | Alibaba Text Data |    70M     |    272727     |    Offline     |   offline punctuation model    |
-| [CT-Transformer](https://modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727/summary)      | Alibaba Text Data |    70M     |    272727     |     Online     |  online punctuation model     |
-
-### Language Models
-
-|                                                       Model Name                                                       |        Training Data         | Parameters | Vocab Size | Notes |
-|:----------------------------------------------------------------------------------------------------------------------:|:----------------------------:|:----------:|:----------:|:------|
-| [Transformer](https://www.modelscope.cn/models/damo/speech_transformer_lm_zh-cn-common-vocab8404-pytorch/summary)      | Alibaba Speech Data (?hours) |    57M     |    8404    |       |
-
-### Speaker Verification Models
-
-|                                                  Model Name                                                   |   Training Data   | Parameters | Number Speaker | Notes |
-|:-------------------------------------------------------------------------------------------------------------:|:-----------------:|:----------:|:----------:|:------|
-| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours)  |   17.5M    |    3465    |    Xvector, speaker verification, Chinese   |
-| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) |    61M     |    6135    |   Xvector, speaker verification, English    |
-
-### Speaker Diarization Models
-
-|                                                    Model Name                                                    |    Training Data    | Parameters | Notes |
-|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
-| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/summary) | AliMeeting (120 hours) |   40.5M    |    Speaker diarization, profiles and records, Chinese |
-| [SOND](https://www.modelscope.cn/models/damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch/summary)    |  CallHome (60 hours)  |     12M     |    Speaker diarization, profiles and records, English   |
-
-### Timestamp Prediction Models
-
-|                                                    Model Name                                     |  Language  |    Training Data    | Parameters | Notes |
-|:--------------------------------------------------------------------------------------------------:|:--------------:|:-------------------:|:----------:|:------|
-| [TP-Aligner](https://modelscope.cn/models/damo/speech_timestamp_prediction-v1-16k-offline/summary) | CN | Alibaba Speech Data (50000hours) |   37.8M    |    Timestamp prediction, Mandarin, middle size |
diff --git a/docs/modelscope_pipeline/itn_pipeline.md b/docs/modelscope_pipeline/itn_pipeline.md
new file mode 100644
index 0000000..2336842
--- /dev/null
+++ b/docs/modelscope_pipeline/itn_pipeline.md
@@ -0,0 +1,63 @@
+# Inverse Text Normalization (ITN)
+
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://modelscope.cn/models?page=1&tasks=inverse-text-processing&type=audio) to inference. Here we take the model of the Japanese ITN model as example to demonstrate the usage.
+
+## Inference
+
+### Quick start
+#### [Japanese ITN model](https://modelscope.cn/models/damo/speech_inverse_text_processing_fun-text-processing-itn-ja/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+itn_inference_pipline = pipeline(
+    task=Tasks.inverse_text_processing,
+    model='damo/speech_inverse_text_processing_fun-text-processing-itn-ja',
+    model_revision=None)
+
+itn_result = itn_inference_pipline(text_in='鐧句簩鍗佷笁')
+print(itn_result)
+# 123
+```
+- read text data directly.
+```python
+rec_result = inference_pipeline(text_in='涓�涔濅節涔濆勾銇獣鐢熴仐銇熷悓鍟嗗搧銇仭銇伩銆佺磩涓夊崄骞村墠銆佷簩鍗佸洓姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��')
+# 1999骞淬伀瑾曠敓銇椼仧鍚屽晢鍝併伀銇°仾銇裤�佺磩30骞村墠銆�24姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��
+```
+- text stored via url锛宔xample锛歨ttps://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt
+```python
+rec_result = inference_pipeline(text_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt')
+```
+
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/tree/main/fun_text_processing/inverse_text_normalization)
+
+### API-reference
+#### Define pipeline
+- `task`: `Tasks.inverse_text_processing`
+- `model`: model name in [model zoo](https://modelscope.cn/models?page=1&tasks=inverse-text-processing&type=audio), or model path in local disk
+- `output_dir`: `None` (Default), the output path of results if set
+- `model_revision`: `None` (Default), setting the model version
+
+#### Infer pipeline
+- `text_in`: the input to decode, which could be:
+  - text bytes, `e.g.`: "涓�涔濅節涔濆勾銇獣鐢熴仐銇熷悓鍟嗗搧銇仭銇伩銆佺磩涓夊崄骞村墠銆佷簩鍗佸洓姝炽伄闋冦伄骞稿洓閮庛伄鍐欑湡銈掑叕闁嬨��"
+  - text file, `e.g.`: https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_text/ja_itn_example.txt
+  In this case of `text file` input, `output_dir` must be set to save the output results
+
+## Modify Your Own ITN Model
+The rule-based ITN code is open-sourced in [FunTextProcessing](https://github.com/alibaba-damo-academy/FunASR/tree/main/fun_text_processing), users can modify by their own grammar rules for different languages. Let's take Japanese as an example, users can add their own whitelist in ```FunASR/fun_text_processing/inverse_text_normalization/ja/data/whitelist.tsv```. After modified the grammar rules, the users can export and evaluate their own ITN models in local directory.
+
+### Export ITN Model
+Export ITN model via ```FunASR/fun_text_processing/inverse_text_normalization/export_models.py```. An example to export ITN model to local folder is shown as below.
+```shell
+cd FunASR/fun_text_processing/inverse_text_normalization/
+python export_models.py --language ja --export_dir ./itn_models/
+```
+
+### Evaluate ITN Model
+Users can evaluate their own ITN model in local directory via ```FunASR/fun_text_processing/inverse_text_normalization/inverse_normalize.py```. Here is an example:
+```shell
+cd FunASR/fun_text_processing/inverse_text_normalization/
+python inverse_normalize.py --input_file ja_itn_example.txt --cache_dir ./itn_models/ --output_file output.txt --language=ja
+```
\ No newline at end of file
diff --git a/docs/modelscope_pipeline/quick_start.md b/docs/modelscope_pipeline/quick_start.md
index 436fb1d..7e35e91 100644
--- a/docs/modelscope_pipeline/quick_start.md
+++ b/docs/modelscope_pipeline/quick_start.md
@@ -1,7 +1,7 @@
 # Quick Start
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
 
 
 ## Inference with pipeline
diff --git a/docs/FQA.md b/docs/reference/FQA.md
similarity index 100%
rename from docs/FQA.md
rename to docs/reference/FQA.md
diff --git a/docs/application.md b/docs/reference/application.md
similarity index 100%
rename from docs/application.md
rename to docs/reference/application.md
diff --git a/docs/build_task.md b/docs/reference/build_task.md
similarity index 100%
rename from docs/build_task.md
rename to docs/reference/build_task.md
diff --git a/docs/papers.md b/docs/reference/papers.md
similarity index 100%
rename from docs/papers.md
rename to docs/reference/papers.md
diff --git a/egs/alimeeting/sa-asr/README.md b/egs/alimeeting/sa-asr/README.md
index 5731c39..951670b 100644
--- a/egs/alimeeting/sa-asr/README.md
+++ b/egs/alimeeting/sa-asr/README.md
@@ -19,7 +19,7 @@
 stage 7 - 9: Language model training (Optional).
 stage 10 - 11: ASR training (SA-ASR requires loading the pre-trained ASR model).
 stage 12: SA-ASR training.
-stage 13 - 18: Inference and evaluation.
+stage 13 - 16: Inference and evaluation.
 ```
 Before running `run_m2met_2023_infer.sh`, you need to place the new test set `Test_2023_Ali_far` (to be released after the challenge starts) in the `./dataset` directory, which contains only raw audios. Then put the given `wav.scp`, `wav_raw.scp`, `segments`, `utt2spk` and `spk2utt` in the `./data/Test_2023_Ali_far` directory.  
 ```shell
@@ -37,6 +37,10 @@
 stage 3: Inference.
 stage 4: Generation of SA-ASR results required for final submission.
 ```
+
+The baseline model is available on [ModelScope](https://www.modelscope.cn/models/damo/speech_saasr_asr-zh-cn-16k-alimeeting/summary).
+After generate stats of AliMeeting corpus(stage 10 in `run.sh`), you can set the `infer_with_pretrained_model=true` in `run.sh` to infer with our official baseline model released on ModelScope without training.
+
 # Format of Final Submission
 Finally, you need to submit a file called `text_spk_merge` with the following format:
 ```shell
diff --git a/egs/alimeeting/sa-asr/asr_local.sh b/egs/alimeeting/sa-asr/asr_local.sh
index 389bb01..30401b9 100755
--- a/egs/alimeeting/sa-asr/asr_local.sh
+++ b/egs/alimeeting/sa-asr/asr_local.sh
@@ -107,8 +107,8 @@
                                       # inference_asr_model=valid.acc.best.pth
                                       # inference_asr_model=valid.loss.ave.pth
 inference_sa_asr_model=valid.acc_spk.ave.pb
-download_model= # Download a model from Model Zoo and use it for decoding.
-
+infer_with_pretrained_model=false   # Use pretrained model for decoding
+download_sa_asr_model=          # Download the SA-ASR model from ModelScope and use it for decoding.
 # [Task dependent] Set the datadir name created by local/data.sh
 train_set=       # Name of training set.
 valid_set=       # Name of validation set used for monitoring/tuning network training.
@@ -203,7 +203,8 @@
                           # Note that it will overwrite args in inference config.
     --inference_lm        # Language modle path for decoding (default="${inference_lm}").
     --inference_asr_model # ASR model path for decoding (default="${inference_asr_model}").
-    --download_model      # Download a model from Model Zoo and use it for decoding (default="${download_model}").
+    --infer_with_pretrained_model      # Use pretrained model for decoding (default="${infer_with_pretrained_model}").
+    --download_sa_asr_model=          # Download the SA-ASR model from ModelScope and use it for decoding(default="${download_sa_asr_model}").
 
     # [Task dependent] Set the datadir name created by local/data.sh
     --train_set     # Name of training set (required).
@@ -304,6 +305,9 @@
     lm_token_type="${token_type}"
 fi
 
+if ${infer_with_pretrained_model}; then
+    skip_train=true
+fi
 
 # Set tag for naming of model directory
 if [ -z "${asr_tag}" ]; then
@@ -1220,122 +1224,20 @@
     log "Skip the training stages"
 fi
 
+if ${infer_with_pretrained_model}; then
+    log "Use ${download_sa_asr_model} for decoding and evaluation"
+    sa_asr_exp="${expdir}/${download_sa_asr_model}"
+    mkdir -p "${sa_asr_exp}"
+
+
+    python local/download_pretrained_model_from_modelscope.py $download_sa_asr_model ${expdir}
+    inference_sa_asr_model="model.pb"
+    inference_config=${sa_asr_exp}/decoding.yaml
+fi
 
 if ! "${skip_eval}"; then
     if [ ${stage} -le 13 ] && [ ${stop_stage} -ge 13 ]; then
-        log "Stage 13: Decoding multi-talker ASR: training_dir=${asr_exp}"
-
-        if ${gpu_inference}; then
-            _cmd="${cuda_cmd}"
-            inference_nj=$[${ngpu}*${njob_infer}]
-            _ngpu=1
-
-        else
-            _cmd="${decode_cmd}"
-            inference_nj=$inference_nj
-            _ngpu=0
-        fi
-
-        _opts=
-        if [ -n "${inference_config}" ]; then
-            _opts+="--config ${inference_config} "
-        fi
-        if "${use_lm}"; then
-            if "${use_word_lm}"; then
-                _opts+="--word_lm_train_config ${lm_exp}/config.yaml "
-                _opts+="--word_lm_file ${lm_exp}/${inference_lm} "
-            else
-                _opts+="--lm_train_config ${lm_exp}/config.yaml "
-                _opts+="--lm_file ${lm_exp}/${inference_lm} "
-            fi
-        fi
-
-        # 2. Generate run.sh
-        log "Generate '${asr_exp}/${inference_tag}/run.sh'. You can resume the process from stage 13 using this script"
-        mkdir -p "${asr_exp}/${inference_tag}"; echo "${run_args} --stage 13 \"\$@\"; exit \$?" > "${asr_exp}/${inference_tag}/run.sh"; chmod +x "${asr_exp}/${inference_tag}/run.sh"
-
-        for dset in ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _dir="${asr_exp}/${inference_tag}/${dset}"
-            _logdir="${_dir}/logdir"
-            mkdir -p "${_logdir}"
-
-            _feats_type="$(<${_data}/feats_type)"
-            if [ "${_feats_type}" = raw ]; then
-                _scp=wav.scp
-                if [[ "${audio_format}" == *ark* ]]; then
-                    _type=kaldi_ark
-                else
-                    _type=sound
-                fi
-            else
-                _scp=feats.scp
-                _type=kaldi_ark
-            fi
-
-            # 1. Split the key file
-            key_file=${_data}/${_scp}
-            split_scps=""
-            _nj=$(min "${inference_nj}" "$(<${key_file} wc -l)")
-            echo $_nj
-            for n in $(seq "${_nj}"); do
-                split_scps+=" ${_logdir}/keys.${n}.scp"
-            done
-            # shellcheck disable=SC2086
-            utils/split_scp.pl "${key_file}" ${split_scps}
-
-            # 2. Submit decoding jobs
-            log "Decoding started... log: '${_logdir}/asr_inference.*.log'"
-            
-            ${_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
-                python -m funasr.bin.asr_inference_launch \
-                    --batch_size 1 \
-                    --mc True   \
-                    --nbest 1   \
-                    --ngpu "${_ngpu}" \
-                    --njob ${njob_infer} \
-                    --gpuid_list ${device} \
-                    --data_path_and_name_and_type "${_data}/${_scp},speech,${_type}" \
-                    --key_file "${_logdir}"/keys.JOB.scp \
-                    --asr_train_config "${asr_exp}"/config.yaml \
-                    --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
-                    --output_dir "${_logdir}"/output.JOB \
-                    --mode asr \
-                    ${_opts}
-
-            # 3. Concatenates the output files from each jobs
-            for f in token token_int score text; do
-                for i in $(seq "${_nj}"); do
-                    cat "${_logdir}/output.${i}/1best_recog/${f}"
-                done | LC_ALL=C sort -k1 >"${_dir}/${f}"
-            done
-        done
-    fi
-
-
-    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
-        log "Stage 14: Scoring multi-talker ASR"
-
-        for dset in ${test_sets}; do
-            _data="${data_feats}/${dset}"
-            _dir="${asr_exp}/${inference_tag}/${dset}"
-
-            sed 's/\$//g' ${_data}/text > ${_data}/text_nosrc
-            sed 's/\$//g' ${_dir}/text > ${_dir}/text_nosrc
-
-            python utils/proce_text.py ${_data}/text_nosrc ${_data}/text.proc
-            python utils/proce_text.py ${_dir}/text_nosrc ${_dir}/text.proc
-
-            python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
-            tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
-            cat ${_dir}/text.cer.txt
-            
-        done
-
-    fi
-
-    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
-        log "Stage 15: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"
+        log "Stage 13: Decoding SA-ASR (oracle profile): training_dir=${sa_asr_exp}"
 
         if ${gpu_inference}; then
             _cmd="${cuda_cmd}"
@@ -1426,8 +1328,8 @@
         done
     fi
 
-    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
-        log "Stage 16: Scoring SA-ASR (oracle profile)"
+    if [ ${stage} -le 14 ] && [ ${stop_stage} -ge 14 ]; then
+        log "Stage 14: Scoring SA-ASR (oracle profile)"
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
@@ -1454,8 +1356,8 @@
 
     fi
 
-    if [ ${stage} -le 17 ] && [ ${stop_stage} -ge 17 ]; then
-        log "Stage 17: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"
+    if [ ${stage} -le 15 ] && [ ${stop_stage} -ge 15 ]; then
+        log "Stage 15: Decoding SA-ASR (cluster profile): training_dir=${sa_asr_exp}"
 
         if ${gpu_inference}; then
             _cmd="${cuda_cmd}"
@@ -1545,8 +1447,8 @@
         done
     fi
 
-    if [ ${stage} -le 18 ] && [ ${stop_stage} -ge 18 ]; then
-        log "Stage 18: Scoring SA-ASR (cluster profile)"
+    if [ ${stage} -le 16 ] && [ ${stop_stage} -ge 16 ]; then
+        log "Stage 16: Scoring SA-ASR (cluster profile)"
 
         for dset in ${test_sets}; do
             _data="${data_feats}/${dset}"
diff --git a/egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py b/egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py
new file mode 100644
index 0000000..b4b5412
--- /dev/null
+++ b/egs/alimeeting/sa-asr/local/download_pretrained_model_from_modelscope.py
@@ -0,0 +1,7 @@
+from modelscope.hub.snapshot_download import snapshot_download
+import sys
+
+if __name__ == "__main__":
+    model_tag = sys.argv[1]
+    local_model_dir = sys.argv[2]
+    model_dir = snapshot_download(model_tag, cache_dir=local_model_dir, revision='1.0.0')
\ No newline at end of file
diff --git a/egs/alimeeting/sa-asr/run.sh b/egs/alimeeting/sa-asr/run.sh
index e5297b8..2869164 100755
--- a/egs/alimeeting/sa-asr/run.sh
+++ b/egs/alimeeting/sa-asr/run.sh
@@ -8,8 +8,8 @@
 ngpu=4
 device="0,1,2,3"
 
-stage=1
-stop_stage=18
+stage=12
+stop_stage=13
 
 
 train_set=Train_Ali_far
@@ -18,6 +18,8 @@
 asr_config=conf/train_asr_conformer.yaml
 sa_asr_config=conf/train_sa_asr_conformer.yaml
 inference_config=conf/decode_asr_rnn.yaml
+infer_with_pretrained_model=true
+download_sa_asr_model="damo/speech_saasr_asr-zh-cn-16k-alimeeting"
 
 lm_config=conf/train_lm_transformer.yaml
 use_lm=false
@@ -29,6 +31,8 @@
     --stop_stage ${stop_stage}                         \
     --gpu_inference true    \
     --njob_infer 4    \
+    --infer_with_pretrained_model ${infer_with_pretrained_model} \
+    --download_sa_asr_model $download_sa_asr_model \
     --asr_exp exp/asr_train_multispeaker_conformer_raw_zh_char_data_alimeeting \
     --sa_asr_exp exp/sa_asr_train_conformer_raw_zh_char_data_alimeeting \
     --asr_stats_dir exp/asr_stats_multispeaker_conformer_raw_zh_char_data_alimeeting \
diff --git a/egs_modelscope/asr/TEMPLATE/README.md b/egs_modelscope/asr/TEMPLATE/README.md
index 30ae8c9..7ff04eb 100644
--- a/egs_modelscope/asr/TEMPLATE/README.md
+++ b/egs_modelscope/asr/TEMPLATE/README.md
@@ -1,7 +1,7 @@
 # Speech Recognition
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
 
 ## Inference
 
@@ -44,7 +44,7 @@
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
 
 #### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
-There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model details, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
 ```python
 decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
 inference_pipeline = pipeline(
@@ -61,7 +61,7 @@
 Undo
 
 #### [MFCCA Model](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
-For more model detailes, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+For more model details, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -79,7 +79,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.auto_speech_recognition`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
@@ -103,7 +103,7 @@
 FunASR also offer recipes [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 #### Settings of `infer.sh`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
 - `output_dir`: output dir of the recognition results
 - `batch_size`: `64` (Default), batch size of inference on gpu
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
index 92088a2..bb55ab5 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@@ -1 +1 @@
-../TEMPLATE/README.md
\ No newline at end of file
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py
new file mode 100644
index 0000000..9d08923
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/finetune.py
@@ -0,0 +1,37 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        model_revision="v1.0.2",
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404", data_path="./data")
+    params.output_dir = "./checkpoint"              # 妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "large"                   # finetune contextual paraformer妯″瀷鍙兘浣跨敤large dataset
+    params.batch_bins = 200000                      # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 20                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.0002                              # 璁剧疆瀛︿範鐜�
+
+    modelscope_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
deleted file mode 120000
index 0b3b38b..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
new file mode 100644
index 0000000..6325626
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=10    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+hotword_txt=None
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --hotword_txt ${hotword_txt} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
new file mode 100644
index 0000000..97e9fce
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
@@ -0,0 +1,40 @@
+import os
+import tempfile
+import codecs
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.msdatasets import MsDataset
+
+if __name__ == '__main__':
+    param_dict = dict()
+    param_dict['hotword'] = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/hotword.txt"
+
+    output_dir = "./output"
+    batch_size = 1
+
+    # dataset split ['test']
+    ds_dict = MsDataset.load(dataset_name='speech_asr_aishell1_hotwords_testsets', namespace='speech_asr')
+    work_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(work_dir):
+        os.makedirs(work_dir)
+    wav_file_path = os.path.join(work_dir, "wav.scp")
+    
+    counter = 0
+    with codecs.open(wav_file_path, 'w') as fin: 
+        for line in ds_dict:
+            counter += 1
+            wav = line["Audio:FILE"]
+            idx = wav.split("/")[-1].split(".")[0]
+            fin.writelines(idx + " " + wav + "\n")
+            if counter == 50:
+                break
+    audio_in = wav_file_path         
+
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+        output_dir=output_dir,
+        batch_size=batch_size,
+        param_dict=param_dict)
+
+    rec_result = inference_pipeline(audio_in=audio_in)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
similarity index 96%
rename from egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
rename to egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
index 4fd4cdf..b566454 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
@@ -34,6 +34,6 @@
     rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
                                     param_dict=param_dict)
     if len(rec_result) != 0:
-        final_result += rec_result['text'][0]
+        final_result += rec_result['text'] + " "
         print(rec_result)
 print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 92088a2..bb55ab5 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -1 +1 @@
-../TEMPLATE/README.md
\ No newline at end of file
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
deleted file mode 120000
index 0b3b38b..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
new file mode 100644
index 0000000..ef49d7a
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
index 92088a2..bb55ab5 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@@ -1 +1 @@
-../TEMPLATE/README.md
\ No newline at end of file
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
index f05fbbb..128fc31 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -1 +1 @@
-../TEMPLATE/infer.py
\ No newline at end of file
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
deleted file mode 120000
index 0b3b38b..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
new file mode 100644
index 0000000..207bbdf
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
index 92088a2..bb55ab5 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@@ -1 +1 @@
-../TEMPLATE/README.md
\ No newline at end of file
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
index f05fbbb..128fc31 120000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -1 +1 @@
-../TEMPLATE/infer.py
\ No newline at end of file
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
deleted file mode 120000
index 0b3b38b..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
new file mode 100644
index 0000000..4b59bc1
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@@ -0,0 +1,103 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
index 0066c7b..6672bbf 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -34,6 +34,6 @@
     rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
                                     param_dict=param_dict)
     if len(rec_result) != 0:
-        final_result += rec_result['text'][0]
+        final_result += rec_result['text'] + " "
         print(rec_result)
-print(final_result)
+print(final_result.strip())
diff --git a/egs_modelscope/punctuation/TEMPLATE/README.md b/egs_modelscope/punctuation/TEMPLATE/README.md
index dfbe044..08814ea 100644
--- a/egs_modelscope/punctuation/TEMPLATE/README.md
+++ b/egs_modelscope/punctuation/TEMPLATE/README.md
@@ -1,7 +1,7 @@
 # Punctuation Restoration
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of the punctuation model of CT-Transformer as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of the punctuation model of CT-Transformer as example to demonstrate the usage.
 
 ## Inference
 
@@ -55,7 +55,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.punctuation`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `output_dir`: `None` (Default), the output path of results if set
 - `model_revision`: `None` (Default), setting the model version
@@ -71,7 +71,7 @@
 FunASR also offer recipes [egs_modelscope/punctuation/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punctuation/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.
 
 #### Settings of `infer.sh`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `data_dir`: the dataset dir needs to include `punc.txt`
 - `output_dir`: output dir of the recognition results
 - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
deleted file mode 120000
index bb55ab5..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
+++ /dev/null
@@ -1 +0,0 @@
-../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
deleted file mode 120000
index 128fc31..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
+++ /dev/null
@@ -1 +0,0 @@
-../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
deleted file mode 120000
index 5e59f18..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
+++ /dev/null
@@ -1 +0,0 @@
-../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/TEMPLATE/README.md b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
index 99c9b59..ba179ed 100644
--- a/egs_modelscope/speaker_diarization/TEMPLATE/README.md
+++ b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
@@ -2,7 +2,7 @@
 
 > **Note**: 
 > The modelscope pipeline supports all the models in 
-[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) 
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 
 to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
 
 ## Inference with pipeline
@@ -40,7 +40,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.speaker_diarization`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `output_dir`: `None` (Default), the output path of results if set
 - `batch_size`: `1` (Default), batch size when decoding
diff --git a/egs_modelscope/speaker_verification/TEMPLATE/README.md b/egs_modelscope/speaker_verification/TEMPLATE/README.md
index f7b64ce..d6736e3 100644
--- a/egs_modelscope/speaker_verification/TEMPLATE/README.md
+++ b/egs_modelscope/speaker_verification/TEMPLATE/README.md
@@ -2,7 +2,7 @@
 
 > **Note**: 
 > The modelscope pipeline supports all the models in 
-[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) 
+[model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 
 to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
 
 ## Inference with pipeline
@@ -50,7 +50,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.speaker_verification`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `output_dir`: `None` (Default), the output path of results if set
 - `batch_size`: `1` (Default), batch size when decoding
diff --git a/egs_modelscope/tp/TEMPLATE/README.md b/egs_modelscope/tp/TEMPLATE/README.md
index 62c35d8..7cc8508 100644
--- a/egs_modelscope/tp/TEMPLATE/README.md
+++ b/egs_modelscope/tp/TEMPLATE/README.md
@@ -26,7 +26,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.speech_timestamp`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
@@ -62,7 +62,7 @@
 FunASR also offer recipes [egs_modelscope/tp/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/tp/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 #### Settings of `infer.sh`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `data_dir`: the dataset dir **must** include `wav.scp` and `text.txt`
 - `output_dir`: output dir of the recognition results
 - `batch_size`: `64` (Default), batch size of inference on gpu
diff --git a/egs_modelscope/vad/TEMPLATE/README.md b/egs_modelscope/vad/TEMPLATE/README.md
index 503b9bf..0ad9fb3 100644
--- a/egs_modelscope/vad/TEMPLATE/README.md
+++ b/egs_modelscope/vad/TEMPLATE/README.md
@@ -1,7 +1,7 @@
 # Voice Activity Detection
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of FSMN-VAD as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of FSMN-VAD as example to demonstrate the usage.
 
 ## Inference
 
@@ -46,7 +46,7 @@
 ### API-reference
 #### Define pipeline
 - `task`: `Tasks.voice_activity_detection`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
 - `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
 - `output_dir`: `None` (Default), the output path of results if set
@@ -70,7 +70,7 @@
 FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 #### Settings of `infer.sh`
-- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
 - `data_dir`: the dataset dir needs to include `wav.scp`
 - `output_dir`: output dir of the recognition results
 - `batch_size`: `64` (Default), batch size of inference on gpu
@@ -83,7 +83,7 @@
 #### Decode with multi GPUs:
 ```shell
     bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
     --batch_size 1 \
@@ -93,11 +93,11 @@
 #### Decode with multi-thread CPUs:
 ```shell
     bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 1
+    --njob 64
 ```
 
 ## Finetune with pipeline
diff --git a/egs_modelscope/vad/TEMPLATE/infer.py b/egs_modelscope/vad/TEMPLATE/infer.py
index 3d9ee55..f49ab4b 100644
--- a/egs_modelscope/vad/TEMPLATE/infer.py
+++ b/egs_modelscope/vad/TEMPLATE/infer.py
@@ -16,10 +16,10 @@
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
-    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--model', type=str, default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch")
     parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
     parser.add_argument('--output_dir', type=str, default="./results/")
-    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--batch_size', type=int, default=1)
     parser.add_argument('--gpuid', type=str, default="0")
     args = parser.parse_args()
     modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/vad/TEMPLATE/infer.sh b/egs_modelscope/vad/TEMPLATE/infer.sh
index 7dc0387..0651c98 100644
--- a/egs_modelscope/vad/TEMPLATE/infer.sh
+++ b/egs_modelscope/vad/TEMPLATE/infer.sh
@@ -9,7 +9,7 @@
 model="damo/speech_fsmn_vad_zh-cn-16k-common"
 data_dir="./data/test"
 output_dir="./results"
-batch_size=64
+batch_size=1
 gpu_inference=true    # whether to perform gpu decoding
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
 njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
index bb55ab5..92088a2 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
@@ -1 +1 @@
-../../TEMPLATE/README.md
\ No newline at end of file
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
index bbc16c5..eded5ed 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
@@ -7,7 +7,7 @@
     inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
-        model_revision='v1.2.0',
+        model_revision=None,
         output_dir=output_dir,
         batch_size=1,
     )
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
index 128fc31..f05fbbb 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@@ -1 +1 @@
-../../TEMPLATE/infer.py
\ No newline at end of file
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
index 5e59f18..0b3b38b 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
@@ -1 +1 @@
-../../TEMPLATE/infer.sh
\ No newline at end of file
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
index bb55ab5..92088a2 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
@@ -1 +1 @@
-../../TEMPLATE/README.md
\ No newline at end of file
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
index 84863d0..33be505 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
@@ -7,7 +7,7 @@
     inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-8k-common",
-        model_revision='v1.2.0',
+        model_revision=None,
         output_dir=output_dir,
         batch_size=1,
     )
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
index 5b67da7..ec5c502 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
@@ -11,7 +11,7 @@
     inference_pipeline = pipeline(
         task=Tasks.voice_activity_detection,
         model="damo/speech_fsmn_vad_zh-cn-8k-common",
-        model_revision='v1.2.0',
+        model_revision=None,
         output_dir=output_dir,
         batch_size=1,
         mode='online',
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
index 128fc31..f05fbbb 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@@ -1 +1 @@
-../../TEMPLATE/infer.py
\ No newline at end of file
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
index 5e59f18..0b3b38b 120000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
@@ -1 +1 @@
-../../TEMPLATE/infer.sh
\ No newline at end of file
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 5546c92..5335860 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -41,6 +41,7 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
 from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
 from funasr.utils.timestamp_tools import ts_prediction_lfr6_standard
 from funasr.bin.tp_inference import SpeechText2Timestamp
@@ -236,7 +237,7 @@
         pre_token_length = pre_token_length.round().long()
         if torch.max(pre_token_length) < 1:
             return []
-        if not isinstance(self.asr_model, ContextualParaformer):
+        if not isinstance(self.asr_model, ContextualParaformer) and not isinstance(self.asr_model, NeatContextualParaformer):
             if self.hotword_list:
                 logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
             decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index bf5590c..4f04d02 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -239,7 +239,7 @@
                         feats_len = torch.tensor([feats_chunk2.shape[1]])
                         results_chunk2 = self.infer(feats_chunk2, feats_len, cache)
 
-                        return ["".join(results_chunk1 + results_chunk2)]
+                        return [" ".join(results_chunk1 + results_chunk2)]
 
                 results = self.infer(feats, feats_len, cache)
 
@@ -299,12 +299,9 @@
 
                 # Change integer-ids to tokens
                 token = self.converter.ids2tokens(token_int)
+                token = " ".join(token)
 
-                if self.tokenizer is not None:
-                    text = self.tokenizer.tokens2text(token)
-                else:
-                    text = None
-                results.append(text)
+                results.append(token)
 
         # assert check_return_type(results)
         return results
@@ -555,13 +552,13 @@
                 input_lens = torch.tensor([stride_size])
                 asr_result = speech2text(cache, raw_inputs[:, sample_offset: sample_offset + stride_size], input_lens)
                 if len(asr_result) != 0: 
-                    final_result += asr_result[0]
-            item = {'key': "utt", 'value': [final_result]}
+                    final_result += " ".join(asr_result) + " "
+            item = {'key': "utt", 'value': final_result.strip()}
         else:
             input_lens = torch.tensor([raw_inputs.shape[1]])
             cache["encoder"]["is_final"] = is_final
             asr_result = speech2text(cache, raw_inputs, input_lens)
-            item = {'key': "utt", 'value': asr_result}
+            item = {'key': "utt", 'value': " ".join(asr_result)}
 
         asr_result_list.append(item)
         if is_final:
@@ -750,12 +747,3 @@
 if __name__ == "__main__":
     main()
 
-    # from modelscope.pipelines import pipeline
-    # from modelscope.utils.constant import Tasks
-    #
-    # inference_16k_pipline = pipeline(
-    #     task=Tasks.auto_speech_recognition,
-    #     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
-    #
-    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
-    # print(rec_result)
diff --git a/funasr/bin/asr_inference_rnnt.py b/funasr/bin/asr_inference_rnnt.py
index d964643..bd36907 100644
--- a/funasr/bin/asr_inference_rnnt.py
+++ b/funasr/bin/asr_inference_rnnt.py
@@ -188,18 +188,15 @@
         self.frontend = frontend
         self.window_size = self.chunk_size + self.right_context
         
-        self._ctx = self.asr_model.encoder.get_encoder_input_size(
-            self.window_size
-        )
+        if self.streaming:
+            self._ctx = self.asr_model.encoder.get_encoder_input_size(
+                self.window_size
+            )
        
-        #self.last_chunk_length = (
-        #    self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
-        #) * self.hop_length
-
-        self.last_chunk_length = (
-            self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
-        )
-        self.reset_inference_cache()
+            self.last_chunk_length = (
+                self.asr_model.encoder.embed.min_frame_length + self.right_context + 1
+            )
+            self.reset_inference_cache()
 
     def reset_inference_cache(self) -> None:
         """Reset Speech2Text parameters."""
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index 94f7262..5c30fdb 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -83,7 +83,8 @@
         finetune_configs = yaml.safe_load(f)
         # set data_types
         if dataset_type == "large":
-            finetune_configs["dataset_conf"]["data_types"] = "sound,text"
+            if 'data_types' not in finetune_configs['dataset_conf']:
+                finetune_configs["dataset_conf"]["data_types"] = "sound,text"
     finetune_configs = update_dct(configs, finetune_configs)
     for key, value in finetune_configs.items():
         if hasattr(args, key):
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punctuation_infer_vadrealtime.py
index b2db1bf..0dc01f5 100644
--- a/funasr/bin/punctuation_infer_vadrealtime.py
+++ b/funasr/bin/punctuation_infer_vadrealtime.py
@@ -61,7 +61,7 @@
             text_name="text",
             non_linguistic_symbols=train_args.non_linguistic_symbols,
         )
-        print("start decoding!!!")
+        
 
     @torch.no_grad()
     def __call__(self, text: Union[list, str], cache: list, split_size=20):
@@ -70,7 +70,7 @@
         else:
             precache = ""
             cache = []
-        data = {"text": precache + text}
+        data = {"text": precache + " " + text}
         result = self.preprocessor(data=data, uid="12938712838719")
         split_text = self.preprocessor.pop_split_text_data(result)
         mini_sentences = split_to_mini_sentence(split_text, split_size)
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 387b622..5fbd844 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -274,8 +274,7 @@
     assert check_argument_types()
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
-    if ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
+
 
     logging.basicConfig(
         level=log_level,
@@ -286,7 +285,7 @@
         device = "cuda"
     else:
         device = "cpu"
-
+        batch_size = 1
     # 1. Set random-seed
     set_all_random_seed(seed)
 
@@ -352,7 +351,6 @@
                 item = {'key': keys[i], 'value': results[i]}
                 vad_results.append(item)
                 if writer is not None:
-                    results[i] = json.loads(results[i])
                     ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
@@ -377,10 +375,7 @@
         **kwargs,
 ):
     assert check_argument_types()
-    if batch_size > 1:
-        raise NotImplementedError("batch decoding is not implemented")
-    if ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
+
 
     logging.basicConfig(
         level=log_level,
@@ -391,6 +386,7 @@
         device = "cuda"
     else:
         device = "cpu"
+        batch_size = 1
 
     # 1. Set random-seed
     set_all_random_seed(seed)
@@ -466,7 +462,6 @@
                         item = {'key': keys[i], 'value': results[i]}
                         vad_results.append(item)
                         if writer is not None:
-                            results[i] = json.loads(results[i])
                             ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
index 4d02620..a363309 100644
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -156,8 +156,6 @@
     
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
-    if ngpu > 1:
-        raise NotImplementedError("only single GPU decoding is supported")
 
     logging.basicConfig(
         level=log_level,
@@ -168,7 +166,7 @@
         device = "cuda"
     else:
         device = "cpu"
-
+        batch_size = 1
     # 1. Set random-seed
     set_all_random_seed(seed)
 
@@ -243,7 +241,6 @@
                         item = {'key': keys[i], 'value': results[i]}
                         vad_results.append(item)
                         if writer is not None:
-                            results[i] = json.loads(results[i])
                             ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index b0e1b8f..8c224d8 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -101,7 +101,7 @@
                 if data_type == "kaldi_ark":
                     ark_reader = ReadHelper('ark:{}'.format(data_file))
                     reader_list.append(ark_reader)
-                elif data_type == "text" or data_type == "sound":
+                elif data_type == "text" or data_type == "sound" or data_type == 'text_hotword':
                     text_reader = open(data_file, "r")
                     reader_list.append(text_reader)
                 elif data_type == "none":
@@ -131,6 +131,13 @@
                         sample_dict["sampling_rate"] = sampling_rate
                         if data_name == "speech":
                             sample_dict["key"] = key
+                    elif data_type == "text_hotword":
+                        text = item
+                        segs = text.strip().split()
+                        sample_dict[data_name] = segs[1:]
+                        if "key" not in sample_dict:
+                            sample_dict["key"] = segs[0]
+                        sample_dict['hw_tag'] = 1
                     else:
                         text = item
                         segs = text.strip().split()
@@ -167,14 +174,38 @@
     shuffle = conf.get('shuffle', True)
     data_names = conf.get("data_names", "speech,text")
     data_types = conf.get("data_types", "kaldi_ark,text")
-    dataset = AudioDataset(scp_lists, data_names, data_types, frontend_conf=frontend_conf, shuffle=shuffle, mode=mode)
+
+    pre_hwfile = conf.get("pre_hwlist", None)
+    pre_prob = conf.get("pre_prob", 0)  # unused yet
+
+    hw_config = {"sample_rate": conf.get("sample_rate", 0.6),
+                 "double_rate": conf.get("double_rate", 0.1),
+                 "hotword_min_length": conf.get("hotword_min_length", 2),
+                 "hotword_max_length": conf.get("hotword_max_length", 8),
+                 "pre_prob": conf.get("pre_prob", 0.0)}
+
+    if pre_hwfile is not None:
+        pre_hwlist = []
+        with open(pre_hwfile, 'r') as fin:
+            for line in fin.readlines():
+                pre_hwlist.append(line.strip())
+    else:
+        pre_hwlist = None
+
+    dataset = AudioDataset(scp_lists, 
+                           data_names, 
+                           data_types, 
+                           frontend_conf=frontend_conf, 
+                           shuffle=shuffle, 
+                           mode=mode, 
+                           )
 
     filter_conf = conf.get('filter_conf', {})
     filter_fn = partial(filter, **filter_conf)
     dataset = FilterIterDataPipe(dataset, fn=filter_fn)
 
     if "text" in data_names:
-        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer}
+        vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
         tokenize_fn = partial(tokenize, **vocab)
         dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
 
diff --git a/funasr/datasets/large_datasets/utils/hotword_utils.py b/funasr/datasets/large_datasets/utils/hotword_utils.py
new file mode 100644
index 0000000..fccfea6
--- /dev/null
+++ b/funasr/datasets/large_datasets/utils/hotword_utils.py
@@ -0,0 +1,32 @@
+import random
+
+def sample_hotword(length, 
+                   hotword_min_length, 
+                   hotword_max_length,
+                   sample_rate,
+                   double_rate,
+                   pre_prob,
+                   pre_index=None):
+        if length < hotword_min_length:
+            return [-1]
+        if random.random() < sample_rate:
+            if pre_prob > 0 and random.random() < pre_prob and pre_index is not None:
+                return pre_index
+            if length == hotword_min_length:
+                return [0, length-1]
+            elif random.random() < double_rate and length > hotword_max_length + hotword_min_length + 2:
+                # sample two hotwords in a sentence
+                _max_hw_length = min(hotword_max_length, length // 2)
+                # first hotword
+                start1 = random.randint(0, length // 3)
+                end1 = random.randint(start1 + hotword_min_length - 1, start1 + _max_hw_length - 1)
+                # second hotword
+                start2 = random.randint(end1 + 1, length - hotword_min_length)
+                end2 = random.randint(min(length-1, start2+hotword_min_length-1), min(length-1, start2+hotword_max_length-1))
+                return [start1, end1, start2, end2]
+            else:  # single hotword
+                start = random.randint(0, length - hotword_min_length)
+                end = random.randint(min(length-1, start+hotword_min_length-1), min(length-1, start+hotword_max_length-1))
+                return [start, end]
+        else:
+            return [-1]
\ No newline at end of file
diff --git a/funasr/datasets/large_datasets/utils/padding.py b/funasr/datasets/large_datasets/utils/padding.py
index e0feac6..20ba7a3 100644
--- a/funasr/datasets/large_datasets/utils/padding.py
+++ b/funasr/datasets/large_datasets/utils/padding.py
@@ -13,15 +13,16 @@
     batch = {}
     data_names = data[0].keys()
     for data_name in data_names:
-        if data_name == "key" or data_name =="sampling_rate":
+        if data_name == "key" or data_name == "sampling_rate":
             continue
         else:
-            if data[0][data_name].dtype.kind == "i":
-                pad_value = int_pad_value
-                tensor_type = torch.int64
-            else:
-                pad_value = float_pad_value
-                tensor_type = torch.float32
+            if data_name != 'hotword_indxs':
+                if data[0][data_name].dtype.kind == "i":
+                    pad_value = int_pad_value
+                    tensor_type = torch.int64
+                else:
+                    pad_value = float_pad_value
+                    tensor_type = torch.float32
 
             tensor_list = [torch.tensor(np.copy(d[data_name]), dtype=tensor_type) for d in data]
             tensor_lengths = torch.tensor([len(d[data_name]) for d in data], dtype=torch.int32)
@@ -31,4 +32,47 @@
             batch[data_name] = tensor_pad
             batch[data_name + "_lengths"] = tensor_lengths
 
+    # DHA, EAHC NOT INCLUDED
+    if "hotword_indxs" in batch:
+        # if hotword indxs in batch
+        # use it to slice hotwords out
+        hotword_list = []
+        hotword_lengths = []
+        text = batch['text']
+        text_lengths = batch['text_lengths']
+        hotword_indxs = batch['hotword_indxs']
+        num_hw = sum([int(i) for i in batch['hotword_indxs_lengths'] if i != 1]) // 2
+        B, t1 = text.shape
+        t1 += 1  # TODO: as parameter which is same as predictor_bias
+        ideal_attn = torch.zeros(B, t1, num_hw+1)
+        nth_hw = 0
+        for b, (hotword_indx, one_text, length) in enumerate(zip(hotword_indxs, text, text_lengths)):
+            ideal_attn[b][:,-1] = 1
+            if hotword_indx[0] != -1:
+                start, end = int(hotword_indx[0]), int(hotword_indx[1])
+                hotword = one_text[start: end+1]
+                hotword_list.append(hotword)
+                hotword_lengths.append(end-start+1)
+                ideal_attn[b][start:end+1, nth_hw] = 1
+                ideal_attn[b][start:end+1, -1] = 0
+                nth_hw += 1
+                if len(hotword_indx) == 4 and hotword_indx[2] != -1:
+                    # the second hotword if exist
+                    start, end = int(hotword_indx[2]), int(hotword_indx[3])
+                    hotword_list.append(one_text[start: end+1])
+                    hotword_lengths.append(end-start+1)
+                    ideal_attn[b][start:end+1, nth_hw-1] = 1
+                    ideal_attn[b][start:end+1, -1] = 0
+                    nth_hw += 1
+        hotword_list.append(torch.tensor([1]))
+        hotword_lengths.append(1)
+        hotword_pad = pad_sequence(hotword_list,
+                                batch_first=True,
+                                padding_value=0)
+        batch["hotword_pad"] = hotword_pad
+        batch["hotword_lengths"] = torch.tensor(hotword_lengths, dtype=torch.int32)
+        batch['ideal_attn'] = ideal_attn
+        del batch['hotword_indxs']
+        del batch['hotword_indxs_lengths']
+
     return keys, batch
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index 0d2fd84..f0f0c66 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -1,6 +1,7 @@
 #!/usr/bin/env python
 import re
 import numpy as np
+from funasr.datasets.large_datasets.utils.hotword_utils import sample_hotword
 
 def forward_segment(text, seg_dict):
     word_list = []
@@ -38,7 +39,8 @@
              vocab=None,
              seg_dict=None,
              punc_dict=None,
-             bpe_tokenizer=None):
+             bpe_tokenizer=None,
+             hw_config=None):
     assert "text" in data
     assert isinstance(vocab, dict)
     text = data["text"]
@@ -53,6 +55,10 @@
         text = seg_tokenize(text, seg_dict)
 
     length = len(text)
+    if 'hw_tag' in data:
+        hotword_indxs = sample_hotword(length, **hw_config)
+        data['hotword_indxs'] = hotword_indxs
+        del data['hw_tag']
     for i in range(length):
         x = text[i]
         if i == length-1 and "punc" in data and x.startswith("vad:"):
diff --git a/funasr/models/decoder/rnnt_decoder.py b/funasr/models/decoder/rnnt_decoder.py
index 5401ab2..a0fe9ea 100644
--- a/funasr/models/decoder/rnnt_decoder.py
+++ b/funasr/models/decoder/rnnt_decoder.py
@@ -33,6 +33,7 @@
         dropout_rate: float = 0.0,
         embed_dropout_rate: float = 0.0,
         embed_pad: int = 0,
+        use_embed_mask: bool = False,
     ) -> None:
         """Construct a RNNDecoder object."""
         super().__init__()
@@ -66,6 +67,15 @@
 
         self.device = next(self.parameters()).device
         self.score_cache = {}
+
+        self.use_embed_mask = use_embed_mask
+        if self.use_embed_mask:
+            self._embed_mask = SpecAug(
+                time_mask_width_range=3,
+                num_time_mask=4,
+                apply_freq_mask=False,
+                apply_time_warp=False
+            )
     
     def forward(
         self,
@@ -88,6 +98,8 @@
             states = self.init_state(labels.size(0))
 
         dec_embed = self.dropout_embed(self.embed(labels))
+        if self.use_embed_mask and self.training:
+            dec_embed = self._embed_mask(dec_embed, label_lens)[0]
         dec_out, states = self.rnn_forward(dec_embed, states)
         return dec_out
 
diff --git a/funasr/models/e2e_asr_contextual_paraformer.py b/funasr/models/e2e_asr_contextual_paraformer.py
new file mode 100644
index 0000000..dc820db
--- /dev/null
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@@ -0,0 +1,372 @@
+import logging
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import numpy as np
+
+import torch
+from typeguard import check_argument_types
+
+from funasr.layers.abs_normalize import AbsNormalize
+from funasr.models.ctc import CTC
+from funasr.models.decoder.abs_decoder import AbsDecoder
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.models.frontend.abs_frontend import AbsFrontend
+from funasr.models.postencoder.abs_postencoder import AbsPostEncoder
+from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
+from funasr.models.specaug.abs_specaug import AbsSpecAug
+from funasr.modules.add_sos_eos import add_sos_eos
+from funasr.modules.nets_utils import make_pad_mask, pad_list
+from funasr.modules.nets_utils import th_accuracy
+from funasr.torch_utils.device_funcs import force_gatherable
+from funasr.models.e2e_asr_paraformer import Paraformer
+
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+
+
+class NeatContextualParaformer(Paraformer):
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        postencoder: Optional[AbsPostEncoder],
+        decoder: AbsDecoder,
+        ctc: CTC,
+        ctc_weight: float = 0.5,
+        interctc_weight: float = 0.0,
+        ignore_id: int = -1,
+        blank_id: int = 0,
+        sos: int = 1,
+        eos: int = 2,
+        lsm_weight: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+        extract_feats_in_collect_stats: bool = True,
+        predictor = None,
+        predictor_weight: float = 0.0,
+        predictor_bias: int = 0,
+        sampling_ratio: float = 0.2,
+        target_buffer_length: int = -1,
+        inner_dim: int = 256, 
+        bias_encoder_type: str = 'lstm',
+        use_decoder_embedding: bool = False,
+        crit_attn_weight: float = 0.0,
+        crit_attn_smooth: float = 0.0,
+        bias_encoder_dropout_rate: float = 0.0,
+    ):
+        assert check_argument_types()
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
+
+        super().__init__(
+        vocab_size=vocab_size,
+        token_list=token_list,
+        frontend=frontend,
+        specaug=specaug,
+        normalize=normalize,
+        preencoder=preencoder,
+        encoder=encoder,
+        postencoder=postencoder,
+        decoder=decoder,
+        ctc=ctc,
+        ctc_weight=ctc_weight,
+        interctc_weight=interctc_weight,
+        ignore_id=ignore_id,
+        blank_id=blank_id,
+        sos=sos,
+        eos=eos,
+        lsm_weight=lsm_weight,
+        length_normalized_loss=length_normalized_loss,
+        report_cer=report_cer,
+        report_wer=report_wer,
+        sym_space=sym_space,
+        sym_blank=sym_blank,
+        extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+        predictor=predictor,
+        predictor_weight=predictor_weight,
+        predictor_bias=predictor_bias,
+        sampling_ratio=sampling_ratio,
+        )
+
+        if bias_encoder_type == 'lstm':
+            logging.warning("enable bias encoder sampling and contextual training")
+            self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=bias_encoder_dropout_rate)
+            self.bias_embed = torch.nn.Embedding(vocab_size, inner_dim)
+        elif bias_encoder_type == 'mean':
+            logging.warning("enable bias encoder sampling and contextual training")
+            self.bias_embed = torch.nn.Embedding(vocab_size, inner_dim)
+        else:
+            logging.error("Unsupport bias encoder type: {}".format(bias_encoder_type))
+
+        self.target_buffer_length = target_buffer_length
+        if self.target_buffer_length > 0:
+            self.hotword_buffer = None
+            self.length_record = []
+            self.current_buffer_length = 0
+        self.use_decoder_embedding = use_decoder_embedding
+        self.crit_attn_weight = crit_attn_weight
+        if self.crit_attn_weight > 0:
+            self.attn_loss = torch.nn.L1Loss()
+        self.crit_attn_smooth = crit_attn_smooth
+
+    def forward(
+            self,
+            speech: torch.Tensor,
+            speech_lengths: torch.Tensor,
+            text: torch.Tensor,
+            text_lengths: torch.Tensor,
+            hotword_pad: torch.Tensor,
+            hotword_lengths: torch.Tensor,
+            ideal_attn: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+                speech: (Batch, Length, ...)
+                speech_lengths: (Batch, )
+                text: (Batch, Length)
+                text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+                speech.shape[0]
+                == speech_lengths.shape[0]
+                == text.shape[0]
+                == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        batch_size = speech.shape[0]
+        self.step_cur += 1
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+        speech = speech[:, :speech_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        loss_ctc, cer_ctc = None, None
+        loss_pre = None
+        loss_ideal = None
+
+        stats = dict()
+
+        # 1. CTC branch
+        if self.ctc_weight != 0.0:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+            # Collect CTC branch stats
+            stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+            stats["cer_ctc"] = cer_ctc
+
+        # Intermediate CTC (optional)
+        loss_interctc = 0.0
+        if self.interctc_weight != 0.0 and intermediate_outs is not None:
+            for layer_idx, intermediate_out in intermediate_outs:
+                # we assume intermediate_out has the same length & padding
+                # as those of encoder_out
+                loss_ic, cer_ic = self._calc_ctc_loss(
+                    intermediate_out, encoder_out_lens, text, text_lengths
+                )
+                loss_interctc = loss_interctc + loss_ic
+
+                # Collect Intermedaite CTC stats
+                stats["loss_interctc_layer{}".format(layer_idx)] = (
+                    loss_ic.detach() if loss_ic is not None else None
+                )
+                stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+            loss_interctc = loss_interctc / len(intermediate_outs)
+
+            # calculate whole encoder loss
+            loss_ctc = (1 - self.interctc_weight) * loss_ctc + self.interctc_weight * loss_interctc
+
+        # 2b. Attention decoder branch
+        if self.ctc_weight != 1.0:
+            loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+            )
+
+        # 3. CTC-Att loss definition
+        if self.ctc_weight == 0.0:
+            loss = loss_att + loss_pre * self.predictor_weight
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
+
+        if loss_ideal is not None:
+            loss = loss + loss_ideal * self.crit_attn_weight
+            stats["loss_ideal"] = loss_ideal.detach().cpu()
+
+        # Collect Attn branch stats
+        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["acc"] = acc_att
+        stats["cer"] = cer_att
+        stats["wer"] = wer_att
+        stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
+
+        stats["loss"] = torch.clone(loss.detach())
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+    
+    def _calc_att_clas_loss(
+            self,
+            encoder_out: torch.Tensor,
+            encoder_out_lens: torch.Tensor,
+            ys_pad: torch.Tensor,
+            ys_pad_lens: torch.Tensor,
+            hotword_pad: torch.Tensor,
+            hotword_lengths: torch.Tensor,
+            ideal_attn: torch.Tensor,
+    ):
+        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+            encoder_out.device)
+        if self.predictor_bias == 1:
+            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+            ys_pad_lens = ys_pad_lens + self.predictor_bias
+        pre_acoustic_embeds, pre_token_length, _, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
+                                                                                  ignore_id=self.ignore_id)
+
+        # -1. bias encoder
+        if self.use_decoder_embedding:
+            hw_embed = self.decoder.embed(hotword_pad)
+        else:
+            hw_embed = self.bias_embed(hotword_pad)
+        hw_embed, (_, _) = self.bias_encoder(hw_embed)
+        _ind = np.arange(0, hotword_pad.shape[0]).tolist()
+        selected = hw_embed[_ind, [i-1 for i in hotword_lengths.detach().cpu().tolist()]]
+        contextual_info = selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
+
+        # 0. sampler
+        decoder_out_1st = None
+        if self.sampling_ratio > 0.0:
+            if self.step_cur < 2:
+                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
+                                                           pre_acoustic_embeds, contextual_info)
+        else:
+            if self.step_cur < 2:
+                logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            sematic_embeds = pre_acoustic_embeds
+
+        # 1. Forward decoder
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
+        ) 
+        decoder_out, _ = decoder_outs[0], decoder_outs[1]
+        '''
+        if self.crit_attn_weight > 0 and attn.shape[-1] > 1:
+            ideal_attn = ideal_attn + self.crit_attn_smooth / (self.crit_attn_smooth + 1.0)
+            attn_non_blank = attn[:,:,:,:-1]
+            ideal_attn_non_blank = ideal_attn[:,:,:-1]
+            loss_ideal = self.attn_loss(attn_non_blank.max(1)[0], ideal_attn_non_blank.to(attn.device))
+        else:
+            loss_ideal = None
+        '''
+        loss_ideal = None
+
+        if decoder_out_1st is None:
+            decoder_out_1st = decoder_out
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_pad)
+        acc_att = th_accuracy(
+            decoder_out_1st.view(-1, self.vocab_size),
+            ys_pad,
+            ignore_label=self.ignore_id,
+        )
+        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out_1st.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal
+    
+    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, contextual_info):
+
+        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
+        ys_pad = ys_pad * tgt_mask[:, :, 0]
+        if self.share_embedding:
+            ys_pad_embed = self.decoder.output_layer.weight[ys_pad]
+        else:
+            ys_pad_embed = self.decoder.embed(ys_pad)
+        with torch.no_grad():
+            decoder_outs = self.decoder(
+                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, contextual_info=contextual_info
+            )
+            decoder_out, _ = decoder_outs[0], decoder_outs[1]
+            pred_tokens = decoder_out.argmax(-1)
+            nonpad_positions = ys_pad.ne(self.ignore_id)
+            seq_lens = (nonpad_positions).sum(1)
+            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
+            input_mask = torch.ones_like(nonpad_positions)
+            bsz, seq_len = ys_pad.size()
+            for li in range(bsz):
+                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
+                if target_num > 0:
+                    input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].to(pre_acoustic_embeds.device), value=0)
+            input_mask = input_mask.eq(1)
+            input_mask = input_mask.masked_fill(~nonpad_positions, False)
+            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
+
+        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
+            input_mask_expand_dim, 0)
+        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
+
+    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
+        if hw_list is None:
+            hw_list = [torch.Tensor([1]).long().to(encoder_out.device)]  # empty hotword list
+            hw_list_pad = pad_list(hw_list, 0)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
+            hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
+        else:
+            hw_lengths = [len(i) for i in hw_list]
+            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
+            if self.use_decoder_embedding:
+                hw_embed = self.decoder.embed(hw_list_pad)
+            else:
+                hw_embed = self.bias_embed(hw_list_pad)
+            hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
+                                                            enforce_sorted=False)
+            _, (h_n, _) = self.bias_encoder(hw_embed)
+            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
+        
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed
+        )
+        decoder_out = decoder_outs[0]
+        decoder_out = torch.log_softmax(decoder_out, dim=-1)
+        return decoder_out, ys_pad_lens
diff --git a/funasr/models/e2e_asr_transducer.py b/funasr/models/e2e_asr_transducer.py
index f8ba0f0..a5aaa6c 100644
--- a/funasr/models/e2e_asr_transducer.py
+++ b/funasr/models/e2e_asr_transducer.py
@@ -12,7 +12,7 @@
 from funasr.models.specaug.abs_specaug import AbsSpecAug
 from funasr.models.decoder.rnnt_decoder import RNNTDecoder
 from funasr.models.decoder.abs_decoder import AbsDecoder as AbsAttDecoder
-from funasr.models.encoder.conformer_encoder import ConformerChunkEncoder as Encoder
+from funasr.models.encoder.abs_encoder import AbsEncoder
 from funasr.models.joint_net.joint_network import JointNetwork
 from funasr.modules.nets_utils import get_transducer_task_io
 from funasr.layers.abs_normalize import AbsNormalize
@@ -62,7 +62,7 @@
         frontend: Optional[AbsFrontend],
         specaug: Optional[AbsSpecAug],
         normalize: Optional[AbsNormalize],
-        encoder: Encoder,
+        encoder: AbsEncoder,
         decoder: RNNTDecoder,
         joint_network: JointNetwork,
         att_decoder: Optional[AbsAttDecoder] = None,
@@ -286,7 +286,7 @@
                 feats, feats_lengths = self.normalize(feats, feats_lengths)
 
         # 4. Forward encoder
-        encoder_out, encoder_out_lens = self.encoder(feats, feats_lengths)
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
 
         assert encoder_out.size(0) == speech.size(0), (
             encoder_out.size(),
@@ -515,7 +515,7 @@
         frontend: Optional[AbsFrontend],
         specaug: Optional[AbsSpecAug],
         normalize: Optional[AbsNormalize],
-        encoder: Encoder,
+        encoder: AbsEncoder,
         decoder: RNNTDecoder,
         joint_network: JointNetwork,
         att_decoder: Optional[AbsAttDecoder] = None,
diff --git a/funasr/models/encoder/conformer_encoder.py b/funasr/models/encoder/conformer_encoder.py
index 9777cee..434f2a4 100644
--- a/funasr/models/encoder/conformer_encoder.py
+++ b/funasr/models/encoder/conformer_encoder.py
@@ -307,7 +307,7 @@
         feed_forward: torch.nn.Module,
         feed_forward_macaron: torch.nn.Module,
         conv_mod: torch.nn.Module,
-        norm_class: torch.nn.Module = torch.nn.LayerNorm,
+        norm_class: torch.nn.Module = LayerNorm,
         norm_args: Dict = {},
         dropout_rate: float = 0.0,
     ) -> None:
@@ -1145,7 +1145,7 @@
             x = x[:,::self.time_reduction_factor,:]
             olens = torch.floor_divide(olens-1, self.time_reduction_factor) + 1
 
-        return x, olens
+        return x, olens, None
 
     def simu_chunk_forward(
         self,
diff --git a/funasr/modules/nets_utils.py b/funasr/modules/nets_utils.py
index 10df124..397a5c4 100644
--- a/funasr/modules/nets_utils.py
+++ b/funasr/modules/nets_utils.py
@@ -485,14 +485,39 @@
         new_k = k.replace(old_prefix, new_prefix)
         state_dict[new_k] = v
 
-
 class Swish(torch.nn.Module):
-    """Construct an Swish object."""
+    """Swish activation definition.
 
-    def forward(self, x):
-        """Return Swich activation function."""
-        return x * torch.sigmoid(x)
+    Swish(x) = (beta * x) * sigmoid(x)
+                 where beta = 1 defines standard Swish activation.
 
+    References:
+        https://arxiv.org/abs/2108.12943 / https://arxiv.org/abs/1710.05941v1.
+        E-swish variant: https://arxiv.org/abs/1801.07145.
+
+    Args:
+        beta: Beta parameter for E-Swish.
+                (beta >= 1. If beta < 1, use standard Swish).
+        use_builtin: Whether to use PyTorch function if available.
+
+    """
+
+    def __init__(self, beta: float = 1.0, use_builtin: bool = False) -> None:
+        super().__init__()
+
+        self.beta = beta
+
+        if beta > 1:
+            self.swish = lambda x: (self.beta * x) * torch.sigmoid(x)
+        else:
+            if use_builtin:
+                self.swish = torch.nn.SiLU()
+            else:
+                self.swish = lambda x: x * torch.sigmoid(x)
+
+    def forward(self, x: torch.Tensor) -> torch.Tensor:
+        """Forward computation."""
+        return self.swish(x)
 
 def get_activation(act):
     """Return activation function."""
diff --git a/funasr/modules/repeat.py b/funasr/modules/repeat.py
index 2b2dac8..ff1e182 100644
--- a/funasr/modules/repeat.py
+++ b/funasr/modules/repeat.py
@@ -7,7 +7,7 @@
 """Repeat the same layer definition."""
 
 from typing import Dict, List, Optional
-
+from funasr.modules.layer_norm import LayerNorm
 import torch
 
 
@@ -48,7 +48,7 @@
         self,
         block_list: List[torch.nn.Module],
         output_size: int,
-        norm_class: torch.nn.Module = torch.nn.LayerNorm,
+        norm_class: torch.nn.Module = LayerNorm,
     ) -> None:
         """Construct a MultiBlocks object."""
         super().__init__()
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
index 4499441..71bb035 100644
--- a/funasr/runtime/grpc/Readme.md
+++ b/funasr/runtime/grpc/Readme.md
@@ -37,39 +37,32 @@
 
 ### Start grpc paraformer server
 ```
-./cmake/build/paraformer-server     --port-id <string> [--punc-config
-                                    <string>] [--punc-model <string>]
-                                    --am-config <string> --am-cmvn <string>
-                                    --am-model <string> [--vad-config
-                                    <string>] [--vad-cmvn <string>]
-                                    [--vad-model <string>] [--] [--version]
-                                    [-h]
+
+./cmake/build/paraformer-server   --port-id <string> [--punc-quant <string>]
+                                  [--punc-dir <string>] [--vad-quant <string>]
+                                  [--vad-dir <string>] [--quantize <string>]
+                                  --model-dir <string> [--] [--version] [-h]
 Where:
    --port-id <string>
      (required)  port id
+   --model-dir <string>
+     (required)  the asr model path, which contains model.onnx, config.yaml, am.mvn
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
 
-   --am-config <string>
-     (required)  am config path
-   --am-cmvn <string>
-     (required)  am cmvn path
-   --am-model <string>
-     (required)  am model path
+   --vad-dir <string>
+     the vad model path, which contains model.onnx, vad.yaml, vad.mvn
+   --vad-quant <string>
+     false (Default), load the model of model.onnx in vad_dir. If set true, load the model of model_quant.onnx in vad_dir
 
-   --punc-config <string>
-     punc config path
-   --punc-model <string>
-     punc model path
-
-   --vad-config <string>
-     vad config path
-   --vad-cmvn <string>
-     vad cmvn path
-   --vad-model <string>
-     vad model path
-
-   Required: --port-id <string> --am-config <string> --am-cmvn <string> --am-model <string> 
-   If use vad, please add: [--vad-config <string>] [--vad-cmvn <string>] [--vad-model <string>]
-   If use punc, please add: [--punc-config <string>] [--punc-model <string>] 
+   --punc-dir <string>
+     the punc model path, which contains model.onnx, punc.yaml
+   --punc-quant <string>
+     false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir
+  
+   Required: --port-id <string>  --model-dir <string>
+   If use vad, please add: --vad-dir <string>
+   If use punc, please add: --punc-dir <string>
 ```
 
 ## For the client
diff --git a/funasr/runtime/grpc/paraformer-server.cc b/funasr/runtime/grpc/paraformer-server.cc
index 31333c9..734dadc 100644
--- a/funasr/runtime/grpc/paraformer-server.cc
+++ b/funasr/runtime/grpc/paraformer-server.cc
@@ -31,7 +31,7 @@
 using paraformer::ASR;
 
 ASRServicer::ASRServicer(std::map<std::string, std::string>& model_path) {
-    AsrHanlde=FunASRInit(model_path, 1);
+    AsrHanlde=FunOfflineInit(model_path, 1);
     std::cout << "ASRServicer init" << std::endl;
     init_flag = 0;
 }
@@ -137,7 +137,7 @@
                     stream->Write(res);
                 }
                 else {
-                    FUNASR_RESULT Result= FunASRRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, 16000, RASR_NONE, NULL);
+                    FUNASR_RESULT Result= FunOfflineInferBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL, 16000);
                     std::string asr_result = ((FUNASR_RECOG_RESULT*)Result)->msg;
 
                     auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
@@ -204,38 +204,30 @@
     FLAGS_logtostderr = true;
 
     TCLAP::CmdLine cmd("paraformer-server", ' ', "1.0");
-    TCLAP::ValueArg<std::string> vad_model("", VAD_MODEL_PATH, "vad model path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_cmvn("", VAD_CMVN_PATH, "vad cmvn path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_config("", VAD_CONFIG_PATH, "vad config path", false, "", "string");
+    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the asr model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string");
+    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string>    vad_dir("", VAD_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", false, "", "string");
+    TCLAP::ValueArg<std::string>    vad_quant("", VAD_QUANT, "false (Default), load the model of model.onnx in vad_dir. If set true, load the model of model_quant.onnx in vad_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string>    punc_dir("", PUNC_DIR, "the punc model path, which contains model.onnx, punc.yaml", false, "", "string");
+    TCLAP::ValueArg<std::string>    punc_quant("", PUNC_QUANT, "false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string>    port_id("", PORT_ID, "port id", true, "", "string");
 
-    TCLAP::ValueArg<std::string> am_model("", AM_MODEL_PATH, "am model path", true, "", "string");
-    TCLAP::ValueArg<std::string> am_cmvn("", AM_CMVN_PATH, "am cmvn path", true, "", "string");
-    TCLAP::ValueArg<std::string> am_config("", AM_CONFIG_PATH, "am config path", true, "", "string");
-
-    TCLAP::ValueArg<std::string> punc_model("", PUNC_MODEL_PATH, "punc model path", false, "", "string");
-    TCLAP::ValueArg<std::string> punc_config("", PUNC_CONFIG_PATH, "punc config path", false, "", "string");
-    TCLAP::ValueArg<std::string> port_id("", PORT_ID, "port id", true, "", "string");
-
-    cmd.add(vad_model);
-    cmd.add(vad_cmvn);
-    cmd.add(vad_config);
-    cmd.add(am_model);
-    cmd.add(am_cmvn);
-    cmd.add(am_config);
-    cmd.add(punc_model);
-    cmd.add(punc_config);
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(vad_dir);
+    cmd.add(vad_quant);
+    cmd.add(punc_dir);
+    cmd.add(punc_quant);
     cmd.add(port_id);
     cmd.parse(argc, argv);
 
     std::map<std::string, std::string> model_path;
-    GetValue(vad_model, VAD_MODEL_PATH, model_path);
-    GetValue(vad_cmvn, VAD_CMVN_PATH, model_path);
-    GetValue(vad_config, VAD_CONFIG_PATH, model_path);
-    GetValue(am_model, AM_MODEL_PATH, model_path);
-    GetValue(am_cmvn, AM_CMVN_PATH, model_path);
-    GetValue(am_config, AM_CONFIG_PATH, model_path);
-    GetValue(punc_model, PUNC_MODEL_PATH, model_path);
-    GetValue(punc_config, PUNC_CONFIG_PATH, model_path);
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(vad_dir, VAD_DIR, model_path);
+    GetValue(vad_quant, VAD_QUANT, model_path);
+    GetValue(punc_dir, PUNC_DIR, model_path);
+    GetValue(punc_quant, PUNC_QUANT, model_path);
     GetValue(port_id, PORT_ID, model_path);
 
     RunServer(model_path);
diff --git a/funasr/runtime/grpc/paraformer-server.h b/funasr/runtime/grpc/paraformer-server.h
index 108e3b6..760ea2a 100644
--- a/funasr/runtime/grpc/paraformer-server.h
+++ b/funasr/runtime/grpc/paraformer-server.h
@@ -15,7 +15,7 @@
 #include <chrono>
 
 #include "paraformer.grpc.pb.h"
-#include "libfunasrapi.h"
+#include "funasrruntime.h"
 
 
 using grpc::Server;
diff --git a/funasr/runtime/onnxruntime/include/audio.h b/funasr/runtime/onnxruntime/include/audio.h
index ab9f420..1eabd3e 100644
--- a/funasr/runtime/onnxruntime/include/audio.h
+++ b/funasr/runtime/onnxruntime/include/audio.h
@@ -1,16 +1,17 @@
-
 #ifndef AUDIO_H
 #define AUDIO_H
 
 #include <queue>
 #include <stdint.h>
-#include "model.h"
+#include "vad-model.h"
+#include "offline-stream.h"
 
 #ifndef WAV_HEADER_SIZE
 #define WAV_HEADER_SIZE 44
 #endif
 
 using namespace std;
+namespace funasr {
 
 class AudioFrame {
   private:
@@ -54,9 +55,11 @@
     int FetchChunck(float *&dout, int len);
     int Fetch(float *&dout, int &len, int &flag);
     void Padding();
-    void Split(Model* recog_obj);
+    void Split(OfflineStream* offline_streamj);
+    void Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments);
     float GetTimeLen();
     int GetQueueSize() { return (int)frame_queue.size(); }
 };
 
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/include/com-define.h b/funasr/runtime/onnxruntime/include/com-define.h
index 9b7b212..7a6345b 100644
--- a/funasr/runtime/onnxruntime/include/com-define.h
+++ b/funasr/runtime/onnxruntime/include/com-define.h
@@ -1,7 +1,6 @@
+#pragma once 
 
-#ifndef COMDEFINE_H
-#define COMDEFINE_H
-
+namespace funasr {
 #define S_BEGIN  0
 #define S_MIDDLE 1
 #define S_END    2
@@ -12,19 +11,36 @@
 #define MODEL_SAMPLE_RATE 16000
 #endif
 
-// model path
-#define VAD_MODEL_PATH "vad-model"
-#define VAD_CMVN_PATH "vad-cmvn"
-#define VAD_CONFIG_PATH "vad-config"
-#define AM_MODEL_PATH "am-model"
-#define AM_CMVN_PATH "am-cmvn"
-#define AM_CONFIG_PATH "am-config"
-#define PUNC_MODEL_PATH "punc-model"
-#define PUNC_CONFIG_PATH "punc-config"
+// parser option
+#define MODEL_DIR "model-dir"
+#define VAD_DIR "vad-dir"
+#define PUNC_DIR "punc-dir"
+#define QUANTIZE "quantize"
+#define VAD_QUANT "vad-quant"
+#define PUNC_QUANT "punc-quant"
+
 #define WAV_PATH "wav-path"
 #define WAV_SCP "wav-scp"
+#define TXT_PATH "txt-path"
 #define THREAD_NUM "thread-num"
 #define PORT_ID "port-id"
+
+// #define VAD_MODEL_PATH "vad-model"
+// #define VAD_CMVN_PATH "vad-cmvn"
+// #define VAD_CONFIG_PATH "vad-config"
+// #define AM_MODEL_PATH "am-model"
+// #define AM_CMVN_PATH "am-cmvn"
+// #define AM_CONFIG_PATH "am-config"
+// #define PUNC_MODEL_PATH "punc-model"
+// #define PUNC_CONFIG_PATH "punc-config"
+
+#define MODEL_NAME "model.onnx"
+#define QUANT_MODEL_NAME "model_quant.onnx"
+#define VAD_CMVN_NAME "vad.mvn"
+#define VAD_CONFIG_NAME "vad.yaml"
+#define AM_CMVN_NAME "am.mvn"
+#define AM_CONFIG_NAME "config.yaml"
+#define PUNC_CONFIG_NAME "punc.yaml"
 
 // vad
 #ifndef VAD_SILENCE_DURATION
@@ -60,4 +76,4 @@
 #define DUN_INDEX 5
 #define CACHE_POP_TRIGGER_LIMIT   200
 
-#endif
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/include/funasrruntime.h b/funasr/runtime/onnxruntime/include/funasrruntime.h
new file mode 100644
index 0000000..5cfdb47
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/funasrruntime.h
@@ -0,0 +1,98 @@
+#pragma once
+#include <map>
+#include <vector>
+
+#ifdef WIN32
+#ifdef _FUNASR_API_EXPORT
+#define  _FUNASRAPI __declspec(dllexport)
+#else
+#define  _FUNASRAPI __declspec(dllimport)
+#endif
+#else
+#define _FUNASRAPI
+#endif
+
+#ifndef _WIN32
+#define FUNASR_CALLBCK_PREFIX __attribute__((__stdcall__))
+#else
+#define FUNASR_CALLBCK_PREFIX __stdcall
+#endif
+
+#ifdef __cplusplus 
+
+extern "C" {
+#endif
+
+typedef void* FUNASR_HANDLE;
+typedef void* FUNASR_RESULT;
+typedef unsigned char FUNASR_BOOL;
+
+#define FUNASR_TRUE 1
+#define FUNASR_FALSE 0
+#define QM_DEFAULT_THREAD_NUM  4
+
+typedef enum
+{
+ RASR_NONE=-1,
+ RASRM_CTC_GREEDY_SEARCH=0,
+ RASRM_CTC_RPEFIX_BEAM_SEARCH = 1,
+ RASRM_ATTENSION_RESCORING = 2,
+}FUNASR_MODE;
+
+typedef enum {
+	FUNASR_MODEL_PADDLE = 0,
+	FUNASR_MODEL_PADDLE_2 = 1,
+	FUNASR_MODEL_K2 = 2,
+	FUNASR_MODEL_PARAFORMER = 3,
+}FUNASR_MODEL_TYPE;
+
+typedef enum
+{
+ FSMN_VAD_OFFLINE=0,
+ FSMN_VAD_ONLINE = 1,
+}FSMN_VAD_MODE;
+
+typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step.
+	
+// ASR
+_FUNASRAPI FUNASR_HANDLE  	FunASRInit(std::map<std::string, std::string>& model_path, int thread_num);
+// buffer
+_FUNASRAPI FUNASR_RESULT	FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+// file, support wav & pcm
+_FUNASRAPI FUNASR_RESULT	FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+
+_FUNASRAPI const char*	FunASRGetResult(FUNASR_RESULT result,int n_index);
+_FUNASRAPI const int	FunASRGetRetNumber(FUNASR_RESULT result);
+_FUNASRAPI void			FunASRFreeResult(FUNASR_RESULT result);
+_FUNASRAPI void			FunASRUninit(FUNASR_HANDLE handle);
+_FUNASRAPI const float	FunASRGetRetSnippetTime(FUNASR_RESULT result);
+
+// VAD
+_FUNASRAPI FUNASR_HANDLE  	FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode=FSMN_VAD_OFFLINE);
+// buffer
+_FUNASRAPI FUNASR_RESULT	FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+// file, support wav & pcm
+_FUNASRAPI FUNASR_RESULT	FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+
+_FUNASRAPI std::vector<std::vector<int>>*	FsmnVadGetResult(FUNASR_RESULT result,int n_index);
+_FUNASRAPI void			 	FsmnVadFreeResult(FUNASR_RESULT result);
+_FUNASRAPI void				FsmnVadUninit(FUNASR_HANDLE handle);
+_FUNASRAPI const float		FsmnVadGetRetSnippetTime(FUNASR_RESULT result);
+
+// PUNC
+_FUNASRAPI FUNASR_HANDLE  		CTTransformerInit(std::map<std::string, std::string>& model_path, int thread_num);
+_FUNASRAPI const std::string	CTTransformerInfer(FUNASR_HANDLE handle, const char* sz_sentence, FUNASR_MODE mode, QM_CALLBACK fn_callback);
+_FUNASRAPI void					CTTransformerUninit(FUNASR_HANDLE handle);
+
+//OfflineStream
+_FUNASRAPI FUNASR_HANDLE  	FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num);
+// buffer
+_FUNASRAPI FUNASR_RESULT	FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+// file, support wav & pcm
+_FUNASRAPI FUNASR_RESULT	FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+_FUNASRAPI void				FunOfflineUninit(FUNASR_HANDLE handle);
+
+#ifdef __cplusplus 
+
+}
+#endif
diff --git a/funasr/runtime/onnxruntime/include/libfunasrapi.h b/funasr/runtime/onnxruntime/include/libfunasrapi.h
deleted file mode 100644
index f65efcc..0000000
--- a/funasr/runtime/onnxruntime/include/libfunasrapi.h
+++ /dev/null
@@ -1,75 +0,0 @@
-#pragma once
-#include <map>
-
-#ifdef WIN32
-#ifdef _FUNASR_API_EXPORT
-#define  _FUNASRAPI __declspec(dllexport)
-#else
-#define  _FUNASRAPI __declspec(dllimport)
-#endif
-#else
-#define _FUNASRAPI
-#endif
-
-#ifndef _WIN32
-#define FUNASR_CALLBCK_PREFIX __attribute__((__stdcall__))
-#else
-#define FUNASR_CALLBCK_PREFIX __stdcall
-#endif
-
-#ifdef __cplusplus 
-
-extern "C" {
-#endif
-
-typedef void* FUNASR_HANDLE;
-typedef void* FUNASR_RESULT;
-typedef unsigned char FUNASR_BOOL;
-
-#define FUNASR_TRUE 1
-#define FUNASR_FALSE 0
-#define QM_DEFAULT_THREAD_NUM  4
-
-typedef enum
-{
- RASR_NONE=-1,
- RASRM_CTC_GREEDY_SEARCH=0,
- RASRM_CTC_RPEFIX_BEAM_SEARCH = 1,
- RASRM_ATTENSION_RESCORING = 2,
-}FUNASR_MODE;
-
-typedef enum {
-	FUNASR_MODEL_PADDLE = 0,
-	FUNASR_MODEL_PADDLE_2 = 1,
-	FUNASR_MODEL_K2 = 2,
-	FUNASR_MODEL_PARAFORMER = 3,
-}FUNASR_MODEL_TYPE;
-
-typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step.
-	
-// // ASR
-_FUNASRAPI FUNASR_HANDLE  FunASRInit(std::map<std::string, std::string>& model_path, int thread_num);
-
-_FUNASRAPI FUNASR_RESULT	FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-
-_FUNASRAPI const char*	FunASRGetResult(FUNASR_RESULT result,int n_index);
-_FUNASRAPI const int	FunASRGetRetNumber(FUNASR_RESULT result);
-_FUNASRAPI void			FunASRFreeResult(FUNASR_RESULT result);
-_FUNASRAPI void			FunASRUninit(FUNASR_HANDLE handle);
-_FUNASRAPI const float	FunASRGetRetSnippetTime(FUNASR_RESULT result);
-
-// VAD
-_FUNASRAPI FUNASR_HANDLE  FunVadInit(std::map<std::string, std::string>& model_path, int thread_num);
-
-_FUNASRAPI FUNASR_RESULT	FunASRVadBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRVadPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRVadPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-_FUNASRAPI FUNASR_RESULT	FunASRVadFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback);
-
-#ifdef __cplusplus 
-
-}
-#endif
diff --git a/funasr/runtime/onnxruntime/include/model.h b/funasr/runtime/onnxruntime/include/model.h
index 4b4b582..44bd022 100644
--- a/funasr/runtime/onnxruntime/include/model.h
+++ b/funasr/runtime/onnxruntime/include/model.h
@@ -4,19 +4,17 @@
 
 #include <string>
 #include <map>
-
+namespace funasr {
 class Model {
   public:
     virtual ~Model(){};
     virtual void Reset() = 0;
+    virtual void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num)=0;
     virtual std::string ForwardChunk(float *din, int len, int flag) = 0;
     virtual std::string Forward(float *din, int len, int flag) = 0;
     virtual std::string Rescoring() = 0;
-    virtual std::vector<std::vector<int>> VadSeg(std::vector<float>& pcm_data)=0;
-    virtual std::string AddPunc(const char* sz_input)=0;
-    virtual bool UseVad() =0;
-    virtual bool UsePunc() =0; 
 };
 
 Model *CreateModel(std::map<std::string, std::string>& model_path,int thread_num=1);
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/include/offline-stream.h b/funasr/runtime/onnxruntime/include/offline-stream.h
new file mode 100644
index 0000000..a9ce88e
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/offline-stream.h
@@ -0,0 +1,30 @@
+#ifndef OFFLINE_STREAM_H
+#define OFFLINE_STREAM_H
+
+#include <memory>
+#include <string>
+#include <map>
+#include "model.h"
+#include "punc-model.h"
+#include "vad-model.h"
+
+namespace funasr {
+class OfflineStream {
+  public:
+    OfflineStream(std::map<std::string, std::string>& model_path, int thread_num);
+    ~OfflineStream(){};
+
+    std::unique_ptr<VadModel> vad_handle;
+    std::unique_ptr<Model> asr_handle;
+    std::unique_ptr<PuncModel> punc_handle;
+    bool UseVad(){return use_vad;};
+    bool UsePunc(){return use_punc;}; 
+    
+  private:
+    bool use_vad=false;
+    bool use_punc=false;
+};
+
+OfflineStream *CreateOfflineStream(std::map<std::string, std::string>& model_path, int thread_num=1);
+} // namespace funasr
+#endif
diff --git a/funasr/runtime/onnxruntime/include/punc-model.h b/funasr/runtime/onnxruntime/include/punc-model.h
new file mode 100644
index 0000000..da7ff60
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/punc-model.h
@@ -0,0 +1,20 @@
+
+#ifndef PUNC_MODEL_H
+#define PUNC_MODEL_H
+
+#include <string>
+#include <map>
+#include <vector>
+
+namespace funasr {
+class PuncModel {
+  public:
+    virtual ~PuncModel(){};
+	  virtual void InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num)=0;
+	  virtual std::vector<int>  Infer(std::vector<int32_t> input_data)=0;
+	  virtual std::string AddPunc(const char* sz_input)=0;
+};
+
+PuncModel *CreatePuncModel(std::map<std::string, std::string>& model_path, int thread_num);
+} // namespace funasr
+#endif
diff --git a/funasr/runtime/onnxruntime/include/vad-model.h b/funasr/runtime/onnxruntime/include/vad-model.h
new file mode 100644
index 0000000..e37bd97
--- /dev/null
+++ b/funasr/runtime/onnxruntime/include/vad-model.h
@@ -0,0 +1,29 @@
+
+#ifndef VAD_MODEL_H
+#define VAD_MODEL_H
+
+#include <string>
+#include <map>
+#include <vector>
+
+namespace funasr {
+class VadModel {
+  public:
+    virtual ~VadModel(){};
+    virtual void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num)=0;
+    virtual std::vector<std::vector<int>> Infer(const std::vector<float> &waves)=0;
+    virtual void ReadModel(const char* vad_model)=0;
+    virtual void LoadConfigFromYaml(const char* filename)=0;
+    virtual void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
+                    const std::vector<float> &waves)=0;
+    virtual void LfrCmvn(std::vector<std::vector<float>> &vad_feats)=0;
+    virtual void Forward(
+            const std::vector<std::vector<float>> &chunk_feats,
+            std::vector<std::vector<float>> *out_prob)=0;
+    virtual void LoadCmvn(const char *filename)=0;
+    virtual void InitCache()=0;
+};
+
+VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode);
+} // namespace funasr
+#endif
diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md
index 436c7df..3e34a67 100644
--- a/funasr/runtime/onnxruntime/readme.md
+++ b/funasr/runtime/onnxruntime/readme.md
@@ -41,41 +41,113 @@
 ```
 ## Run the demo
 
+### funasr-onnx-offline
 ```shell
-./funasr-onnx-offline     [--wav-scp <string>] [--wav-path <string>]
-                          [--punc-config <string>] [--punc-model <string>]
-                          --am-config <string> --am-cmvn <string>
-                          --am-model <string> [--vad-config <string>]
-                          [--vad-cmvn <string>] [--vad-model <string>] [--]
-                          [--version] [-h]
+./funasr-onnx-offline     --model-dir <string> [--quantize <string>]
+                          [--vad-dir <string>] [--vad-quant <string>]
+                          [--punc-dir <string>] [--punc-quant <string>]
+                          --wav-path <string> [--] [--version] [-h]
 Where:
-   --wav-scp <string>
-     wave scp path
+   --model-dir <string>
+     (required)  the asr model path, which contains model.onnx, config.yaml, am.mvn
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
+
+   --vad-dir <string>
+     the vad model path, which contains model.onnx, vad.yaml, vad.mvn
+   --vad-quant <string>
+     false (Default), load the model of model.onnx in vad_dir. If set true, load the model of model_quant.onnx in vad_dir
+
+   --punc-dir <string>
+     the punc model path, which contains model.onnx, punc.yaml
+   --punc-quant <string>
+     false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir
+
    --wav-path <string>
-     wave file path
-
-   --punc-config <string>
-     punc config path
-   --punc-model <string>
-     punc model path
-
-   --am-config <string>
-     (required)  am config path
-   --am-cmvn <string>
-     (required)  am cmvn path
-   --am-model <string>
-     (required)  am model path
-
-   --vad-config <string>
-     vad config path
-   --vad-cmvn <string>
-     vad cmvn path
-   --vad-model <string>
-     vad model path
+     (required)  the input could be: 
+      wav_path, e.g.: asr_example.wav;
+      pcm_path, e.g.: asr_example.pcm; 
+      wav.scp, kaldi style wav list (wav_id \t wav_path)
   
-   Required: --am-config <string> --am-cmvn <string> --am-model <string> 
-   If use vad, please add: [--vad-config <string>] [--vad-cmvn <string>] [--vad-model <string>]
-   If use punc, please add: [--punc-config <string>] [--punc-model <string>] 
+   Required: --model-dir <string> --wav-path <string>
+   If use vad, please add: --vad-dir <string>
+   If use punc, please add: --punc-dir <string>
+
+For example:
+./funasr-onnx-offline \
+    --model-dir    ./asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
+    --quantize  true \
+    --vad-dir   ./asrmodel/speech_fsmn_vad_zh-cn-16k-common-pytorch \
+    --punc-dir  ./asrmodel/punc_ct-transformer_zh-cn-common-vocab272727-pytorch \
+    --wav-path    ./vad_example.wav
+```
+
+### funasr-onnx-offline-vad
+```shell
+./funasr-onnx-offline-vad     --model-dir <string> [--quantize <string>]
+                              --wav-path <string> [--] [--version] [-h]
+Where:
+   --model-dir <string>
+     (required)  the vad model path, which contains model.onnx, vad.yaml, vad.mvn
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
+   --wav-path <string>
+     (required)  the input could be: 
+      wav_path, e.g.: asr_example.wav;
+      pcm_path, e.g.: asr_example.pcm; 
+      wav.scp, kaldi style wav list (wav_id \t wav_path)
+
+   Required: --model-dir <string> --wav-path <string>
+
+For example:
+./funasr-onnx-offline-vad \
+    --model-dir   ./asrmodel/speech_fsmn_vad_zh-cn-16k-common-pytorch \
+    --wav-path    ./vad_example.wav
+```
+
+### funasr-onnx-offline-punc
+```shell
+./funasr-onnx-offline-punc    --model-dir <string> [--quantize <string>]
+                              --txt-path <string> [--] [--version] [-h]
+Where:
+   --model-dir <string>
+     (required)  the punc model path, which contains model.onnx, punc.yaml
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
+   --txt-path <string>
+     (required)  txt file path, one sentence per line
+
+   Required: --model-dir <string> --txt-path <string>
+
+For example:
+./funasr-onnx-offline-punc \
+    --model-dir  ./asrmodel/punc_ct-transformer_zh-cn-common-vocab272727-pytorch \
+    --txt-path   ./punc_example.txt
+```
+### funasr-onnx-offline-rtf
+```shell
+./funasr-onnx-offline-rtf     --model-dir <string> [--quantize <string>]
+                              --wav-path <string> --thread-num <int32_t>
+                              [--] [--version] [-h]
+Where:
+   --thread-num <int32_t>
+     (required)  multi-thread num for rtf
+   --model-dir <string>
+     (required)  the model path, which contains model.onnx, config.yaml, am.mvn
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
+   --wav-path <string>
+     (required)  the input could be: 
+      wav_path, e.g.: asr_example.wav;
+      pcm_path, e.g.: asr_example.pcm; 
+      wav.scp, kaldi style wav list (wav_id \t wav_path)
+
+For example:
+./funasr-onnx-offline-rtf \
+    --model-dir    ./asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
+    --quantize  true \
+    --wav-path     ./aishell1_test.scp  \
+    --thread-num 32
 ```
 
 ## Acknowledge
diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt
index d33c540..341a16a 100644
--- a/funasr/runtime/onnxruntime/src/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt
@@ -26,6 +26,11 @@
 target_link_libraries(funasr PUBLIC onnxruntime ${EXTRA_LIBS})
 
 add_executable(funasr-onnx-offline "funasr-onnx-offline.cpp")
+add_executable(funasr-onnx-offline-vad "funasr-onnx-offline-vad.cpp")
+add_executable(funasr-onnx-offline-punc "funasr-onnx-offline-punc.cpp")
 add_executable(funasr-onnx-offline-rtf "funasr-onnx-offline-rtf.cpp")
 target_link_libraries(funasr-onnx-offline PUBLIC funasr)
-target_link_libraries(funasr-onnx-offline-rtf PUBLIC funasr)
\ No newline at end of file
+target_link_libraries(funasr-onnx-offline-vad PUBLIC funasr)
+target_link_libraries(funasr-onnx-offline-punc PUBLIC funasr)
+target_link_libraries(funasr-onnx-offline-rtf PUBLIC funasr)
+
diff --git a/funasr/runtime/onnxruntime/src/alignedmem.cpp b/funasr/runtime/onnxruntime/src/alignedmem.cpp
index d3e4b82..9c7d323 100644
--- a/funasr/runtime/onnxruntime/src/alignedmem.cpp
+++ b/funasr/runtime/onnxruntime/src/alignedmem.cpp
@@ -1,4 +1,6 @@
 #include "precomp.h"
+
+namespace funasr {
 void *AlignedMalloc(size_t alignment, size_t required_bytes)
 {
     void *p1;  // original block
@@ -16,3 +18,4 @@
 {
     free(((void **)p)[-1]);
 }
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/alignedmem.h b/funasr/runtime/onnxruntime/src/alignedmem.h
index e2b640a..e4b9a78 100644
--- a/funasr/runtime/onnxruntime/src/alignedmem.h
+++ b/funasr/runtime/onnxruntime/src/alignedmem.h
@@ -2,7 +2,9 @@
 #ifndef ALIGNEDMEM_H
 #define ALIGNEDMEM_H
 
+namespace funasr {
 extern void *AlignedMalloc(size_t alignment, size_t required_bytes);
 extern void AlignedFree(void *p);
 
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 8f46a4f..6d63d67 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -11,6 +11,7 @@
 
 using namespace std;
 
+namespace funasr {
 // see http://soundfile.sapp.org/doc/WaveFormat/
 // Note: We assume little endian here
 struct WaveHeader {
@@ -235,6 +236,24 @@
     is.read(reinterpret_cast<char *>(&header), sizeof(header));
     if(!is){
         LOG(ERROR) << "Failed to read " << filename;
+        return false;
+    }
+
+    if (!header.Validate()) {
+        return false;
+    }
+
+    header.SeekToDataChunk(is);
+    if (!is) {
+        return false;
+    }
+    
+    if (!header.Validate()) {
+        return false;
+    }
+
+    header.SeekToDataChunk(is);
+    if (!is) {
         return false;
     }
     
@@ -496,7 +515,7 @@
     delete frame;
 }
 
-void Audio::Split(Model* recog_obj)
+void Audio::Split(OfflineStream* offline_stream)
 {
     AudioFrame *frame;
 
@@ -507,7 +526,7 @@
     frame = NULL;
 
     std::vector<float> pcm_data(speech_data, speech_data+sp_len);
-    vector<std::vector<int>> vad_segments = recog_obj->VadSeg(pcm_data);
+    vector<std::vector<int>> vad_segments = (offline_stream->vad_handle)->Infer(pcm_data);
     int seg_sample = MODEL_SAMPLE_RATE/1000;
     for(vector<int> segment:vad_segments)
     {
@@ -519,4 +538,21 @@
         frame_queue.push(frame);
         frame = NULL;
     }
-}
\ No newline at end of file
+}
+
+
+void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments)
+{
+    AudioFrame *frame;
+
+    frame = frame_queue.front();
+    frame_queue.pop();
+    int sp_len = frame->GetLen();
+    delete frame;
+    frame = NULL;
+
+    std::vector<float> pcm_data(speech_data, speech_data+sp_len);
+    vad_segments = vad_obj->Infer(pcm_data);
+}
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/commonfunc.h b/funasr/runtime/onnxruntime/src/commonfunc.h
index fbbda74..d0882c6 100644
--- a/funasr/runtime/onnxruntime/src/commonfunc.h
+++ b/funasr/runtime/onnxruntime/src/commonfunc.h
@@ -1,10 +1,18 @@
 #pragma once 
 #include <algorithm>
+
+namespace funasr {
 typedef struct
 {
     std::string msg;
     float  snippet_time;
 }FUNASR_RECOG_RESULT;
+
+typedef struct
+{
+    std::vector<std::vector<int>>* segments;
+    float  snippet_time;
+}FUNASR_VAD_RESULT;
 
 
 #ifdef _WIN32
@@ -52,3 +60,4 @@
 inline static size_t Argmax(ForwardIterator first, ForwardIterator last) {
     return std::distance(first, std::max_element(first, last));
 }
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/ct-transformer.cpp b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
index ecde636..38a5a70 100644
--- a/funasr/runtime/onnxruntime/src/ct-transformer.cpp
+++ b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
@@ -5,6 +5,7 @@
 
 #include "precomp.h"
 
+namespace funasr {
 CTTransformer::CTTransformer()
 :env_(ORT_LOGGING_LEVEL_ERROR, ""),session_options{}
 {
@@ -54,7 +55,7 @@
     int nTotalBatch = ceil((float)InputData.size() / TOKEN_LEN);
     int nCurBatch = -1;
     int nSentEnd = -1, nLastCommaIndex = -1;
-    vector<int64_t> RemainIDs; // 
+    vector<int32_t> RemainIDs; // 
     vector<string> RemainStr; //
     vector<int> NewPunctuation; //
     vector<string> NewString; //
@@ -64,7 +65,7 @@
     for (size_t i = 0; i < InputData.size(); i += TOKEN_LEN)
     {
         nDiff = (i + TOKEN_LEN) < InputData.size() ? (0) : (i + TOKEN_LEN - InputData.size());
-        vector<int64_t> InputIDs(InputData.begin() + i, InputData.begin() + i + TOKEN_LEN - nDiff);
+        vector<int32_t> InputIDs(InputData.begin() + i, InputData.begin() + i + TOKEN_LEN - nDiff);
         vector<string> InputStr(strOut.begin() + i, strOut.begin() + i + TOKEN_LEN - nDiff);
         InputIDs.insert(InputIDs.begin(), RemainIDs.begin(), RemainIDs.end()); // RemainIDs+InputIDs;
         InputStr.insert(InputStr.begin(), RemainStr.begin(), RemainStr.end()); // RemainStr+InputStr;
@@ -141,12 +142,13 @@
     return strResult;
 }
 
-vector<int> CTTransformer::Infer(vector<int64_t> input_data)
+vector<int> CTTransformer::Infer(vector<int32_t> input_data)
 {
     Ort::MemoryInfo m_memoryInfo = Ort::MemoryInfo::CreateCpu(OrtArenaAllocator, OrtMemTypeDefault);
     vector<int> punction;
     std::array<int64_t, 2> input_shape_{ 1, (int64_t)input_data.size()};
-    Ort::Value onnx_input = Ort::Value::CreateTensor<int64_t>(m_memoryInfo,
+    Ort::Value onnx_input = Ort::Value::CreateTensor<int32_t>(
+        m_memoryInfo,
         input_data.data(),
         input_data.size(),
         input_shape_.data(),
@@ -185,3 +187,4 @@
     return punction;
 }
 
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/ct-transformer.h b/funasr/runtime/onnxruntime/src/ct-transformer.h
index d965bb3..49ed1b7 100644
--- a/funasr/runtime/onnxruntime/src/ct-transformer.h
+++ b/funasr/runtime/onnxruntime/src/ct-transformer.h
@@ -5,7 +5,8 @@
 
 #pragma once 
 
-class CTTransformer {
+namespace funasr {
+class CTTransformer : public PuncModel {
 /**
  * Author: Speech Lab of DAMO Academy, Alibaba Group
  * CT-Transformer: Controllable time-delay transformer for real-time punctuation prediction and disfluency detection
@@ -27,6 +28,7 @@
 	CTTransformer();
 	void InitPunc(const std::string &punc_model, const std::string &punc_config, int thread_num);
 	~CTTransformer();
-	vector<int>  Infer(vector<int64_t> input_data);
+	vector<int>  Infer(vector<int32_t> input_data);
 	string AddPunc(const char* sz_input);
 };
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/e2e-vad.h b/funasr/runtime/onnxruntime/src/e2e-vad.h
index 02bae62..5ece1f8 100644
--- a/funasr/runtime/onnxruntime/src/e2e-vad.h
+++ b/funasr/runtime/onnxruntime/src/e2e-vad.h
@@ -1,8 +1,10 @@
 /**
  * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  * MIT License  (https://opensource.org/licenses/MIT)
- * Collaborators: zhuzizyf(China Telecom Shanghai)
+ * Contributed by zhuzizyf(China Telecom).
 */
+
+#pragma once 
 
 #include <utility>
 #include <vector>
@@ -14,7 +16,7 @@
 #include <numeric>
 #include <cassert>
 
-
+namespace funasr {
 enum class VadStateMachine {
     kVadInStateStartPointNotDetected = 1,
     kVadInStateInSpeechSegment = 2,
@@ -441,7 +443,7 @@
         } else {
           data_buf_all_size += waveform.size();
         }
-        for (int offset = 0; offset < waveform.size() - frame_sample_length + 1; offset += frame_shift_length) {
+        for (int offset = 0; offset + frame_sample_length -1 < waveform.size(); offset += frame_shift_length) {
             float sum = 0.0;
             for (int i = 0; i < frame_sample_length; i++) {
                 sum += waveform[offset + i] * waveform[offset + i];
@@ -492,7 +494,7 @@
         if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
             std::cout << "warning\n";
         }
-        int out_pos = (int) cur_seg.buffer.size();
+
         int data_to_pop;
         if (end_point_is_sent_end) {
             data_to_pop = expected_sample_number;
@@ -505,14 +507,7 @@
             expected_sample_number = data_buf_size;
         }
         cur_seg.doa = 0;
-        for (int sample_cpy_out = 0; sample_cpy_out < data_to_pop; sample_cpy_out++) {
-            cur_seg.buffer.push_back(data_buf.back());
-            out_pos++;
-        }
-        for (int sample_cpy_out = data_to_pop; sample_cpy_out < expected_sample_number; sample_cpy_out++) {
-            cur_seg.buffer.push_back(data_buf.back());
-            out_pos++;
-        }
+        
         if (cur_seg.end_ms != start_frm * vad_opts.frame_in_ms) {
             std::cout << "Something wrong with the VAD algorithm\n";
         }
@@ -787,5 +782,4 @@
 
 };
 
-
-
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
index fbb682b..f061534 100644
--- a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
@@ -6,8 +6,9 @@
 #include <fstream>
 #include "precomp.h"
 
-void FsmnVad::InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config) {
-    session_options_.SetIntraOpNumThreads(1);
+namespace funasr {
+void FsmnVad::InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num) {
+    session_options_.SetIntraOpNumThreads(thread_num);
     session_options_.SetGraphOptimizationLevel(ORT_ENABLE_ALL);
     session_options_.DisableCpuMemArena();
 
@@ -224,7 +225,7 @@
     }
 }
 
-std::vector<std::vector<float>> &FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
+void FsmnVad::LfrCmvn(std::vector<std::vector<float>> &vad_feats) {
 
     std::vector<std::vector<float>> out_feats;
     int T = vad_feats.size();
@@ -263,7 +264,6 @@
         }
     }
     vad_feats = out_feats;
-    return vad_feats;
 }
 
 std::vector<std::vector<int>>
@@ -271,7 +271,7 @@
     std::vector<std::vector<float>> vad_feats;
     std::vector<std::vector<float>> vad_probs;
     FbankKaldi(vad_sample_rate_, vad_feats, waves);
-    vad_feats = LfrCmvn(vad_feats);
+    LfrCmvn(vad_feats);
     Forward(vad_feats, &vad_probs);
 
     E2EVadModel vad_scorer = E2EVadModel();
@@ -296,5 +296,10 @@
 void FsmnVad::Test() {
 }
 
+FsmnVad::~FsmnVad() {
+}
+
 FsmnVad::FsmnVad():env_(ORT_LOGGING_LEVEL_ERROR, ""),session_options_{} {
 }
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.h b/funasr/runtime/onnxruntime/src/fsmn-vad.h
index 1d5f68c..3d183f8 100644
--- a/funasr/runtime/onnxruntime/src/fsmn-vad.h
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad.h
@@ -8,7 +8,8 @@
 
 #include "precomp.h"
 
-class FsmnVad {
+namespace funasr {
+class FsmnVad : public VadModel {
 /**
  * Author: Speech Lab of DAMO Academy, Alibaba Group
  * Deep-FSMN for Large Vocabulary Continuous Speech Recognition
@@ -17,9 +18,9 @@
 
 public:
     FsmnVad();
+    ~FsmnVad();
     void Test();
-    void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config);
-
+    void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num);
     std::vector<std::vector<int>> Infer(const std::vector<float> &waves);
     void Reset();
 
@@ -35,7 +36,7 @@
     void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
                     const std::vector<float> &waves);
 
-    std::vector<std::vector<float>> &LfrCmvn(std::vector<std::vector<float>> &vad_feats);
+    void LfrCmvn(std::vector<std::vector<float>> &vad_feats);
 
     void Forward(
             const std::vector<std::vector<float>> &chunk_feats,
@@ -63,5 +64,5 @@
     int lfr_n = VAD_LFR_N;
 };
 
-
+} // namespace funasr
 #endif //VAD_SERVER_FSMNVAD_H
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp
new file mode 100644
index 0000000..e18c27e
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp
@@ -0,0 +1,98 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+ * MIT License  (https://opensource.org/licenses/MIT)
+*/
+
+#ifndef _WIN32
+#include <sys/time.h>
+#else
+#include <win_func.h>
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include <glog/logging.h>
+#include "funasrruntime.h"
+#include "tclap/CmdLine.h"
+#include "com-define.h"
+
+using namespace std;
+
+void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
+{
+    if (value_arg.isSet()){
+        model_path.insert({key, value_arg.getValue()});
+        LOG(INFO)<< key << " : " << value_arg.getValue();
+    }
+}
+
+int main(int argc, char *argv[])
+{
+    google::InitGoogleLogging(argv[0]);
+    FLAGS_logtostderr = true;
+
+    TCLAP::CmdLine cmd("funasr-onnx-offline-punc", ' ', "1.0");
+    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the punc model path, which contains model.onnx, punc.yaml", true, "", "string");
+    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string> txt_path("", TXT_PATH, "txt file path, one sentence per line", true, "", "string");
+
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(txt_path);
+    cmd.parse(argc, argv);
+
+    std::map<std::string, std::string> model_path;
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(txt_path, TXT_PATH, model_path);
+
+    struct timeval start, end;
+    gettimeofday(&start, NULL);
+    int thread_num = 1;
+    FUNASR_HANDLE punc_hanlde=CTTransformerInit(model_path, thread_num);
+
+    if (!punc_hanlde)
+    {
+        LOG(ERROR) << "FunASR init failed";
+        exit(-1);
+    }
+
+    gettimeofday(&end, NULL);
+    long seconds = (end.tv_sec - start.tv_sec);
+    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
+
+    // read txt_path
+    vector<string> txt_list;
+
+    if(model_path.find(TXT_PATH)!=model_path.end()){
+        ifstream in(model_path.at(TXT_PATH));
+        if (!in.is_open()) {
+            LOG(ERROR) << "Failed to open file: " << model_path.at(TXT_PATH) ;
+            return 0;
+        }
+        string line;
+        while(getline(in, line))
+        {
+            txt_list.emplace_back(line); 
+        }
+        in.close();
+    }
+    
+    long taking_micros = 0;
+    for(auto& txt_str : txt_list){
+        gettimeofday(&start, NULL);
+        string result=CTTransformerInfer(punc_hanlde, txt_str.c_str(), RASR_NONE, NULL);
+        gettimeofday(&end, NULL);
+        seconds = (end.tv_sec - start.tv_sec);
+        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+        LOG(INFO)<<"Results: "<<result;
+    }
+
+    LOG(INFO) << "Model inference takes: " << (double)taking_micros / 1000000 <<" s";
+    CTTransformerUninit(punc_hanlde);
+    return 0;
+}
+
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp
index 45b6196..6ba65c6 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp
+++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp
@@ -10,7 +10,7 @@
 #endif
 
 #include <glog/logging.h>
-#include "libfunasrapi.h"
+#include "funasrruntime.h"
 #include "tclap/CmdLine.h"
 #include "com-define.h"
 
@@ -39,7 +39,7 @@
     // warm up
     for (size_t i = 0; i < 1; i++)
     {
-        FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL);
+        FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[0].c_str(), RASR_NONE, NULL, 16000);
     }
 
     while (true) {
@@ -50,7 +50,7 @@
         }
 
         gettimeofday(&start, NULL);
-        FUNASR_RESULT result=FunASRRecogFile(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL);
+        FUNASR_RESULT result=FunASRInfer(asr_handle, wav_list[i].c_str(), RASR_NONE, NULL, 16000);
 
         gettimeofday(&end, NULL);
         seconds = (end.tv_sec - start.tv_sec);
@@ -77,6 +77,15 @@
     }
 }
 
+bool is_target_file(const std::string& filename, const std::string target) {
+    std::size_t pos = filename.find_last_of(".");
+    if (pos == std::string::npos) {
+        return false;
+    }
+    std::string extension = filename.substr(pos + 1);
+    return (extension == target);
+}
+
 void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
 {
     if (value_arg.isSet()){
@@ -91,42 +100,22 @@
     FLAGS_logtostderr = true;
 
     TCLAP::CmdLine cmd("funasr-onnx-offline-rtf", ' ', "1.0");
-    TCLAP::ValueArg<std::string> vad_model("", VAD_MODEL_PATH, "vad model path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_cmvn("", VAD_CMVN_PATH, "vad cmvn path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_config("", VAD_CONFIG_PATH, "vad config path", false, "", "string");
+    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string");
+    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
 
-    TCLAP::ValueArg<std::string> am_model("", AM_MODEL_PATH, "am model path", false, "", "string");
-    TCLAP::ValueArg<std::string> am_cmvn("", AM_CMVN_PATH, "am cmvn path", false, "", "string");
-    TCLAP::ValueArg<std::string> am_config("", AM_CONFIG_PATH, "am config path", false, "", "string");
-
-    TCLAP::ValueArg<std::string> punc_model("", PUNC_MODEL_PATH, "punc model path", false, "", "string");
-    TCLAP::ValueArg<std::string> punc_config("", PUNC_CONFIG_PATH, "punc config path", false, "", "string");
-
-    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", true, "", "string");
+    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");
     TCLAP::ValueArg<std::int32_t> thread_num("", THREAD_NUM, "multi-thread num for rtf", true, 0, "int32_t");
 
-    cmd.add(vad_model);
-    cmd.add(vad_cmvn);
-    cmd.add(vad_config);
-    cmd.add(am_model);
-    cmd.add(am_cmvn);
-    cmd.add(am_config);
-    cmd.add(punc_model);
-    cmd.add(punc_config);
-    cmd.add(wav_scp);
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(wav_path);
     cmd.add(thread_num);
     cmd.parse(argc, argv);
 
     std::map<std::string, std::string> model_path;
-    GetValue(vad_model, VAD_MODEL_PATH, model_path);
-    GetValue(vad_cmvn, VAD_CMVN_PATH, model_path);
-    GetValue(vad_config, VAD_CONFIG_PATH, model_path);
-    GetValue(am_model, AM_MODEL_PATH, model_path);
-    GetValue(am_cmvn, AM_CMVN_PATH, model_path);
-    GetValue(am_config, AM_CONFIG_PATH, model_path);
-    GetValue(punc_model, PUNC_MODEL_PATH, model_path);
-    GetValue(punc_config, PUNC_CONFIG_PATH, model_path);
-    GetValue(wav_scp, WAV_SCP, model_path);
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(wav_path, WAV_PATH, model_path);
 
     struct timeval start, end;
     gettimeofday(&start, NULL);
@@ -145,10 +134,14 @@
 
     // read wav_scp
     vector<string> wav_list;
-    if(model_path.find(WAV_SCP)!=model_path.end()){
-        ifstream in(model_path.at(WAV_SCP));
+    string wav_path_ = model_path.at(WAV_PATH);
+    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
+        wav_list.emplace_back(wav_path_);
+    }
+    else if(is_target_file(wav_path_, "scp")){
+        ifstream in(wav_path_);
         if (!in.is_open()) {
-            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP);
+            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
             return 0;
         }
         string line;
@@ -160,6 +153,9 @@
             wav_list.emplace_back(column2); 
         }
         in.close();
+    }else{
+        LOG(ERROR)<<"Please check the wav extension!";
+        exit(-1);
     }
 
     // 澶氱嚎绋嬫祴璇�
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
new file mode 100644
index 0000000..0f606c6
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
@@ -0,0 +1,152 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+ * MIT License  (https://opensource.org/licenses/MIT)
+*/
+
+#ifndef _WIN32
+#include <sys/time.h>
+#else
+#include <win_func.h>
+#endif
+
+#include <iostream>
+#include <fstream>
+#include <sstream>
+#include <map>
+#include <vector>
+#include <glog/logging.h>
+#include "funasrruntime.h"
+#include "tclap/CmdLine.h"
+#include "com-define.h"
+
+using namespace std;
+
+bool is_target_file(const std::string& filename, const std::string target) {
+    std::size_t pos = filename.find_last_of(".");
+    if (pos == std::string::npos) {
+        return false;
+    }
+    std::string extension = filename.substr(pos + 1);
+    return (extension == target);
+}
+
+void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
+{
+    if (value_arg.isSet()){
+        model_path.insert({key, value_arg.getValue()});
+        LOG(INFO)<< key << " : " << value_arg.getValue();
+    }
+}
+
+void print_segs(vector<vector<int>>* vec) {
+    string seg_out="[";
+    for (int i = 0; i < vec->size(); i++) {
+        vector<int> inner_vec = (*vec)[i];
+        seg_out += "[";
+        for (int j = 0; j < inner_vec.size(); j++) {
+            seg_out += to_string(inner_vec[j]);
+            if (j != inner_vec.size() - 1) {
+                seg_out += ",";
+            }
+        }
+        seg_out += "]";
+        if (i != vec->size() - 1) {
+            seg_out += ",";
+        }
+    }
+    seg_out += "]";
+    LOG(INFO)<<seg_out;
+}
+
+int main(int argc, char *argv[])
+{
+    google::InitGoogleLogging(argv[0]);
+    FLAGS_logtostderr = true;
+
+    TCLAP::CmdLine cmd("funasr-onnx-offline-vad", ' ', "1.0");
+    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", true, "", "string");
+    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
+
+    TCLAP::ValueArg<std::string>    wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");
+
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(wav_path);
+    cmd.parse(argc, argv);
+
+    std::map<std::string, std::string> model_path;
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(wav_path, WAV_PATH, model_path);
+
+    struct timeval start, end;
+    gettimeofday(&start, NULL);
+    int thread_num = 1;
+    FUNASR_HANDLE vad_hanlde=FsmnVadInit(model_path, thread_num);
+
+    if (!vad_hanlde)
+    {
+        LOG(ERROR) << "FunVad init failed";
+        exit(-1);
+    }
+
+    gettimeofday(&end, NULL);
+    long seconds = (end.tv_sec - start.tv_sec);
+    long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+    LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
+
+    // read wav_path
+    vector<string> wav_list;
+    string wav_path_ = model_path.at(WAV_PATH);
+    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
+        wav_list.emplace_back(wav_path_);
+    }
+    else if(is_target_file(wav_path_, "scp")){
+        ifstream in(wav_path_);
+        if (!in.is_open()) {
+            LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
+            return 0;
+        }
+        string line;
+        while(getline(in, line))
+        {
+            istringstream iss(line);
+            string column1, column2;
+            iss >> column1 >> column2;
+            wav_list.emplace_back(column2); 
+        }
+        in.close();
+    }else{
+        LOG(ERROR)<<"Please check the wav extension!";
+        exit(-1);
+    }
+    
+    float snippet_time = 0.0f;
+    long taking_micros = 0;
+    for(auto& wav_file : wav_list){
+        gettimeofday(&start, NULL);
+        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000);
+        gettimeofday(&end, NULL);
+        seconds = (end.tv_sec - start.tv_sec);
+        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+
+        if (result)
+        {
+            vector<std::vector<int>>* vad_segments = FsmnVadGetResult(result, 0);
+            print_segs(vad_segments);
+            snippet_time += FsmnVadGetRetSnippetTime(result);
+            FsmnVadFreeResult(result);
+        }
+        else
+        {
+            LOG(ERROR) << ("No return data!\n");
+        }
+    }
+ 
+    LOG(INFO) << "Audio length: " << (double)snippet_time << " s";
+    LOG(INFO) << "Model inference takes: " << (double)taking_micros / 1000000 <<" s";
+    LOG(INFO) << "Model inference RTF: " << (double)taking_micros/ (snippet_time*1000000);
+    FsmnVadUninit(vad_hanlde);
+    return 0;
+}
+
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp
index 2d61bbb..3472925 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp
+++ b/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp
@@ -14,11 +14,20 @@
 #include <sstream>
 #include <map>
 #include <glog/logging.h>
-#include "libfunasrapi.h"
+#include "funasrruntime.h"
 #include "tclap/CmdLine.h"
 #include "com-define.h"
 
 using namespace std;
+
+bool is_target_file(const std::string& filename, const std::string target) {
+    std::size_t pos = filename.find_last_of(".");
+    if (pos == std::string::npos) {
+        return false;
+    }
+    std::string extension = filename.substr(pos + 1);
+    return (extension == target);
+}
 
 void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key, std::map<std::string, std::string>& model_path)
 {
@@ -28,55 +37,43 @@
     }
 }
 
-int main(int argc, char *argv[])
+int main(int argc, char** argv)
 {
     google::InitGoogleLogging(argv[0]);
     FLAGS_logtostderr = true;
 
     TCLAP::CmdLine cmd("funasr-onnx-offline", ' ', "1.0");
-    TCLAP::ValueArg<std::string> vad_model("", VAD_MODEL_PATH, "vad model path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_cmvn("", VAD_CMVN_PATH, "vad cmvn path", false, "", "string");
-    TCLAP::ValueArg<std::string> vad_config("", VAD_CONFIG_PATH, "vad config path", false, "", "string");
+    TCLAP::ValueArg<std::string>    model_dir("", MODEL_DIR, "the asr model path, which contains model.onnx, config.yaml, am.mvn", true, "", "string");
+    TCLAP::ValueArg<std::string>    quantize("", QUANTIZE, "false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string>    vad_dir("", VAD_DIR, "the vad model path, which contains model.onnx, vad.yaml, vad.mvn", false, "", "string");
+    TCLAP::ValueArg<std::string>    vad_quant("", VAD_QUANT, "false (Default), load the model of model.onnx in vad_dir. If set true, load the model of model_quant.onnx in vad_dir", false, "false", "string");
+    TCLAP::ValueArg<std::string>    punc_dir("", PUNC_DIR, "the punc model path, which contains model.onnx, punc.yaml", false, "", "string");
+    TCLAP::ValueArg<std::string>    punc_quant("", PUNC_QUANT, "false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir", false, "false", "string");
 
-    TCLAP::ValueArg<std::string> am_model("", AM_MODEL_PATH, "am model path", true, "", "string");
-    TCLAP::ValueArg<std::string> am_cmvn("", AM_CMVN_PATH, "am cmvn path", true, "", "string");
-    TCLAP::ValueArg<std::string> am_config("", AM_CONFIG_PATH, "am config path", true, "", "string");
+    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", true, "", "string");
 
-    TCLAP::ValueArg<std::string> punc_model("", PUNC_MODEL_PATH, "punc model path", false, "", "string");
-    TCLAP::ValueArg<std::string> punc_config("", PUNC_CONFIG_PATH, "punc config path", false, "", "string");
-
-    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false, "", "string");
-    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false, "", "string");
-
-    cmd.add(vad_model);
-    cmd.add(vad_cmvn);
-    cmd.add(vad_config);
-    cmd.add(am_model);
-    cmd.add(am_cmvn);
-    cmd.add(am_config);
-    cmd.add(punc_model);
-    cmd.add(punc_config);
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(vad_dir);
+    cmd.add(vad_quant);
+    cmd.add(punc_dir);
+    cmd.add(punc_quant);
     cmd.add(wav_path);
-    cmd.add(wav_scp);
     cmd.parse(argc, argv);
 
     std::map<std::string, std::string> model_path;
-    GetValue(vad_model, VAD_MODEL_PATH, model_path);
-    GetValue(vad_cmvn, VAD_CMVN_PATH, model_path);
-    GetValue(vad_config, VAD_CONFIG_PATH, model_path);
-    GetValue(am_model, AM_MODEL_PATH, model_path);
-    GetValue(am_cmvn, AM_CMVN_PATH, model_path);
-    GetValue(am_config, AM_CONFIG_PATH, model_path);
-    GetValue(punc_model, PUNC_MODEL_PATH, model_path);
-    GetValue(punc_config, PUNC_CONFIG_PATH, model_path);
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(vad_dir, VAD_DIR, model_path);
+    GetValue(vad_quant, VAD_QUANT, model_path);
+    GetValue(punc_dir, PUNC_DIR, model_path);
+    GetValue(punc_quant, PUNC_QUANT, model_path);
     GetValue(wav_path, WAV_PATH, model_path);
-    GetValue(wav_scp, WAV_SCP, model_path);
-
 
     struct timeval start, end;
     gettimeofday(&start, NULL);
     int thread_num = 1;
-    FUNASR_HANDLE asr_hanlde=FunASRInit(model_path, thread_num);
+    FUNASR_HANDLE asr_hanlde=FunOfflineInit(model_path, thread_num);
 
     if (!asr_hanlde)
     {
@@ -89,14 +86,14 @@
     long modle_init_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
     LOG(INFO) << "Model initialization takes " << (double)modle_init_micros / 1000000 << " s";
 
-    // read wav_path and wav_scp
+    // read wav_path
     vector<string> wav_list;
-
-    if(model_path.find(WAV_PATH)!=model_path.end()){
-        wav_list.emplace_back(model_path.at(WAV_PATH));
+    string wav_path_ = model_path.at(WAV_PATH); 
+    if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
+        wav_list.emplace_back(wav_path_);
     }
-    if(model_path.find(WAV_SCP)!=model_path.end()){
-        ifstream in(model_path.at(WAV_SCP));
+    else if(is_target_file(wav_path_, "scp")){
+        ifstream in(wav_path_);
         if (!in.is_open()) {
             LOG(ERROR) << "Failed to open file: " << model_path.at(WAV_SCP) ;
             return 0;
@@ -110,13 +107,16 @@
             wav_list.emplace_back(column2); 
         }
         in.close();
+    }else{
+        LOG(ERROR)<<"Please check the wav extension!";
+        exit(-1);
     }
     
     float snippet_time = 0.0f;
     long taking_micros = 0;
     for(auto& wav_file : wav_list){
         gettimeofday(&start, NULL);
-        FUNASR_RESULT result=FunASRRecogFile(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL);
+        FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, 16000);
         gettimeofday(&end, NULL);
         seconds = (end.tv_sec - start.tv_sec);
         taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
@@ -124,8 +124,7 @@
         if (result)
         {
             string msg = FunASRGetResult(result, 0);
-            setbuf(stdout, NULL);
-            printf("Result: %s \n", msg.c_str());
+            LOG(INFO)<<"Result: "<<msg;
             snippet_time += FunASRGetRetSnippetTime(result);
             FunASRFreeResult(result);
         }
@@ -138,7 +137,7 @@
     LOG(INFO) << "Audio length: " << (double)snippet_time << " s";
     LOG(INFO) << "Model inference takes: " << (double)taking_micros / 1000000 <<" s";
     LOG(INFO) << "Model inference RTF: " << (double)taking_micros/ (snippet_time*1000000);
-    FunASRUninit(asr_hanlde);
+    FunOfflineUninit(asr_hanlde);
     return 0;
 }
 
diff --git a/funasr/runtime/onnxruntime/src/funasrruntime.cpp b/funasr/runtime/onnxruntime/src/funasrruntime.cpp
new file mode 100644
index 0000000..adef504
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/funasrruntime.cpp
@@ -0,0 +1,349 @@
+#include "precomp.h"
+#ifdef __cplusplus 
+
+extern "C" {
+#endif
+
+	// APIs for Init
+	_FUNASRAPI FUNASR_HANDLE  FunASRInit(std::map<std::string, std::string>& model_path, int thread_num)
+	{
+		funasr::Model* mm = funasr::CreateModel(model_path, thread_num);
+		return mm;
+	}
+
+	_FUNASRAPI FUNASR_HANDLE  FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode)
+	{
+		funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num, mode);
+		return mm;
+	}
+
+	_FUNASRAPI FUNASR_HANDLE  CTTransformerInit(std::map<std::string, std::string>& model_path, int thread_num)
+	{
+		funasr::PuncModel* mm = funasr::CreatePuncModel(model_path, thread_num);
+		return mm;
+	}
+
+	_FUNASRAPI FUNASR_HANDLE  FunOfflineInit(std::map<std::string, std::string>& model_path, int thread_num)
+	{
+		funasr::OfflineStream* mm = funasr::CreateOfflineStream(model_path, thread_num);
+		return mm;
+	}
+
+	// APIs for ASR Infer
+	_FUNASRAPI FUNASR_RESULT FunASRInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::Model* recog_obj = (funasr::Model*)handle;
+		if (!recog_obj)
+			return nullptr;
+
+		funasr::Audio audio(1);
+		if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
+			return nullptr;
+
+		float* buff;
+		int len;
+		int flag = 0;
+		funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		int n_step = 0;
+		int n_total = audio.GetQueueSize();
+		while (audio.Fetch(buff, len, flag) > 0) {
+			string msg = recog_obj->Forward(buff, len, flag);
+			p_result->msg += msg;
+			n_step++;
+			if (fn_callback)
+				fn_callback(n_step, n_total);
+		}
+
+		return p_result;
+	}
+
+	_FUNASRAPI FUNASR_RESULT FunASRInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::Model* recog_obj = (funasr::Model*)handle;
+		if (!recog_obj)
+			return nullptr;
+
+		funasr::Audio audio(1);
+		if(funasr::is_target_file(sz_filename, "wav")){
+			int32_t sampling_rate_ = -1;
+			if(!audio.LoadWav(sz_filename, &sampling_rate_))
+				return nullptr;
+		}else if(funasr::is_target_file(sz_filename, "pcm")){
+			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
+				return nullptr;
+		}else{
+			LOG(ERROR)<<"Wrong wav extension";
+			exit(-1);
+		}
+
+		float* buff;
+		int len;
+		int flag = 0;
+		int n_step = 0;
+		int n_total = audio.GetQueueSize();
+		funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		while (audio.Fetch(buff, len, flag) > 0) {
+			string msg = recog_obj->Forward(buff, len, flag);
+			p_result->msg += msg;
+			n_step++;
+			if (fn_callback)
+				fn_callback(n_step, n_total);
+		}
+
+		return p_result;
+	}
+
+	// APIs for VAD Infer
+	_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
+		if (!vad_obj)
+			return nullptr;
+
+		funasr::Audio audio(1);
+		if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
+			return nullptr;
+
+		funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		
+		vector<std::vector<int>> vad_segments;
+		audio.Split(vad_obj, vad_segments);
+		p_result->segments = new vector<std::vector<int>>(vad_segments);
+
+		return p_result;
+	}
+
+	_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
+		if (!vad_obj)
+			return nullptr;
+
+		funasr::Audio audio(1);
+		if(funasr::is_target_file(sz_filename, "wav")){
+			int32_t sampling_rate_ = -1;
+			if(!audio.LoadWav(sz_filename, &sampling_rate_))
+				return nullptr;
+		}else if(funasr::is_target_file(sz_filename, "pcm")){
+			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
+				return nullptr;
+		}else{
+			LOG(ERROR)<<"Wrong wav extension";
+			exit(-1);
+		}
+
+		funasr::FUNASR_VAD_RESULT* p_result = new funasr::FUNASR_VAD_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		
+		vector<std::vector<int>> vad_segments;
+		audio.Split(vad_obj, vad_segments);
+		p_result->segments = new vector<std::vector<int>>(vad_segments);
+
+		return p_result;
+	}
+
+	// APIs for PUNC Infer
+	_FUNASRAPI const std::string CTTransformerInfer(FUNASR_HANDLE handle, const char* sz_sentence, FUNASR_MODE mode, QM_CALLBACK fn_callback)
+	{
+		funasr::PuncModel* punc_obj = (funasr::PuncModel*)handle;
+		if (!punc_obj)
+			return nullptr;
+
+		string punc_res = punc_obj->AddPunc(sz_sentence);
+		return punc_res;
+	}
+
+	// APIs for Offline-stream Infer
+	_FUNASRAPI FUNASR_RESULT FunOfflineInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
+		if (!offline_stream)
+			return nullptr;
+
+		funasr::Audio audio(1);
+		if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
+			return nullptr;
+		if(offline_stream->UseVad()){
+			audio.Split(offline_stream);
+		}
+
+		float* buff;
+		int len;
+		int flag = 0;
+		funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		int n_step = 0;
+		int n_total = audio.GetQueueSize();
+		while (audio.Fetch(buff, len, flag) > 0) {
+			string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
+			p_result->msg += msg;
+			n_step++;
+			if (fn_callback)
+				fn_callback(n_step, n_total);
+		}
+		if(offline_stream->UsePunc()){
+			string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
+			p_result->msg = punc_res;
+		}
+
+		return p_result;
+	}
+
+	_FUNASRAPI FUNASR_RESULT FunOfflineInfer(FUNASR_HANDLE handle, const char* sz_filename, FUNASR_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	{
+		funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
+		if (!offline_stream)
+			return nullptr;
+		
+		funasr::Audio audio(1);
+		if(funasr::is_target_file(sz_filename, "wav")){
+			int32_t sampling_rate_ = -1;
+			if(!audio.LoadWav(sz_filename, &sampling_rate_))
+				return nullptr;
+		}else if(funasr::is_target_file(sz_filename, "pcm")){
+			if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
+				return nullptr;
+		}else{
+			LOG(ERROR)<<"Wrong wav extension";
+			exit(-1);
+		}
+		if(offline_stream->UseVad()){
+			audio.Split(offline_stream);
+		}
+
+		float* buff;
+		int len;
+		int flag = 0;
+		int n_step = 0;
+		int n_total = audio.GetQueueSize();
+		funasr::FUNASR_RECOG_RESULT* p_result = new funasr::FUNASR_RECOG_RESULT;
+		p_result->snippet_time = audio.GetTimeLen();
+		while (audio.Fetch(buff, len, flag) > 0) {
+			string msg = (offline_stream->asr_handle)->Forward(buff, len, flag);
+			p_result->msg+= msg;
+			n_step++;
+			if (fn_callback)
+				fn_callback(n_step, n_total);
+		}
+		if(offline_stream->UsePunc()){
+			string punc_res = (offline_stream->punc_handle)->AddPunc((p_result->msg).c_str());
+			p_result->msg = punc_res;
+		}
+	
+		return p_result;
+	}
+
+	_FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result)
+	{
+		if (!result)
+			return 0;
+
+		return 1;
+	}
+
+	// APIs for GetRetSnippetTime
+	_FUNASRAPI const float FunASRGetRetSnippetTime(FUNASR_RESULT result)
+	{
+		if (!result)
+			return 0.0f;
+
+		return ((funasr::FUNASR_RECOG_RESULT*)result)->snippet_time;
+	}
+
+	_FUNASRAPI const float FsmnVadGetRetSnippetTime(FUNASR_RESULT result)
+	{
+		if (!result)
+			return 0.0f;
+
+		return ((funasr::FUNASR_VAD_RESULT*)result)->snippet_time;
+	}
+
+	// APIs for GetResult
+	_FUNASRAPI const char* FunASRGetResult(FUNASR_RESULT result,int n_index)
+	{
+		funasr::FUNASR_RECOG_RESULT * p_result = (funasr::FUNASR_RECOG_RESULT*)result;
+		if(!p_result)
+			return nullptr;
+
+		return p_result->msg.c_str();
+	}
+
+	_FUNASRAPI vector<std::vector<int>>* FsmnVadGetResult(FUNASR_RESULT result,int n_index)
+	{
+		funasr::FUNASR_VAD_RESULT * p_result = (funasr::FUNASR_VAD_RESULT*)result;
+		if(!p_result)
+			return nullptr;
+
+		return p_result->segments;
+	}
+
+	// APIs for FreeResult
+	_FUNASRAPI void FunASRFreeResult(FUNASR_RESULT result)
+	{
+		if (result)
+		{
+			delete (funasr::FUNASR_RECOG_RESULT*)result;
+		}
+	}
+
+	_FUNASRAPI void FsmnVadFreeResult(FUNASR_RESULT result)
+	{
+		funasr::FUNASR_VAD_RESULT * p_result = (funasr::FUNASR_VAD_RESULT*)result;
+		if (p_result)
+		{
+			if(p_result->segments){
+				delete p_result->segments;
+			}
+			delete p_result;
+		}
+	}
+
+	// APIs for Uninit
+	_FUNASRAPI void FunASRUninit(FUNASR_HANDLE handle)
+	{
+		funasr::Model* recog_obj = (funasr::Model*)handle;
+
+		if (!recog_obj)
+			return;
+
+		delete recog_obj;
+	}
+
+	_FUNASRAPI void FsmnVadUninit(FUNASR_HANDLE handle)
+	{
+		funasr::VadModel* recog_obj = (funasr::VadModel*)handle;
+
+		if (!recog_obj)
+			return;
+
+		delete recog_obj;
+	}
+
+	_FUNASRAPI void CTTransformerUninit(FUNASR_HANDLE handle)
+	{
+		funasr::PuncModel* punc_obj = (funasr::PuncModel*)handle;
+
+		if (!punc_obj)
+			return;
+
+		delete punc_obj;
+	}
+
+	_FUNASRAPI void FunOfflineUninit(FUNASR_HANDLE handle)
+	{
+		funasr::OfflineStream* offline_stream = (funasr::OfflineStream*)handle;
+
+		if (!offline_stream)
+			return;
+
+		delete offline_stream;
+	}
+
+#ifdef __cplusplus 
+
+}
+#endif
+
diff --git a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp b/funasr/runtime/onnxruntime/src/libfunasrapi.cpp
deleted file mode 100644
index 01aa38a..0000000
--- a/funasr/runtime/onnxruntime/src/libfunasrapi.cpp
+++ /dev/null
@@ -1,210 +0,0 @@
-#include "precomp.h"
-#ifdef __cplusplus 
-
-extern "C" {
-#endif
-
-	// APIs for funasr
-	_FUNASRAPI FUNASR_HANDLE  FunASRInit(std::map<std::string, std::string>& model_path, int thread_num)
-	{
-		Model* mm = CreateModel(model_path, thread_num);
-		return mm;
-	}
-
-	_FUNASRAPI FUNASR_HANDLE  FunVadInit(std::map<std::string, std::string>& model_path, int thread_num)
-	{
-		Model* mm = CreateModel(model_path, thread_num);
-		return mm;
-	}
-
-	_FUNASRAPI FUNASR_RESULT FunASRRecogBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FUNASR_MODE mode, QM_CALLBACK fn_callback)
-	{
-		Model* recog_obj = (Model*)handle;
-		if (!recog_obj)
-			return nullptr;
-
-		int32_t sampling_rate = -1;
-		Audio audio(1);
-		if (!audio.LoadWav(sz_buf, n_len, &sampling_rate))
-			return nullptr;
-		if(recog_obj->UseVad()){
-			audio.Split(recog_obj);
-		}
-
-		float* buff;
-		int len;
-		int flag=0;
-		FUNASR_RECOG_RESULT* p_result = new FUNASR_RECOG_RESULT;
-		p_result->snippet_time = audio.GetTimeLen();
-		int n_step = 0;
-		int n_total = audio.GetQueueSize();
-		while (audio.Fetch(buff, len, flag) > 0) {
-			string msg = recog_obj->Forward(buff, len, flag);
-			p_result->msg += msg;
-			n_step++;
-			if (fn_callback)
-				fn_callback(n_step, n_total);
-		}
-		if(recog_obj->UsePunc()){
-			string punc_res = recog_obj->AddPunc((p_result->msg).c_str());
-			p_result->msg = punc_res;
-		}
-
-		return p_result;
-	}
-
-	_FUNASRAPI FUNASR_RESULT FunASRRecogPCMBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
-	{
-		Model* recog_obj = (Model*)handle;
-		if (!recog_obj)
-			return nullptr;
-
-		Audio audio(1);
-		if (!audio.LoadPcmwav(sz_buf, n_len, &sampling_rate))
-			return nullptr;
-		if(recog_obj->UseVad()){
-			audio.Split(recog_obj);
-		}
-
-		float* buff;
-		int len;
-		int flag = 0;
-		FUNASR_RECOG_RESULT* p_result = new FUNASR_RECOG_RESULT;
-		p_result->snippet_time = audio.GetTimeLen();
-		int n_step = 0;
-		int n_total = audio.GetQueueSize();
-		while (audio.Fetch(buff, len, flag) > 0) {
-			string msg = recog_obj->Forward(buff, len, flag);
-			p_result->msg += msg;
-			n_step++;
-			if (fn_callback)
-				fn_callback(n_step, n_total);
-		}
-		if(recog_obj->UsePunc()){
-			string punc_res = recog_obj->AddPunc((p_result->msg).c_str());
-			p_result->msg = punc_res;
-		}
-
-		return p_result;
-	}
-
-	_FUNASRAPI FUNASR_RESULT FunASRRecogPCMFile(FUNASR_HANDLE handle, const char* sz_filename, int sampling_rate, FUNASR_MODE mode, QM_CALLBACK fn_callback)
-	{
-		Model* recog_obj = (Model*)handle;
-		if (!recog_obj)
-			return nullptr;
-
-		Audio audio(1);
-		if (!audio.LoadPcmwav(sz_filename, &sampling_rate))
-			return nullptr;
-		if(recog_obj->UseVad()){
-			audio.Split(recog_obj);
-		}
-
-		float* buff;
-		int len;
-		int flag = 0;
-		FUNASR_RECOG_RESULT* p_result = new FUNASR_RECOG_RESULT;
-		p_result->snippet_time = audio.GetTimeLen();
-		int n_step = 0;
-		int n_total = audio.GetQueueSize();
-		while (audio.Fetch(buff, len, flag) > 0) {
-			string msg = recog_obj->Forward(buff, len, flag);
-			p_result->msg += msg;
-			n_step++;
-			if (fn_callback)
-				fn_callback(n_step, n_total);
-		}
-		if(recog_obj->UsePunc()){
-			string punc_res = recog_obj->AddPunc((p_result->msg).c_str());
-			p_result->msg = punc_res;
-		}
-
-		return p_result;
-	}
-
-	_FUNASRAPI FUNASR_RESULT FunASRRecogFile(FUNASR_HANDLE handle, const char* sz_wavfile, FUNASR_MODE mode, QM_CALLBACK fn_callback)
-	{
-		Model* recog_obj = (Model*)handle;
-		if (!recog_obj)
-			return nullptr;
-		
-		int32_t sampling_rate = -1;
-		Audio audio(1);
-		if(!audio.LoadWav(sz_wavfile, &sampling_rate))
-			return nullptr;
-		if(recog_obj->UseVad()){
-			audio.Split(recog_obj);
-		}
-
-		float* buff;
-		int len;
-		int flag = 0;
-		int n_step = 0;
-		int n_total = audio.GetQueueSize();
-		FUNASR_RECOG_RESULT* p_result = new FUNASR_RECOG_RESULT;
-		p_result->snippet_time = audio.GetTimeLen();
-		while (audio.Fetch(buff, len, flag) > 0) {
-			string msg = recog_obj->Forward(buff, len, flag);
-			p_result->msg+= msg;
-			n_step++;
-			if (fn_callback)
-				fn_callback(n_step, n_total);
-		}
-		if(recog_obj->UsePunc()){
-			string punc_res = recog_obj->AddPunc((p_result->msg).c_str());
-			p_result->msg = punc_res;
-		}
-	
-		return p_result;
-	}
-
-	_FUNASRAPI const int FunASRGetRetNumber(FUNASR_RESULT result)
-	{
-		if (!result)
-			return 0;
-
-		return 1;
-	}
-
-
-	_FUNASRAPI const float FunASRGetRetSnippetTime(FUNASR_RESULT result)
-	{
-		if (!result)
-			return 0.0f;
-
-		return ((FUNASR_RECOG_RESULT*)result)->snippet_time;
-	}
-
-	_FUNASRAPI const char* FunASRGetResult(FUNASR_RESULT result,int n_index)
-	{
-		FUNASR_RECOG_RESULT * p_result = (FUNASR_RECOG_RESULT*)result;
-		if(!p_result)
-			return nullptr;
-
-		return p_result->msg.c_str();
-	}
-
-	_FUNASRAPI void FunASRFreeResult(FUNASR_RESULT result)
-	{
-		if (result)
-		{
-			delete (FUNASR_RECOG_RESULT*)result;
-		}
-	}
-
-	_FUNASRAPI void FunASRUninit(FUNASR_HANDLE handle)
-	{
-		Model* recog_obj = (Model*)handle;
-
-		if (!recog_obj)
-			return;
-
-		delete recog_obj;
-	}
-
-#ifdef __cplusplus 
-
-}
-#endif
-
diff --git a/funasr/runtime/onnxruntime/src/model.cpp b/funasr/runtime/onnxruntime/src/model.cpp
index 52ce7ba..6badde6 100644
--- a/funasr/runtime/onnxruntime/src/model.cpp
+++ b/funasr/runtime/onnxruntime/src/model.cpp
@@ -1,8 +1,23 @@
 #include "precomp.h"
 
+namespace funasr {
 Model *CreateModel(std::map<std::string, std::string>& model_path, int thread_num)
 {
+    string am_model_path;
+    string am_cmvn_path;
+    string am_config_path;
+
+    am_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
+    if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
+        am_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
+    }
+    am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME);
+    am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME);
+
     Model *mm;
-    mm = new paraformer::Paraformer(model_path, thread_num);
+    mm = new Paraformer();
+    mm->InitAsr(am_model_path, am_cmvn_path, am_config_path, thread_num);
     return mm;
 }
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/offline-stream.cpp b/funasr/runtime/onnxruntime/src/offline-stream.cpp
new file mode 100644
index 0000000..8170129
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/offline-stream.cpp
@@ -0,0 +1,64 @@
+#include "precomp.h"
+
+namespace funasr {
+OfflineStream::OfflineStream(std::map<std::string, std::string>& model_path, int thread_num)
+{
+    // VAD model
+    if(model_path.find(VAD_DIR) != model_path.end()){
+        use_vad = true;
+        string vad_model_path;
+        string vad_cmvn_path;
+        string vad_config_path;
+    
+        vad_model_path = PathAppend(model_path.at(VAD_DIR), MODEL_NAME);
+        if(model_path.find(VAD_QUANT) != model_path.end() && model_path.at(VAD_QUANT) == "true"){
+            vad_model_path = PathAppend(model_path.at(VAD_DIR), QUANT_MODEL_NAME);
+        }
+        vad_cmvn_path = PathAppend(model_path.at(VAD_DIR), VAD_CMVN_NAME);
+        vad_config_path = PathAppend(model_path.at(VAD_DIR), VAD_CONFIG_NAME);
+        vad_handle = make_unique<FsmnVad>();
+        vad_handle->InitVad(vad_model_path, vad_cmvn_path, vad_config_path, thread_num);
+    }
+
+    // AM model
+    if(model_path.find(MODEL_DIR) != model_path.end()){
+        string am_model_path;
+        string am_cmvn_path;
+        string am_config_path;
+    
+        am_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
+        if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
+            am_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
+        }
+        am_cmvn_path = PathAppend(model_path.at(MODEL_DIR), AM_CMVN_NAME);
+        am_config_path = PathAppend(model_path.at(MODEL_DIR), AM_CONFIG_NAME);
+
+        asr_handle = make_unique<Paraformer>();
+        asr_handle->InitAsr(am_model_path, am_cmvn_path, am_config_path, thread_num);
+    }
+
+    // PUNC model
+    if(model_path.find(PUNC_DIR) != model_path.end()){
+        use_punc = true;
+        string punc_model_path;
+        string punc_config_path;
+    
+        punc_model_path = PathAppend(model_path.at(PUNC_DIR), MODEL_NAME);
+        if(model_path.find(PUNC_QUANT) != model_path.end() && model_path.at(PUNC_QUANT) == "true"){
+            punc_model_path = PathAppend(model_path.at(PUNC_DIR), QUANT_MODEL_NAME);
+        }
+        punc_config_path = PathAppend(model_path.at(PUNC_DIR), PUNC_CONFIG_NAME);
+
+        punc_handle = make_unique<CTTransformer>();
+        punc_handle->InitPunc(punc_model_path, punc_config_path, thread_num);
+    }
+}
+
+OfflineStream *CreateOfflineStream(std::map<std::string, std::string>& model_path, int thread_num)
+{
+    OfflineStream *mm;
+    mm = new OfflineStream(model_path, thread_num);
+    return mm;
+}
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/online-feature.cpp b/funasr/runtime/onnxruntime/src/online-feature.cpp
index 3f57e0b..a21589c 100644
--- a/funasr/runtime/onnxruntime/src/online-feature.cpp
+++ b/funasr/runtime/onnxruntime/src/online-feature.cpp
@@ -1,11 +1,13 @@
 /**
  * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  * MIT License  (https://opensource.org/licenses/MIT)
+ * Contributed by zhuzizyf(China Telecom).
 */
 
 #include "online-feature.h"
 #include <utility>
 
+namespace funasr {
 OnlineFeature::OnlineFeature(int sample_rate, knf::FbankOptions fbank_opts, int lfr_m, int lfr_n,
                              std::vector<std::vector<float>> cmvns)
   : sample_rate_(sample_rate),
@@ -131,3 +133,5 @@
   }
 
 }
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/online-feature.h b/funasr/runtime/onnxruntime/src/online-feature.h
index decaaf4..16e6e4b 100644
--- a/funasr/runtime/onnxruntime/src/online-feature.h
+++ b/funasr/runtime/onnxruntime/src/online-feature.h
@@ -1,13 +1,14 @@
 /**
  * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  * MIT License  (https://opensource.org/licenses/MIT)
+ * Contributed by zhuzizyf(China Telecom).
 */
-
+#pragma once 
 #include <vector>
 #include "precomp.h"
 
 using namespace std;
-
+namespace funasr {
 class OnlineFeature {
 
 public:
@@ -53,3 +54,5 @@
   bool input_finished_ = false;
 
 };
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/paraformer.cpp b/funasr/runtime/onnxruntime/src/paraformer.cpp
index 136d228..74366a0 100644
--- a/funasr/runtime/onnxruntime/src/paraformer.cpp
+++ b/funasr/runtime/onnxruntime/src/paraformer.cpp
@@ -6,67 +6,14 @@
 #include "precomp.h"
 
 using namespace std;
-using namespace paraformer;
 
-Paraformer::Paraformer(std::map<std::string, std::string>& model_path,int thread_num)
+namespace funasr {
+
+Paraformer::Paraformer()
 :env_(ORT_LOGGING_LEVEL_ERROR, "paraformer"),session_options{}{
-
-    // VAD model
-    if(model_path.find(VAD_MODEL_PATH) != model_path.end()){
-        use_vad = true;
-        string vad_model_path;
-        string vad_cmvn_path;
-        string vad_config_path;
-    
-        try{
-            vad_model_path = model_path.at(VAD_MODEL_PATH);
-            vad_cmvn_path = model_path.at(VAD_CMVN_PATH);
-            vad_config_path = model_path.at(VAD_CONFIG_PATH);
-        }catch(const out_of_range& e){
-            LOG(ERROR) << "Error when read "<< VAD_CMVN_PATH << " or " << VAD_CONFIG_PATH <<" :" << e.what();
-            exit(0);
-        }
-        vad_handle = make_unique<FsmnVad>();
-        vad_handle->InitVad(vad_model_path, vad_cmvn_path, vad_config_path);
-    }
-
-    // AM model
-    if(model_path.find(AM_MODEL_PATH) != model_path.end()){
-        string am_model_path;
-        string am_cmvn_path;
-        string am_config_path;
-    
-        try{
-            am_model_path = model_path.at(AM_MODEL_PATH);
-            am_cmvn_path = model_path.at(AM_CMVN_PATH);
-            am_config_path = model_path.at(AM_CONFIG_PATH);
-        }catch(const out_of_range& e){
-            LOG(ERROR) << "Error when read "<< AM_CONFIG_PATH << " or " << AM_CMVN_PATH <<" :" << e.what();
-            exit(0);
-        }
-        InitAM(am_model_path, am_cmvn_path, am_config_path, thread_num);
-    }
-
-    // PUNC model
-    if(model_path.find(PUNC_MODEL_PATH) != model_path.end()){
-        use_punc = true;
-        string punc_model_path;
-        string punc_config_path;
-    
-        try{
-            punc_model_path = model_path.at(PUNC_MODEL_PATH);
-            punc_config_path = model_path.at(PUNC_CONFIG_PATH);
-        }catch(const out_of_range& e){
-            LOG(ERROR) << "Error when read "<< PUNC_CONFIG_PATH <<" :" << e.what();
-            exit(0);
-        }
-
-        punc_handle = make_unique<CTTransformer>();
-        punc_handle->InitPunc(punc_model_path, punc_config_path, thread_num);
-    }
 }
 
-void Paraformer::InitAM(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){
+void Paraformer::InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num){
     // knf options
     fbank_opts.frame_opts.dither = 0;
     fbank_opts.mel_opts.num_bins = 80;
@@ -118,14 +65,6 @@
 
 void Paraformer::Reset()
 {
-}
-
-vector<std::vector<int>> Paraformer::VadSeg(std::vector<float>& pcm_data){
-    return vad_handle->Infer(pcm_data);
-}
-
-string Paraformer::AddPunc(const char* sz_input){
-    return punc_handle->AddPunc(sz_input);
 }
 
 vector<float> Paraformer::FbankKaldi(float sample_rate, const float* waves, int len) {
@@ -282,7 +221,7 @@
     }
     catch (std::exception const &e)
     {
-        printf(e.what());
+        LOG(ERROR)<<e.what();
     }
 
     return result;
@@ -291,12 +230,13 @@
 string Paraformer::ForwardChunk(float* din, int len, int flag)
 {
 
-    printf("Not Imp!!!!!!\n");
-    return "Hello";
+    LOG(ERROR)<<"Not Imp!!!!!!";
+    return "";
 }
 
 string Paraformer::Rescoring()
 {
-    printf("Not Imp!!!!!!\n");
-    return "Hello";
+    LOG(ERROR)<<"Not Imp!!!!!!";
+    return "";
 }
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/paraformer.h b/funasr/runtime/onnxruntime/src/paraformer.h
index f3eb059..533c16f 100644
--- a/funasr/runtime/onnxruntime/src/paraformer.h
+++ b/funasr/runtime/onnxruntime/src/paraformer.h
@@ -2,16 +2,11 @@
  * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
  * MIT License  (https://opensource.org/licenses/MIT)
 */
-
 #pragma once
-
-
-#ifndef PARAFORMER_MODELIMP_H
-#define PARAFORMER_MODELIMP_H
 
 #include "precomp.h"
 
-namespace paraformer {
+namespace funasr {
 
     class Paraformer : public Model {
     /**
@@ -23,9 +18,6 @@
         //std::unique_ptr<knf::OnlineFbank> fbank_;
         knf::FbankOptions fbank_opts;
 
-        std::unique_ptr<FsmnVad> vad_handle;
-        std::unique_ptr<CTTransformer> punc_handle;
-
         Vocab* vocab;
         vector<float> means_list;
         vector<float> vars_list;
@@ -36,7 +28,6 @@
         void LoadCmvn(const char *filename);
         vector<float> ApplyLfr(const vector<float> &in);
         void ApplyCmvn(vector<float> *v);
-
         string GreedySearch( float* in, int n_len, int64_t token_nums);
 
         std::shared_ptr<Ort::Session> m_session;
@@ -46,23 +37,16 @@
         vector<string> m_strInputNames, m_strOutputNames;
         vector<const char*> m_szInputNames;
         vector<const char*> m_szOutputNames;
-        bool use_vad=false;
-        bool use_punc=false;
 
     public:
-        Paraformer(std::map<std::string, std::string>& model_path, int thread_num=0);
+        Paraformer();
         ~Paraformer();
-        void InitAM(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num);
+        void InitAsr(const std::string &am_model, const std::string &am_cmvn, const std::string &am_config, int thread_num);
         void Reset();
         vector<float> FbankKaldi(float sample_rate, const float* waves, int len);
         string ForwardChunk(float* din, int len, int flag);
         string Forward(float* din, int len, int flag);
         string Rescoring();
-        std::vector<std::vector<int>> VadSeg(std::vector<float>& pcm_data);
-        string AddPunc(const char* sz_input);
-        bool UseVad(){return use_vad;};
-        bool UsePunc(){return use_punc;}; 
     };
 
-} // namespace paraformer
-#endif
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h
index 1630e55..e607dbf 100644
--- a/funasr/runtime/onnxruntime/src/precomp.h
+++ b/funasr/runtime/onnxruntime/src/precomp.h
@@ -30,6 +30,10 @@
 #include "com-define.h"
 #include "commonfunc.h"
 #include "predefine-coe.h"
+#include "model.h"
+#include "vad-model.h"
+#include "punc-model.h"
+#include "offline-stream.h"
 #include "tokenizer.h"
 #include "ct-transformer.h"
 #include "fsmn-vad.h"
@@ -39,9 +43,6 @@
 #include "tensor.h"
 #include "util.h"
 #include "resample.h"
-#include "model.h"
-//#include "vad-model.h"
 #include "paraformer.h"
-#include "libfunasrapi.h"
-
-using namespace paraformer;
+#include "offline-stream.h"
+#include "funasrruntime.h"
diff --git a/funasr/runtime/onnxruntime/src/predefine-coe.h b/funasr/runtime/onnxruntime/src/predefine-coe.h
index 93012d8..17c263f 100644
--- a/funasr/runtime/onnxruntime/src/predefine-coe.h
+++ b/funasr/runtime/onnxruntime/src/predefine-coe.h
@@ -3,6 +3,7 @@
 
 #include <stdint.h>
 
+namespace funasr {
 const int32_t melcoe_hex[] = {
 
     0x3f01050c, 0x3e0afb11, 0x3f5d413c, 0x3f547fd0, 0x3e2e00c1, 0x3f132970,
@@ -590,3 +591,5 @@
     0x39164323, 0x3910f3c6, 0x390bd472, 0x3906e374, 0x39021f2b, 0x38fb0c03,
     0x38f22ce3, 0x38e99e04, 0x38e15c92, 0x38d965ce};
 #endif
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/punc-model.cpp b/funasr/runtime/onnxruntime/src/punc-model.cpp
new file mode 100644
index 0000000..52ba0df
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/punc-model.cpp
@@ -0,0 +1,22 @@
+#include "precomp.h"
+
+namespace funasr {
+PuncModel *CreatePuncModel(std::map<std::string, std::string>& model_path, int thread_num)
+{
+    PuncModel *mm;
+    mm = new CTTransformer();
+
+    string punc_model_path;
+    string punc_config_path;
+
+    punc_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
+    if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
+        punc_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
+    }
+    punc_config_path = PathAppend(model_path.at(MODEL_DIR), PUNC_CONFIG_NAME);
+
+    mm->InitPunc(punc_model_path, punc_config_path, thread_num);
+    return mm;
+}
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/resample.cc b/funasr/runtime/onnxruntime/src/resample.cpp
similarity index 99%
rename from funasr/runtime/onnxruntime/src/resample.cc
rename to funasr/runtime/onnxruntime/src/resample.cpp
index 0238752..9c74dc8 100644
--- a/funasr/runtime/onnxruntime/src/resample.cc
+++ b/funasr/runtime/onnxruntime/src/resample.cpp
@@ -31,6 +31,7 @@
 #include <cstdlib>
 #include <type_traits>
 
+namespace funasr {
 #ifndef M_2PI
 #define M_2PI 6.283185307179586476925286766559005
 #endif
@@ -303,3 +304,4 @@
     }
   }
 }
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/resample.h b/funasr/runtime/onnxruntime/src/resample.h
index b9a283a..5cfc971 100644
--- a/funasr/runtime/onnxruntime/src/resample.h
+++ b/funasr/runtime/onnxruntime/src/resample.h
@@ -21,11 +21,11 @@
  */
 // this file is copied and modified from
 // kaldi/src/feat/resample.h
-
+#pragma once 
 #include <cstdint>
 #include <vector>
 
-
+namespace funasr {
 /*
    We require that the input and output sampling rate be specified as
    integers, as this is an easy way to specify that their ratio be rational.
@@ -135,3 +135,4 @@
   std::vector<float> input_remainder_;  ///< A small trailing part of the
                                         ///< previously seen input signal.
 };
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/tensor.h b/funasr/runtime/onnxruntime/src/tensor.h
index 3b7a633..a2a7bc3 100644
--- a/funasr/runtime/onnxruntime/src/tensor.h
+++ b/funasr/runtime/onnxruntime/src/tensor.h
@@ -5,6 +5,8 @@
 
 using namespace std;
 
+namespace funasr {
+
 template <typename T> class Tensor {
   private:
     void alloc_buff();
@@ -152,4 +154,6 @@
     fwrite(buff, 1, buff_size * sizeof(T), fp);
     fclose(fp);
 }
+
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/src/tokenizer.cpp b/funasr/runtime/onnxruntime/src/tokenizer.cpp
index 5f29b46..a8f6301 100644
--- a/funasr/runtime/onnxruntime/src/tokenizer.cpp
+++ b/funasr/runtime/onnxruntime/src/tokenizer.cpp
@@ -5,12 +5,17 @@
 
 #include "precomp.h"
 
+namespace funasr {
 CTokenizer::CTokenizer(const char* sz_yamlfile):m_ready(false)
 {
 	OpenYaml(sz_yamlfile);
 }
 
 CTokenizer::CTokenizer():m_ready(false)
+{
+}
+
+CTokenizer::~CTokenizer()
 {
 }
 
@@ -216,3 +221,5 @@
 	}
 	id_out= String2Ids(str_out);
 }
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/tokenizer.h b/funasr/runtime/onnxruntime/src/tokenizer.h
index 4ff1809..419791b 100644
--- a/funasr/runtime/onnxruntime/src/tokenizer.h
+++ b/funasr/runtime/onnxruntime/src/tokenizer.h
@@ -6,6 +6,7 @@
 #pragma once
 #include <yaml-cpp/yaml.h>
 
+namespace funasr {
 class CTokenizer {
 private:
 
@@ -17,6 +18,7 @@
 
 	CTokenizer(const char* sz_yamlfile);
 	CTokenizer();
+	~CTokenizer();
 	bool OpenYaml(const char* sz_yamlfile);
 	void ReadYaml(const YAML::Node& node);
 	vector<string> Id2String(vector<int> input);
@@ -30,3 +32,5 @@
 	void Tokenize(const char* str_info, vector<string>& str_out, vector<int>& id_out);
 
 };
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/util.cpp b/funasr/runtime/onnxruntime/src/util.cpp
index c5c27af..755913c 100644
--- a/funasr/runtime/onnxruntime/src/util.cpp
+++ b/funasr/runtime/onnxruntime/src/util.cpp
@@ -1,6 +1,7 @@
 
 #include "precomp.h"
 
+namespace funasr {
 float *LoadParams(const char *filename)
 {
 
@@ -178,3 +179,14 @@
         }
     }
 }
+
+bool is_target_file(const std::string& filename, const std::string target) {
+    std::size_t pos = filename.find_last_of(".");
+    if (pos == std::string::npos) {
+        return false;
+    }
+    std::string extension = filename.substr(pos + 1);
+    return (extension == target);
+}
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/util.h b/funasr/runtime/onnxruntime/src/util.h
index 6327f7b..8823a32 100644
--- a/funasr/runtime/onnxruntime/src/util.h
+++ b/funasr/runtime/onnxruntime/src/util.h
@@ -1,10 +1,9 @@
-
-
 #ifndef UTIL_H
 #define UTIL_H
 
 using namespace std;
 
+namespace funasr {
 extern float *LoadParams(const char *filename);
 
 extern void SaveDataFile(const char *filename, void *data, uint32_t len);
@@ -26,5 +25,7 @@
 extern void Glu(Tensor<float> *din, Tensor<float> *dout);
 
 string PathAppend(const string &p1, const string &p2);
+bool is_target_file(const std::string& filename, const std::string target);
 
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/src/vad-model.cpp b/funasr/runtime/onnxruntime/src/vad-model.cpp
new file mode 100644
index 0000000..336758f
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/vad-model.cpp
@@ -0,0 +1,28 @@
+#include "precomp.h"
+
+namespace funasr {
+VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode)
+{
+    VadModel *mm;
+    if(mode == FSMN_VAD_OFFLINE){
+        mm = new FsmnVad();
+    }else{
+        LOG(ERROR)<<"Online fsmn vad not imp!";
+    }
+
+    string vad_model_path;
+    string vad_cmvn_path;
+    string vad_config_path;
+
+    vad_model_path = PathAppend(model_path.at(MODEL_DIR), MODEL_NAME);
+    if(model_path.find(QUANTIZE) != model_path.end() && model_path.at(QUANTIZE) == "true"){
+        vad_model_path = PathAppend(model_path.at(MODEL_DIR), QUANT_MODEL_NAME);
+    }
+    vad_cmvn_path = PathAppend(model_path.at(MODEL_DIR), VAD_CMVN_NAME);
+    vad_config_path = PathAppend(model_path.at(MODEL_DIR), VAD_CONFIG_NAME);
+
+    mm->InitVad(vad_model_path, vad_cmvn_path, vad_config_path, thread_num);
+    return mm;
+}
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/vocab.cpp b/funasr/runtime/onnxruntime/src/vocab.cpp
index 53233b3..65af8b6 100644
--- a/funasr/runtime/onnxruntime/src/vocab.cpp
+++ b/funasr/runtime/onnxruntime/src/vocab.cpp
@@ -10,6 +10,7 @@
 
 using namespace std;
 
+namespace funasr {
 Vocab::Vocab(const char *filename)
 {
     ifstream in(filename);
@@ -151,3 +152,5 @@
 {
     return vocab.size();
 }
+
+} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/vocab.h b/funasr/runtime/onnxruntime/src/vocab.h
index a3fdf65..6c4e523 100644
--- a/funasr/runtime/onnxruntime/src/vocab.h
+++ b/funasr/runtime/onnxruntime/src/vocab.h
@@ -7,6 +7,7 @@
 #include <vector>
 using namespace std;
 
+namespace funasr {
 class Vocab {
   private:
     vector<string> vocab;
@@ -22,4 +23,5 @@
     string Vector2StringV2(vector<int> in);
 };
 
+} // namespace funasr
 #endif
diff --git a/funasr/runtime/python/grpc/proto/paraformer.proto b/funasr/runtime/python/grpc/proto/paraformer.proto
index b221ee2..6c336a8 100644
--- a/funasr/runtime/python/grpc/proto/paraformer.proto
+++ b/funasr/runtime/python/grpc/proto/paraformer.proto
@@ -1,19 +1,5 @@
-// Copyright (c) 2021 Ximalaya Speech Team (Xiang Lyu)
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//   http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
 syntax = "proto3";
 
-option java_package = "ex.grpc";
 option objc_class_prefix = "paraformer";
 
 package paraformer;
diff --git a/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py b/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
index aeb91e7..6fd01e4 100644
--- a/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
+++ b/funasr/runtime/python/onnxruntime/funasr_onnx/punc_bin.py
@@ -64,7 +64,7 @@
             mini_sentence = mini_sentences[mini_sentence_i]
             mini_sentence_id = mini_sentences_id[mini_sentence_i]
             mini_sentence = cache_sent + mini_sentence
-            mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype='int64')
+            mini_sentence_id = np.array(cache_sent_id + mini_sentence_id, dtype='int32')
             data = {
                 "text": mini_sentence_id[None,:],
                 "text_lengths": np.array([len(mini_sentence_id)], dtype='int32'),
@@ -148,7 +148,7 @@
         else:
             precache = ""
             cache = []
-        full_text = precache + text
+        full_text = precache + " " + text
         split_text = code_mix_split_words(full_text)
         split_text_id = self.converter.tokens2ids(split_text)
         mini_sentences = split_to_mini_sentence(split_text, split_size)
@@ -166,7 +166,7 @@
             mini_sentence = mini_sentences[mini_sentence_i]
             mini_sentence_id = mini_sentences_id[mini_sentence_i]
             mini_sentence = cache_sent + mini_sentence
-            mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0)
+            mini_sentence_id = np.concatenate((cache_sent_id, mini_sentence_id), axis=0,dtype='int32')
             text_length = len(mini_sentence_id)
             data = {
                 "input": mini_sentence_id[None,:],
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index 06603f0..0b249dd 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -13,7 +13,7 @@
 
 
 MODULE_NAME = 'funasr_onnx'
-VERSION_NUM = '0.0.6'
+VERSION_NUM = '0.0.8'
 
 setuptools.setup(
     name=MODULE_NAME,
diff --git a/funasr/runtime/python/websocket/README.md b/funasr/runtime/python/websocket/README.md
index ee7dca0..7ca5730 100644
--- a/funasr/runtime/python/websocket/README.md
+++ b/funasr/runtime/python/websocket/README.md
@@ -1,6 +1,6 @@
 # Service with websocket-python
 
-This is a demo using funasr pipeline with websocket python-api. 
+This is a demo using funasr pipeline with websocket python-api. It supports the offline, online, offline/online-2pass unifying speech recognition. 
 
 ## For the Server
 
@@ -22,24 +22,49 @@
 
 ### Start server
 #### ASR offline server
-
-[//]: # (```shell)
-
-[//]: # (python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
-
-[//]: # (```)
-#### ASR streaming server
+##### API-reference
 ```shell
-python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+python ws_server_offline.py \
+--port [port id] \
+--asr_model [asr model_name] \
+--punc_model [punc model_name] \
+--ngpu [0 or 1] \
+--ncpu [1 or 4]
+```
+##### Usage examples
+```shell
+python ws_server_offline.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+```
+
+#### ASR streaming server
+##### API-reference
+```shell
+python ws_server_online.py \
+--port [port id] \
+--asr_model_online [asr model_name] \
+--ngpu [0 or 1] \
+--ncpu [1 or 4]
+```
+##### Usage examples
+```shell
+python ws_server_online.py --port 10095 --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
 ```
 
 #### ASR offline/online 2pass server
-
-[//]: # (```shell)
-
-[//]: # (python ws_server_online.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
-
-[//]: # (```)
+##### API-reference
+```shell
+python ws_server_2pass.py \
+--port [port id] \
+--asr_model [asr model_name] \
+--asr_model_online [asr model_name] \
+--punc_model [punc model_name] \
+--ngpu [0 or 1] \
+--ncpu [1 or 4]
+```
+##### Usage examples
+```shell
+python ws_server_2pass.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"  --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+```
 
 ## For the client
 
@@ -51,17 +76,55 @@
 ```
 
 ### Start client
-#### Recording from mircrophone
+#### API-reference
 ```shell
-# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "127.0.0.1" --port 10095 --chunk_size "5,10,5"
+python ws_client.py \
+--host [ip_address] \
+--port [port id] \
+--chunk_size ["5,10,5"=600ms, "8,8,4"=480ms] \
+--chunk_interval [duration of send chunk_size/chunk_interval] \
+--words_max_print [max number of words to print] \
+--audio_in [if set, loadding from wav.scp, else recording from mircrophone] \
+--output_dir [if set, write the results to output_dir] \
+--send_without_sleep [only set for offline]
 ```
-#### Loadding from wav.scp(kaldi style)
+#### Usage examples
+##### ASR offline client
+Recording from mircrophone
 ```shell
-# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "127.0.0.1" --port 10095 --chunk_size "5,10,5" --audio_in "./data/wav.scp"
+# --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100
+```
+Loadding from wav.scp(kaldi style)
+```shell
+# --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100 --audio_in "./data/wav.scp" --send_without_sleep --output_dir "./results"
 ```
 
+##### ASR streaming client
+Recording from mircrophone
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --words_max_print 100
+```
+Loadding from wav.scp(kaldi style)
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --audio_in "./data/wav.scp" --words_max_print 100 --output_dir "./results"
+```
+
+##### ASR offline/online 2pass client
+Recording from mircrophone
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "8,8,4" --words_max_print 10000
+```
+Loadding from wav.scp(kaldi style)
+```shell
+# --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
+python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "8,8,4" --audio_in "./data/wav.scp" --words_max_print 10000 --output_dir "./results"
+```
 ## Acknowledge
 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
-2. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service.
+2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/fix_bug_for_python_websocket) for contributing the websocket service.
+3. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service of offline model.
diff --git a/funasr/runtime/python/websocket/parse_args.py b/funasr/runtime/python/websocket/parse_args.py
index 2528a76..d170be8 100644
--- a/funasr/runtime/python/websocket/parse_args.py
+++ b/funasr/runtime/python/websocket/parse_args.py
@@ -31,5 +31,10 @@
                     type=int,
                     default=1,
                     help="0 for cpu, 1 for gpu")
+parser.add_argument("--ncpu",
+                    type=int,
+                    default=1,
+                    help="cpu cores")
 
-args = parser.parse_args()
\ No newline at end of file
+args = parser.parse_args()
+print(args)
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_client.py b/funasr/runtime/python/websocket/ws_client.py
index 8bbf103..a4a6d9f 100644
--- a/funasr/runtime/python/websocket/ws_client.py
+++ b/funasr/runtime/python/websocket/ws_client.py
@@ -6,6 +6,13 @@
 # import threading
 import argparse
 import json
+import traceback
+from multiprocessing import Process
+from funasr.fileio.datadir_writer import DatadirWriter
+
+import logging
+
+logging.basicConfig(level=logging.ERROR)
 
 parser = argparse.ArgumentParser()
 parser.add_argument("--host",
@@ -30,15 +37,35 @@
                     type=str,
                     default=None,
                     help="audio_in")
+parser.add_argument("--send_without_sleep",
+                    action="store_true",
+                    default=False,
+                    help="if audio_in is set, send_without_sleep")
+parser.add_argument("--test_thread_num",
+                    type=int,
+                    default=1,
+                    help="test_thread_num")
+parser.add_argument("--words_max_print",
+                    type=int,
+                    default=100,
+                    help="chunk")
+parser.add_argument("--output_dir",
+                    type=str,
+                    default=None,
+                    help="output_dir")
 
 args = parser.parse_args()
 args.chunk_size = [int(x) for x in args.chunk_size.split(",")]
-
+print(args)
 # voices = asyncio.Queue()
 from queue import Queue
 voices = Queue()
 
-# 鍏朵粬鍑芥暟鍙互閫氳繃璋冪敤send(data)鏉ュ彂閫佹暟鎹紝渚嬪锛�
+ibest_writer = None
+if args.output_dir is not None:
+    writer = DatadirWriter(args.output_dir)
+    ibest_writer = writer[f"1best_recog"]
+
 async def record_microphone():
     is_finished = False
     import pyaudio
@@ -65,11 +92,9 @@
         message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "audio": data, "is_speaking": is_speaking, "is_finished": is_finished})
         
         voices.put(message)
-        #print(voices.qsize())
 
         await asyncio.sleep(0.005)
 
-# 鍏朵粬鍑芥暟鍙互閫氳繃璋冪敤send(data)鏉ュ彂閫佹暟鎹紝渚嬪锛�
 async def record_from_scp():
     import wave
     global voices
@@ -81,19 +106,17 @@
         wavs = [args.audio_in]
     for wav in wavs:
         wav_splits = wav.strip().split()
+        wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
         wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
+        
         # bytes_f = open(wav_path, "rb")
         # bytes_data = bytes_f.read()
         with wave.open(wav_path, "rb") as wav_file:
-            # 鑾峰彇闊抽鍙傛暟
             params = wav_file.getparams()
-            # 鑾峰彇澶翠俊鎭殑闀垮害
             # header_length = wav_file.getheaders()[0][1]
-            # 璇诲彇闊抽甯ф暟鎹紝璺宠繃澶翠俊鎭�
             # wav_file.setpos(header_length)
             frames = wav_file.readframes(wav_file.getnframes())
 
-        # 灏嗛煶棰戝抚鏁版嵁杞崲涓哄瓧鑺傜被鍨嬬殑鏁版嵁
         audio_bytes = bytes(frames)
         # stride = int(args.chunk_size/1000*16000*2)
         stride = int(60*args.chunk_size[1]/args.chunk_interval/1000*16000*2)
@@ -106,12 +129,12 @@
             beg = i*stride
             data = audio_bytes[beg:beg+stride]
             data = data.decode('ISO-8859-1')
-            message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "is_speaking": is_speaking, "audio": data, "is_finished": is_finished})
+            message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "is_speaking": is_speaking, "audio": data, "is_finished": is_finished, "wav_name": wav_name})
             voices.put(message)
             # print("data_chunk: ", len(data_chunk))
             # print(voices.qsize())
-        
-            await asyncio.sleep(60*args.chunk_size[1]/args.chunk_interval/1000)
+            sleep_duration = 0.001 if args.send_without_sleep else 60*args.chunk_size[1]/args.chunk_interval/1000
+            await asyncio.sleep(sleep_duration)
 
     is_finished = True
     message = json.dumps({"is_finished": is_finished})
@@ -126,31 +149,57 @@
             data = voices.get()
             voices.task_done()
             try:
-                await websocket.send(data) # 閫氳繃ws瀵硅薄鍙戦�佹暟鎹�
+                await websocket.send(data)
             except Exception as e:
                 print('Exception occurred:', e)
+                traceback.print_exc()
+                exit(0)
             await asyncio.sleep(0.005)
         await asyncio.sleep(0.005)
 
 
 
-async def message():
+async def message(id):
     global websocket
     text_print = ""
+    text_print_2pass_online = ""
+    text_print_2pass_offline = ""
     while True:
         try:
             meg = await websocket.recv()
             meg = json.loads(meg)
-            # print(meg, end = '')
-            # print("\r")
-            text = meg["text"][0]
-            text_print += text
-            text_print = text_print[-55:]
-            os.system('clear')
-            print("\r"+text_print)
+            wav_name = meg.get("wav_name", "demo")
+            # print(wav_name)
+            text = meg["text"]
+            if ibest_writer is not None:
+                ibest_writer["text"][wav_name] = text
+            
+            if meg["mode"] == "online":
+                text_print += " {}".format(text)
+                text_print = text_print[-args.words_max_print:]
+                os.system('clear')
+                print("\rpid"+str(id)+": "+text_print)
+            elif meg["mode"] == "online":
+                text_print += "{}".format(text)
+                text_print = text_print[-args.words_max_print:]
+                os.system('clear')
+                print("\rpid"+str(id)+": "+text_print)
+            else:
+                if meg["mode"] == "2pass-online":
+                    text_print_2pass_online += " {}".format(text)
+                    text_print = text_print_2pass_offline + text_print_2pass_online
+                else:
+                    text_print_2pass_online = ""
+                    text_print = text_print_2pass_offline + "{}".format(text)
+                    text_print_2pass_offline += "{}".format(text)
+                text_print = text_print[-args.words_max_print:]
+                os.system('clear')
+                print("\rpid" + str(id) + ": " + text_print)
+
         except Exception as e:
             print("Exception:", e)
-
+            traceback.print_exc()
+            exit(0)
 
 async def print_messge():
     global websocket
@@ -161,22 +210,36 @@
             print(meg)
         except Exception as e:
             print("Exception:", e)
+            traceback.print_exc()
+            exit(0)
 
-
-async def ws_client():
-    global websocket # 瀹氫箟涓�涓叏灞�鍙橀噺ws锛岀敤浜庝繚瀛榳ebsocket杩炴帴瀵硅薄
-    # uri = "ws://11.167.134.197:8899"
+async def ws_client(id):
+    global websocket
     uri = "ws://{}:{}".format(args.host, args.port)
-    #ws = await websockets.connect(uri, subprotocols=["binary"]) # 鍒涘缓涓�涓暱杩炴帴
     async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None):
         if args.audio_in is not None:
-            task = asyncio.create_task(record_from_scp()) # 鍒涘缓涓�涓悗鍙颁换鍔″綍闊�
+            task = asyncio.create_task(record_from_scp())
         else:
-            task = asyncio.create_task(record_microphone())  # 鍒涘缓涓�涓悗鍙颁换鍔″綍闊�
-        task2 = asyncio.create_task(ws_send()) # 鍒涘缓涓�涓悗鍙颁换鍔″彂閫�
-        task3 = asyncio.create_task(message()) # 鍒涘缓涓�涓悗鍙版帴鏀舵秷鎭殑浠诲姟
+            task = asyncio.create_task(record_microphone())
+        task2 = asyncio.create_task(ws_send())
+        task3 = asyncio.create_task(message(id))
         await asyncio.gather(task, task2, task3)
 
+def one_thread(id):
+   asyncio.get_event_loop().run_until_complete(ws_client(id))
+   asyncio.get_event_loop().run_forever()
 
-asyncio.get_event_loop().run_until_complete(ws_client()) # 鍚姩鍗忕▼
-asyncio.get_event_loop().run_forever()
+
+if __name__ == '__main__':
+    process_list = []
+    for i in range(args.test_thread_num):   
+        p = Process(target=one_thread,args=(i,))
+        p.start()
+        process_list.append(p)
+
+    for i in process_list:
+        p.join()
+
+    print('end')
+ 
+
diff --git a/funasr/runtime/python/websocket/ws_server_2pass.py b/funasr/runtime/python/websocket/ws_server_2pass.py
new file mode 100644
index 0000000..ced67ff
--- /dev/null
+++ b/funasr/runtime/python/websocket/ws_server_2pass.py
@@ -0,0 +1,182 @@
+import asyncio
+import json
+import websockets
+import time
+import logging
+import tracemalloc
+import numpy as np
+
+from parse_args import args
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
+
+tracemalloc.start()
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+
+websocket_users = set()
+
+print("model loading")
+# asr
+inference_pipeline_asr = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=args.asr_model,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+    model_revision=None)
+
+
+# vad
+inference_pipeline_vad = pipeline(
+    task=Tasks.voice_activity_detection,
+    model=args.vad_model,
+    model_revision=None,
+    output_dir=None,
+    batch_size=1,
+    mode='online',
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+)
+
+if args.punc_model != "":
+    inference_pipeline_punc = pipeline(
+        task=Tasks.punctuation,
+        model=args.punc_model,
+        model_revision=None,
+        ngpu=args.ngpu,
+        ncpu=args.ncpu,
+    )
+else:
+    inference_pipeline_punc = None
+
+inference_pipeline_asr_online = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=args.asr_model_online,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+    model_revision='v1.0.4')
+
+print("model loaded")
+
+async def ws_serve(websocket, path):
+    frames = []
+    frames_asr = []
+    frames_asr_online = []
+    global websocket_users
+    websocket_users.add(websocket)
+    websocket.param_dict_asr = {}
+    websocket.param_dict_asr_online = {"cache": dict()}
+    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
+    websocket.param_dict_punc = {'cache': list()}
+    websocket.vad_pre_idx = 0
+    speech_start = False
+
+    try:
+        async for message in websocket:
+            message = json.loads(message)
+            is_finished = message["is_finished"]
+            if not is_finished:
+                audio = bytes(message['audio'], 'ISO-8859-1')
+                frames.append(audio)
+                duration_ms = len(audio)//32
+                websocket.vad_pre_idx += duration_ms
+
+                is_speaking = message["is_speaking"]
+                websocket.param_dict_vad["is_final"] = not is_speaking
+                websocket.param_dict_asr_online["is_final"] = not is_speaking
+                websocket.param_dict_asr_online["chunk_size"] = message["chunk_size"]
+                websocket.wav_name = message.get("wav_name", "demo")
+                # asr online
+                frames_asr_online.append(audio)
+                if len(frames_asr_online) % message["chunk_interval"] == 0:
+                    audio_in = b"".join(frames_asr_online)
+                    await async_asr_online(websocket, audio_in)
+                    frames_asr_online = []
+                if speech_start:
+                    frames_asr.append(audio)
+                # vad online
+                speech_start_i, speech_end_i = await async_vad(websocket, audio)
+                if speech_start_i:
+                    speech_start = True
+                    beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
+                    frames_pre = frames[-beg_bias:]
+                    frames_asr = []
+                    frames_asr.extend(frames_pre)
+                # asr punc offline
+                if speech_end_i or not is_speaking:
+                    audio_in = b"".join(frames_asr)
+                    await async_asr(websocket, audio_in)
+                    frames_asr = []
+                    speech_start = False
+                    frames_asr_online = []
+                    websocket.param_dict_asr_online = {"cache": dict()}
+                    if not is_speaking:
+                        websocket.vad_pre_idx = 0
+                        frames = []
+                        websocket.param_dict_vad = {'in_cache': dict()}
+                    else:
+                        frames = frames[-20:]
+
+     
+    except websockets.ConnectionClosed:
+        print("ConnectionClosed...", websocket_users)
+        websocket_users.remove(websocket)
+    except websockets.InvalidState:
+        print("InvalidState...")
+    except Exception as e:
+        print("Exception:", e)
+
+
+async def async_vad(websocket, audio_in):
+
+    segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
+
+    speech_start = False
+    speech_end = False
+    
+    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
+        return speech_start, speech_end
+    if segments_result["text"][0][0] != -1:
+        speech_start = segments_result["text"][0][0]
+    if segments_result["text"][0][1] != -1:
+        speech_end = True
+    return speech_start, speech_end
+
+
+async def async_asr(websocket, audio_in):
+            if len(audio_in) > 0:
+                # print(len(audio_in))
+                audio_in = load_bytes(audio_in)
+                
+                rec_result = inference_pipeline_asr(audio_in=audio_in,
+                                                    param_dict=websocket.param_dict_asr)
+                # print(rec_result)
+                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
+                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
+                                                         param_dict=websocket.param_dict_punc)
+                    # print("offline", rec_result)
+                message = json.dumps({"mode": "2pass-offline", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                await websocket.send(message)
+
+
+async def async_asr_online(websocket, audio_in):
+    if len(audio_in) > 0:
+        audio_in = load_bytes(audio_in)
+        rec_result = inference_pipeline_asr_online(audio_in=audio_in,
+                                                   param_dict=websocket.param_dict_asr_online)
+        if websocket.param_dict_asr_online["is_final"]:
+            websocket.param_dict_asr_online["cache"] = dict()
+        if "text" in rec_result:
+            if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
+                # print("online", rec_result)
+                message = json.dumps({"mode": "2pass-online", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                await websocket.send(message)
+
+
+start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
+asyncio.get_event_loop().run_until_complete(start_server)
+asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_server_offline.py b/funasr/runtime/python/websocket/ws_server_offline.py
new file mode 100644
index 0000000..15578f6
--- /dev/null
+++ b/funasr/runtime/python/websocket/ws_server_offline.py
@@ -0,0 +1,150 @@
+import asyncio
+import json
+import websockets
+import time
+import logging
+import tracemalloc
+import numpy as np
+
+from parse_args import args
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
+
+tracemalloc.start()
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+
+websocket_users = set()
+
+print("model loading")
+# asr
+inference_pipeline_asr = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=args.asr_model,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+    model_revision=None)
+
+
+# vad
+inference_pipeline_vad = pipeline(
+    task=Tasks.voice_activity_detection,
+    model=args.vad_model,
+    model_revision=None,
+    output_dir=None,
+    batch_size=1,
+    mode='online',
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
+)
+
+if args.punc_model != "":
+    inference_pipeline_punc = pipeline(
+        task=Tasks.punctuation,
+        model=args.punc_model,
+        model_revision=None,
+        ngpu=args.ngpu,
+        ncpu=args.ncpu,
+    )
+else:
+    inference_pipeline_punc = None
+
+print("model loaded")
+
+async def ws_serve(websocket, path):
+    frames = []
+    frames_asr = []
+    global websocket_users
+    websocket_users.add(websocket)
+    websocket.param_dict_asr = {}
+    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
+    websocket.param_dict_punc = {'cache': list()}
+    websocket.vad_pre_idx = 0
+    speech_start = False
+
+    try:
+        async for message in websocket:
+            message = json.loads(message)
+            is_finished = message["is_finished"]
+            if not is_finished:
+                audio = bytes(message['audio'], 'ISO-8859-1')
+                frames.append(audio)
+                duration_ms = len(audio)//32
+                websocket.vad_pre_idx += duration_ms
+
+                is_speaking = message["is_speaking"]
+                websocket.param_dict_vad["is_final"] = not is_speaking
+                websocket.wav_name = message.get("wav_name", "demo")
+                if speech_start:
+                    frames_asr.append(audio)
+                speech_start_i, speech_end_i = await async_vad(websocket, audio)
+                if speech_start_i:
+                    speech_start = True
+                    beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
+                    frames_pre = frames[-beg_bias:]
+                    frames_asr = []
+                    frames_asr.extend(frames_pre)
+                if speech_end_i or not is_speaking:
+                    audio_in = b"".join(frames_asr)
+                    await async_asr(websocket, audio_in)
+                    frames_asr = []
+                    speech_start = False
+                    if not is_speaking:
+                        websocket.vad_pre_idx = 0
+                        frames = []
+                        websocket.param_dict_vad = {'in_cache': dict()}
+                    else:
+                        frames = frames[-20:]
+
+     
+    except websockets.ConnectionClosed:
+        print("ConnectionClosed...", websocket_users)
+        websocket_users.remove(websocket)
+    except websockets.InvalidState:
+        print("InvalidState...")
+    except Exception as e:
+        print("Exception:", e)
+
+
+async def async_vad(websocket, audio_in):
+
+    segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
+
+    speech_start = False
+    speech_end = False
+    
+    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
+        return speech_start, speech_end
+    if segments_result["text"][0][0] != -1:
+        speech_start = segments_result["text"][0][0]
+    if segments_result["text"][0][1] != -1:
+        speech_end = True
+    return speech_start, speech_end
+
+
+async def async_asr(websocket, audio_in):
+            if len(audio_in) > 0:
+                # print(len(audio_in))
+                audio_in = load_bytes(audio_in)
+                
+                rec_result = inference_pipeline_asr(audio_in=audio_in,
+                                                    param_dict=websocket.param_dict_asr)
+                # print(rec_result)
+                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
+                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
+                                                         param_dict=websocket.param_dict_punc)
+                    # print(rec_result)
+                message = json.dumps({"mode": "offline", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                await websocket.send(message)
+                
+                
+ 
+
+
+start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
+asyncio.get_event_loop().run_until_complete(start_server)
+asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_server_online.py b/funasr/runtime/python/websocket/ws_server_online.py
index 7ef0e21..3c0fb16 100644
--- a/funasr/runtime/python/websocket/ws_server_online.py
+++ b/funasr/runtime/python/websocket/ws_server_online.py
@@ -12,7 +12,7 @@
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.utils.logger import get_logger
-from funasr_onnx.utils.frontend import load_bytes
+from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
 
 tracemalloc.start()
 
@@ -28,6 +28,8 @@
 inference_pipeline_asr_online = pipeline(
     task=Tasks.auto_speech_recognition,
     model=args.asr_model_online,
+    ngpu=args.ngpu,
+    ncpu=args.ncpu,
     model_revision='v1.0.4')
 
 print("model loaded")
@@ -35,14 +37,10 @@
 
 
 async def ws_serve(websocket, path):
-    frames_online = []
+    frames_asr_online = []
     global websocket_users
-    websocket.send_msg = Queue()
     websocket_users.add(websocket)
     websocket.param_dict_asr_online = {"cache": dict()}
-    websocket.speek_online = Queue()
-    ss_online = threading.Thread(target=asr_online, args=(websocket,))
-    ss_online.start()
 
     try:
         async for message in websocket:
@@ -53,54 +51,37 @@
 
                 is_speaking = message["is_speaking"]
                 websocket.param_dict_asr_online["is_final"] = not is_speaking
-
+                websocket.wav_name = message.get("wav_name", "demo")
                 websocket.param_dict_asr_online["chunk_size"] = message["chunk_size"]
                 
-    
-                frames_online.append(audio)
-    
-                if len(frames_online) % message["chunk_interval"] == 0 or not is_speaking:
-                    
-                    audio_in = b"".join(frames_online)
-                    websocket.speek_online.put(audio_in)
-                    frames_online = []
+                frames_asr_online.append(audio)
+                if len(frames_asr_online) % message["chunk_interval"] == 0 or not is_speaking:
+                    audio_in = b"".join(frames_asr_online)
+                    await async_asr_online(websocket,audio_in)
+                    frames_asr_online = []
 
-            if not websocket.send_msg.empty():
-                await websocket.send(websocket.send_msg.get())
-                websocket.send_msg.task_done()
 
      
     except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)    # 閾炬帴鏂紑
+        print("ConnectionClosed...", websocket_users)
         websocket_users.remove(websocket)
     except websockets.InvalidState:
-        print("InvalidState...")    # 鏃犳晥鐘舵��
+        print("InvalidState...")
     except Exception as e:
         print("Exception:", e)
  
-
-
-def asr_online(websocket):  # ASR鎺ㄧ悊
-    global websocket_users
-    while websocket in websocket_users:
-        if not websocket.speek_online.empty():
-            audio_in = websocket.speek_online.get()
-            websocket.speek_online.task_done()
+async def async_asr_online(websocket,audio_in):
             if len(audio_in) > 0:
-                # print(len(audio_in))
                 audio_in = load_bytes(audio_in)
                 rec_result = inference_pipeline_asr_online(audio_in=audio_in,
                                                            param_dict=websocket.param_dict_asr_online)
                 if websocket.param_dict_asr_online["is_final"]:
                     websocket.param_dict_asr_online["cache"] = dict()
-                
                 if "text" in rec_result:
                     if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
-                        print(rec_result["text"])
-                        message = json.dumps({"mode": "online", "text": rec_result["text"]})
-                        websocket.send_msg.put(message)
-        
-        time.sleep(0.005)
+                        message = json.dumps({"mode": "online", "text": rec_result["text"], "wav_name": websocket.wav_name})
+                        await websocket.send(message)
+
 
 
 start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
diff --git a/funasr/runtime/websocket/CMakeLists.txt b/funasr/runtime/websocket/CMakeLists.txt
index 07d96d9..e89537b 100644
--- a/funasr/runtime/websocket/CMakeLists.txt
+++ b/funasr/runtime/websocket/CMakeLists.txt
@@ -8,7 +8,7 @@
 option(ENABLE_WEBSOCKET "Whether to build websocket server" ON)
  
 if(ENABLE_WEBSOCKET)
-  cmake_policy(SET CMP0135 NEW)
+  # cmake_policy(SET CMP0135 NEW)
 
   include(FetchContent)
   FetchContent_Declare(websocketpp
diff --git a/funasr/runtime/websocket/readme.md b/funasr/runtime/websocket/readme.md
index 6ff3e50..078184e 100644
--- a/funasr/runtime/websocket/readme.md
+++ b/funasr/runtime/websocket/readme.md
@@ -43,48 +43,39 @@
 
 ```shell
 cd bin
-websocketmain  [--model_thread_num <int>] [--decoder_thread_num
-                        <int>] [--io_thread_num <int>] [--port <int>]
-                        [--listen_ip <string>] [--wav-scp <string>]
-                        [--wav-path <string>] [--punc-config <string>]
-                        [--punc-model <string>] --am-config <string>
-                        --am-cmvn <string> --am-model <string>
-                        [--vad-config <string>] [--vad-cmvn <string>]
-                        [--vad-model <string>] [--] [--version] [-h]
+./websocketmain  [--model_thread_num <int>] [--decoder_thread_num <int>]
+                    [--io_thread_num <int>] [--port <int>] [--listen_ip
+                    <string>] [--punc-quant <string>] [--punc-dir <string>]
+                    [--vad-quant <string>] [--vad-dir <string>] [--quantize
+                    <string>] --model-dir <string> [--] [--version] [-h]
 Where:
-   --wav-scp <string>
-     wave scp path
-   --wav-path <string>
-     wave file path
+   --model-dir <string>
+     (required)  the asr model path, which contains model.onnx, config.yaml, am.mvn
+   --quantize <string>
+     false (Default), load the model of model.onnx in model_dir. If set true, load the model of model_quant.onnx in model_dir
 
-   --punc-config <string>
-     punc config path
-   --punc-model <string>
-     punc model path
+   --vad-dir <string>
+     the vad model path, which contains model.onnx, vad.yaml, vad.mvn
+   --vad-quant <string>
+     false (Default), load the model of model.onnx in vad_dir. If set true, load the model of model_quant.onnx in vad_dir
 
-   --am-config <string>
-     (required)  am config path
-   --am-cmvn <string>
-     (required)  am cmvn path
-   --am-model <string>
-     (required)  am model path
+   --punc-dir <string>
+     the punc model path, which contains model.onnx, punc.yaml
+   --punc-quant <string>
+     false (Default), load the model of model.onnx in punc_dir. If set true, load the model of model_quant.onnx in punc_dir
 
-   --vad-config <string>
-     vad config path
-   --vad-cmvn <string>
-     vad cmvn path
-   --vad-model <string>
-     vad model path
    --decoder_thread_num <int>
-     number of threads for decoder
+     number of threads for decoder, default:8
    --io_thread_num <int>
-     number of threads for network io
+     number of threads for network io, default:8
+   --port <int>
+     listen port, default:8889
   
-   Required: --am-config <string> --am-cmvn <string> --am-model <string> 
-   If use vad, please add: [--vad-config <string>] [--vad-cmvn <string>] [--vad-model <string>]
-   If use punc, please add: [--punc-config <string>] [--punc-model <string>] 
+   Required:  --model-dir <string>
+   If use vad, please add: --vad-dir <string>
+   If use punc, please add: --punc-dir <string>
 example:
-   websocketmain --am-config /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/config.yaml --am-model /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/model.onnx --am-cmvn /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/am.mvn
+   websocketmain --model-dir /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
 ```
 
 ## Run websocket client test
diff --git a/funasr/runtime/websocket/websocketclient.cpp b/funasr/runtime/websocket/websocketclient.cpp
index 9ef1d5e..3ab4e99 100644
--- a/funasr/runtime/websocket/websocketclient.cpp
+++ b/funasr/runtime/websocket/websocketclient.cpp
@@ -120,7 +120,7 @@
     uint64_t count = 0;
     std::stringstream val;
 
-    Audio audio(1);
+    funasr::Audio audio(1);
     int32_t sampling_rate = 16000;
 
     if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate)) {
diff --git a/funasr/runtime/websocket/websocketmain.cpp b/funasr/runtime/websocket/websocketmain.cpp
index 24e4269..4614b51 100644
--- a/funasr/runtime/websocket/websocketmain.cpp
+++ b/funasr/runtime/websocket/websocketmain.cpp
@@ -5,14 +5,11 @@
 /* 2022-2023 by zhaomingwork */
 
 // io server
-// Usage:websocketmain  [--model_thread_num <int>] [--decoder_thread_num
-//                        <int>] [--io_thread_num <int>] [--port <int>]
-//                        [--listen_ip <string>] [--wav-scp <string>]
-//                        [--wav-path <string>] [--punc-config <string>]
-//                        [--punc-model <string>] --am-config <string>
-//                        --am-cmvn <string> --am-model <string>
-//                        [--vad-config <string>] [--vad-cmvn <string>]
-//                        [--vad-model <string>] [--] [--version] [-h]
+// Usage:websocketmain  [--model_thread_num <int>] [--decoder_thread_num <int>]
+//                    [--io_thread_num <int>] [--port <int>] [--listen_ip
+//                    <string>] [--punc-quant <string>] [--punc-dir <string>]
+//                    [--vad-quant <string>] [--vad-dir <string>] [--quantize
+//                    <string>] --model-dir <string> [--] [--version] [-h]
 #include "websocketsrv.h"
 
 using namespace std;
@@ -29,29 +26,33 @@
     FLAGS_logtostderr = true;
 
     TCLAP::CmdLine cmd("websocketmain", ' ', "1.0");
-    TCLAP::ValueArg<std::string> vad_model("", VAD_MODEL_PATH, "vad model path",
-                                           false, "", "string");
-    TCLAP::ValueArg<std::string> vad_cmvn("", VAD_CMVN_PATH, "vad cmvn path",
-                                          false, "", "string");
-    TCLAP::ValueArg<std::string> vad_config(
-        "", VAD_CONFIG_PATH, "vad config path", false, "", "string");
-
-    TCLAP::ValueArg<std::string> am_model("", AM_MODEL_PATH, "am model path",
-                                          true, "", "string");
-    TCLAP::ValueArg<std::string> am_cmvn("", AM_CMVN_PATH, "am cmvn path", true,
-                                         "", "string");
-    TCLAP::ValueArg<std::string> am_config("", AM_CONFIG_PATH, "am config path",
-                                           true, "", "string");
-
-    TCLAP::ValueArg<std::string> punc_model(
-        "", PUNC_MODEL_PATH, "punc model path", false, "", "string");
-    TCLAP::ValueArg<std::string> punc_config(
-        "", PUNC_CONFIG_PATH, "punc config path", false, "", "string");
-
-    TCLAP::ValueArg<std::string> wav_path("", WAV_PATH, "wave file path", false,
-                                          "", "string");
-    TCLAP::ValueArg<std::string> wav_scp("", WAV_SCP, "wave scp path", false,
-                                         "", "string");
+    TCLAP::ValueArg<std::string> model_dir(
+        "", MODEL_DIR,
+        "the asr model path, which contains model.onnx, config.yaml, am.mvn",
+        true, "", "string");
+    TCLAP::ValueArg<std::string> quantize(
+        "", QUANTIZE,
+        "false (Default), load the model of model.onnx in model_dir. If set "
+        "true, load the model of model_quant.onnx in model_dir",
+        false, "false", "string");
+    TCLAP::ValueArg<std::string> vad_dir(
+        "", VAD_DIR,
+        "the vad model path, which contains model.onnx, vad.yaml, vad.mvn",
+        false, "", "string");
+    TCLAP::ValueArg<std::string> vad_quant(
+        "", VAD_QUANT,
+        "false (Default), load the model of model.onnx in vad_dir. If set "
+        "true, load the model of model_quant.onnx in vad_dir",
+        false, "false", "string");
+    TCLAP::ValueArg<std::string> punc_dir(
+        "", PUNC_DIR,
+        "the punc model path, which contains model.onnx, punc.yaml", false, "",
+        "string");
+    TCLAP::ValueArg<std::string> punc_quant(
+        "", PUNC_QUANT,
+        "false (Default), load the model of model.onnx in punc_dir. If set "
+        "true, load the model of model_quant.onnx in punc_dir",
+        false, "false", "string");
 
     TCLAP::ValueArg<std::string> listen_ip("", "listen_ip", "listen_ip", false,
                                            "0.0.0.0", "string");
@@ -63,16 +64,13 @@
     TCLAP::ValueArg<int> model_thread_num("", "model_thread_num",
                                           "model_thread_num", false, 1, "int");
 
-    cmd.add(vad_model);
-    cmd.add(vad_cmvn);
-    cmd.add(vad_config);
-    cmd.add(am_model);
-    cmd.add(am_cmvn);
-    cmd.add(am_config);
-    cmd.add(punc_model);
-    cmd.add(punc_config);
-    cmd.add(wav_path);
-    cmd.add(wav_scp);
+    cmd.add(model_dir);
+    cmd.add(quantize);
+    cmd.add(vad_dir);
+    cmd.add(vad_quant);
+    cmd.add(punc_dir);
+    cmd.add(punc_quant);
+
     cmd.add(listen_ip);
     cmd.add(port);
     cmd.add(io_thread_num);
@@ -81,17 +79,12 @@
     cmd.parse(argc, argv);
 
     std::map<std::string, std::string> model_path;
-    GetValue(vad_model, VAD_MODEL_PATH, model_path);
-    GetValue(vad_cmvn, VAD_CMVN_PATH, model_path);
-    GetValue(vad_config, VAD_CONFIG_PATH, model_path);
-    GetValue(am_model, AM_MODEL_PATH, model_path);
-    GetValue(am_cmvn, AM_CMVN_PATH, model_path);
-    GetValue(am_config, AM_CONFIG_PATH, model_path);
-    GetValue(punc_model, PUNC_MODEL_PATH, model_path);
-    GetValue(punc_config, PUNC_CONFIG_PATH, model_path);
-    GetValue(wav_path, WAV_PATH, model_path);
-    GetValue(wav_scp, WAV_SCP, model_path);
-
+    GetValue(model_dir, MODEL_DIR, model_path);
+    GetValue(quantize, QUANTIZE, model_path);
+    GetValue(vad_dir, VAD_DIR, model_path);
+    GetValue(vad_quant, VAD_QUANT, model_path);
+    GetValue(punc_dir, PUNC_DIR, model_path);
+    GetValue(punc_quant, PUNC_QUANT, model_path);
 
     std::string s_listen_ip = listen_ip.getValue();
     int s_port = port.getValue();
@@ -100,7 +93,6 @@
 
     int s_model_thread_num = model_thread_num.getValue();
 
- 
     asio::io_context io_decoder;  // context for decoding
 
     std::vector<std::thread> decoder_threads;
diff --git a/funasr/runtime/websocket/websocketsrv.cpp b/funasr/runtime/websocket/websocketsrv.cpp
index 7e54210..1a6adbf 100644
--- a/funasr/runtime/websocket/websocketsrv.cpp
+++ b/funasr/runtime/websocket/websocketsrv.cpp
@@ -25,8 +25,8 @@
     if (!buffer.empty()) {
       // fout.write(buffer.data(), buffer.size());
       // feed data to asr engine
-      FUNASR_RESULT Result = FunASRRecogPCMBuffer(
-          asr_hanlde, buffer.data(), buffer.size(), 16000, RASR_NONE, NULL);
+      FUNASR_RESULT Result = FunOfflineInferBuffer(
+          asr_hanlde, buffer.data(), buffer.size(), RASR_NONE, NULL, 16000);
 
       std::string asr_result =
           ((FUNASR_RECOG_RESULT*)Result)->msg;  // get decode result
@@ -149,7 +149,7 @@
   try {
     // init model with api
 
-    asr_hanlde = FunASRInit(model_path, thread_num);
+    asr_hanlde = FunOfflineInit(model_path, thread_num);
     std::cout << "model ready" << std::endl;
 
   } catch (const std::exception& e) {
diff --git a/funasr/runtime/websocket/websocketsrv.h b/funasr/runtime/websocket/websocketsrv.h
index 2d0c7bd..e484724 100644
--- a/funasr/runtime/websocket/websocketsrv.h
+++ b/funasr/runtime/websocket/websocketsrv.h
@@ -30,7 +30,7 @@
 
 #include "asio.hpp"
 #include "com-define.h"
-#include "libfunasrapi.h"
+#include "funasrruntime.h"
 #include "nlohmann/json.hpp"
 #include "tclap/CmdLine.h"
 typedef websocketpp::server<websocketpp::config::asio> server;
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index f8c1009..55a5d79 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -555,6 +555,12 @@
             help="The number of gradient accumulation",
         )
         group.add_argument(
+            "--bias_grad_times",
+            type=float,
+            default=1.0,
+            help="To scale the gradient of contextual related params",
+        )
+        group.add_argument(
             "--no_forward_run",
             type=str2bool,
             default=False,
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index d52c9c3..43ea5ab 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -42,6 +42,7 @@
 from funasr.models.joint_net.joint_network import JointNetwork
 from funasr.models.e2e_asr import ESPnetASRModel
 from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerOnline, ParaformerBert, BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_contextual_paraformer import NeatContextualParaformer
 from funasr.models.e2e_tp import TimestampPredictor
 from funasr.models.e2e_asr_mfcca import MFCCA
 from funasr.models.e2e_uni_asr import UniASR
@@ -128,6 +129,7 @@
         paraformer_bert=ParaformerBert,
         bicif_paraformer=BiCifParaformer,
         contextual_paraformer=ContextualParaformer,
+        neatcontextual_paraformer=NeatContextualParaformer,
         mfcca=MFCCA,
         timestamp_prediction=TimestampPredictor,
     ),
@@ -1647,7 +1649,6 @@
             normalize = None
 
         # 4. Encoder
-
         if getattr(args, "encoder", None) is not None:
             encoder_class = encoder_choices.get_class(args.encoder)
             encoder = encoder_class(input_size, **args.encoder_conf)
@@ -1683,7 +1684,7 @@
 
         # 7. Build model
 
-        if encoder.unified_model_training:
+        if hasattr(encoder, 'unified_model_training') and encoder.unified_model_training:
             model = UnifiedTransducerModel(
                 vocab_size=vocab_size,
                 token_list=token_list,
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index e9b18cd..b54f777 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -120,6 +120,6 @@
     if ignore_init_mismatch:
         src_state = filter_state_dict(dst_state, src_state)
 
-    logging.info("Loaded src_state keys: {}".format(src_state.keys()))
+    # logging.info("Loaded src_state keys: {}".format(src_state.keys()))
     dst_state.update(src_state)
     obj.load_state_dict(dst_state)
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 7c187e9..a40f031 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -95,6 +95,7 @@
     use_pai: bool
     oss_bucket: Union[oss2.Bucket, None]
     batch_interval: int
+    bias_grad_times: float
 
 class Trainer:
     """Trainer having a optimizer.
@@ -546,8 +547,11 @@
         no_forward_run = options.no_forward_run
         ngpu = options.ngpu
         use_wandb = options.use_wandb
+        bias_grad_times = options.bias_grad_times
         distributed = distributed_option.distributed
 
+        if bias_grad_times != 1.0:
+            logging.warning("Using bias_grad_times: {} for gradient scaling".format(bias_grad_times))
         if log_interval is None:
             try:
                 log_interval = max(len(iterator) // 20, 10)
@@ -690,6 +694,16 @@
                         scale_factor=0.55,
                     )
 
+                # for contextual training
+                if bias_grad_times != 1.0:
+                    # contextual related parameter names
+                    cr_pnames = ["bias_encoder", "bias_embed", "decoder.bias_decoder", "decoder.bias_output"]
+                    for name, param in model.named_parameters():
+                        for cr_pname in cr_pnames:
+                            if cr_pname in name:
+                                param.grad *= bias_grad_times
+                                continue
+
                 # compute the gradient norm to check if it is normal or not
                 grad_norm = torch.nn.utils.clip_grad_norm_(
                     model.parameters(),
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 87cc49e..4e7a8a9 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -80,6 +80,7 @@
 
 
 def time_stamp_sentence(punc_id_list, time_stamp_postprocessed, text_postprocessed):
+    punc_list = ['锛�', '銆�', '锛�', '銆�']
     res = []
     if text_postprocessed is None:
         return res
@@ -94,48 +95,48 @@
         res.append({
             'text': text_postprocessed.split(),
             "start": time_stamp_postprocessed[0][0],
-            "end": time_stamp_postprocessed[-1][1]
+            "end": time_stamp_postprocessed[-1][1],
+            'text_seg': text_postprocessed.split(),
+            "ts_list": time_stamp_postprocessed,
         })
         return res
     if len(punc_id_list) != len(time_stamp_postprocessed):
         print("  warning length mistach!!!!!!")
-    sentence_text = ''
+    sentence_text = ""
+    sentence_text_seg = ""
+    ts_list = []
     sentence_start = time_stamp_postprocessed[0][0]
     sentence_end = time_stamp_postprocessed[0][1]
     texts = text_postprocessed.split()
     punc_stamp_text_list = list(zip_longest(punc_id_list, time_stamp_postprocessed, texts, fillvalue=None))
     for punc_stamp_text in punc_stamp_text_list:
         punc_id, time_stamp, text = punc_stamp_text
-        sentence_text += text if text is not None else ''
+        # sentence_text += text if text is not None else ''
+        if text is not None:
+            if 'a' <= text[0] <= 'z' or 'A' <= text[0] <= 'Z':
+                sentence_text += ' ' + text
+            elif len(sentence_text) and ('a' <= sentence_text[-1] <= 'z' or 'A' <= sentence_text[-1] <= 'Z'):
+                sentence_text += ' ' + text
+            else:
+                sentence_text += text
+            sentence_text_seg += text + ' '
+        ts_list.append(time_stamp)
+
         punc_id = int(punc_id) if punc_id is not None else 1
         sentence_end = time_stamp[1] if time_stamp is not None else sentence_end
 
-        if punc_id == 2:
-            sentence_text += ','
+        if punc_id > 1:
+            sentence_text += punc_list[punc_id - 2]
             res.append({
                 'text': sentence_text,
                 "start": sentence_start,
-                "end": sentence_end
+                "end": sentence_end,
+                "text_seg": sentence_text_seg,
+                "ts_list": ts_list
             })
             sentence_text = ''
-            sentence_start = sentence_end
-        elif punc_id == 3:
-            sentence_text += '.'
-            res.append({
-                'text': sentence_text,
-                "start": sentence_start,
-                "end": sentence_end
-            })
-            sentence_text = ''
-            sentence_start = sentence_end
-        elif punc_id == 4:
-            sentence_text += '?'
-            res.append({
-                'text': sentence_text,
-                "start": sentence_start,
-                "end": sentence_end
-            })
-            sentence_text = ''
+            sentence_text_seg = ''
+            ts_list = []
             sentence_start = sentence_end
     return res
 
diff --git a/funasr/version.txt b/funasr/version.txt
index 6f2743d..8f0916f 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.4.4
+0.5.0
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 2f2f11d..9098ea6 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -112,6 +112,22 @@
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
 
+    def test_paraformer_large_online_common(self):
+        inference_pipeline = pipeline(
+            task=Tasks.auto_speech_recognition,
+            model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online')
+        rec_result = inference_pipeline(
+            audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+        logger.info("asr inference result: {0}".format(rec_result))
+
+    def test_paraformer_online_common(self):
+        inference_pipeline = pipeline(
+            task=Tasks.auto_speech_recognition,
+            model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online')
+        rec_result = inference_pipeline(
+            audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+        logger.info("asr inference result: {0}".format(rec_result))
+
     def test_paraformer_tiny_commandword(self):
         inference_pipeline = pipeline(
             task=Tasks.auto_speech_recognition,

--
Gitblit v1.9.1