From 6a59f94278b4d05f5b7715b11b6dd0bfffc28cce Mon Sep 17 00:00:00 2001
From: 志浩 <neo.dzh@alibaba-inc.com>
Date: 星期四, 15 六月 2023 17:12:15 +0800
Subject: [PATCH] Merge branch 'main' into dev_dzh

---
 egs/aishell/data2vec_transformer_finetune/run.sh                                                                 |    8 
 funasr/runtime/onnxruntime/src/ct-transformer.cpp                                                                |    9 
 funasr/runtime/html5/static/wsconnecter.js                                                                       |    9 
 funasr/runtime/python/websocket/wss_srv_asr.py                                                                   |   46 
 egs/wenetspeech/conformer/local/extract_meta.py                                                                  |  114 +
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py               |   33 
 docs/model_zoo/modelscope_models.md                                                                              |    5 
 egs/wenetspeech/conformer/local/path.sh                                                                          |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py            |   38 
 egs/librispeech/rnnt/README.md                                                                                   |   18 
 egs/librispeech_100h/rnnt/local/spm_train.py                                                                     |   12 
 egs/aishell/conformer/run.sh                                                                                     |    8 
 egs/aishell/transformer/utils/compute_cmvn.sh                                                                    |    6 
 egs/librispeech_100h/rnnt/README.md                                                                              |   16 
 funasr/runtime/websocket/readme.md                                                                               |   48 
 funasr/runtime/onnxruntime/src/precomp.h                                                                         |    3 
 tests/test_asr_inference_pipeline.py                                                                             |    6 
 funasr/runtime/onnxruntime/include/vad-model.h                                                                   |    9 
 funasr/runtime/onnxruntime/src/audio.cpp                                                                         |   78 
 egs/wenetspeech/conformer/local/data.sh                                                                          |  102 +
 funasr/models/encoder/sanm_encoder.py                                                                            |   55 
 funasr/runtime/python/websocket/README.md                                                                        |   55 
 egs/aishell2/conformer/utils                                                                                     |    2 
 egs/wenetspeech/conformer/local/text_normalize.pl                                                                |   24 
 funasr/bin/asr_infer.py                                                                                          |    8 
 egs/librispeech_100h/rnnt/local/download_and_untar.sh                                                            |   97 +
 egs/aishell/data2vec_paraformer_finetune/run.sh                                                                  |    8 
 funasr/runtime/websocket/funasr-ws-client.cpp                                                                    |  366 +++
 egs/aishell2/paraformer/utils                                                                                    |    2 
 funasr/models/e2e_vad.py                                                                                         |   26 
 funasr/runtime/websocket/websocket-server.h                                                                      |  137 +
 funasr/bin/build_trainer.py                                                                                      |    2 
 egs/aishell2/paraformer/run.sh                                                                                   |    8 
 funasr/runtime/onnxruntime/src/fsmn-vad.h                                                                        |   45 
 funasr/runtime/onnxruntime/bin/funasr-onnx-offline.cpp                                                           |   14 
 egs/wenetspeech/conformer/conf/train_asr_conformer.yaml                                                          |  104 +
 egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh                                                         |  135 +
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh         |  104 +
 funasr/runtime/python/websocket/wss_client_asr.py                                                                |  275 +-
 funasr/runtime/onnxruntime/src/vad-model.cpp                                                                     |   15 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh               |  104 +
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py          |   38 
 funasr/models/decoder/sanm_decoder.py                                                                            |    7 
 funasr/bin/vad_infer.py                                                                                          |    3 
 egs/aishell/rnnt/conf/train_conformer_rnnt_unified.yaml                                                          |   31 
 egs/aishell2/paraformerbert/run.sh                                                                               |    8 
 egs/librispeech/rnnt/path.sh                                                                                     |    5 
 funasr/utils/kwargs2args.py                                                                                      |   19 
 funasr/runtime/onnxruntime/include/audio.h                                                                       |   13 
 egs/aishell2/data2vec_pretrain/run.sh                                                                            |    2 
 funasr/runtime/html5/readme_cn.md                                                                                |    2 
 funasr/runtime/onnxruntime/CMakeLists.txt                                                                        |   13 
 funasr/models/e2e_asr_transducer.py                                                                              |    8 
 funasr/runtime/websocket/funasr-ws-server.cpp                                                                    |   82 
 funasr/models/e2e_asr_paraformer.py                                                                              |  485 ++++
 egs/librispeech/rnnt/local/data_prep.sh                                                                          |   58 
 egs/librispeech/rnnt/local/spm_train.py                                                                          |   12 
 egs/aishell2/paraformerbert/utils                                                                                |    2 
 egs/librispeech_100h/rnnt/run.sh                                                                                 |  213 ++
 funasr/runtime/onnxruntime/bin/funasr-onnx-offline-punc.cpp                                                      |    0 
 egs/librispeech/conformer/run.sh                                                                                 |    2 
 funasr/runtime/onnxruntime/src/paraformer.h                                                                      |    4 
 egs/librispeech_100h/conformer/run.sh                                                                            |    4 
 fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py                                            |   15 
 funasr/utils/prepare_data.py                                                                                     |    2 
 egs/aishell2/transformer/run.sh                                                                                  |    8 
 funasr/runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp                                                        |   89 
 egs/aishell/paraformer/run.sh                                                                                    |    8 
 funasr/runtime/python/websocket/parse_args.py                                                                    |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py                |   38 
 tests/test_asr_vad_punc_inference_pipeline.py                                                                    |    1 
 egs/librispeech_100h/rnnt/local/data_prep.sh                                                                     |   58 
 egs/aishell/paraformerbert/run.sh                                                                                |    8 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py      |   38 
 funasr/runtime/onnxruntime/src/CMakeLists.txt                                                                    |   17 
 MODEL_LICENSE                                                                                                    |   73 
 egs/librispeech_100h/rnnt/conf/train_conformer_rnnt.yaml                                                         |   96 
 funasr/datasets/large_datasets/build_dataloader.py                                                               |    3 
 funasr/train/trainer.py                                                                                          |   19 
 setup.py                                                                                                         |    2 
 funasr/runtime/onnxruntime/src/fsmn-vad-online.h                                                                 |   88 
 funasr/runtime/onnxruntime/src/fsmn-vad.cpp                                                                      |   51 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py         |   33 
 egs/aishell/transformer/run.sh                                                                                   |    8 
 egs/librispeech_100h/rnnt/utils                                                                                  |    1 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/README.md        |    1 
 egs/aishell2/transformerLM/utils                                                                                 |    1 
 egs/librispeech_100h/rnnt/path.sh                                                                                |    5 
 docs/academic_recipe/asr_recipe.md                                                                               |  105 
 egs/librispeech/rnnt/run.sh                                                                                      |  222 ++
 egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml                                                 |    6 
 funasr/build_utils/build_asr_model.py                                                                            |    5 
 egs/wenetspeech/conformer/local/process_opus.py                                                                  |   88 
 funasr/runtime/onnxruntime/bin/CMakeLists.txt                                                                    |   16 
 egs/aishell/transformer/utils/compute_cmvn.py                                                                    |    4 
 egs/librispeech/rnnt/local/download_and_untar.sh                                                                 |   97 +
 egs/librispeech/rnnt/utils                                                                                       |    1 
 funasr/runtime/html5/readme.md                                                                                   |    2 
 funasr/runtime/html5/static/index.html                                                                           |   10 
 README.md                                                                                                        |    2 
 funasr/datasets/large_datasets/dataset.py                                                                        |    6 
 funasr/runtime/websocket/websocket-server.cpp                                                                    |   80 
 egs/wenetspeech/conformer/path.sh                                                                                |    5 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py |    5 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py   |   41 
 egs/aishell2/transformer/utils                                                                                   |    1 
 docs/academic_recipe/images/loss.png                                                                             |    0 
 funasr/runtime/onnxruntime/src/fsmn-vad-online.cpp                                                               |  198 ++
 funasr/version.txt                                                                                               |    2 
 egs/librispeech/rnnt/conf/decode_rnnt_conformer_streaming.yaml                                                   |    8 
 funasr/runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp                                                       |   20 
 funasr/runtime/onnxruntime/src/funasrruntime.cpp                                                                 |   18 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/utils                  |    1 
 egs/aishell/rnnt/run.sh                                                                                          |  127 
 egs_modelscope/tp/TEMPLATE/README.md                                                                             |    2 
 egs/wenetspeech/conformer/utils                                                                                  |    1 
 funasr/runtime/onnxruntime/include/funasrruntime.h                                                               |   13 
 docs/installation/docker.md                                                                                      |    8 
 fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv                                               |   16 
 funasr/runtime/html5/static/main.js                                                                              |   79 
 docs/images/dingding.jpg                                                                                         |    0 
 egs/librispeech_100h/rnnt/local/spm_encode.py                                                                    |   98 +
 egs/wenetspeech/conformer/run.sh                                                                                 |  222 ++
 funasr/runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp                                                       |   12 
 egs/aishell2/conformer/run.sh                                                                                    |    8 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py         |   41 
 egs/librispeech_100h/rnnt/conf/decode_rnnt_conformer.yaml                                                        |    3 
 funasr/models/frontend/wav_frontend.py                                                                           |    6 
 /dev/null                                                                                                        |   99 -
 egs/librispeech/rnnt/conf/train_conformer_rnnt_unified.yaml                                                      |   98 +
 funasr/runtime/websocket/CMakeLists.txt                                                                          |   19 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/README.md              |    1 
 egs_modelscope/asr/TEMPLATE/README.md                                                                            |   22 
 egs/librispeech/rnnt/local/spm_encode.py                                                                         |   98 +
 egs/aishell/rnnt/README.md                                                                                       |   10 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/utils            |    1 
 funasr/bin/asr_inference_launch.py                                                                               |    4 
 137 files changed, 4,963 insertions(+), 798 deletions(-)

diff --git a/LICENSE b/LICENSE
deleted file mode 100644
index 303aaf8..0000000
--- a/LICENSE
+++ /dev/null
@@ -1,21 +0,0 @@
-MIT License
-
-Copyright (c) 2022 Alibaba
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in all
-copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
-SOFTWARE.
\ No newline at end of file
diff --git a/MODEL_LICENSE b/MODEL_LICENSE
new file mode 100644
index 0000000..3d9e410
--- /dev/null
+++ b/MODEL_LICENSE
@@ -0,0 +1,73 @@
+FunASR 妯″瀷寮�婧愬崗璁�
+
+鐗堟湰鍙凤細1.0
+
+鐗堟潈鎵�鏈� (C) [2023-2028] [闃块噷宸村反闆嗗洟]銆備繚鐣欐墍鏈夋潈鍒┿��
+
+鎰熻阿鎮ㄩ�夋嫨 FunASR 寮�婧愭ā鍨嬨�侳unASR 寮�婧愭ā鍨嬪寘鍚竴绯诲垪鍏嶈垂涓斿紑婧愮殑宸ヤ笟妯″瀷锛岃澶у鍙互浣跨敤銆佷慨鏀广�佸垎浜拰瀛︿範璇ユā鍨嬨��
+
+涓轰簡淇濊瘉鏇村ソ鐨勭ぞ鍖哄悎浣滐紝鎴戜滑鍒跺畾浜嗕互涓嬪崗璁紝甯屾湜鎮ㄤ粩缁嗛槄璇诲苟閬靛畧鏈崗璁��
+
+1 瀹氫箟
+鏈崗璁腑锛孾FunASR 杞欢]鎸� FunASR 寮�婧愭ā鍨嬫潈閲嶅強鍏惰鐢熷搧锛屽寘鎷� Finetune 鍚庣殑妯″瀷锛沎鎮╙鎸囦娇鐢ㄣ�佷慨鏀广�佸垎浜拰瀛︿範[FunASR 杞欢]鐨勪釜浜烘垨缁勭粐銆�
+
+2 璁稿彲鍜岄檺鍒�
+2.1 璁稿彲
+
+鎮ㄥ彲浠ュ湪閬靛畧鏈崗璁殑鍓嶆彁涓嬶紝鑷敱鍦颁娇鐢ㄣ�佸鍒躲�佷慨鏀瑰拰鍒嗕韩[FunASR 杞欢]銆�
+
+2.2 闄愬埗
+
+鎮ㄥ湪浣跨敤銆佸鍒躲�佷慨鏀瑰拰鍒嗕韩[FunASR 杞欢]鏃讹紝蹇呴』娉ㄦ槑鍑哄浠ュ強浣滆�呬俊鎭紝骞朵繚鐣橻FunASR 杞欢]涓浉鍏虫ā鍨嬪悕绉般��
+
+3 璐ｄ换鍜岄闄╂壙鎷�
+[FunASR 杞欢]浠呬綔涓哄弬鑰冨拰瀛︿範浣跨敤锛屼笉瀵规偍浣跨敤鎴栦慨鏀筟FunASR 杞欢]閫犳垚鐨勪换浣曠洿鎺ユ垨闂存帴鎹熷け鎵挎媴浠讳綍璐ｄ换銆傛偍瀵筟FunASR 杞欢]鐨勪娇鐢ㄥ拰淇敼搴旇鑷鎵挎媴椋庨櫓銆�
+
+4 缁堟
+濡傛灉鎮ㄨ繚鍙嶆湰鍗忚鐨勪换浣曟潯娆撅紝鎮ㄧ殑璁稿彲灏嗚嚜鍔ㄧ粓姝紝鎮ㄥ繀椤诲仠姝娇鐢ㄣ�佸鍒躲�佷慨鏀瑰拰鍒嗕韩[FunASR 杞欢]銆�
+
+5 淇
+鏈崗璁彲鑳戒細涓嶆椂鏇存柊鍜屼慨璁€�備慨璁㈠悗鐨勫崗璁皢鍦╗FunASR 杞欢]瀹樻柟浠撳簱鍙戝竷锛屽苟鑷姩鐢熸晥銆傚鏋滄偍缁х画浣跨敤銆佸鍒躲�佷慨鏀瑰拰鍒嗕韩[FunASR 杞欢]锛屽嵆琛ㄧず鎮ㄥ悓鎰忎慨璁㈠悗鐨勫崗璁��
+
+6 鍏朵粬瑙勫畾
+鏈崗璁彈鍒癧鍥藉/鍦板尯] 鐨勬硶寰嬬杈栥�傚鏋滀换浣曟潯娆捐瑁佸畾涓轰笉鍚堟硶銆佹棤鏁堟垨鏃犳硶鎵ц锛屽垯璇ユ潯娆惧簲琚涓轰粠鏈崗璁腑鍒犻櫎锛岃�屽叾浣欐潯娆惧簲缁х画鏈夋晥骞跺叿鏈夌害鏉熷姏銆�
+
+濡傛灉鎮ㄥ鏈崗璁湁浠讳綍闂鎴栨剰瑙侊紝璇疯仈绯绘垜浠��
+
+鐗堟潈鎵�鏈壜� [2023-2028] [闃块噷宸村反闆嗗洟]銆備繚鐣欐墍鏈夋潈鍒┿��
+
+FunASR Model Open Source License
+Version 1.0
+
+Copyright (C) [2023-2028] Alibaba Group. All rights reserved.
+
+Thank you for choosing the FunASR open source models. The FunASR open source models contain a series of open-source models that allow everyone to use, modify, share, and learn from it.
+
+To ensure better community collaboration, we have developed the following agreement and hope that you carefully read and abide by it.
+
+1 Definitions
+In this agreement, [FunASR software] refers to the FunASR open source model, and its derivatives, including fine-tuned models. [You] refer to individuals or organizations who use, modify, share, and learn from [FunASR software].
+
+2 License and Restrictions
+
+2.1 License
+You are free to use, copy, modify, and share [FunASR software] under the conditions of this agreement.
+
+2.2 Restrictions
+You should indicate the code and model source and author information when using, copying, modifying and sharing [FunASR software]. You should keep the relevant names of models in [FunASR software].
+
+3 Responsibility and Risk
+[FunASR software] is for reference and learning purposes only and is not responsible for any direct or indirect losses caused by your use or modification of [FunASR software]. You should take responsibility and risks for your use and modification of [FunASR software].
+
+4 Termination
+If you violate any terms of this agreement, your license will be automatically terminated, and you must stop using, copying, modifying, and sharing [FunASR software].
+
+5 Revision
+This agreement may be updated and revised from time to time. The revised agreement will be published in the FunASR official repository and automatically take effect. If you continue to use, copy, modify, and share [FunASR software], it means you agree to the revised agreement.
+
+6 Other Provisions
+This agreement is subject to the laws of [Country/Region]. If any provisions are found to be illegal, invalid, or unenforceable, they shall be deemed deleted from this agreement, and the remaining provisions shall remain valid and binding.
+
+If you have any questions or comments about this agreement, please contact us.
+
+Copyright (c) [2023-2028] Alibaba Group. All rights reserved.
diff --git a/README.md b/README.md
index 1fcbcdd..7c289e0 100644
--- a/README.md
+++ b/README.md
@@ -13,7 +13,6 @@
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
 | [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
-| [**Tutorial_CN**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
 | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
 | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
 | [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
@@ -88,6 +87,7 @@
 
 ## License
 This project is licensed under the [The MIT License](https://opensource.org/licenses/MIT). FunASR also contains various third-party components and some code modified from other repos under other open source licenses.
+The use of pretraining model is subject to [model licencs](./MODEL_LICENSE)
 
 ## Citations
 
diff --git a/docs/academic_recipe/asr_recipe.md b/docs/academic_recipe/asr_recipe.md
index 4e8f072..5a11dc5 100644
--- a/docs/academic_recipe/asr_recipe.md
+++ b/docs/academic_recipe/asr_recipe.md
@@ -12,7 +12,7 @@
 Then you can directly start the recipe as follows:
 ```sh
 conda activate funasr
-. ./run.sh
+. ./run.sh --CUDA_VISIBLE_DEVICES="0,1" --gpu_num=2
 ```
 
 The training log files are saved in `${exp_dir}/exp/${model_dir}/log/train.log.*`锛� which can be viewed using the following command:
@@ -26,15 +26,18 @@
 ... 1epoch:train:801-850batch:850num_updates: ... loss_ctc=107.890, loss_att=87.832, acc=0.029, loss_pre=1.702 ...
 ```
 
-Also, users can use tensorboard to observe these training information by the following command:
-```sh
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
 At the end of each epoch, the evaluation metrics are calculated on the validation set, like follows:
 ```text
 ... [valid] loss_ctc=99.914, cer_ctc=1.000, loss_att=80.512, acc=0.029, cer=0.971, wer=1.000, loss_pre=1.952, loss=88.285 ...
 ```
+
+Also, users can use tensorboard to observe these training information by the following command:
+```sh
+tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
+```
+Here is an example of loss:
+
+<img src="./academic_recipe/images/loss.png" width="200"/>
 
 The inference results are saved in `${exp_dir}/exp/${model_dir}/decode_asr_*/$dset`. The main two files are `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text, like follows:
 ```text
@@ -103,18 +106,37 @@
 榫�
 <unk>
 ```
-* `<blank>`: indicates the blank token for CTC, must be in the first line
-* `<s>`: indicates the start-of-sentence token, must be in the second line
-* `</s>`: indicates the end-of-sentence token, must be in the third line
-* `<unk>`: indicates the out-of-vocabulary token, must be in the last line
+There are four tokens must be specified:
+* `<blank>`: (required), indicates the blank token for CTC, must be in the first line
+* `<s>`: (required), indicates the start-of-sentence token, must be in the second line
+* `</s>`: (required), indicates the end-of-sentence token, must be in the third line
+* `<unk>`: (required), indicates the out-of-vocabulary token, must be in the last line
 
 ### Stage 3: LM Training
 
 ### Stage 4: ASR Training
-This stage achieves the training of the specified model. To start training, users should manually set `exp_dir` to specify the path for saving experimental results. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding. FunASR implements `train.py` for training different models and users can configure the following parameters if necessary.
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir` to specify the path for saving experimental results. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding. FunASR implements `train.py` for training different models and users can configure the following parameters if necessary. The training command is as follows:
+
+```sh
+train.py \
+    --task_name asr \
+    --use_preprocessor true \
+    --token_list $token_list \
+    --data_dir ${feats_dir}/data \
+    --train_set ${train_set} \
+    --valid_set ${valid_set} \
+    --data_file_names "wav.scp,text" \
+    --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+    --speed_perturb ${speed_perturb} \
+    --resume true \
+    --output_dir ${exp_dir}/exp/${model_dir} \
+    --config $asr_config \
+    --ngpu $gpu_num \
+    ...
+```
 
 * `task_name`: `asr` (Default), specify the task type of the current recipe
-* `gpu_num`: `2` (Default), specify the number of GPUs for training. When `gpu_num > 1`, DistributedDataParallel (DDP, the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)) training will be enabled. Correspondingly, `CUDA_VISIBLE_DEVICES` should be set to specify which ids of GPUs will be used.
+* `ngpu`: `2` (Default), specify the number of GPUs for training. When `ngpu > 1`, DistributedDataParallel (DDP, the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html)) training will be enabled. Correspondingly, `CUDA_VISIBLE_DEVICES` should be set to specify which ids of GPUs will be used.
 * `use_preprocessor`: `true` (Default), specify whether to use pre-processing on each sample
 * `token_list`: the path of token list for training
 * `dataset_type`: `small` (Default). FunASR supports `small` dataset type for training small datasets. Besides, an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large datasets is supported and users can specify `dataset_type=large` to enable it.
@@ -122,6 +144,7 @@
 * `data_file_names`: `"wav.scp,text"` specify the speech and text file names for ASR
 * `cmvn_file`: the path of cmvn file
 * `resume`: `true`, whether to enable "checkpoint training"
+* `output_dir`: the path for saving training results
 * `config`: the path of configuration file, which is usually a YAML file in `conf` directory. In FunASR, the parameters of the training, including model, optimization, dataset, etc., can also be set in this file. Note that if the same parameters are specified in both recipe and config file, the parameters of recipe will be employed
 
 ### Stage 5: Decoding
@@ -153,31 +176,32 @@
 ## Change settings
 Here we explain how to perform common custom settings, which can help users to modify scripts according to their own needs.
 
-* Training with specified GPUs
+### Training with specified GPUs
 
-For example, if users want to use 2 GPUs with id `2` and `3, users can run the following command:
+For example, if users want to use 2 GPUs with id `2` and `3`, users can run the following command:
 ```sh
 . ./run.sh --CUDA_VISIBLE_DEVICES "2,3" --gpu_num 2 
 ```
 
-* Start from/Stop at a specified stage
+### Start from/Stop at a specified stage
 
 The recipe includes several stages. Users can start form or stop at any stage. For example, the following command achieves starting from the third stage and stopping at the fifth stage:
 ```sh
 . ./run.sh --stage 3 --stop_stage 5
 ```
 
-* Training Steps
+### Specify total training steps
+
 FunASR supports two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
 
-* Change the configuration of the model
+### Change the configuration of the model
 
 The configuration of the model is set in the config file `conf/train_*.yaml`. Specifically, the default encoder configuration of paraformer is as follows:
 ```
 encoder: conformer
 encoder_conf:
     output_size: 256    # dimension of attention
-    attention_heads: 4  # number of heads in multi-head attention
+    attention_heads: 4  # the number of heads in multi-head attention
     linear_units: 2048  # the number of units of position-wise feed forward
     num_blocks: 12      # the number of encoder blocks
     dropout_rate: 0.1
@@ -195,6 +219,49 @@
 ```
 Users can change the encoder configuration by modify these values. For example, if users want to use an encoder with 16 conformer blocks and each block has 8 attention heads, users just need to change `num_blocks` from 12 to 16 and change `attention_heads` from 4 to 8. Besides, the batch_size, learning rate and other training hyper-parameters are also set in this config file. To change these hyper-parameters, users just need to directly change the corresponding values in this file. For example, the default learning rate is `0.0005`. If users want to change the learning rate to 0.0002, set the value of lr as `lr: 0.0002`.
 
-* Decoding by CPU or GPU
+### Change different input data type
+
+FunASR supports different input data types, including `sound`, `kaldi_ark`, `npy`, `text` and `text_int`. Users can specify any number and any type of input, which is achieved by `data_names` and `data_types` (in `config/train_*.yaml`). For example, ASR task usually requires speech and the transcripts as input. In FunASR, by default, speech is saved as raw audio (such as wav format) and transcripts are saved as text format. Correspondingly, `data_names` and `data_types` are set as follows (seen in `config/train_*.yaml`):
+```text
+dataset_conf:
+    data_names: speech,text
+    data_types: sound,text
+    ...
+```
+When the input type changes to FBank, users just need to modify as `data_types: kaldi_ark,text` in the config file. Note `data_file_names` used in `train.py` should also be changed to the new file name.
+
+### How to resume training process
+FunASR supports resuming training as follows:
+```shell
+train.py ... --resume true ...
+```
+
+### How to transfer / fine-tuning from pre-trained models
+
+FunASR supports transferring / fine-tuning from a pre-trained model by specifying the `init_param` parameter. The usage format is as follows:
+```shell
+train.py ... --init_param <file_path>:<src_key>:<dst_key>:<exclude_keys>  ..
+```
+For example, the following command achieves loading all pretrained parameters starting from decoder except decoder.embed and set it to model.decoder2: 
+```shell
+train.py ... --init_param model.pb:decoder:decoder2:decoder.embed  ...
+```
+Besides, loading parameters from multiple pre-trained models is supported. For example, the following command achieves loading encoder parameters from the pre-trained model1 and decoder parameters from the pre-trained model2:
+```sh
+train.py ... --init_param model1.pb:encoder  --init_param model2.pb:decoder ...
+```
+
+### How to freeze part of the model parameters
+
+In certain situations, users may want to fix part of the model parameters update the rest model parameters. FunASR employs `freeze_param` to achieve this. For example, to fix all parameters like `encoder.*`, users need to set `freeze_param ` as follows:
+```sh
+train.py ... --freeze_param encoder ...
+```
+
+### ModelScope Usage
+
+Users can use ModelScope for inference and fine-tuning based on a trained academic model. To achieve this, users need to run the stage 6 in the script. In this stage, relevant files required by ModelScope will be generated automatically. Users can then use the corresponding ModelScope interface by replacing the model name with the local trained model path. For the detailed usage of the ModelScope interface, please refer to [ModelScope Usage](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html).
+
+### Decoding by CPU or GPU
 
 We support CPU and GPU decoding. For CPU decoding, set `gpu_inference=false` and `njob` to specific the total number of CPU jobs. For GPU decoding, first set `gpu_inference=true`. Then set `gpuid_list` to specific which GPUs for decoding and `njob` to specific the number of decoding jobs on each GPU.
\ No newline at end of file
diff --git a/docs/academic_recipe/images/loss.png b/docs/academic_recipe/images/loss.png
new file mode 100644
index 0000000..f559864
--- /dev/null
+++ b/docs/academic_recipe/images/loss.png
Binary files differ
diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg
index 6ac3ab8..9c9166c 100644
--- a/docs/images/dingding.jpg
+++ b/docs/images/dingding.jpg
Binary files differ
diff --git a/docs/installation/docker.md b/docs/installation/docker.md
index bc80819..a63f2e7 100644
--- a/docs/installation/docker.md
+++ b/docs/installation/docker.md
@@ -52,8 +52,12 @@
 
 ## Run Docker
 ```shell
-sudo docker run -itd --name funasr <image-name>:<tag> bash
-sudo docker exec -it funasr bash
+# cpu
+sudo docker run -itd --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+# gpu
+sudo docker run -itd --gpus all --name funasr -v <local_dir:dir_in_docker> <image-name>:<tag> /bin/bash
+
+sudo docker exec -it funasr /bin/bash
 ```
 
 ## Stop Docker
diff --git a/docs/model_zoo/modelscope_models.md b/docs/model_zoo/modelscope_models.md
index e844209..2e4e51c 100644
--- a/docs/model_zoo/modelscope_models.md
+++ b/docs/model_zoo/modelscope_models.md
@@ -1,7 +1,10 @@
 # Pretrained Models on ModelScope
 
 ## Model License
--  Apache License 2.0
+You are free to use, copy, modify, and share FunASR models under the conditions of this agreement. You should indicate the model source and author information when using, copying, modifying and sharing FunASR models. You should keep the relevant names of models in [FunASR software].. Full model license could see [license](https://github.com/alibaba-damo-academy/FunASR/blob/main/MODEL_LICENSE)
+
+## Model Usage
+Ref to [docs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)
 
 ## Model Zoo
 Here we provided several pretrained models on different datasets. The details of models and datasets can be found on [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition).
diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 633e697..e8643e9 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -85,14 +85,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -130,7 +130,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index 694191a..1f96873 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -88,14 +88,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -134,7 +134,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 27ba90c..7b01a5f 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -88,14 +88,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -134,7 +134,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index b7e3a82..7d79211 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -85,14 +85,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
    
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -130,7 +130,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --dataset_type small \
                 --data_dir ${feats_dir}/data \
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 8a614e3..efc831f 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -89,14 +89,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
    
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -141,7 +141,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell/rnnt/README.md b/egs/aishell/rnnt/README.md
index 45f1f3f..817c9b2 100644
--- a/egs/aishell/rnnt/README.md
+++ b/egs/aishell/rnnt/README.md
@@ -5,14 +5,14 @@
 - 8 gpu(Tesla V100)
 - Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
 - Train config: conf/train_conformer_rnnt_unified.yaml
-- chunk config: chunk size 16, full left chunk
+- chunk config: chunk size 16, 1 left chunk
 - LM config: LM was not used
 - Model size: 90M
 
 ## Results (CER)
-- Decode config: conf/train_conformer_rnnt_unified.yaml
+- Decode config: conf/decode_rnnt_conformer_streaming.yaml
 
-|   testset   | CER(%)  |
+|   testset   |  CER(%) |
 |:-----------:|:-------:|
-|     dev     |  5.53   |
-|    test     |  6.24   |
+|     dev     |  5.43   |
+|    test     |  6.04   |
diff --git a/egs/aishell/rnnt/conf/train_conformer_rnnt_unified.yaml b/egs/aishell/rnnt/conf/train_conformer_rnnt_unified.yaml
index 8a1c40c..59f9936 100644
--- a/egs/aishell/rnnt/conf/train_conformer_rnnt_unified.yaml
+++ b/egs/aishell/rnnt/conf/train_conformer_rnnt_unified.yaml
@@ -25,19 +25,28 @@
     hidden_size: 512
     embed_dropout_rate: 0.5
     dropout_rate: 0.5
-
 joint_network_conf:
     joint_space_size: 512
 
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+
 # Auxiliary CTC
+model: rnnt_unified
 model_conf:
     auxiliary_ctc_weight: 0.0
 
 # minibatch related
 use_amp: true
-batch_type: unsorted
-batch_size: 16
-num_workers: 16
 
 # optimization related
 accum_grad: 1
@@ -59,8 +68,6 @@
 scheduler_conf:
    warmup_steps: 25000
 
-normalize: None
-
 specaug: specaug
 specaug_conf:
     apply_time_warp: true
@@ -77,4 +84,16 @@
     - 50
     num_time_mask: 5
 
+dataset_conf:
+    data_names: speech,text
+    data_types: sound,text
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 2048
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 16000
+    num_workers: 8
+
 log_interval: 50
diff --git a/egs/aishell/rnnt/run.sh b/egs/aishell/rnnt/run.sh
index bcd4a8b..9facc8b 100755
--- a/egs/aishell/rnnt/run.sh
+++ b/egs/aishell/rnnt/run.sh
@@ -3,8 +3,8 @@
 . ./path.sh || exit 1;
 
 # machines configuration
-CUDA_VISIBLE_DEVICES="0,1,2,3"
-gpu_num=4
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
 count=1
 gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
 # for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
@@ -13,25 +13,23 @@
 infer_cmd=utils/run.pl
 
 # general configuration
-feats_dir= #feature output dictionary
-exp_dir=
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
 lang=zh
-dumpdir=dump/fbank
-feats_type=fbank
 token_type=char
-scp=feats.scp
-type=kaldi_ark
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
 stage=0
-stop_stage=4
+stop_stage=5
 
 # feature configuration
 feats_dim=80
-sample_frequency=16000
-nj=32
-speed_perturb="0.9,1.0,1.1"
+nj=64
 
 # data
-data_aishell=
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
 
 # exp tag
 tag="exp1"
@@ -49,10 +47,10 @@
 test_sets="dev test"
 
 asr_config=conf/train_conformer_rnnt_unified.yaml
-model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
 
 inference_config=conf/decode_rnnt_conformer_streaming.yaml
-inference_asr_model=valid.cer_transducer_chunk.ave_5best.pth
+inference_asr_model=valid.cer_transducer_chunk.ave_10best.pb
 
 # you can set gpu num for decoding here
 gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
@@ -66,10 +64,16 @@
     _ngpu=0
 fi
 
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
 if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
     echo "stage 0: Data preparation"
     # Data preparation
-    local/aishell_data_prep.sh ${data_aishell}/data_aishell/wav ${data_aishell}/data_aishell/transcript ${feats_dir}
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
     for x in train dev test; do
         cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
         paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
@@ -79,46 +83,9 @@
     done
 fi
 
-feat_train_dir=${feats_dir}/${dumpdir}/train; mkdir -p ${feat_train_dir}
-feat_dev_dir=${feats_dir}/${dumpdir}/dev; mkdir -p ${feat_dev_dir}
-feat_test_dir=${feats_dir}/${dumpdir}/test; mkdir -p ${feat_test_dir}
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
-    echo "stage 1: Feature Generation"
-    # compute fbank features
-    fbankdir=${feats_dir}/fbank
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} --speed_perturb ${speed_perturb} \
-        ${feats_dir}/data/train ${exp_dir}/exp/make_fbank/train ${fbankdir}/train
-    utils/fix_data_feat.sh ${fbankdir}/train
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
-        ${feats_dir}/data/dev ${exp_dir}/exp/make_fbank/dev ${fbankdir}/dev
-    utils/fix_data_feat.sh ${fbankdir}/dev
-    utils/compute_fbank.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --sample_frequency ${sample_frequency} \
-        ${feats_dir}/data/test ${exp_dir}/exp/make_fbank/test ${fbankdir}/test
-    utils/fix_data_feat.sh ${fbankdir}/test
-     
-    # compute global cmvn
-    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} \
-        ${fbankdir}/train ${exp_dir}/exp/make_fbank/train
-
-    # apply cmvn 
-    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
-        ${fbankdir}/train ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/train ${feat_train_dir}
-    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
-        ${fbankdir}/dev ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/dev ${feat_dev_dir}
-    utils/apply_cmvn.sh --cmd "$train_cmd" --nj $nj \
-        ${fbankdir}/test ${fbankdir}/train/cmvn.json ${exp_dir}/exp/make_fbank/test ${feat_test_dir}
-    
-    cp ${fbankdir}/train/text ${fbankdir}/train/speech_shape ${fbankdir}/train/text_shape ${feat_train_dir}
-    cp ${fbankdir}/dev/text ${fbankdir}/dev/speech_shape ${fbankdir}/dev/text_shape ${feat_dev_dir}
-    cp ${fbankdir}/test/text ${fbankdir}/test/speech_shape ${fbankdir}/test/text_shape ${feat_test_dir}
-
-    utils/fix_data_feat.sh ${feat_train_dir}
-    utils/fix_data_feat.sh ${feat_dev_dir}
-    utils/fix_data_feat.sh ${feat_test_dir}
-
-    #generate ark list 
-    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_train_dir} ${fbankdir}/train ${feat_train_dir}
-    utils/gen_ark_list.sh --cmd "$train_cmd" --nj $nj ${feat_dev_dir} ${fbankdir}/dev ${feat_dev_dir}
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} ${feats_dir}/data/${train_set}
 fi
 
 token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
@@ -126,31 +93,29 @@
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
     mkdir -p ${feats_dir}/data/${lang}_token_list/char/
-   
+
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
     echo "<s>" >> ${token_list}
     echo "</s>" >> ${token_list}
-    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/train/text | cut -f 2- -d" " | tr " " "\n" \
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
         | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
-    num_token=$(cat ${token_list} | wc -l)
     echo "<unk>" >> ${token_list}
-    vocab_size=$(cat ${token_list} | wc -l)
-    awk -v v=,${vocab_size} '{print $0v}' ${feat_train_dir}/text_shape > ${feat_train_dir}/text_shape.char
-    awk -v v=,${vocab_size} '{print $0v}' ${feat_dev_dir}/text_shape > ${feat_dev_dir}/text_shape.char
-    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/train 
-    mkdir -p ${feats_dir}/asr_stats_fbank_zh_char/dev
-    cp ${feat_train_dir}/speech_shape ${feat_train_dir}/text_shape ${feat_train_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/train
-    cp ${feat_dev_dir}/speech_shape ${feat_dev_dir}/text_shape ${feat_dev_dir}/text_shape.char ${feats_dir}/asr_stats_fbank_zh_char/dev
 fi
 
-# Training Stage
+# LM Training Stage
 world_size=$gpu_num  # run on one machine
 if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
-    echo "stage 3: Training"
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: ASR Training"
     mkdir -p ${exp_dir}/exp/${model_dir}
     mkdir -p ${exp_dir}/exp/${model_dir}/log
-    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    INIT_FILE=./ddp_init
     if [ -f $INIT_FILE ];then
         rm -f $INIT_FILE
     fi 
@@ -161,26 +126,23 @@
             rank=$i
             local_rank=$i
             gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
-            asr_train_transducer.py \
+            train.py \
+                --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
                 --token_type char \
                 --token_list $token_list \
-                --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/${scp},speech,${type} \
-                --train_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${train_set}/text,text,text \
-                --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/speech_shape \
-                --train_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${train_set}/text_shape.char \
-                --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/${scp},speech,${type} \
-                --valid_data_path_and_name_and_type ${feats_dir}/${dumpdir}/${valid_set}/text,text,text \
-                --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/speech_shape \
-                --valid_shape_file ${feats_dir}/asr_stats_fbank_zh_char/${valid_set}/text_shape.char  \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --data_file_names "wav.scp,text" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
+                --speed_perturb ${speed_perturb} \
                 --resume true \
                 --output_dir ${exp_dir}/exp/${model_dir} \
                 --config $asr_config \
-                --input_size $feats_dim \
                 --ngpu $gpu_num \
                 --num_worker_count $count \
-                --multiprocessing_distributed true \
                 --dist_init_method $init_method \
                 --dist_world_size $world_size \
                 --dist_rank $rank \
@@ -191,8 +153,8 @@
 fi
 
 # Testing Stage
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
-    echo "stage 4: Inference"
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Inference"
     for dset in ${test_sets}; do
         asr_exp=${exp_dir}/exp/${model_dir}
         inference_tag="$(basename "${inference_config}" .yaml)"
@@ -203,7 +165,7 @@
             exit 0
         fi
         mkdir -p "${_logdir}"
-        _data="${feats_dir}/${dumpdir}/${dset}"
+        _data="${feats_dir}/data/${dset}"
         key_file=${_data}/${scp}
         num_scp_file="$(<${key_file} wc -l)"
         _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
@@ -224,6 +186,7 @@
                 --njob ${njob} \
                 --gpuid_list ${gpuid_list} \
                 --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
                 --key_file "${_logdir}"/keys.JOB.scp \
                 --asr_train_config "${asr_exp}"/config.yaml \
                 --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 38343ae..e492fb2 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -85,14 +85,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -130,7 +130,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell/transformer/utils/compute_cmvn.py b/egs/aishell/transformer/utils/compute_cmvn.py
index 6c9b445..4986a5a 100755
--- a/egs/aishell/transformer/utils/compute_cmvn.py
+++ b/egs/aishell/transformer/utils/compute_cmvn.py
@@ -27,7 +27,7 @@
         help="the path of wav scps",
     )
     parser.add_argument(
-        "--config",
+        "--config_file",
         type=str,
         help="the config file for computing cmvn",
     )
@@ -89,7 +89,7 @@
     #         var_stats += np.sum(np.square(mat), axis=0)
     #         total_frames += mat.shape[0]
 
-    with open(args.config) as f:
+    with open(args.config_file) as f:
         configs = yaml.safe_load(f)
         frontend_configs = configs.get("frontend_conf", {})
         num_mel_bins = frontend_configs.get("n_mels", 80)
diff --git a/egs/aishell/transformer/utils/compute_cmvn.sh b/egs/aishell/transformer/utils/compute_cmvn.sh
index ad8813d..4e7f80e 100755
--- a/egs/aishell/transformer/utils/compute_cmvn.sh
+++ b/egs/aishell/transformer/utils/compute_cmvn.sh
@@ -2,11 +2,11 @@
 
 . ./path.sh || exit 1;
 # Begin configuration section.
-fbankdir=$1
+fbankdir=
 nj=32
 cmd=./utils/run.pl
 feats_dim=80
-config=
+config_file=
 scale=1.0
 
 echo "$0 $@"
@@ -29,7 +29,7 @@
     python utils/compute_cmvn.py \
       --dim ${feats_dim} \
       --wav_path $split_dir \
-      --config $config \
+      --config_file $config_file \
       --idx JOB \
 
 python utils/combine_cmvn_file.py --dim ${feats_dim} --cmvn_dir $split_dir --nj $nj --output_dir ${fbankdir}/cmvn
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index b5e8db1..193c4a3 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -87,14 +87,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
    
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -132,7 +132,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell2/conformer/utils b/egs/aishell2/conformer/utils
index 0853fce..fe070dd 120000
--- a/egs/aishell2/conformer/utils
+++ b/egs/aishell2/conformer/utils
@@ -1 +1 @@
-../transformer/utils/
\ No newline at end of file
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/aishell2/data2vec_pretrain/run.sh b/egs/aishell2/data2vec_pretrain/run.sh
index e741919..f07deb5 100755
--- a/egs/aishell2/data2vec_pretrain/run.sh
+++ b/egs/aishell2/data2vec_pretrain/run.sh
@@ -66,7 +66,7 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
 token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index cf3ceb2..4268cf6 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -87,14 +87,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -132,7 +132,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell2/paraformer/utils b/egs/aishell2/paraformer/utils
index 0853fce..fe070dd 120000
--- a/egs/aishell2/paraformer/utils
+++ b/egs/aishell2/paraformer/utils
@@ -1 +1 @@
-../transformer/utils/
\ No newline at end of file
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index eee2912..548c4db 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -90,14 +90,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -142,7 +142,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell2/paraformerbert/utils b/egs/aishell2/paraformerbert/utils
index 0853fce..fe070dd 120000
--- a/egs/aishell2/paraformerbert/utils
+++ b/egs/aishell2/paraformerbert/utils
@@ -1 +1 @@
-../transformer/utils/
\ No newline at end of file
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index 895e403..22c3342 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -87,14 +87,14 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
-token_list=${feats_dir}/data/${lang}_token_list/char/tokens.txt
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
 echo "dictionary: ${token_list}"
 if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
     echo "stage 2: Dictionary Preparation"
-    mkdir -p ${feats_dir}/data/${lang}_token_list/char/
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
 
     echo "make a dictionary"
     echo "<blank>" > ${token_list}
@@ -132,7 +132,7 @@
                 --task_name asr \
                 --gpu_id $gpu_id \
                 --use_preprocessor true \
-                --token_type char \
+                --token_type $token_type \
                 --token_list $token_list \
                 --data_dir ${feats_dir}/data \
                 --train_set ${train_set} \
diff --git a/egs/aishell2/transformer/utils b/egs/aishell2/transformer/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/aishell2/transformer/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/aishell2/transformer/utils/apply_cmvn.py b/egs/aishell2/transformer/utils/apply_cmvn.py
deleted file mode 100755
index b5c5086..0000000
--- a/egs/aishell2/transformer/utils/apply_cmvn.py
+++ /dev/null
@@ -1,79 +0,0 @@
-from kaldiio import ReadHelper
-from kaldiio import WriteHelper
-
-import argparse
-import json
-import math
-import numpy as np
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="apply cmvn",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--ark-file",
-        "-a",
-        default=False,
-        required=True,
-        type=str,
-        help="fbank ark file",
-    )
-    parser.add_argument(
-        "--cmvn-file",
-        "-c",
-        default=False,
-        required=True,
-        type=str,
-        help="cmvn file",
-    )
-    parser.add_argument(
-        "--ark-index",
-        "-i",
-        default=1,
-        required=True,
-        type=int,
-        help="ark index",
-    )
-    parser.add_argument(
-        "--output-dir",
-        "-o",
-        default=False,
-        required=True,
-        type=str,
-        help="output dir",
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    ark_file = args.output_dir + "/feats." + str(args.ark_index) + ".ark"
-    scp_file = args.output_dir + "/feats." + str(args.ark_index) + ".scp"
-    ark_writer = WriteHelper('ark,scp:{},{}'.format(ark_file, scp_file))
-
-    with open(args.cmvn_file) as f:
-        cmvn_stats = json.load(f)
-
-    means = cmvn_stats['mean_stats']
-    vars = cmvn_stats['var_stats']
-    total_frames = cmvn_stats['total_frames']
-
-    for i in range(len(means)):
-        means[i] /= total_frames
-        vars[i] = vars[i] / total_frames - means[i] * means[i]
-        if vars[i] < 1.0e-20:
-            vars[i] = 1.0e-20
-        vars[i] = 1.0 / math.sqrt(vars[i])
-
-    with ReadHelper('ark:{}'.format(args.ark_file)) as ark_reader:
-        for key, mat in ark_reader:
-            mat = (mat - means) * vars
-            ark_writer(key, mat)
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/aishell2/transformer/utils/apply_cmvn.sh b/egs/aishell2/transformer/utils/apply_cmvn.sh
deleted file mode 100755
index f8fd1d1..0000000
--- a/egs/aishell2/transformer/utils/apply_cmvn.sh
+++ /dev/null
@@ -1,29 +0,0 @@
-#!/usr/bin/env bash
-
-. ./path.sh || exit 1;
-# Begin configuration section.
-nj=32
-cmd=./utils/run.pl
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-fbankdir=$1
-cmvn_file=$2
-logdir=$3
-output_dir=$4
-
-dump_dir=${output_dir}/ark; mkdir -p ${dump_dir}
-mkdir -p ${logdir}
-
-$cmd JOB=1:$nj $logdir/apply_cmvn.JOB.log \
-    python utils/apply_cmvn.py -a $fbankdir/ark/feats.JOB.ark \
-        -c $cmvn_file -i JOB -o ${dump_dir} \
-        || exit 1;
-
-for n in $(seq $nj); do
-    cat ${dump_dir}/feats.$n.scp || exit 1
-done > ${output_dir}/feats.scp || exit 1
-
-echo "$0: Succeeded apply cmvn"
diff --git a/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.py b/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.py
deleted file mode 100755
index 50d18d1..0000000
--- a/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.py
+++ /dev/null
@@ -1,143 +0,0 @@
-from kaldiio import ReadHelper, WriteHelper
-
-import argparse
-import numpy as np
-
-
-def build_LFR_features(inputs, m=7, n=6):
-    LFR_inputs = []
-    T = inputs.shape[0]
-    T_lfr = int(np.ceil(T / n))
-    left_padding = np.tile(inputs[0], ((m - 1) // 2, 1))
-    inputs = np.vstack((left_padding, inputs))
-    T = T + (m - 1) // 2
-    for i in range(T_lfr):
-        if m <= T - i * n:
-            LFR_inputs.append(np.hstack(inputs[i * n:i * n + m]))
-        else:
-            num_padding = m - (T - i * n)
-            frame = np.hstack(inputs[i * n:])
-            for _ in range(num_padding):
-                frame = np.hstack((frame, inputs[-1]))
-            LFR_inputs.append(frame)
-    return np.vstack(LFR_inputs)
-
-
-def build_CMVN_features(inputs, mvn_file):  # noqa
-    with open(mvn_file, 'r', encoding='utf-8') as f:
-        lines = f.readlines()
-
-    add_shift_list = []
-    rescale_list = []
-    for i in range(len(lines)):
-        line_item = lines[i].split()
-        if line_item[0] == '<AddShift>':
-            line_item = lines[i + 1].split()
-            if line_item[0] == '<LearnRateCoef>':
-                add_shift_line = line_item[3:(len(line_item) - 1)]
-                add_shift_list = list(add_shift_line)
-                continue
-        elif line_item[0] == '<Rescale>':
-            line_item = lines[i + 1].split()
-            if line_item[0] == '<LearnRateCoef>':
-                rescale_line = line_item[3:(len(line_item) - 1)]
-                rescale_list = list(rescale_line)
-                continue
-
-    for j in range(inputs.shape[0]):
-        for k in range(inputs.shape[1]):
-            add_shift_value = add_shift_list[k]
-            rescale_value = rescale_list[k]
-            inputs[j, k] = float(inputs[j, k]) + float(add_shift_value)
-            inputs[j, k] = float(inputs[j, k]) * float(rescale_value)
-
-    return inputs
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="apply low_frame_rate and cmvn",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--ark-file",
-        "-a",
-        default=False,
-        required=True,
-        type=str,
-        help="fbank ark file",
-    )
-    parser.add_argument(
-        "--lfr",
-        "-f",
-        default=True,
-        type=str,
-        help="low frame rate",
-    )
-    parser.add_argument(
-        "--lfr-m",
-        "-m",
-        default=7,
-        type=int,
-        help="number of frames to stack",
-    )
-    parser.add_argument(
-        "--lfr-n",
-        "-n",
-        default=6,
-        type=int,
-        help="number of frames to skip",
-    )
-    parser.add_argument(
-        "--cmvn-file",
-        "-c",
-        default=False,
-        required=True,
-        type=str,
-        help="global cmvn file",
-    )
-    parser.add_argument(
-        "--ark-index",
-        "-i",
-        default=1,
-        required=True,
-        type=int,
-        help="ark index",
-    )
-    parser.add_argument(
-        "--output-dir",
-        "-o",
-        default=False,
-        required=True,
-        type=str,
-        help="output dir",
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    dump_ark_file = args.output_dir + "/feats." + str(args.ark_index) + ".ark"
-    dump_scp_file = args.output_dir + "/feats." + str(args.ark_index) + ".scp"
-    shape_file = args.output_dir + "/len." + str(args.ark_index)
-    ark_writer = WriteHelper('ark,scp:{},{}'.format(dump_ark_file, dump_scp_file))
-
-    shape_writer = open(shape_file, 'w')
-    with ReadHelper('ark:{}'.format(args.ark_file)) as ark_reader:
-        for key, mat in ark_reader:
-            if args.lfr:
-                lfr = build_LFR_features(mat, args.lfr_m, args.lfr_n)
-            else:
-                lfr = mat
-            cmvn = build_CMVN_features(lfr, args.cmvn_file)
-            dims = cmvn.shape[1]
-            lens = cmvn.shape[0]
-            shape_writer.write(key + " " + str(lens) + "," + str(dims) + '\n')
-            ark_writer(key, cmvn)
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.sh b/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.sh
deleted file mode 100755
index 3119fdb..0000000
--- a/egs/aishell2/transformer/utils/apply_lfr_and_cmvn.sh
+++ /dev/null
@@ -1,38 +0,0 @@
-#!/usr/bin/env bash
-
-
-# Begin configuration section.
-nj=32
-cmd=utils/run.pl
-
-# feature configuration
-lfr=True
-lfr_m=7
-lfr_n=6
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-fbankdir=$1
-cmvn_file=$2
-logdir=$3
-output_dir=$4
-
-dump_dir=${output_dir}/ark; mkdir -p ${dump_dir}
-mkdir -p ${logdir}
-
-$cmd JOB=1:$nj $logdir/apply_lfr_and_cmvn.JOB.log \
-    python utils/apply_lfr_and_cmvn.py -a $fbankdir/ark/feats.JOB.ark \
-        -f $lfr -m $lfr_m -n $lfr_n -c $cmvn_file -i JOB -o ${dump_dir} \
-        || exit 1;
-
-for n in $(seq $nj); do
-    cat ${dump_dir}/feats.$n.scp || exit 1
-done > ${output_dir}/feats.scp || exit 1
-
-for n in $(seq $nj); do
-  cat ${dump_dir}/len.$n || exit 1
-done > ${output_dir}/speech_shape || exit 1
-
-echo "$0: Succeeded apply low frame rate and cmvn"
diff --git a/egs/aishell2/transformer/utils/cmvn_converter.py b/egs/aishell2/transformer/utils/cmvn_converter.py
deleted file mode 100644
index d405d12..0000000
--- a/egs/aishell2/transformer/utils/cmvn_converter.py
+++ /dev/null
@@ -1,51 +0,0 @@
-import argparse
-import json
-import numpy as np
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="cmvn converter",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--cmvn_json",
-        default=False,
-        required=True,
-        type=str,
-        help="cmvn json file",
-    )
-    parser.add_argument(
-        "--am_mvn",
-        default=False,
-        required=True,
-        type=str,
-        help="am mvn file",
-    )
-    return parser
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    with open(args.cmvn_json, "r") as fin:
-        cmvn_dict = json.load(fin)
-
-    mean_stats = np.array(cmvn_dict["mean_stats"])
-    var_stats = np.array(cmvn_dict["var_stats"])
-    total_frame = np.array(cmvn_dict["total_frames"])
-
-    mean = -1.0 * mean_stats / total_frame
-    var = 1.0 / np.sqrt(var_stats / total_frame - mean * mean)
-    dims = mean.shape[0]
-    with open(args.am_mvn, 'w') as fout:
-        fout.write("<Nnet>" + "\n" + "<Splice> " + str(dims) + " " + str(dims) + '\n' + "[ 0 ]" + "\n" + "<AddShift> " + str(dims) + " " + str(dims) + "\n")
-        mean_str = str(list(mean)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
-        fout.write("<LearnRateCoef> 0 " + mean_str + '\n')
-        fout.write("<Rescale> " + str(dims) + " " + str(dims) + '\n')
-        var_str = str(list(var)).replace(',', '').replace('[', '[ ').replace(']', ' ]')
-        fout.write("<LearnRateCoef> 0 " + var_str + '\n')
-        fout.write("</Nnet>" + '\n')
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/aishell2/transformer/utils/combine_cmvn_file.py b/egs/aishell2/transformer/utils/combine_cmvn_file.py
deleted file mode 100755
index c525973..0000000
--- a/egs/aishell2/transformer/utils/combine_cmvn_file.py
+++ /dev/null
@@ -1,72 +0,0 @@
-import argparse
-import json
-import os
-
-import numpy as np
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="combine cmvn file",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--dim",
-        default=80,
-        type=int,
-        help="feature dim",
-    )
-    parser.add_argument(
-        "--cmvn_dir",
-        default=False,
-        required=True,
-        type=str,
-        help="cmvn dir",
-    )
-
-    parser.add_argument(
-        "--nj",
-        default=1,
-        required=True,
-        type=int,
-        help="num of cmvn files",
-    )
-    parser.add_argument(
-        "--output_dir",
-        default=False,
-        required=True,
-        type=str,
-        help="output dir",
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    total_means = np.zeros(args.dim)
-    total_vars = np.zeros(args.dim)
-    total_frames = 0
-
-    cmvn_file = os.path.join(args.output_dir, "cmvn.json")
-
-    for i in range(1, args.nj + 1):
-        with open(os.path.join(args.cmvn_dir, "cmvn.{}.json".format(str(i)))) as fin:
-            cmvn_stats = json.load(fin)
-
-        total_means += np.array(cmvn_stats["mean_stats"])
-        total_vars += np.array(cmvn_stats["var_stats"])
-        total_frames += cmvn_stats["total_frames"]
-
-    cmvn_info = {
-        'mean_stats': list(total_means.tolist()),
-        'var_stats': list(total_vars.tolist()),
-        'total_frames': total_frames
-    }
-    with open(cmvn_file, 'w') as fout:
-        fout.write(json.dumps(cmvn_info))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/aishell2/transformer/utils/compute_cmvn.py b/egs/aishell2/transformer/utils/compute_cmvn.py
deleted file mode 100755
index 949cc08..0000000
--- a/egs/aishell2/transformer/utils/compute_cmvn.py
+++ /dev/null
@@ -1,104 +0,0 @@
-import argparse
-import json
-import os
-
-import numpy as np
-import torchaudio
-import torchaudio.compliance.kaldi as kaldi
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="computer global cmvn",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--dim",
-        default=80,
-        type=int,
-        help="feature dimension",
-    )
-    parser.add_argument(
-        "--wav_path",
-        default=False,
-        required=True,
-        type=str,
-        help="the path of wav scps",
-    )
-    parser.add_argument(
-        "--idx",
-        default=1,
-        required=True,
-        type=int,
-        help="index",
-    )
-    return parser
-
-
-def compute_fbank(wav_file,
-                  num_mel_bins=80,
-                  frame_length=25,
-                  frame_shift=10,
-                  dither=0.0,
-                  resample_rate=16000,
-                  speed=1.0,
-                  window_type="hamming"):
-    waveform, sample_rate = torchaudio.load(wav_file)
-    if resample_rate != sample_rate:
-        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate,
-                                                  new_freq=resample_rate)(waveform)
-    if speed != 1.0:
-        waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
-            waveform, resample_rate,
-            [['speed', str(speed)], ['rate', str(resample_rate)]]
-        )
-
-    waveform = waveform * (1 << 15)
-    mat = kaldi.fbank(waveform,
-                      num_mel_bins=num_mel_bins,
-                      frame_length=frame_length,
-                      frame_shift=frame_shift,
-                      dither=dither,
-                      energy_floor=0.0,
-                      window_type=window_type,
-                      sample_frequency=resample_rate)
-
-    return mat.numpy()
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    wav_scp_file = os.path.join(args.wav_path, "wav.{}.scp".format(args.idx))
-    cmvn_file = os.path.join(args.wav_path, "cmvn.{}.json".format(args.idx))
-
-    mean_stats = np.zeros(args.dim)
-    var_stats = np.zeros(args.dim)
-    total_frames = 0
-
-    # with ReadHelper('ark:{}'.format(ark_file)) as ark_reader:
-    #     for key, mat in ark_reader:
-    #         mean_stats += np.sum(mat, axis=0)
-    #         var_stats += np.sum(np.square(mat), axis=0)
-    #         total_frames += mat.shape[0]
-    with open(wav_scp_file) as f:
-        lines = f.readlines()
-        for line in lines:
-            _, wav_file = line.strip().split()
-            fbank = compute_fbank(wav_file, num_mel_bins=args.dim)
-            mean_stats += np.sum(fbank, axis=0)
-            var_stats += np.sum(np.square(fbank), axis=0)
-            total_frames += fbank.shape[0]
-
-    cmvn_info = {
-        'mean_stats': list(mean_stats.tolist()),
-        'var_stats': list(var_stats.tolist()),
-        'total_frames': total_frames
-    }
-    with open(cmvn_file, 'w') as fout:
-        fout.write(json.dumps(cmvn_info))
-
-
-if __name__ == '__main__':
-    main()
diff --git a/egs/aishell2/transformer/utils/compute_cmvn.sh b/egs/aishell2/transformer/utils/compute_cmvn.sh
deleted file mode 100755
index 75d88a2..0000000
--- a/egs/aishell2/transformer/utils/compute_cmvn.sh
+++ /dev/null
@@ -1,34 +0,0 @@
-#!/usr/bin/env bash
-
-. ./path.sh || exit 1;
-# Begin configuration section.
-nj=32
-cmd=./utils/run.pl
-feats_dim=80
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-fbankdir=$1
-
-split_dir=${fbankdir}/cmvn/split_${nj};
-mkdir -p $split_dir
-split_scps=""
-for n in $(seq $nj); do
-    split_scps="$split_scps $split_dir/wav.$n.scp"
-done
-utils/split_scp.pl ${fbankdir}/wav.scp $split_scps || exit 1;
-
-logdir=${fbankdir}/cmvn/log
-$cmd JOB=1:$nj $logdir/cmvn.JOB.log \
-    python utils/compute_cmvn.py \
-      --dim ${feats_dim} \
-      --wav_path $split_dir \
-      --idx JOB
-
-python utils/combine_cmvn_file.py --dim ${feats_dim} --cmvn_dir $split_dir --nj $nj --output_dir ${fbankdir}/cmvn
-
-python utils/cmvn_converter.py --cmvn_json ${fbankdir}/cmvn/cmvn.json --am_mvn ${fbankdir}/cmvn/am.mvn
-
-echo "$0: Succeeded compute global cmvn"
diff --git a/egs/aishell2/transformer/utils/compute_fbank.py b/egs/aishell2/transformer/utils/compute_fbank.py
deleted file mode 100755
index 9c3904f..0000000
--- a/egs/aishell2/transformer/utils/compute_fbank.py
+++ /dev/null
@@ -1,171 +0,0 @@
-from kaldiio import WriteHelper
-
-import argparse
-import numpy as np
-import json
-import torch
-import torchaudio
-import torchaudio.compliance.kaldi as kaldi
-
-
-def compute_fbank(wav_file,
-                  num_mel_bins=80,
-                  frame_length=25,
-                  frame_shift=10,
-                  dither=0.0,
-                  resample_rate=16000,
-                  speed=1.0,
-                  window_type="hamming"):
-
-    waveform, sample_rate = torchaudio.load(wav_file)
-    if resample_rate != sample_rate:
-        waveform = torchaudio.transforms.Resample(orig_freq=sample_rate,
-                                                  new_freq=resample_rate)(waveform)
-    if speed != 1.0:
-        waveform, _ = torchaudio.sox_effects.apply_effects_tensor(
-            waveform, resample_rate,
-            [['speed', str(speed)], ['rate', str(resample_rate)]]
-        )
-
-    waveform = waveform * (1 << 15)
-    mat = kaldi.fbank(waveform,
-                      num_mel_bins=num_mel_bins,
-                      frame_length=frame_length,
-                      frame_shift=frame_shift,
-                      dither=dither,
-                      energy_floor=0.0,
-                      window_type=window_type,
-                      sample_frequency=resample_rate)
-
-    return mat.numpy()
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="computer features",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--wav-lists",
-        "-w",
-        default=False,
-        required=True,
-        type=str,
-        help="input wav lists",
-    )
-    parser.add_argument(
-        "--text-files",
-        "-t",
-        default=False,
-        required=True,
-        type=str,
-        help="input text files",
-    )
-    parser.add_argument(
-        "--dims",
-        "-d",
-        default=80,
-        type=int,
-        help="feature dims",
-    )
-    parser.add_argument(
-        "--max-lengths",
-        "-m",
-        default=1500,
-        type=int,
-        help="max frame numbers",
-    )
-    parser.add_argument(
-        "--sample-frequency",
-        "-s",
-        default=16000,
-        type=int,
-        help="sample frequency",
-    )
-    parser.add_argument(
-        "--speed-perturb",
-        "-p",
-        default="1.0",
-        type=str,
-        help="speed perturb",
-    )
-    parser.add_argument(
-        "--ark-index",
-        "-a",
-        default=1,
-        required=True,
-        type=int,
-        help="ark index",
-    )
-    parser.add_argument(
-        "--output-dir",
-        "-o",
-        default=False,
-        required=True,
-        type=str,
-        help="output dir",
-    )
-    parser.add_argument(
-        "--window-type",
-        default="hamming",
-        required=False,
-        type=str,
-        help="window type"
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    ark_file = args.output_dir + "/ark/feats." + str(args.ark_index) + ".ark"
-    scp_file = args.output_dir + "/ark/feats." + str(args.ark_index) + ".scp"
-    text_file = args.output_dir + "/txt/text." + str(args.ark_index) + ".txt"  
-    feats_shape_file = args.output_dir + "/ark/len." + str(args.ark_index)
-    text_shape_file = args.output_dir + "/txt/len." + str(args.ark_index)
-
-    ark_writer = WriteHelper('ark,scp:{},{}'.format(ark_file, scp_file))
-    text_writer = open(text_file, 'w')
-    feats_shape_writer = open(feats_shape_file, 'w')
-    text_shape_writer = open(text_shape_file, 'w')
-
-    speed_perturb_list = args.speed_perturb.split(',')
-    
-    for speed in speed_perturb_list:
-        with open(args.wav_lists, 'r', encoding='utf-8') as wavfile:
-            with open(args.text_files, 'r', encoding='utf-8') as textfile:
-                for wav, text in zip(wavfile, textfile): 
-                    s_w = wav.strip().split()
-                    wav_id = s_w[0]
-                    wav_file = s_w[1]
-
-                    s_t = text.strip().split()
-                    text_id = s_t[0]
-                    txt = s_t[1:]
-                    fbank = compute_fbank(wav_file,
-                                          num_mel_bins=args.dims,
-                                          resample_rate=args.sample_frequency,
-                                          speed=float(speed),
-                                          window_type=args.window_type
-                                          )
-                    feats_dims = fbank.shape[1]
-                    feats_lens = fbank.shape[0]
-                    if feats_lens >= args.max_lengths:
-                        continue
-                    txt_lens = len(txt)
-                    if speed == "1.0":
-                        wav_id_sp = wav_id
-                    else: 
-                        wav_id_sp = wav_id + "_sp" + speed
-
-                    feats_shape_writer.write(wav_id_sp + " " + str(feats_lens) + "," + str(feats_dims) + '\n')
-                    text_shape_writer.write(wav_id_sp + " " + str(txt_lens) + '\n')
-
-                    text_writer.write(wav_id_sp + " " + " ".join(txt) + '\n')
-                    ark_writer(wav_id_sp, fbank)
-                    
-
-if __name__ == '__main__':
-    main()
-
diff --git a/egs/aishell2/transformer/utils/compute_fbank.sh b/egs/aishell2/transformer/utils/compute_fbank.sh
deleted file mode 100755
index 8704b31..0000000
--- a/egs/aishell2/transformer/utils/compute_fbank.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/usr/bin/env bash
-
-. ./path.sh || exit 1;
-# Begin configuration section.
-nj=32
-cmd=./utils/run.pl
-
-# feature configuration
-feats_dim=80
-sample_frequency=16000
-speed_perturb="1.0"
-window_type="hamming"
-max_lengths=1500
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-data=$1
-logdir=$2
-fbankdir=$3
-
-[ ! -f $data/wav.scp ] && echo "$0: no such file $data/wav.scp" && exit 1;
-[ ! -f $data/text ] && echo "$0: no such file $data/text" && exit 1;
-
-python utils/split_data.py $data $data $nj
-
-ark_dir=${fbankdir}/ark; mkdir -p ${ark_dir}
-text_dir=${fbankdir}/txt; mkdir -p ${text_dir}
-mkdir -p ${logdir}
-
-$cmd JOB=1:$nj $logdir/make_fbank.JOB.log \
-    python utils/compute_fbank.py -w $data/split${nj}/JOB/wav.scp -t $data/split${nj}/JOB/text \
-        -d $feats_dim -s $sample_frequency -m ${max_lengths} -p ${speed_perturb} -a JOB -o ${fbankdir} \
-        --window-type ${window_type} \
-        || exit 1;
-
-for n in $(seq $nj); do
-    cat ${ark_dir}/feats.$n.scp || exit 1
-done > $fbankdir/feats.scp || exit 1
-
-for n in $(seq $nj); do
-    cat ${text_dir}/text.$n.txt || exit 1
-done > $fbankdir/text || exit 1
-
-for n in $(seq $nj); do
-    cat ${ark_dir}/len.$n || exit 1
-done > $fbankdir/speech_shape || exit 1
-
-for n in $(seq $nj); do
-    cat ${text_dir}/len.$n || exit 1
-done > $fbankdir/text_shape || exit 1
-
-echo "$0: Succeeded compute FBANK features"
diff --git a/egs/aishell2/transformer/utils/compute_wer.py b/egs/aishell2/transformer/utils/compute_wer.py
deleted file mode 100755
index 26a9f49..0000000
--- a/egs/aishell2/transformer/utils/compute_wer.py
+++ /dev/null
@@ -1,157 +0,0 @@
-import os
-import numpy as np
-import sys
-
-def compute_wer(ref_file,
-                hyp_file,
-                cer_detail_file):
-    rst = {
-        'Wrd': 0,
-        'Corr': 0,
-        'Ins': 0,
-        'Del': 0,
-        'Sub': 0,
-        'Snt': 0,
-        'Err': 0.0,
-        'S.Err': 0.0,
-        'wrong_words': 0,
-        'wrong_sentences': 0
-    }
-
-    hyp_dict = {}
-    ref_dict = {}
-    with open(hyp_file, 'r') as hyp_reader:
-        for line in hyp_reader:
-            key = line.strip().split()[0]
-            value = line.strip().split()[1:]
-            hyp_dict[key] = value
-    with open(ref_file, 'r') as ref_reader:
-        for line in ref_reader:
-            key = line.strip().split()[0]
-            value = line.strip().split()[1:]
-            ref_dict[key] = value
-
-    cer_detail_writer = open(cer_detail_file, 'w')
-    for hyp_key in hyp_dict:
-        if hyp_key in ref_dict:
-           out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
-           rst['Wrd'] += out_item['nwords']
-           rst['Corr'] += out_item['cor']
-           rst['wrong_words'] += out_item['wrong']
-           rst['Ins'] += out_item['ins']
-           rst['Del'] += out_item['del']
-           rst['Sub'] += out_item['sub']
-           rst['Snt'] += 1
-           if out_item['wrong'] > 0:
-               rst['wrong_sentences'] += 1
-           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
-           cer_detail_writer.write("ref:" + '\t' + " ".join(list(map(lambda x: x.lower(), ref_dict[hyp_key]))) + '\n')
-           cer_detail_writer.write("hyp:" + '\t' + " ".join(list(map(lambda x: x.lower(), hyp_dict[hyp_key]))) + '\n')
-
-    if rst['Wrd'] > 0:
-        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
-    if rst['Snt'] > 0:
-        rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
-
-    cer_detail_writer.write('\n')
-    cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
-                            ", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
-    cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
-    cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
-
-     
-def compute_wer_by_line(hyp,
-                        ref):
-    hyp = list(map(lambda x: x.lower(), hyp))
-    ref = list(map(lambda x: x.lower(), ref))
-
-    len_hyp = len(hyp)
-    len_ref = len(ref)
-
-    cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
-
-    ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
-
-    for i in range(len_hyp + 1):
-        cost_matrix[i][0] = i
-    for j in range(len_ref + 1):
-        cost_matrix[0][j] = j
-
-    for i in range(1, len_hyp + 1):
-        for j in range(1, len_ref + 1):
-            if hyp[i - 1] == ref[j - 1]:
-                cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
-            else:
-                substitution = cost_matrix[i - 1][j - 1] + 1
-                insertion = cost_matrix[i - 1][j] + 1
-                deletion = cost_matrix[i][j - 1] + 1
-
-                compare_val = [substitution, insertion, deletion]
-
-                min_val = min(compare_val)
-                operation_idx = compare_val.index(min_val) + 1
-                cost_matrix[i][j] = min_val
-                ops_matrix[i][j] = operation_idx
-
-    match_idx = []
-    i = len_hyp
-    j = len_ref
-    rst = {
-        'nwords': len_ref,
-        'cor': 0,
-        'wrong': 0,
-        'ins': 0,
-        'del': 0,
-        'sub': 0
-    }
-    while i >= 0 or j >= 0:
-        i_idx = max(0, i)
-        j_idx = max(0, j)
-
-        if ops_matrix[i_idx][j_idx] == 0:  # correct
-            if i - 1 >= 0 and j - 1 >= 0:
-                match_idx.append((j - 1, i - 1))
-                rst['cor'] += 1
-
-            i -= 1
-            j -= 1
-
-        elif ops_matrix[i_idx][j_idx] == 2:  # insert
-            i -= 1
-            rst['ins'] += 1
-
-        elif ops_matrix[i_idx][j_idx] == 3:  # delete
-            j -= 1
-            rst['del'] += 1
-
-        elif ops_matrix[i_idx][j_idx] == 1:  # substitute
-            i -= 1
-            j -= 1
-            rst['sub'] += 1
-
-        if i < 0 and j >= 0:
-            rst['del'] += 1
-        elif j < 0 and i >= 0:
-            rst['ins'] += 1
-
-    match_idx.reverse()
-    wrong_cnt = cost_matrix[len_hyp][len_ref]
-    rst['wrong'] = wrong_cnt
-
-    return rst
-
-def print_cer_detail(rst):
-    return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
-            + ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
-            + str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
-            + ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
-
-if __name__ == '__main__':
-    if len(sys.argv) != 4:
-        print("usage : python compute-wer.py test.ref test.hyp test.wer")
-        sys.exit(0)
-
-    ref_file = sys.argv[1]
-    hyp_file = sys.argv[2]
-    cer_detail_file = sys.argv[3]
-    compute_wer(ref_file, hyp_file, cer_detail_file)
diff --git a/egs/aishell2/transformer/utils/download_model.py b/egs/aishell2/transformer/utils/download_model.py
deleted file mode 100755
index 70ea179..0000000
--- a/egs/aishell2/transformer/utils/download_model.py
+++ /dev/null
@@ -1,20 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser(
-        description="download model configs",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument("--model_name",
-                        type=str,
-                        default="damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch",
-                        help="model name in ModelScope")
-    args = parser.parse_args()
-
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=args.model_name)
diff --git a/egs/aishell2/transformer/utils/error_rate_zh b/egs/aishell2/transformer/utils/error_rate_zh
deleted file mode 100755
index 6871a07..0000000
--- a/egs/aishell2/transformer/utils/error_rate_zh
+++ /dev/null
@@ -1,370 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf8
-
-# Copyright  2021  Jiayu DU
-
-import sys
-import argparse
-import json
-import logging
-logging.basicConfig(stream=sys.stderr, level=logging.INFO, format='[%(levelname)s] %(message)s')
-
-DEBUG = None
-
-def GetEditType(ref_token, hyp_token):
-    if ref_token == None and hyp_token != None:
-        return 'I'
-    elif ref_token != None and hyp_token == None:
-        return 'D'
-    elif ref_token == hyp_token:
-        return 'C'
-    elif ref_token != hyp_token:
-        return 'S'
-    else:
-        raise RuntimeError
-
-class AlignmentArc:
-    def __init__(self, src, dst, ref, hyp):
-        self.src = src
-        self.dst = dst
-        self.ref = ref
-        self.hyp = hyp
-        self.edit_type = GetEditType(ref, hyp)
-
-def similarity_score_function(ref_token, hyp_token):
-    return 0 if (ref_token == hyp_token) else -1.0
-
-def insertion_score_function(token):
-    return -1.0
-
-def deletion_score_function(token):
-    return -1.0
-
-def EditDistance(
-        ref,
-        hyp, 
-        similarity_score_function = similarity_score_function,
-        insertion_score_function = insertion_score_function,
-        deletion_score_function = deletion_score_function):
-    assert(len(ref) != 0)
-    class DPState:
-        def __init__(self):
-            self.score = -float('inf')
-            # backpointer
-            self.prev_r = None
-            self.prev_h = None
-    
-    def print_search_grid(S, R, H, fstream):
-        print(file=fstream)
-        for r in range(R):
-            for h in range(H):
-                print(F'[{r},{h}]:{S[r][h].score:4.3f}:({S[r][h].prev_r},{S[r][h].prev_h}) ', end='', file=fstream)
-            print(file=fstream)
-
-    R = len(ref) + 1
-    H = len(hyp) + 1
-
-    # Construct DP search space, a (R x H) grid
-    S = [ [] for r in range(R) ]
-    for r in range(R):
-        S[r] = [ DPState() for x in range(H) ]
-
-    # initialize DP search grid origin, S(r = 0, h = 0)
-    S[0][0].score = 0.0
-    S[0][0].prev_r = None
-    S[0][0].prev_h = None
-
-    # initialize REF axis
-    for r in range(1, R):
-        S[r][0].score = S[r-1][0].score + deletion_score_function(ref[r-1])
-        S[r][0].prev_r = r-1
-        S[r][0].prev_h = 0
-
-    # initialize HYP axis
-    for h in range(1, H):
-        S[0][h].score = S[0][h-1].score + insertion_score_function(hyp[h-1])
-        S[0][h].prev_r = 0
-        S[0][h].prev_h = h-1
-
-    best_score = S[0][0].score
-    best_state = (0, 0)
-
-    for r in range(1, R):
-        for h in range(1, H):
-            sub_or_cor_score = similarity_score_function(ref[r-1], hyp[h-1])
-            new_score = S[r-1][h-1].score + sub_or_cor_score
-            if new_score >= S[r][h].score:
-                S[r][h].score = new_score
-                S[r][h].prev_r = r-1
-                S[r][h].prev_h = h-1
-
-            del_score = deletion_score_function(ref[r-1])
-            new_score = S[r-1][h].score + del_score
-            if new_score >= S[r][h].score:
-                S[r][h].score = new_score
-                S[r][h].prev_r = r - 1
-                S[r][h].prev_h = h
-
-            ins_score = insertion_score_function(hyp[h-1])
-            new_score = S[r][h-1].score + ins_score
-            if new_score >= S[r][h].score:
-                S[r][h].score = new_score
-                S[r][h].prev_r = r
-                S[r][h].prev_h = h-1
-
-    best_score = S[R-1][H-1].score
-    best_state = (R-1, H-1)
-
-    if DEBUG:
-        print_search_grid(S, R, H, sys.stderr)
-
-    # Backtracing best alignment path, i.e. a list of arcs
-    # arc = (src, dst, ref, hyp, edit_type)
-    # src/dst = (r, h), where r/h refers to search grid state-id along Ref/Hyp axis
-    best_path = []
-    r, h = best_state[0], best_state[1]
-    prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
-    score = S[r][h].score
-    # loop invariant:
-    #   1. (prev_r, prev_h) -> (r, h) is a "forward arc" on best alignment path
-    #   2. score is the value of point(r, h) on DP search grid
-    while prev_r != None or prev_h != None:
-        src = (prev_r, prev_h)
-        dst = (r, h)
-        if (r == prev_r + 1 and h == prev_h + 1): # Substitution or correct
-            arc = AlignmentArc(src, dst, ref[prev_r], hyp[prev_h])
-        elif (r == prev_r + 1 and h == prev_h): # Deletion
-            arc = AlignmentArc(src, dst, ref[prev_r], None)
-        elif (r == prev_r and h == prev_h + 1): # Insertion
-            arc = AlignmentArc(src, dst, None, hyp[prev_h])
-        else:
-            raise RuntimeError
-        best_path.append(arc)
-        r, h = prev_r, prev_h
-        prev_r, prev_h = S[r][h].prev_r, S[r][h].prev_h
-        score = S[r][h].score
-    
-    best_path.reverse()
-    return (best_path, best_score)
-
-def PrettyPrintAlignment(alignment, stream = sys.stderr):
-    def get_token_str(token):
-        if token == None:
-            return "*"
-        return token
-    
-    def is_double_width_char(ch):
-        if (ch >= '\u4e00') and (ch <= '\u9fa5'): # codepoint ranges for Chinese chars
-            return True
-        # TODO: support other double-width-char language such as Japanese, Korean 
-        else:
-            return False
-    
-    def display_width(token_str):
-        m = 0
-        for c in token_str:
-            if is_double_width_char(c):
-                m += 2
-            else:
-                m += 1
-        return m
-
-    R = '  REF  : '
-    H = '  HYP  : '
-    E = '  EDIT : '
-    for arc in alignment:
-        r = get_token_str(arc.ref)
-        h = get_token_str(arc.hyp)
-        e = arc.edit_type if arc.edit_type != 'C' else ''
-
-        nr, nh, ne = display_width(r), display_width(h), display_width(e)
-        n = max(nr, nh, ne) + 1
-
-        R += r + ' ' * (n-nr)
-        H += h + ' ' * (n-nh)
-        E += e + ' ' * (n-ne)
-
-    print(R, file=stream)
-    print(H, file=stream)
-    print(E, file=stream)
-
-def CountEdits(alignment):
-    c, s, i, d = 0, 0, 0, 0
-    for arc in alignment:
-        if arc.edit_type == 'C':
-            c += 1
-        elif arc.edit_type == 'S':
-            s += 1
-        elif arc.edit_type == 'I':
-            i += 1
-        elif arc.edit_type == 'D':
-            d += 1
-        else:
-            raise RuntimeError
-    return (c, s, i, d)
-
-def ComputeTokenErrorRate(c, s, i, d):
-    return 100.0 * (s + d + i) / (s + d + c)
-
-def ComputeSentenceErrorRate(num_err_utts, num_utts):
-    assert(num_utts != 0)
-    return 100.0 * num_err_utts / num_utts
-
-
-class EvaluationResult:
-    def __init__(self):
-        self.num_ref_utts = 0
-        self.num_hyp_utts = 0
-        self.num_eval_utts = 0 # seen in both ref & hyp
-        self.num_hyp_without_ref = 0
-
-        self.C = 0
-        self.S = 0
-        self.I = 0
-        self.D = 0
-        self.token_error_rate = 0.0
-
-        self.num_utts_with_error = 0
-        self.sentence_error_rate = 0.0
-    
-    def to_json(self):
-        return json.dumps(self.__dict__)
-    
-    def to_kaldi(self):
-        info = (
-            F'%WER {self.token_error_rate:.2f} [ {self.S + self.D + self.I} / {self.C + self.S + self.D}, {self.I} ins, {self.D} del, {self.S} sub ]\n'
-            F'%SER {self.sentence_error_rate:.2f} [ {self.num_utts_with_error} / {self.num_eval_utts} ]\n'
-        )
-        return info
-    
-    def to_sclite(self):
-        return "TODO"
-    
-    def to_espnet(self):
-        return "TODO"
-    
-    def to_summary(self):
-        #return json.dumps(self.__dict__, indent=4)
-        summary = (
-            '==================== Overall Statistics ====================\n'
-            F'num_ref_utts: {self.num_ref_utts}\n'
-            F'num_hyp_utts: {self.num_hyp_utts}\n'
-            F'num_hyp_without_ref: {self.num_hyp_without_ref}\n'
-            F'num_eval_utts: {self.num_eval_utts}\n'
-            F'sentence_error_rate: {self.sentence_error_rate:.2f}%\n'
-            F'token_error_rate: {self.token_error_rate:.2f}%\n'
-            F'token_stats:\n'
-            F'  - tokens:{self.C + self.S + self.D:>7}\n'
-            F'  - edits: {self.S + self.I + self.D:>7}\n'
-            F'  - cor:   {self.C:>7}\n'
-            F'  - sub:   {self.S:>7}\n'
-            F'  - ins:   {self.I:>7}\n'
-            F'  - del:   {self.D:>7}\n'
-            '============================================================\n'
-        )
-        return summary
-
-
-class Utterance:
-    def __init__(self, uid, text):
-        self.uid = uid
-        self.text = text
-
-
-def LoadUtterances(filepath, format):
-    utts = {}
-    if format == 'text': # utt_id word1 word2 ...
-        with open(filepath, 'r', encoding='utf8') as f:
-            for line in f:
-                line = line.strip()
-                if line:
-                    cols = line.split(maxsplit=1)
-                    assert(len(cols) == 2 or len(cols) == 1)
-                    uid = cols[0]
-                    text = cols[1] if len(cols) == 2 else ''
-                    if utts.get(uid) != None:
-                        raise RuntimeError(F'Found duplicated utterence id {uid}')
-                    utts[uid] = Utterance(uid, text)
-    else:
-        raise RuntimeError(F'Unsupported text format {format}')
-    return utts
-
-
-def tokenize_text(text, tokenizer):
-    if tokenizer == 'whitespace':
-        return text.split()
-    elif tokenizer == 'char':
-        return [ ch for ch in ''.join(text.split()) ]
-    else:
-        raise RuntimeError(F'ERROR: Unsupported tokenizer {tokenizer}')
-
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    # optional
-    parser.add_argument('--tokenizer', choices=['whitespace', 'char'], default='whitespace', help='whitespace for WER, char for CER')
-    parser.add_argument('--ref-format', choices=['text'], default='text', help='reference format, first col is utt_id, the rest is text')
-    parser.add_argument('--hyp-format', choices=['text'], default='text', help='hypothesis format, first col is utt_id, the rest is text')
-    # required
-    parser.add_argument('--ref', type=str, required=True, help='input reference file')
-    parser.add_argument('--hyp', type=str, required=True, help='input hypothesis file')
-
-    parser.add_argument('result_file', type=str)
-    args = parser.parse_args()
-    logging.info(args)
-
-    ref_utts = LoadUtterances(args.ref, args.ref_format)
-    hyp_utts = LoadUtterances(args.hyp, args.hyp_format)
-
-    r = EvaluationResult()
-
-    # check valid utterances in hyp that have matched non-empty reference
-    eval_utts = []
-    r.num_hyp_without_ref = 0
-    for uid in sorted(hyp_utts.keys()):
-        if uid in ref_utts.keys(): # TODO: efficiency
-            if ref_utts[uid].text.strip(): # non-empty reference
-                eval_utts.append(uid)
-            else:
-                logging.warn(F'Found {uid} with empty reference, skipping...')
-        else:
-            logging.warn(F'Found {uid} without reference, skipping...')
-            r.num_hyp_without_ref += 1
-
-    r.num_hyp_utts = len(hyp_utts)
-    r.num_ref_utts = len(ref_utts)
-    r.num_eval_utts = len(eval_utts)
-
-    with open(args.result_file, 'w+', encoding='utf8') as fo:
-        for uid in eval_utts:
-            ref = ref_utts[uid]
-            hyp = hyp_utts[uid]
-
-            alignment, score = EditDistance(
-                tokenize_text(ref.text, args.tokenizer),
-                tokenize_text(hyp.text, args.tokenizer)
-            )
-            
-            c, s, i, d = CountEdits(alignment)
-            utt_ter = ComputeTokenErrorRate(c, s, i, d)
-
-            # utt-level evaluation result
-            print(F'{{"uid":{uid}, "score":{score}, "ter":{utt_ter:.2f}, "cor":{c}, "sub":{s}, "ins":{i}, "del":{d}}}', file=fo)
-            PrettyPrintAlignment(alignment, fo)
-
-            r.C += c
-            r.S += s
-            r.I += i
-            r.D += d
-
-            if utt_ter > 0:
-                r.num_utts_with_error += 1
-
-        # corpus level evaluation result
-        r.sentence_error_rate = ComputeSentenceErrorRate(r.num_utts_with_error, r.num_eval_utts)
-        r.token_error_rate = ComputeTokenErrorRate(r.C, r.S, r.I, r.D)
-
-        print(r.to_summary(), file=fo)
-
-    print(r.to_json())
-    print(r.to_kaldi())
diff --git a/egs/aishell2/transformer/utils/extract_embeds.py b/egs/aishell2/transformer/utils/extract_embeds.py
deleted file mode 100755
index 7b817d8..0000000
--- a/egs/aishell2/transformer/utils/extract_embeds.py
+++ /dev/null
@@ -1,47 +0,0 @@
-from transformers import AutoTokenizer, AutoModel, pipeline
-import numpy as np
-import sys
-import os
-import torch
-from kaldiio import WriteHelper
-import re
-text_file_json = sys.argv[1]
-out_ark = sys.argv[2]
-out_scp = sys.argv[3]
-out_shape = sys.argv[4]
-device = int(sys.argv[5])
-model_path = sys.argv[6]
-
-model = AutoModel.from_pretrained(model_path)
-tokenizer = AutoTokenizer.from_pretrained(model_path)
-extractor = pipeline(task="feature-extraction", model=model, tokenizer=tokenizer, device=device)
-
-with open(text_file_json, 'r') as f:
-    js = f.readlines()
-
-
-f_shape = open(out_shape, "w")
-with WriteHelper('ark,scp:{},{}'.format(out_ark, out_scp)) as writer:
-    with torch.no_grad():
-        for idx, line in enumerate(js):
-            id, tokens = line.strip().split(" ", 1)
-            tokens = re.sub(" ", "", tokens.strip())
-            tokens = ' '.join([j for j in tokens])
-            token_num = len(tokens.split(" "))
-            outputs = extractor(tokens)
-            outputs = np.array(outputs)
-            embeds = outputs[0, 1:-1, :]
-
-            token_num_embeds, dim = embeds.shape
-            if token_num == token_num_embeds:
-                writer(id, embeds)
-                shape_line = "{} {},{}\n".format(id, token_num_embeds, dim)
-                f_shape.write(shape_line)
-            else:
-                print("{}, size has changed, {}, {}, {}".format(id, token_num, token_num_embeds, tokens))
-
-
-
-f_shape.close()
-
-
diff --git a/egs/aishell2/transformer/utils/filter_scp.pl b/egs/aishell2/transformer/utils/filter_scp.pl
deleted file mode 100755
index 003530d..0000000
--- a/egs/aishell2/transformer/utils/filter_scp.pl
+++ /dev/null
@@ -1,87 +0,0 @@
-#!/usr/bin/env perl
-# Copyright 2010-2012 Microsoft Corporation
-#                     Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This script takes a list of utterance-ids or any file whose first field
-# of each line is an utterance-id, and filters an scp
-# file (or any file whose "n-th" field is an utterance id), printing
-# out only those lines whose "n-th" field is in id_list. The index of
-# the "n-th" field is 1, by default, but can be changed by using
-# the -f <n> switch
-
-$exclude = 0;
-$field = 1;
-$shifted = 0;
-
-do {
-  $shifted=0;
-  if ($ARGV[0] eq "--exclude") {
-    $exclude = 1;
-    shift @ARGV;
-    $shifted=1;
-  }
-  if ($ARGV[0] eq "-f") {
-    $field = $ARGV[1];
-    shift @ARGV; shift @ARGV;
-    $shifted=1
-  }
-} while ($shifted);
-
-if(@ARGV < 1 || @ARGV > 2) {
-  die "Usage: filter_scp.pl [--exclude] [-f <field-to-filter-on>] id_list [in.scp] > out.scp \n" .
-      "Prints only the input lines whose f'th field (default: first) is in 'id_list'.\n" .
-      "Note: only the first field of each line in id_list matters.  With --exclude, prints\n" .
-      "only the lines that were *not* in id_list.\n" .
-      "Caution: previously, the -f option was interpreted as a zero-based field index.\n" .
-      "If your older scripts (written before Oct 2014) stopped working and you used the\n" .
-      "-f option, add 1 to the argument.\n" .
-      "See also: scripts/filter_scp.pl .\n";
-}
-
-
-$idlist = shift @ARGV;
-open(F, "<$idlist") || die "Could not open id-list file $idlist";
-while(<F>) {
-  @A = split;
-  @A>=1 || die "Invalid id-list file line $_";
-  $seen{$A[0]} = 1;
-}
-
-if ($field == 1) { # Treat this as special case, since it is common.
-  while(<>) {
-    $_ =~ m/\s*(\S+)\s*/ || die "Bad line $_, could not get first field.";
-    # $1 is what we filter on.
-    if ((!$exclude && $seen{$1}) || ($exclude && !defined $seen{$1})) {
-      print $_;
-    }
-  }
-} else {
-  while(<>) {
-    @A = split;
-    @A > 0 || die "Invalid scp file line $_";
-    @A >= $field || die "Invalid scp file line $_";
-    if ((!$exclude && $seen{$A[$field-1]}) || ($exclude && !defined $seen{$A[$field-1]})) {
-      print $_;
-    }
-  }
-}
-
-# tests:
-# the following should print "foo 1"
-# ( echo foo 1; echo bar 2 ) | scripts/filter_scp.pl <(echo foo)
-# the following should print "bar 2".
-# ( echo foo 1; echo bar 2 ) | scripts/filter_scp.pl -f 2 <(echo 2)
diff --git a/egs/aishell2/transformer/utils/fix_data.sh b/egs/aishell2/transformer/utils/fix_data.sh
deleted file mode 100755
index b1a2bb8..0000000
--- a/egs/aishell2/transformer/utils/fix_data.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-echo "$0 $@"
-data_dir=$1
-
-if [ ! -f ${data_dir}/wav.scp ]; then
-  echo "$0: wav.scp is not found"
-  exit 1;
-fi
-
-if [ ! -f ${data_dir}/text ]; then
-  echo "$0: text is not found"
-  exit 1;
-fi
-
-
-
-mkdir -p ${data_dir}/.backup
-
-awk '{print $1}' ${data_dir}/wav.scp > ${data_dir}/.backup/wav_id
-awk '{print $1}' ${data_dir}/text > ${data_dir}/.backup/text_id
-
-sort ${data_dir}/.backup/wav_id ${data_dir}/.backup/text_id | uniq -d > ${data_dir}/.backup/id
-
-cp ${data_dir}/wav.scp ${data_dir}/.backup/wav.scp
-cp ${data_dir}/text ${data_dir}/.backup/text
-
-mv ${data_dir}/wav.scp ${data_dir}/wav.scp.bak
-mv ${data_dir}/text ${data_dir}/text.bak
-
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/wav.scp.bak | sort -k1,1 -u > ${data_dir}/wav.scp
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text.bak | sort -k1,1 -u > ${data_dir}/text
-
-rm ${data_dir}/wav.scp.bak
-rm ${data_dir}/text.bak
diff --git a/egs/aishell2/transformer/utils/fix_data_feat.sh b/egs/aishell2/transformer/utils/fix_data_feat.sh
deleted file mode 100755
index 84eea36..0000000
--- a/egs/aishell2/transformer/utils/fix_data_feat.sh
+++ /dev/null
@@ -1,52 +0,0 @@
-#!/usr/bin/env bash
-
-echo "$0 $@"
-data_dir=$1
-
-if [ ! -f ${data_dir}/feats.scp ]; then
-  echo "$0: feats.scp is not found"
-  exit 1;
-fi
-
-if [ ! -f ${data_dir}/text ]; then
-  echo "$0: text is not found"
-  exit 1;
-fi
-
-if [ ! -f ${data_dir}/speech_shape ]; then
-  echo "$0: feature lengths is not found"
-  exit 1;
-fi
-
-if [ ! -f ${data_dir}/text_shape ]; then
-  echo "$0: text lengths is not found"
-  exit 1;
-fi
-
-mkdir -p ${data_dir}/.backup
-
-awk '{print $1}' ${data_dir}/feats.scp > ${data_dir}/.backup/wav_id
-awk '{print $1}' ${data_dir}/text > ${data_dir}/.backup/text_id
-
-sort ${data_dir}/.backup/wav_id ${data_dir}/.backup/text_id | uniq -d > ${data_dir}/.backup/id
-
-cp ${data_dir}/feats.scp ${data_dir}/.backup/feats.scp
-cp ${data_dir}/text ${data_dir}/.backup/text
-cp ${data_dir}/speech_shape ${data_dir}/.backup/speech_shape
-cp ${data_dir}/text_shape ${data_dir}/.backup/text_shape
-
-mv ${data_dir}/feats.scp ${data_dir}/feats.scp.bak
-mv ${data_dir}/text ${data_dir}/text.bak
-mv ${data_dir}/speech_shape ${data_dir}/speech_shape.bak
-mv ${data_dir}/text_shape ${data_dir}/text_shape.bak
-
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/feats.scp.bak | sort -k1,1 -u > ${data_dir}/feats.scp
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text.bak | sort -k1,1 -u > ${data_dir}/text
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/speech_shape.bak | sort -k1,1 -u > ${data_dir}/speech_shape
-utils/filter_scp.pl -f 1 ${data_dir}/.backup/id ${data_dir}/text_shape.bak | sort -k1,1 -u > ${data_dir}/text_shape
-
-rm ${data_dir}/feats.scp.bak
-rm ${data_dir}/text.bak
-rm ${data_dir}/speech_shape.bak
-rm ${data_dir}/text_shape.bak
-
diff --git a/egs/aishell2/transformer/utils/gen_ark_list.sh b/egs/aishell2/transformer/utils/gen_ark_list.sh
deleted file mode 100755
index aebf356..0000000
--- a/egs/aishell2/transformer/utils/gen_ark_list.sh
+++ /dev/null
@@ -1,22 +0,0 @@
-#!/usr/bin/env bash
-
-
-# Begin configuration section.
-nj=32
-cmd=./utils/run.pl
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-ark_dir=$1
-txt_dir=$2
-output_dir=$3
-
-[ ! -d ${ark_dir}/ark ] && echo "$0: ark data is required" && exit 1;
-[ ! -d ${txt_dir}/txt ] && echo "$0: txt data is required" && exit 1;
-
-for n in $(seq $nj); do
-  echo "${ark_dir}/ark/feats.$n.ark ${txt_dir}/txt/text.$n.txt" || exit 1
-done > ${output_dir}/ark_txt.scp || exit 1
-
diff --git a/egs/aishell2/transformer/utils/gen_modelscope_configuration.py b/egs/aishell2/transformer/utils/gen_modelscope_configuration.py
deleted file mode 100644
index d340970..0000000
--- a/egs/aishell2/transformer/utils/gen_modelscope_configuration.py
+++ /dev/null
@@ -1,118 +0,0 @@
-import argparse
-import json
-import os
-
-if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        "--task",
-        type=str,
-        default="auto-speech-recognition",
-        help="task name",
-    )
-    parser.add_argument(
-        "--type",
-        type=str,
-        default="generic-asr",
-    )
-    parser.add_argument(
-        "--am_model_name",
-        type=str,
-        default="model.pb",
-        help="model file name",
-    )
-    parser.add_argument(
-        "--mode",
-        type=str,
-        default="paraformer",
-        help="mode for decoding",
-    )
-    parser.add_argument(
-        "--lang",
-        type=str,
-        default="zh-cn",
-        help="language",
-    )
-    parser.add_argument(
-        "--batch_size",
-        type=int,
-        default=1,
-        help="batch size",
-    )
-    parser.add_argument(
-        "--am_model_config",
-        type=str,
-        default="config.yaml",
-        help="config file",
-    )
-    parser.add_argument(
-        "--mvn_file",
-        type=str,
-        default="am.mvn",
-        help="cmvn file",
-    )
-    parser.add_argument(
-        "--model_name",
-        type=str,
-        help="model name",
-    )
-    parser.add_argument(
-        "--pipeline_type",
-        type=str,
-        default="asr-inference",
-        help="pipeline type",
-    )
-    parser.add_argument(
-        "--vocab_size",
-        type=int,
-        help="vocab_size",
-    )
-    parser.add_argument(
-        "--dataset",
-        type=str,
-        help="dataset name",
-    )
-    parser.add_argument(
-        "--output_dir",
-        type=str,
-        help="output path",
-    )
-    parser.add_argument(
-        "--nat",
-        type=str,
-        default="",
-        help="nat",
-    )
-    parser.add_argument(
-        "--tag",
-        type=str,
-        default="exp1",
-        help="model name tag",
-    )
-    args = parser.parse_args()
-
-    model = {
-        "type": args.type,
-        "am_model_name": args.am_model_name,
-        "model_config": {
-            "type": "pytorch",
-            "code_base": "funasr",
-            "mode": args.mode,
-            "lang": args.lang,
-            "batch_size": args.batch_size,
-            "am_model_config": args.am_model_config,
-            "mvn_file": args.mvn_file,
-            "model": "speech_{}_asr{}-{}-16k-{}-vocab{}-pytorch-{}".format(args.model_name, args.nat, args.lang,
-                                                                           args.dataset, args.vocab_size, args.tag),
-        }
-    }
-    pipeline = {"type": args.pipeline_type}
-    json_dict = {
-        "framework": "pytorch",
-        "task": args.task,
-        "model": model,
-        "pipeline": pipeline,
-    }
-
-    with open(os.path.join(args.output_dir, "configuration.json"), "w") as f:
-        json.dump(json_dict, f, indent=4)
diff --git a/egs/aishell2/transformer/utils/parse_options.sh b/egs/aishell2/transformer/utils/parse_options.sh
deleted file mode 100755
index 71fb9e5..0000000
--- a/egs/aishell2/transformer/utils/parse_options.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
diff --git a/egs/aishell2/transformer/utils/print_args.py b/egs/aishell2/transformer/utils/print_args.py
deleted file mode 100755
index b0c61e5..0000000
--- a/egs/aishell2/transformer/utils/print_args.py
+++ /dev/null
@@ -1,45 +0,0 @@
-#!/usr/bin/env python
-import sys
-
-
-def get_commandline_args(no_executable=True):
-    extra_chars = [
-        " ",
-        ";",
-        "&",
-        "|",
-        "<",
-        ">",
-        "?",
-        "*",
-        "~",
-        "`",
-        '"',
-        "'",
-        "\\",
-        "{",
-        "}",
-        "(",
-        ")",
-    ]
-
-    # Escape the extra characters for shell
-    argv = [
-        arg.replace("'", "'\\''")
-        if all(char not in arg for char in extra_chars)
-        else "'" + arg.replace("'", "'\\''") + "'"
-        for arg in sys.argv
-    ]
-
-    if no_executable:
-        return " ".join(argv[1:])
-    else:
-        return sys.executable + " " + " ".join(argv)
-
-
-def main():
-    print(get_commandline_args())
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell2/transformer/utils/proc_conf_oss.py b/egs/aishell2/transformer/utils/proc_conf_oss.py
deleted file mode 100755
index c4a90c5..0000000
--- a/egs/aishell2/transformer/utils/proc_conf_oss.py
+++ /dev/null
@@ -1,35 +0,0 @@
-from pathlib import Path
-
-import torch
-import yaml
-
-
-class NoAliasSafeDumper(yaml.SafeDumper):
-    # Disable anchor/alias in yaml because looks ugly
-    def ignore_aliases(self, data):
-        return True
-
-
-def yaml_no_alias_safe_dump(data, stream=None, **kwargs):
-    """Safe-dump in yaml with no anchor/alias"""
-    return yaml.dump(
-        data, stream, allow_unicode=True, Dumper=NoAliasSafeDumper, **kwargs
-    )
-
-
-def gen_conf(file, out_dir):
-    conf = torch.load(file)["config"]
-    conf["oss_bucket"] = "null"
-    print(conf)
-    output_dir = Path(out_dir)
-    output_dir.mkdir(parents=True, exist_ok=True)
-    with (output_dir / "config.yaml").open("w", encoding="utf-8") as f:
-        yaml_no_alias_safe_dump(conf, f, indent=4, sort_keys=False)
-
-
-if __name__ == "__main__":
-    import sys
-
-    in_f = sys.argv[1]
-    out_f = sys.argv[2]
-    gen_conf(in_f, out_f)
diff --git a/egs/aishell2/transformer/utils/proce_text.py b/egs/aishell2/transformer/utils/proce_text.py
deleted file mode 100755
index 9e517a4..0000000
--- a/egs/aishell2/transformer/utils/proce_text.py
+++ /dev/null
@@ -1,31 +0,0 @@
-
-import sys
-import re
-
-in_f = sys.argv[1]
-out_f = sys.argv[2]
-
-
-with open(in_f, "r", encoding="utf-8") as f:
-  lines = f.readlines()
-
-with open(out_f, "w", encoding="utf-8") as f:
-  for line in lines:
-    outs = line.strip().split(" ", 1)
-    if len(outs) == 2:
-      idx, text = outs
-      text = re.sub("</s>", "", text)
-      text = re.sub("<s>", "", text)
-      text = re.sub("@@", "", text)
-      text = re.sub("@", "", text)
-      text = re.sub("<unk>", "", text)
-      text = re.sub(" ", "", text)
-      text = text.lower()
-    else:
-      idx = outs[0]
-      text = " "
-
-    text = [x for x in text]
-    text = " ".join(text)
-    out = "{} {}\n".format(idx, text)
-    f.write(out)
diff --git a/egs/aishell2/transformer/utils/run.pl b/egs/aishell2/transformer/utils/run.pl
deleted file mode 100755
index 483f95b..0000000
--- a/egs/aishell2/transformer/utils/run.pl
+++ /dev/null
@@ -1,356 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# In general, doing
-#  run.pl some.log a b c is like running the command a b c in
-# the bash shell, and putting the standard error and output into some.log.
-# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
-#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
-# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
-# If any of the jobs fails, this script will fail.
-
-# A typical example is:
-#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
-# and run.pl will run something like:
-# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
-#
-# Basically it takes the command-line arguments, quotes them
-# as necessary to preserve spaces, and evaluates them with bash.
-# In addition it puts the command line at the top of the log, and
-# the start and end times of the command at the beginning and end.
-# The reason why this is useful is so that we can create a different
-# version of this program that uses a queueing system instead.
-
-#use Data::Dumper;
-
-@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
-
-#print STDERR "COMMAND-LINE: " .  Dumper(\@ARGV) . "\n";
-$job_pick = 'all';
-$max_jobs_run = -1;
-$jobstart = 1;
-$jobend = 1;
-$ignored_opts = ""; # These will be ignored.
-
-# First parse an option like JOB=1:4, and any
-# options that would normally be given to
-# queue.pl, which we will just discard.
-
-for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
-  # allow the JOB=1:n option to be interleaved with the
-  # options to qsub.
-  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
-    # parse any options that would normally go to qsub, but which will be ignored here.
-    my $switch = shift @ARGV;
-    if ($switch eq "-V") {
-      $ignored_opts .= "-V ";
-    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
-      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
-      # if the command appears multiple times uses the smallest option.
-      if ( $max_jobs_run <= 0 ) {
-          $max_jobs_run =  shift @ARGV;
-      } else {
-        my $new_constraint = shift @ARGV;
-        if ( ($new_constraint < $max_jobs_run) ) {
-          $max_jobs_run = $new_constraint;
-        }
-      }
-      
-      if (! ($max_jobs_run > 0)) {
-        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
-      }
-    } else {
-      my $argument = shift @ARGV;
-      if ($argument =~ m/^--/) {
-        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
-      }
-      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
-        $ignored_opts .= "-sync "; # Note: in the
-        # corresponding code in queue.pl it says instead, just "$sync = 1;".
-      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
-        my $argument2 = shift @ARGV;
-        $ignored_opts .= "$switch $argument $argument2 ";
-      } elsif ($switch eq "--gpu") {
-        $using_gpu = $argument;
-      } elsif ($switch eq "--pick") {
-        if($argument =~ m/^(all|failed|incomplete)$/) {
-          $job_pick = $argument;
-        } else {
-          print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
-        }
-      } else {
-        # Ignore option.
-        $ignored_opts .= "$switch $argument ";
-      }
-    }
-  }
-  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
-    $jobname = $1;
-    $jobstart = $2;
-    $jobend = $3;
-    if ($jobstart > $jobend) {
-      die "run.pl: invalid job range $ARGV[0]";
-    }
-    if ($jobstart <= 0) {
-      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
-    }
-    shift;
-  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
-    $jobname = $1;
-    $jobstart = $2;
-    $jobend = $2;
-    shift;
-  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
-    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
-  }
-}
-
-# Users found this message confusing so we are removing it.
-# if ($ignored_opts ne "") {
-#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
-# }
-
-if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
-                           # then work out the number of processors if possible,
-                           # and set it based on that.
-  $max_jobs_run = 0;
-  if ($using_gpu) {
-    if (open(P, "nvidia-smi -L |")) {
-      $max_jobs_run++ while (<P>);
-      close(P);
-    }
-    if ($max_jobs_run == 0) {
-      $max_jobs_run = 1;
-      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
-    }
-  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
-    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
-    if ($max_jobs_run == 0) {
-      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
-      $max_jobs_run = 10;  # reasonable default.
-    }
-    close(P);
-  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
-    while (<P>) {
-      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
-        $max_jobs_run = $1;
-        last;
-      }
-    }
-    close(P);
-    if ($max_jobs_run == 0) {
-      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
-      $max_jobs_run = 10;  # reasonable default.
-    }
-  } else {
-    # allow at most 32 jobs at once, on non-UNIX systems; change this code
-    # if you need to change this default.
-    $max_jobs_run = 32;
-  }
-  # The just-computed value of $max_jobs_run is just the number of processors
-  # (or our best guess); and if it happens that the number of jobs we need to
-  # run is just slightly above $max_jobs_run, it will make sense to increase
-  # $max_jobs_run to equal the number of jobs, so we don't have a small number
-  # of leftover jobs.
-  $num_jobs = $jobend - $jobstart + 1;
-  if (!$using_gpu &&
-      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
-    $max_jobs_run = $num_jobs;
-  }
-}
-
-sub pick_or_exit {
-  # pick_or_exit ( $logfile ) 
-  # Invoked before each job is started helps to run jobs selectively.
-  #
-  # Given the name of the output logfile decides whether the job must be 
-  # executed (by returning from the subroutine) or not (by terminating the
-  # process calling exit)
-  # 
-  # PRE: $job_pick is a global variable set by command line switch --pick
-  #      and indicates which class of jobs must be executed.
-  #
-  # 1) If a failed job is not executed the process exit code will indicate 
-  #    failure, just as if the task was just executed  and failed.
-  #
-  # 2) If a task is incomplete it will be executed. Incomplete may be either
-  #    a job whose log file does not contain the accounting notes in the end,
-  #    or a job whose log file does not exist.
-  #
-  # 3) If the $job_pick is set to 'all' (default behavior) a task will be
-  #    executed regardless of the result of previous attempts.
-  #
-  # This logic could have been implemented in the main execution loop
-  # but a subroutine to preserve the current level of readability of
-  # that part of the code.
-  #
-  # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
-  #
-  if($job_pick eq 'all'){
-    return; # no need to bother with the previous log
-  }
-  open my $fh, "<", $_[0] or return; # job not executed yet
-  my $log_line;
-  my $cur_line;
-  while ($cur_line = <$fh>) {
-    if( $cur_line =~ m/# Ended \(code .*/ ) {
-      $log_line = $cur_line;
-    }
-  }
-  close $fh;
-  if (! defined($log_line)){
-    return; # incomplete
-  }
-  if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
-    exit(0); # complete
-  } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
-    if ($job_pick !~ m/^(failed|all)$/) {
-      exit(1); # failed but not going to run
-    } else {
-      return; # failed
-    }
-  } elsif ( $log_line =~ m/.*\S.*/ ) {
-    return; # incomplete jobs are always run
-  }
-}
-
-
-$logfile = shift @ARGV;
-
-if (defined $jobname && $logfile !~ m/$jobname/ &&
-    $jobend > $jobstart) {
-  print STDERR "run.pl: you are trying to run a parallel job but "
-    . "you are putting the output into just one log file ($logfile)\n";
-  exit(1);
-}
-
-$cmd = "";
-
-foreach $x (@ARGV) {
-    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
-    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
-    else { $cmd .= "\"$x\" "; }
-}
-
-#$Data::Dumper::Indent=0;
-$ret = 0;
-$numfail = 0;
-%active_pids=();
-
-use POSIX ":sys_wait_h";
-for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-  if (scalar(keys %active_pids) >= $max_jobs_run) {
-
-    # Lets wait for a change in any child's status
-    # Then we have to work out which child finished
-    $r = waitpid(-1, 0);
-    $code = $?;
-    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
-    if ( defined $active_pids{$r} ) {
-        $jid=$active_pids{$r};
-        $fail[$jid]=$code;
-        if ($code !=0) { $numfail++;}
-        delete $active_pids{$r};
-        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
-    } else {
-        die "run.pl: Cannot find the PID of the child process that just finished.";
-    }
-
-    # In theory we could do a non-blocking waitpid over all jobs running just
-    # to find out if only one or more jobs finished during the previous waitpid()
-    # However, we just omit this and will reap the next one in the next pass
-    # through the for(;;) cycle
-  }
-  $childpid = fork();
-  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
-  if ($childpid == 0) { # We're in the child... this branch
-    # executes the job and returns (possibly with an error status).
-    if (defined $jobname) {
-      $cmd =~ s/$jobname/$jobid/g;
-      $logfile =~ s/$jobname/$jobid/g;
-    }
-    # exit if the job does not need to be executed
-    pick_or_exit( $logfile );
-
-    system("mkdir -p `dirname $logfile` 2>/dev/null");
-    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
-    print F "# " . $cmd . "\n";
-    print F "# Started at " . `date`;
-    $starttime = `date +'%s'`;
-    print F "#\n";
-    close(F);
-
-    # Pipe into bash.. make sure we're not using any other shell.
-    open(B, "|bash") || die "run.pl: Error opening shell command";
-    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
-    close(B);                   # If there was an error, exit status is in $?
-    $ret = $?;
-
-    $lowbits = $ret & 127;
-    $highbits = $ret >> 8;
-    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
-    else { $return_str = "code $highbits"; }
-
-    $endtime = `date +'%s'`;
-    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
-    $enddate = `date`;
-    chop $enddate;
-    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
-    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
-    close(F);
-    exit($ret == 0 ? 0 : 1);
-  } else {
-    $pid[$jobid] = $childpid;
-    $active_pids{$childpid} = $jobid;
-    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
-  }
-}
-
-# Now we have submitted all the jobs, lets wait until all the jobs finish
-foreach $child (keys %active_pids) {
-    $jobid=$active_pids{$child};
-    $r = waitpid($pid[$jobid], 0);
-    $code = $?;
-    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
-    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
-}
-
-# Some sanity checks:
-# The $fail array should not contain undefined codes
-# The number of non-zeros in that array  should be equal to $numfail
-# We cannot do foreach() here, as the JOB ids do not start at zero
-$failed_jids=0;
-for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-  $job_return = $fail[$jobid];
-  if (not defined $job_return ) {
-    # print Dumper(\@fail);
-
-    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
-      "even after we waited for all jobs to finish" ;
-  }
-  if ($job_return != 0 ){ $failed_jids++;}
-}
-if ($failed_jids != $numfail) {
-  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
-}
-if ($numfail > 0) { $ret = 1; }
-
-if ($ret != 0) {
-  $njobs = $jobend - $jobstart + 1;
-  if ($njobs == 1) {
-    if (defined $jobname) {
-      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
-                                         # that job.
-    }
-    print STDERR "run.pl: job failed, log is in $logfile\n";
-    if ($logfile =~ m/JOB/) {
-      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
-    }
-  }
-  else {
-    $logfile =~ s/$jobname/*/g;
-    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
-  }
-}
-
-
-exit ($ret);
diff --git a/egs/aishell2/transformer/utils/shuffle_list.pl b/egs/aishell2/transformer/utils/shuffle_list.pl
deleted file mode 100755
index a116200..0000000
--- a/egs/aishell2/transformer/utils/shuffle_list.pl
+++ /dev/null
@@ -1,44 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2013  Johns Hopkins University (author: Daniel Povey)
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-if ($ARGV[0] eq "--srand") {
-  $n = $ARGV[1];
-  $n =~ m/\d+/ || die "Bad argument to --srand option: \"$n\"";
-  srand($ARGV[1]);
-  shift;
-  shift;
-} else {
-  srand(0); # Gives inconsistent behavior if we don't seed.
-}
-
-if (@ARGV > 1 || $ARGV[0] =~ m/^-.+/) { # >1 args, or an option we
-  # don't understand.
-  print "Usage: shuffle_list.pl [--srand N] [input file]  > output\n";
-  print "randomizes the order of lines of input.\n";
-  exit(1);
-}
-
-@lines;
-while (<>) {
-  push @lines, [ (rand(), $_)] ;
-}
-
-@lines = sort { $a->[0] cmp $b->[0] } @lines;
-foreach $l (@lines) {
-    print $l->[1];
-}
\ No newline at end of file
diff --git a/egs/aishell2/transformer/utils/split_data.py b/egs/aishell2/transformer/utils/split_data.py
deleted file mode 100755
index 060eae6..0000000
--- a/egs/aishell2/transformer/utils/split_data.py
+++ /dev/null
@@ -1,60 +0,0 @@
-import os
-import sys
-import random
-
-
-in_dir = sys.argv[1]
-out_dir = sys.argv[2]
-num_split = sys.argv[3]
-
-
-def split_scp(scp, num):
-    assert len(scp) >= num
-    avg = len(scp) // num
-    out = []
-    begin = 0
-
-    for i in range(num):
-        if i == num - 1:
-            out.append(scp[begin:])
-        else:
-            out.append(scp[begin:begin+avg])
-        begin += avg
-
-    return out
-
-
-os.path.exists("{}/wav.scp".format(in_dir))
-os.path.exists("{}/text".format(in_dir))
-
-with open("{}/wav.scp".format(in_dir), 'r') as infile:
-    wav_list = infile.readlines()
-
-with open("{}/text".format(in_dir), 'r') as infile:
-    text_list = infile.readlines()
-
-assert len(wav_list) == len(text_list)
-
-x = list(zip(wav_list, text_list))
-random.shuffle(x)
-wav_shuffle_list, text_shuffle_list = zip(*x)
-
-num_split = int(num_split)
-wav_split_list = split_scp(wav_shuffle_list, num_split)
-text_split_list = split_scp(text_shuffle_list, num_split)
-
-for idx, wav_list in enumerate(wav_split_list, 1):
-    path = out_dir + "/split" + str(num_split) + "/" + str(idx)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    with open("{}/wav.scp".format(path), 'w') as wav_writer:
-        for line in wav_list:
-            wav_writer.write(line)
-
-for idx, text_list in enumerate(text_split_list, 1):
-    path = out_dir + "/split" + str(num_split) + "/" + str(idx)
-    if not os.path.exists(path):
-        os.makedirs(path)
-    with open("{}/text".format(path), 'w') as text_writer:
-        for line in text_list:
-            text_writer.write(line)
diff --git a/egs/aishell2/transformer/utils/split_scp.pl b/egs/aishell2/transformer/utils/split_scp.pl
deleted file mode 100755
index 0876dcb..0000000
--- a/egs/aishell2/transformer/utils/split_scp.pl
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2010-2011 Microsoft Corporation
-
-# See ../../COPYING for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This program splits up any kind of .scp or archive-type file.
-# If there is no utt2spk option it will work on any text  file and
-# will split it up with an approximately equal number of lines in
-# each but.
-# With the --utt2spk option it will work on anything that has the
-# utterance-id as the first entry on each line; the utt2spk file is
-# of the form "utterance speaker" (on each line).
-# It splits it into equal size chunks as far as it can.  If you use the utt2spk
-# option it will make sure these chunks coincide with speaker boundaries.  In
-# this case, if there are more chunks than speakers (and in some other
-# circumstances), some of the resulting chunks will be empty and it will print
-# an error message and exit with nonzero status.
-# You will normally call this like:
-# split_scp.pl scp scp.1 scp.2 scp.3 ...
-# or
-# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
-# Note that you can use this script to split the utt2spk file itself,
-# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
-
-# You can also call the scripts like:
-# split_scp.pl -j 3 0 scp scp.0
-# [note: with this option, it assumes zero-based indexing of the split parts,
-# i.e. the second number must be 0 <= n < num-jobs.]
-
-use warnings;
-
-$num_jobs = 0;
-$job_id = 0;
-$utt2spk_file = "";
-$one_based = 0;
-
-for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
-    if ($ARGV[0] eq "-j") {
-        shift @ARGV;
-        $num_jobs = shift @ARGV;
-        $job_id = shift @ARGV;
-    }
-    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
-        $utt2spk_file=$1;
-        shift;
-    }
-    if ($ARGV[0] eq '--one-based') {
-        $one_based = 1;
-        shift @ARGV;
-    }
-}
-
-if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
-                       $job_id - $one_based >= $num_jobs)) {
-  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
-      ($one_based ? " --one-based" : "") . "'\n"
-}
-
-$one_based
-    and $job_id--;
-
-if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die
-"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
-   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
- ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
-}
-
-$error = 0;
-$inscp = shift @ARGV;
-if ($num_jobs == 0) { # without -j option
-    @OUTPUTS = @ARGV;
-} else {
-    for ($j = 0; $j < $num_jobs; $j++) {
-        if ($j == $job_id) {
-            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
-            else { push @OUTPUTS, "-"; }
-        } else {
-            push @OUTPUTS, "/dev/null";
-        }
-    }
-}
-
-if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
-    while(<$u_fh>) {
-        @A = split;
-        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
-        ($u,$s) = @A;
-        $utt2spk{$u} = $s;
-    }
-    close $u_fh;
-    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
-    @spkrs = ();
-    while(<$i_fh>) {
-        @A = split;
-        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
-        $u = $A[0];
-        $s = $utt2spk{$u};
-        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
-        if(!defined $spk_count{$s}) {
-            push @spkrs, $s;
-            $spk_count{$s} = 0;
-            $spk_data{$s} = [];  # ref to new empty array.
-        }
-        $spk_count{$s}++;
-        push @{$spk_data{$s}}, $_;
-    }
-    # Now split as equally as possible ..
-    # First allocate spks to files by allocating an approximately
-    # equal number of speakers.
-    $numspks = @spkrs;  # number of speakers.
-    $numscps = @OUTPUTS; # number of output files.
-    if ($numspks < $numscps) {
-      die "$0: Refusing to split data because number of speakers $numspks " .
-          "is less than the number of output .scp files $numscps\n";
-    }
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scparray[$scpidx] = []; # [] is array reference.
-    }
-    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
-        $scpidx = int(($spkidx*$numscps) / $numspks);
-        $spk = $spkrs[$spkidx];
-        push @{$scparray[$scpidx]}, $spk;
-        $scpcount[$scpidx] += $spk_count{$spk};
-    }
-
-    # Now will try to reassign beginning + ending speakers
-    # to different scp's and see if it gets more balanced.
-    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
-    # We can show that if considering changing just 2 scp's, we minimize
-    # this by minimizing the squared difference in sizes.  This is
-    # equivalent to minimizing the absolute difference in sizes.  This
-    # shows this method is bound to converge.
-
-    $changed = 1;
-    while($changed) {
-        $changed = 0;
-        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-            # First try to reassign ending spk of this scp.
-            if($scpidx < $numscps-1) {
-                $sz = @{$scparray[$scpidx]};
-                if($sz > 0) {
-                    $spk = $scparray[$scpidx]->[$sz-1];
-                    $count = $spk_count{$spk};
-                    $nutt1 = $scpcount[$scpidx];
-                    $nutt2 = $scpcount[$scpidx+1];
-                    if( abs( ($nutt2+$count) - ($nutt1-$count))
-                        < abs($nutt2 - $nutt1))  { # Would decrease
-                        # size-diff by reassigning spk...
-                        $scpcount[$scpidx+1] += $count;
-                        $scpcount[$scpidx] -= $count;
-                        pop @{$scparray[$scpidx]};
-                        unshift @{$scparray[$scpidx+1]}, $spk;
-                        $changed = 1;
-                    }
-                }
-            }
-            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
-                $spk = $scparray[$scpidx]->[0];
-                $count = $spk_count{$spk};
-                $nutt1 = $scpcount[$scpidx-1];
-                $nutt2 = $scpcount[$scpidx];
-                if( abs( ($nutt2-$count) - ($nutt1+$count))
-                    < abs($nutt2 - $nutt1))  { # Would decrease
-                    # size-diff by reassigning spk...
-                    $scpcount[$scpidx-1] += $count;
-                    $scpcount[$scpidx] -= $count;
-                    shift @{$scparray[$scpidx]};
-                    push @{$scparray[$scpidx-1]}, $spk;
-                    $changed = 1;
-                }
-            }
-        }
-    }
-    # Now print out the files...
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
-                         : open($f_fh, '>&', \*STDOUT)) ||
-            die "$0: Could not open scp file $scpfile for writing: $!\n";
-        $count = 0;
-        if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
-                         "$scpfile (too many splits and too few speakers?)\n";
-            $error = 1;
-        } else {
-            foreach $spk ( @{$scparray[$scpidx]} ) {
-                print $f_fh @{$spk_data{$spk}};
-                $count += $spk_count{$spk};
-            }
-            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
-        }
-        close($f_fh);
-    }
-} else {
-   # This block is the "normal" case where there is no --utt2spk
-   # option and we just break into equal size chunks.
-
-    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
-
-    $numscps = @OUTPUTS;  # size of array.
-    @F = ();
-    while(<$i_fh>) {
-        push @F, $_;
-    }
-    $numlines = @F;
-    if($numlines == 0) {
-        print STDERR "$0: error: empty input scp file $inscp\n";
-        $error = 1;
-    }
-    $linesperscp = int( $numlines / $numscps); # the "whole part"..
-    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
-    $remainder = $numlines - ($linesperscp * $numscps);
-    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
-    # [just doing int() rounds down].
-    $n = 0;
-    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
-                         : open($o_fh, '>&', \*STDOUT)) ||
-            die "$0: Could not open scp file $scpfile for writing: $!\n";
-        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print $o_fh $F[$n++];
-        }
-        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
-    }
-    $n == $numlines || die "$n != $numlines [code error]";
-}
-
-exit ($error);
diff --git a/egs/aishell2/transformer/utils/subset_data_dir_tr_cv.sh b/egs/aishell2/transformer/utils/subset_data_dir_tr_cv.sh
deleted file mode 100755
index e16cebd..0000000
--- a/egs/aishell2/transformer/utils/subset_data_dir_tr_cv.sh
+++ /dev/null
@@ -1,30 +0,0 @@
-#!/usr/bin/env bash
-
-dev_num_utt=1000
-
-echo "$0 $@"
-. utils/parse_options.sh || exit 1;
-
-train_data=$1
-out_dir=$2
-
-[ ! -f ${train_data}/wav.scp ] && echo "$0: no such file ${train_data}/wav.scp" && exit 1;
-[ ! -f ${train_data}/text ] && echo "$0: no such file ${train_data}/text" && exit 1;
-
-mkdir -p ${out_dir}/train && mkdir -p ${out_dir}/dev
-
-cp ${train_data}/wav.scp ${out_dir}/train/wav.scp.bak
-cp ${train_data}/text ${out_dir}/train/text.bak
-
-num_utt=$(wc -l <${out_dir}/train/wav.scp.bak)
-
-utils/shuffle_list.pl --srand 1 ${out_dir}/train/wav.scp.bak > ${out_dir}/train/wav.scp.shuf
-head -n ${dev_num_utt} ${out_dir}/train/wav.scp.shuf > ${out_dir}/dev/wav.scp
-tail -n $((${num_utt}-${dev_num_utt})) ${out_dir}/train/wav.scp.shuf > ${out_dir}/train/wav.scp
-
-utils/shuffle_list.pl --srand 1 ${out_dir}/train/text.bak > ${out_dir}/train/text.shuf
-head -n ${dev_num_utt} ${out_dir}/train/text.shuf > ${out_dir}/dev/text
-tail -n $((${num_utt}-${dev_num_utt})) ${out_dir}/train/text.shuf > ${out_dir}/train/text
-
-rm ${out_dir}/train/wav.scp.bak ${out_dir}/train/text.bak
-rm ${out_dir}/train/wav.scp.shuf ${out_dir}/train/text.shuf
diff --git a/egs/aishell2/transformer/utils/text2token.py b/egs/aishell2/transformer/utils/text2token.py
deleted file mode 100755
index 56c3913..0000000
--- a/egs/aishell2/transformer/utils/text2token.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-
-# Copyright 2017 Johns Hopkins University (Shinji Watanabe)
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-
-import argparse
-import codecs
-import re
-import sys
-
-is_python2 = sys.version_info[0] == 2
-
-
-def exist_or_not(i, match_pos):
-    start_pos = None
-    end_pos = None
-    for pos in match_pos:
-        if pos[0] <= i < pos[1]:
-            start_pos = pos[0]
-            end_pos = pos[1]
-            break
-
-    return start_pos, end_pos
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="convert raw text to tokenized text",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--nchar",
-        "-n",
-        default=1,
-        type=int,
-        help="number of characters to split, i.e., \
-                        aabb -> a a b b with -n 1 and aa bb with -n 2",
-    )
-    parser.add_argument(
-        "--skip-ncols", "-s", default=0, type=int, help="skip first n columns"
-    )
-    parser.add_argument("--space", default="<space>", type=str, help="space symbol")
-    parser.add_argument(
-        "--non-lang-syms",
-        "-l",
-        default=None,
-        type=str,
-        help="list of non-linguistic symobles, e.g., <NOISE> etc.",
-    )
-    parser.add_argument("text", type=str, default=False, nargs="?", help="input text")
-    parser.add_argument(
-        "--trans_type",
-        "-t",
-        type=str,
-        default="char",
-        choices=["char", "phn"],
-        help="""Transcript type. char/phn. e.g., for TIMIT FADG0_SI1279 -
-                        If trans_type is char,
-                        read from SI1279.WRD file -> "bricks are an alternative"
-                        Else if trans_type is phn,
-                        read from SI1279.PHN file -> "sil b r ih sil k s aa r er n aa l
-                        sil t er n ih sil t ih v sil" """,
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    rs = []
-    if args.non_lang_syms is not None:
-        with codecs.open(args.non_lang_syms, "r", encoding="utf-8") as f:
-            nls = [x.rstrip() for x in f.readlines()]
-            rs = [re.compile(re.escape(x)) for x in nls]
-
-    if args.text:
-        f = codecs.open(args.text, encoding="utf-8")
-    else:
-        f = codecs.getreader("utf-8")(sys.stdin if is_python2 else sys.stdin.buffer)
-
-    sys.stdout = codecs.getwriter("utf-8")(
-        sys.stdout if is_python2 else sys.stdout.buffer
-    )
-    line = f.readline()
-    n = args.nchar
-    while line:
-        x = line.split()
-        print(" ".join(x[: args.skip_ncols]), end=" ")
-        a = " ".join(x[args.skip_ncols :])
-
-        # get all matched positions
-        match_pos = []
-        for r in rs:
-            i = 0
-            while i >= 0:
-                m = r.search(a, i)
-                if m:
-                    match_pos.append([m.start(), m.end()])
-                    i = m.end()
-                else:
-                    break
-
-        if args.trans_type == "phn":
-            a = a.split(" ")
-        else:
-            if len(match_pos) > 0:
-                chars = []
-                i = 0
-                while i < len(a):
-                    start_pos, end_pos = exist_or_not(i, match_pos)
-                    if start_pos is not None:
-                        chars.append(a[start_pos:end_pos])
-                        i = end_pos
-                    else:
-                        chars.append(a[i])
-                        i += 1
-                a = chars
-
-            a = [a[j : j + n] for j in range(0, len(a), n)]
-
-        a_flat = []
-        for z in a:
-            a_flat.append("".join(z))
-
-        a_chars = [z.replace(" ", args.space) for z in a_flat]
-        if args.trans_type == "phn":
-            a_chars = [z.replace("sil", args.space) for z in a_chars]
-        print(" ".join(a_chars))
-        line = f.readline()
-
-
-if __name__ == "__main__":
-    main()
diff --git a/egs/aishell2/transformer/utils/text_tokenize.py b/egs/aishell2/transformer/utils/text_tokenize.py
deleted file mode 100755
index 962ea11..0000000
--- a/egs/aishell2/transformer/utils/text_tokenize.py
+++ /dev/null
@@ -1,106 +0,0 @@
-import re
-import argparse
-
-
-def load_dict(seg_file):
-    seg_dict = {}
-    with open(seg_file, 'r') as infile:
-        for line in infile:
-            s = line.strip().split()
-            key = s[0]
-            value = s[1:]
-            seg_dict[key] = " ".join(value)
-    return seg_dict
-
-
-def forward_segment(text, dic):
-    word_list = []
-    i = 0
-    while i < len(text):
-        longest_word = text[i]
-        for j in range(i + 1, len(text) + 1):
-            word = text[i:j]
-            if word in dic:
-                if len(word) > len(longest_word):
-                    longest_word = word
-        word_list.append(longest_word)
-        i += len(longest_word)
-    return word_list
-
-
-def tokenize(txt,
-             seg_dict):
-    out_txt = ""
-    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
-    for word in txt:
-        if pattern.match(word):
-            if word in seg_dict:
-                out_txt += seg_dict[word] + " "
-            else:
-                out_txt += "<unk>" + " "
-        else:
-            continue
-    return out_txt.strip()
-
-
-def get_parser():
-    parser = argparse.ArgumentParser(
-        description="text tokenize",
-        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
-    )
-    parser.add_argument(
-        "--text-file",
-        "-t",
-        default=False,
-        required=True,
-        type=str,
-        help="input text",
-    )
-    parser.add_argument(
-        "--seg-file",
-        "-s",
-        default=False,
-        required=True,
-        type=str,
-        help="seg file",
-    )
-    parser.add_argument(
-        "--txt-index",
-        "-i",
-        default=1,
-        required=True,
-        type=int,
-        help="txt index",
-    )
-    parser.add_argument(
-        "--output-dir",
-        "-o",
-        default=False,
-        required=True,
-        type=str,
-        help="output dir",
-    )
-    return parser
-
-
-def main():
-    parser = get_parser()
-    args = parser.parse_args()
-
-    txt_writer = open("{}/text.{}.txt".format(args.output_dir, args.txt_index), 'w')
-    shape_writer = open("{}/len.{}".format(args.output_dir, args.txt_index), 'w')
-    seg_dict = load_dict(args.seg_file)
-    with open(args.text_file, 'r') as infile:
-        for line in infile:
-            s = line.strip().split()
-            text_id = s[0]
-            text_list = forward_segment("".join(s[1:]).lower(), seg_dict)
-            text = tokenize(text_list, seg_dict)
-            lens = len(text.strip().split())
-            txt_writer.write(text_id + " " + text + '\n')
-            shape_writer.write(text_id + " " + str(lens) + '\n')
-
-
-if __name__ == '__main__':
-    main()
-
diff --git a/egs/aishell2/transformer/utils/text_tokenize.sh b/egs/aishell2/transformer/utils/text_tokenize.sh
deleted file mode 100755
index 6b74fef..0000000
--- a/egs/aishell2/transformer/utils/text_tokenize.sh
+++ /dev/null
@@ -1,35 +0,0 @@
-#!/usr/bin/env bash
-
-
-# Begin configuration section.
-nj=32
-cmd=utils/run.pl
-
-echo "$0 $@"
-
-. utils/parse_options.sh || exit 1;
-
-# tokenize configuration
-text_dir=$1
-seg_file=$2
-logdir=$3
-output_dir=$4
-
-txt_dir=${output_dir}/txt; mkdir -p ${output_dir}/txt
-mkdir -p ${logdir}
-
-$cmd JOB=1:$nj $logdir/text_tokenize.JOB.log \
-  python utils/text_tokenize.py -t ${text_dir}/txt/text.JOB.txt \
-      -s ${seg_file} -i JOB -o ${txt_dir} \
-      || exit 1;
-
-# concatenate the text files together.
-for n in $(seq $nj); do
-  cat ${txt_dir}/text.$n.txt || exit 1
-done > ${output_dir}/text || exit 1
-
-for n in $(seq $nj); do
-  cat ${txt_dir}/len.$n || exit 1
-done > ${output_dir}/text_shape || exit 1
-
-echo "$0: Succeeded text tokenize"
diff --git a/egs/aishell2/transformer/utils/textnorm_zh.py b/egs/aishell2/transformer/utils/textnorm_zh.py
deleted file mode 100755
index 79feb83..0000000
--- a/egs/aishell2/transformer/utils/textnorm_zh.py
+++ /dev/null
@@ -1,834 +0,0 @@
-#!/usr/bin/env python3
-# coding=utf-8
-
-# Authors:
-#   2019.5 Zhiyang Zhou (https://github.com/Joee1995/chn_text_norm.git)
-#   2019.9 Jiayu DU
-#
-# requirements:
-#   - python 3.X
-# notes: python 2.X WILL fail or produce misleading results
-
-import sys, os, argparse, codecs, string, re
-
-# ================================================================================ #
-#                                    basic constant
-# ================================================================================ #
-CHINESE_DIGIS = u'闆朵竴浜屼笁鍥涗簲鍏竷鍏節'
-BIG_CHINESE_DIGIS_SIMPLIFIED = u'闆跺９璐板弫鑲嗕紞闄嗘煉鎹岀帠'
-BIG_CHINESE_DIGIS_TRADITIONAL = u'闆跺９璨冲弮鑲嗕紞闄告煉鎹岀帠'
-SMALLER_BIG_CHINESE_UNITS_SIMPLIFIED = u'鍗佺櫨鍗冧竾'
-SMALLER_BIG_CHINESE_UNITS_TRADITIONAL = u'鎷句桨浠熻惉'
-LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'浜垮厗浜灀绉┌娌熸锭姝ｈ浇'
-LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'鍎勫厗浜灀绉┌婧濇緱姝ｈ級'
-SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED = u'鍗佺櫨鍗冧竾'
-SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL = u'鎷句桨浠熻惉'
-
-ZERO_ALT = u'銆�'
-ONE_ALT = u'骞�'
-TWO_ALTS = [u'涓�', u'鍏�']
-
-POSITIVE = [u'姝�', u'姝�']
-NEGATIVE = [u'璐�', u'璨�']
-POINT = [u'鐐�', u'榛�']
-# PLUS = [u'鍔�', u'鍔�']
-# SIL = [u'鏉�', u'妲�']
-
-FILLER_CHARS = ['鍛�', '鍟�']
-ER_WHITELIST = '(鍎垮コ|鍎垮瓙|鍎垮瓩|濂冲効|鍎垮|濡诲効|' \
-             '鑳庡効|濠村効|鏂扮敓鍎縷濠村辜鍎縷骞煎効|灏戝効|灏忓効|鍎挎瓕|鍎跨|鍎跨|鎵樺効鎵�|瀛ゅ効|' \
-             '鍎挎垙|鍎垮寲|鍙板効搴剕楣垮効宀泑姝ｅ効鍏粡|鍚婂効閮庡綋|鐢熷効鑲插コ|鎵樺効甯﹀コ|鍏诲効闃茶�亅鐥村効鍛嗗コ|' \
-             '浣冲効浣冲|鍎挎�滃吔鎵皘鍎挎棤甯哥埗|鍎夸笉瀚屾瘝涓憒鍎胯鍗冮噷姣嶆媴蹇鍎垮ぇ涓嶇敱鐖穦鑻忎篂鍎�)'
-
-# 涓枃鏁板瓧绯荤粺绫诲瀷
-NUMBERING_TYPES = ['low', 'mid', 'high']
-
-CURRENCY_NAMES = '(浜烘皯甯亅缇庡厓|鏃ュ厓|鑻遍晳|娆у厓|椹厠|娉曢儙|鍔犳嬁澶у厓|婢冲厓|娓竵|鍏堜护|鑺叞椹厠|鐖卞皵鍏伴晳|' \
-                 '閲屾媺|鑽峰叞鐩緗鍩冩柉搴撳|姣斿濉攟鍗板凹鐩緗鏋楀悏鐗箌鏂拌タ鍏板厓|姣旂储|鍗㈠竷|鏂板姞鍧″厓|闊╁厓|娉伴摙)'
-CURRENCY_UNITS = '((浜縷鍗冧竾|鐧句竾|涓噟鍗億鐧�)|(浜縷鍗冧竾|鐧句竾|涓噟鍗億鐧緗)鍏億(浜縷鍗冧竾|鐧句竾|涓噟鍗億鐧緗)鍧梶瑙抾姣泑鍒�)'
-COM_QUANTIFIERS = '(鍖箌寮爘搴鍥瀨鍦簗灏緗鏉涓獆棣東闃檤闃祙缃憒鐐畖椤秥涓榺妫祙鍙獆鏀瘄琚瓅杈唡鎸憒鎷厊棰梶澹硘绐爘鏇瞸澧檤缇鑵攟' \
-                  '鐮搴瀹璐瘄鎵巪鎹唡鍒�|浠鎵搢鎵媩缃梶鍧灞眧宀瓅姹焲婧獆閽焲闃焲鍗晐鍙寍瀵箌鍑簗鍙澶磡鑴殀鏉縷璺硘鏋潀浠秥璐磡' \
-                  '閽坾绾縷绠鍚峾浣峾韬珅鍫倈璇緗鏈瑋椤祙瀹秥鎴穦灞倈涓潀姣珅鍘榺鍒唡閽眧涓鏂鎷厊閾鐭硘閽閿眧蹇絴(鍗億姣珅寰�)鍏媩' \
-                  '姣珅鍘榺鍒唡瀵竱灏簗涓坾閲寍瀵粅甯竱閾簗绋媩(鍗億鍒唡鍘榺姣珅寰�)绫硘鎾畖鍕簗鍚坾鍗噟鏂梶鐭硘鐩榺纰梶纰焲鍙爘妗秥绗紎鐩唡' \
-                  '鐩抾鏉瘄閽焲鏂泑閿厊绨媩绡畖鐩榺妗秥缃恷鐡秥澹秥鍗畖鐩弢绠﹟绠眧鐓瞸鍟東琚媩閽祙骞磡鏈坾鏃瀛鍒粅鏃秥鍛▅澶﹟绉抾鍒唡鏃瑋' \
-                  '绾獆宀亅涓東鏇磡澶渱鏄澶弢绉媩鍐瑋浠浼弢杈坾涓竱娉绮抾棰梶骞鍫唡鏉鏍箌鏀瘄閬搢闈鐗噟寮爘棰梶鍧�)'
-
-# punctuation information are based on Zhon project (https://github.com/tsroten/zhon.git)
-CHINESE_PUNC_STOP = '锛侊紵锝°��'
-CHINESE_PUNC_NON_STOP = '锛傦純锛勶紖锛嗭紘锛堬級锛婏紜锛岋紞锛忥細锛涳紲锛濓紴锛狅蓟锛硷冀锛撅伎锝�锝涳綔锝濓綖锝燂綘锝剑锝ゃ�併�冦�嬨�屻�嶃�庛�忋�愩�戙�斻�曘�栥�椼�樸�欍�氥�涖�溿�濄�炪�熴�般�俱�库�撯�斺�樷�欌�涒�溾�濃�炩�熲�︹�э箯'
-CHINESE_PUNC_LIST = CHINESE_PUNC_STOP + CHINESE_PUNC_NON_STOP
-
-# ================================================================================ #
-#                                    basic class
-# ================================================================================ #
-class ChineseChar(object):
-    """
-    涓枃瀛楃
-    姣忎釜瀛楃瀵瑰簲绠�浣撳拰绻佷綋,
-    e.g. 绠�浣� = '璐�', 绻佷綋 = '璨�'
-    杞崲鏃跺彲杞崲涓虹畝浣撴垨绻佷綋
-    """
-
-    def __init__(self, simplified, traditional):
-        self.simplified = simplified
-        self.traditional = traditional
-        #self.__repr__ = self.__str__
-
-    def __str__(self):
-        return self.simplified or self.traditional or None
-
-    def __repr__(self):
-        return self.__str__()
-
-
-class ChineseNumberUnit(ChineseChar):
-    """
-    涓枃鏁板瓧/鏁颁綅瀛楃
-    姣忎釜瀛楃闄ょ箒绠�浣撳杩樻湁涓�涓澶栫殑澶у啓瀛楃
-    e.g. '闄�' 鍜� '闄�'
-    """
-
-    def __init__(self, power, simplified, traditional, big_s, big_t):
-        super(ChineseNumberUnit, self).__init__(simplified, traditional)
-        self.power = power
-        self.big_s = big_s
-        self.big_t = big_t
-
-    def __str__(self):
-        return '10^{}'.format(self.power)
-
-    @classmethod
-    def create(cls, index, value, numbering_type=NUMBERING_TYPES[1], small_unit=False):
-
-        if small_unit:
-            return ChineseNumberUnit(power=index + 1,
-                                     simplified=value[0], traditional=value[1], big_s=value[1], big_t=value[1])
-        elif numbering_type == NUMBERING_TYPES[0]:
-            return ChineseNumberUnit(power=index + 8,
-                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
-        elif numbering_type == NUMBERING_TYPES[1]:
-            return ChineseNumberUnit(power=(index + 2) * 4,
-                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
-        elif numbering_type == NUMBERING_TYPES[2]:
-            return ChineseNumberUnit(power=pow(2, index + 3),
-                                     simplified=value[0], traditional=value[1], big_s=value[0], big_t=value[1])
-        else:
-            raise ValueError(
-                'Counting type should be in {0} ({1} provided).'.format(NUMBERING_TYPES, numbering_type))
-
-
-class ChineseNumberDigit(ChineseChar):
-    """
-    涓枃鏁板瓧瀛楃
-    """
-
-    def __init__(self, value, simplified, traditional, big_s, big_t, alt_s=None, alt_t=None):
-        super(ChineseNumberDigit, self).__init__(simplified, traditional)
-        self.value = value
-        self.big_s = big_s
-        self.big_t = big_t
-        self.alt_s = alt_s
-        self.alt_t = alt_t
-
-    def __str__(self):
-        return str(self.value)
-
-    @classmethod
-    def create(cls, i, v):
-        return ChineseNumberDigit(i, v[0], v[1], v[2], v[3])
-
-
-class ChineseMath(ChineseChar):
-    """
-    涓枃鏁颁綅瀛楃
-    """
-
-    def __init__(self, simplified, traditional, symbol, expression=None):
-        super(ChineseMath, self).__init__(simplified, traditional)
-        self.symbol = symbol
-        self.expression = expression
-        self.big_s = simplified
-        self.big_t = traditional
-
-
-CC, CNU, CND, CM = ChineseChar, ChineseNumberUnit, ChineseNumberDigit, ChineseMath
-
-
-class NumberSystem(object):
-    """
-    涓枃鏁板瓧绯荤粺
-    """
-    pass
-
-
-class MathSymbol(object):
-    """
-    鐢ㄤ簬涓枃鏁板瓧绯荤粺鐨勬暟瀛︾鍙� (绻�/绠�浣�), e.g.
-    positive = ['姝�', '姝�']
-    negative = ['璐�', '璨�']
-    point = ['鐐�', '榛�']
-    """
-
-    def __init__(self, positive, negative, point):
-        self.positive = positive
-        self.negative = negative
-        self.point = point
-
-    def __iter__(self):
-        for v in self.__dict__.values():
-            yield v
-
-
-# class OtherSymbol(object):
-#     """
-#     鍏朵粬绗﹀彿
-#     """
-#
-#     def __init__(self, sil):
-#         self.sil = sil
-#
-#     def __iter__(self):
-#         for v in self.__dict__.values():
-#             yield v
-
-
-# ================================================================================ #
-#                                    basic utils
-# ================================================================================ #
-def create_system(numbering_type=NUMBERING_TYPES[1]):
-    """
-    鏍规嵁鏁板瓧绯荤粺绫诲瀷杩斿洖鍒涘缓鐩稿簲鐨勬暟瀛楃郴缁燂紝榛樿涓� mid
-    NUMBERING_TYPES = ['low', 'mid', 'high']: 涓枃鏁板瓧绯荤粺绫诲瀷
-        low:  '鍏�' = '浜�' * '鍗�' = $10^{9}$,  '浜�' = '鍏�' * '鍗�', etc.
-        mid:  '鍏�' = '浜�' * '涓�' = $10^{12}$, '浜�' = '鍏�' * '涓�', etc.
-        high: '鍏�' = '浜�' * '浜�' = $10^{16}$, '浜�' = '鍏�' * '鍏�', etc.
-    杩斿洖瀵瑰簲鐨勬暟瀛楃郴缁�
-    """
-
-    # chinese number units of '浜�' and larger
-    all_larger_units = zip(
-        LARGER_CHINESE_NUMERING_UNITS_SIMPLIFIED, LARGER_CHINESE_NUMERING_UNITS_TRADITIONAL)
-    larger_units = [CNU.create(i, v, numbering_type, False)
-                    for i, v in enumerate(all_larger_units)]
-    # chinese number units of '鍗�, 鐧�, 鍗�, 涓�'
-    all_smaller_units = zip(
-        SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED, SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL)
-    smaller_units = [CNU.create(i, v, small_unit=True)
-                     for i, v in enumerate(all_smaller_units)]
-    # digis
-    chinese_digis = zip(CHINESE_DIGIS, CHINESE_DIGIS,
-                        BIG_CHINESE_DIGIS_SIMPLIFIED, BIG_CHINESE_DIGIS_TRADITIONAL)
-    digits = [CND.create(i, v) for i, v in enumerate(chinese_digis)]
-    digits[0].alt_s, digits[0].alt_t = ZERO_ALT, ZERO_ALT
-    digits[1].alt_s, digits[1].alt_t = ONE_ALT, ONE_ALT
-    digits[2].alt_s, digits[2].alt_t = TWO_ALTS[0], TWO_ALTS[1]
-
-    # symbols
-    positive_cn = CM(POSITIVE[0], POSITIVE[1], '+', lambda x: x)
-    negative_cn = CM(NEGATIVE[0], NEGATIVE[1], '-', lambda x: -x)
-    point_cn = CM(POINT[0], POINT[1], '.', lambda x,
-                  y: float(str(x) + '.' + str(y)))
-    # sil_cn = CM(SIL[0], SIL[1], '-', lambda x, y: float(str(x) + '-' + str(y)))
-    system = NumberSystem()
-    system.units = smaller_units + larger_units
-    system.digits = digits
-    system.math = MathSymbol(positive_cn, negative_cn, point_cn)
-    # system.symbols = OtherSymbol(sil_cn)
-    return system
-
-
-def chn2num(chinese_string, numbering_type=NUMBERING_TYPES[1]):
-
-    def get_symbol(char, system):
-        for u in system.units:
-            if char in [u.traditional, u.simplified, u.big_s, u.big_t]:
-                return u
-        for d in system.digits:
-            if char in [d.traditional, d.simplified, d.big_s, d.big_t, d.alt_s, d.alt_t]:
-                return d
-        for m in system.math:
-            if char in [m.traditional, m.simplified]:
-                return m
-
-    def string2symbols(chinese_string, system):
-        int_string, dec_string = chinese_string, ''
-        for p in [system.math.point.simplified, system.math.point.traditional]:
-            if p in chinese_string:
-                int_string, dec_string = chinese_string.split(p)
-                break
-        return [get_symbol(c, system) for c in int_string], \
-               [get_symbol(c, system) for c in dec_string]
-
-    def correct_symbols(integer_symbols, system):
-        """
-        涓�鐧惧叓 to 涓�鐧惧叓鍗�
-        涓�浜夸竴鍗冧笁鐧句竾 to 涓�浜� 涓�鍗冧竾 涓夌櫨涓�
-        """
-
-        if integer_symbols and isinstance(integer_symbols[0], CNU):
-            if integer_symbols[0].power == 1:
-                integer_symbols = [system.digits[1]] + integer_symbols
-
-        if len(integer_symbols) > 1:
-            if isinstance(integer_symbols[-1], CND) and isinstance(integer_symbols[-2], CNU):
-                integer_symbols.append(
-                    CNU(integer_symbols[-2].power - 1, None, None, None, None))
-
-        result = []
-        unit_count = 0
-        for s in integer_symbols:
-            if isinstance(s, CND):
-                result.append(s)
-                unit_count = 0
-            elif isinstance(s, CNU):
-                current_unit = CNU(s.power, None, None, None, None)
-                unit_count += 1
-
-            if unit_count == 1:
-                result.append(current_unit)
-            elif unit_count > 1:
-                for i in range(len(result)):
-                    if isinstance(result[-i - 1], CNU) and result[-i - 1].power < current_unit.power:
-                        result[-i - 1] = CNU(result[-i - 1].power +
-                                             current_unit.power, None, None, None, None)
-        return result
-
-    def compute_value(integer_symbols):
-        """
-        Compute the value.
-        When current unit is larger than previous unit, current unit * all previous units will be used as all previous units.
-        e.g. '涓ゅ崈涓�' = 2000 * 10000 not 2000 + 10000
-        """
-        value = [0]
-        last_power = 0
-        for s in integer_symbols:
-            if isinstance(s, CND):
-                value[-1] = s.value
-            elif isinstance(s, CNU):
-                value[-1] *= pow(10, s.power)
-                if s.power > last_power:
-                    value[:-1] = list(map(lambda v: v *
-                                                    pow(10, s.power), value[:-1]))
-                    last_power = s.power
-                value.append(0)
-        return sum(value)
-
-    system = create_system(numbering_type)
-    int_part, dec_part = string2symbols(chinese_string, system)
-    int_part = correct_symbols(int_part, system)
-    int_str = str(compute_value(int_part))
-    dec_str = ''.join([str(d.value) for d in dec_part])
-    if dec_part:
-        return '{0}.{1}'.format(int_str, dec_str)
-    else:
-        return int_str
-
-
-def num2chn(number_string, numbering_type=NUMBERING_TYPES[1], big=False,
-            traditional=False, alt_zero=False, alt_one=False, alt_two=True,
-            use_zeros=True, use_units=True):
-
-    def get_value(value_string, use_zeros=True):
-
-        striped_string = value_string.lstrip('0')
-
-        # record nothing if all zeros
-        if not striped_string:
-            return []
-
-        # record one digits
-        elif len(striped_string) == 1:
-            if use_zeros and len(value_string) != len(striped_string):
-                return [system.digits[0], system.digits[int(striped_string)]]
-            else:
-                return [system.digits[int(striped_string)]]
-
-        # recursively record multiple digits
-        else:
-            result_unit = next(u for u in reversed(
-                system.units) if u.power < len(striped_string))
-            result_string = value_string[:-result_unit.power]
-            return get_value(result_string) + [result_unit] + get_value(striped_string[-result_unit.power:])
-
-    system = create_system(numbering_type)
-
-    int_dec = number_string.split('.')
-    if len(int_dec) == 1:
-        int_string = int_dec[0]
-        dec_string = ""
-    elif len(int_dec) == 2:
-        int_string = int_dec[0]
-        dec_string = int_dec[1]
-    else:
-        raise ValueError(
-            "invalid input num string with more than one dot: {}".format(number_string))
-
-    if use_units and len(int_string) > 1:
-        result_symbols = get_value(int_string)
-    else:
-        result_symbols = [system.digits[int(c)] for c in int_string]
-    dec_symbols = [system.digits[int(c)] for c in dec_string]
-    if dec_string:
-        result_symbols += [system.math.point] + dec_symbols
-
-    if alt_two:
-        liang = CND(2, system.digits[2].alt_s, system.digits[2].alt_t,
-                    system.digits[2].big_s, system.digits[2].big_t)
-        for i, v in enumerate(result_symbols):
-            if isinstance(v, CND) and v.value == 2:
-                next_symbol = result_symbols[i +
-                                             1] if i < len(result_symbols) - 1 else None
-                previous_symbol = result_symbols[i - 1] if i > 0 else None
-                if isinstance(next_symbol, CNU) and isinstance(previous_symbol, (CNU, type(None))):
-                    if next_symbol.power != 1 and ((previous_symbol is None) or (previous_symbol.power != 1)):
-                        result_symbols[i] = liang
-
-    # if big is True, '涓�' will not be used and `alt_two` has no impact on output
-    if big:
-        attr_name = 'big_'
-        if traditional:
-            attr_name += 't'
-        else:
-            attr_name += 's'
-    else:
-        if traditional:
-            attr_name = 'traditional'
-        else:
-            attr_name = 'simplified'
-
-    result = ''.join([getattr(s, attr_name) for s in result_symbols])
-
-    # if not use_zeros:
-    #     result = result.strip(getattr(system.digits[0], attr_name))
-
-    if alt_zero:
-        result = result.replace(
-            getattr(system.digits[0], attr_name), system.digits[0].alt_s)
-
-    if alt_one:
-        result = result.replace(
-            getattr(system.digits[1], attr_name), system.digits[1].alt_s)
-
-    for i, p in enumerate(POINT):
-        if result.startswith(p):
-            return CHINESE_DIGIS[0] + result
-
-    # ^10, 11, .., 19
-    if len(result) >= 2 and result[1] in [SMALLER_CHINESE_NUMERING_UNITS_SIMPLIFIED[0],
-                                          SMALLER_CHINESE_NUMERING_UNITS_TRADITIONAL[0]] and \
-            result[0] in [CHINESE_DIGIS[1], BIG_CHINESE_DIGIS_SIMPLIFIED[1], BIG_CHINESE_DIGIS_TRADITIONAL[1]]:
-        result = result[1:]
-
-    return result
-
-
-# ================================================================================ #
-#                          different types of rewriters
-# ================================================================================ #
-class Cardinal:
-    """
-    CARDINAL绫�
-    """
-
-    def __init__(self, cardinal=None, chntext=None):
-        self.cardinal = cardinal
-        self.chntext = chntext
-
-    def chntext2cardinal(self):
-        return chn2num(self.chntext)
-
-    def cardinal2chntext(self):
-        return num2chn(self.cardinal)
-
-class Digit:
-    """
-    DIGIT绫�
-    """
-
-    def __init__(self, digit=None, chntext=None):
-        self.digit = digit
-        self.chntext = chntext
-
-    # def chntext2digit(self):
-    #     return chn2num(self.chntext)
-
-    def digit2chntext(self):
-        return num2chn(self.digit, alt_two=False, use_units=False)
-
-
-class TelePhone:
-    """
-    TELEPHONE绫�
-    """
-
-    def __init__(self, telephone=None, raw_chntext=None, chntext=None):
-        self.telephone = telephone
-        self.raw_chntext = raw_chntext
-        self.chntext = chntext
-
-    # def chntext2telephone(self):
-    #     sil_parts = self.raw_chntext.split('<SIL>')
-    #     self.telephone = '-'.join([
-    #         str(chn2num(p)) for p in sil_parts
-    #     ])
-    #     return self.telephone
-
-    def telephone2chntext(self, fixed=False):
-
-        if fixed:
-            sil_parts = self.telephone.split('-')
-            self.raw_chntext = '<SIL>'.join([
-                num2chn(part, alt_two=False, use_units=False) for part in sil_parts
-            ])
-            self.chntext = self.raw_chntext.replace('<SIL>', '')
-        else:
-            sp_parts = self.telephone.strip('+').split()
-            self.raw_chntext = '<SP>'.join([
-                num2chn(part, alt_two=False, use_units=False) for part in sp_parts
-            ])
-            self.chntext = self.raw_chntext.replace('<SP>', '')
-        return self.chntext
-
-
-class Fraction:
-    """
-    FRACTION绫�
-    """
-
-    def __init__(self, fraction=None, chntext=None):
-        self.fraction = fraction
-        self.chntext = chntext
-
-    def chntext2fraction(self):
-        denominator, numerator = self.chntext.split('鍒嗕箣')
-        return chn2num(numerator) + '/' + chn2num(denominator)
-
-    def fraction2chntext(self):
-        numerator, denominator = self.fraction.split('/')
-        return num2chn(denominator) + '鍒嗕箣' + num2chn(numerator)
-
-
-class Date:
-    """
-    DATE绫�
-    """
-
-    def __init__(self, date=None, chntext=None):
-        self.date = date
-        self.chntext = chntext
-
-    # def chntext2date(self):
-    #     chntext = self.chntext
-    #     try:
-    #         year, other = chntext.strip().split('骞�', maxsplit=1)
-    #         year = Digit(chntext=year).digit2chntext() + '骞�'
-    #     except ValueError:
-    #         other = chntext
-    #         year = ''
-    #     if other:
-    #         try:
-    #             month, day = other.strip().split('鏈�', maxsplit=1)
-    #             month = Cardinal(chntext=month).chntext2cardinal() + '鏈�'
-    #         except ValueError:
-    #             day = chntext
-    #             month = ''
-    #         if day:
-    #             day = Cardinal(chntext=day[:-1]).chntext2cardinal() + day[-1]
-    #     else:
-    #         month = ''
-    #         day = ''
-    #     date = year + month + day
-    #     self.date = date
-    #     return self.date
-
-    def date2chntext(self):
-        date = self.date
-        try:
-            year, other = date.strip().split('骞�', 1)
-            year = Digit(digit=year).digit2chntext() + '骞�'
-        except ValueError:
-            other = date
-            year = ''
-        if other:
-            try:
-                month, day = other.strip().split('鏈�', 1)
-                month = Cardinal(cardinal=month).cardinal2chntext() + '鏈�'
-            except ValueError:
-                day = date
-                month = ''
-            if day:
-                day = Cardinal(cardinal=day[:-1]).cardinal2chntext() + day[-1]
-        else:
-            month = ''
-            day = ''
-        chntext = year + month + day
-        self.chntext = chntext
-        return self.chntext
-
-
-class Money:
-    """
-    MONEY绫�
-    """
-
-    def __init__(self, money=None, chntext=None):
-        self.money = money
-        self.chntext = chntext
-
-    # def chntext2money(self):
-    #     return self.money
-
-    def money2chntext(self):
-        money = self.money
-        pattern = re.compile(r'(\d+(\.\d+)?)')
-        matchers = pattern.findall(money)
-        if matchers:
-            for matcher in matchers:
-                money = money.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext())
-        self.chntext = money
-        return self.chntext
-
-
-class Percentage:
-    """
-    PERCENTAGE绫�
-    """
-
-    def __init__(self, percentage=None, chntext=None):
-        self.percentage = percentage
-        self.chntext = chntext
-
-    def chntext2percentage(self):
-        return chn2num(self.chntext.strip().strip('鐧惧垎涔�')) + '%'
-
-    def percentage2chntext(self):
-        return '鐧惧垎涔�' + num2chn(self.percentage.strip().strip('%'))
-
-
-def remove_erhua(text, er_whitelist):
-    """
-    鍘婚櫎鍎垮寲闊宠瘝涓殑鍎�:
-    浠栧コ鍎垮湪閭ｈ竟鍎� -> 浠栧コ鍎垮湪閭ｈ竟
-    """
-
-    er_pattern = re.compile(er_whitelist)
-    new_str=''
-    while re.search('鍎�',text):
-        a = re.search('鍎�',text).span()
-        remove_er_flag = 0
-
-        if er_pattern.search(text):
-            b = er_pattern.search(text).span()
-            if b[0] <= a[0]:
-                remove_er_flag = 1
-
-        if remove_er_flag == 0 :
-            new_str = new_str + text[0:a[0]]
-            text = text[a[1]:]
-        else:
-            new_str = new_str + text[0:b[1]]
-            text = text[b[1]:]
-
-    text = new_str + text
-    return text
-
-# ================================================================================ #
-#                            NSW Normalizer
-# ================================================================================ #
-class NSWNormalizer:
-    def __init__(self, raw_text):
-        self.raw_text = '^' + raw_text + '$'
-        self.norm_text = ''
-
-    def _particular(self):
-        text = self.norm_text
-        pattern = re.compile(r"(([a-zA-Z]+)浜�([a-zA-Z]+))")
-        matchers = pattern.findall(text)
-        if matchers:
-            # print('particular')
-            for matcher in matchers:
-                text = text.replace(matcher[0], matcher[1]+'2'+matcher[2], 1)
-        self.norm_text = text
-        return self.norm_text
-
-    def normalize(self):
-        text = self.raw_text
-
-        # 瑙勮寖鍖栨棩鏈�
-        pattern = re.compile(r"\D+((([089]\d|(19|20)\d{2})骞�)?(\d{1,2}鏈�(\d{1,2}[鏃ュ彿])?)?)")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('date')
-            for matcher in matchers:
-                text = text.replace(matcher[0], Date(date=matcher[0]).date2chntext(), 1)
-
-        # 瑙勮寖鍖栭噾閽�
-        pattern = re.compile(r"\D+((\d+(\.\d+)?)[澶氫綑鍑燷?" + CURRENCY_UNITS + r"(\d" + CURRENCY_UNITS + r"?)?)")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('money')
-            for matcher in matchers:
-                text = text.replace(matcher[0], Money(money=matcher[0]).money2chntext(), 1)
-
-        # 瑙勮寖鍖栧浐璇�/鎵嬫満鍙风爜
-        # 鎵嬫満
-        # http://www.jihaoba.com/news/show/13680
-        # 绉诲姩锛�139銆�138銆�137銆�136銆�135銆�134銆�159銆�158銆�157銆�150銆�151銆�152銆�188銆�187銆�182銆�183銆�184銆�178銆�198
-        # 鑱旈�氾細130銆�131銆�132銆�156銆�155銆�186銆�185銆�176
-        # 鐢典俊锛�133銆�153銆�189銆�180銆�181銆�177
-        pattern = re.compile(r"\D((\+?86 ?)?1([38]\d|5[0-35-9]|7[678]|9[89])\d{8})\D")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('telephone')
-            for matcher in matchers:
-                text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(), 1)
-        # 鍥鸿瘽
-        pattern = re.compile(r"\D((0(10|2[1-3]|[3-9]\d{2})-?)?[1-9]\d{6,7})\D")
-        matchers = pattern.findall(text)
-        if matchers:
-            # print('fixed telephone')
-            for matcher in matchers:
-                text = text.replace(matcher[0], TelePhone(telephone=matcher[0]).telephone2chntext(fixed=True), 1)
-
-        # 瑙勮寖鍖栧垎鏁�
-        pattern = re.compile(r"(\d+/\d+)")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('fraction')
-            for matcher in matchers:
-                text = text.replace(matcher, Fraction(fraction=matcher).fraction2chntext(), 1)
-
-        # 瑙勮寖鍖栫櫨鍒嗘暟
-        text = text.replace('锛�', '%')
-        pattern = re.compile(r"(\d+(\.\d+)?%)")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('percentage')
-            for matcher in matchers:
-                text = text.replace(matcher[0], Percentage(percentage=matcher[0]).percentage2chntext(), 1)
-
-        # 瑙勮寖鍖栫函鏁�+閲忚瘝
-        pattern = re.compile(r"(\d+(\.\d+)?)[澶氫綑鍑燷?" + COM_QUANTIFIERS)
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('cardinal+quantifier')
-            for matcher in matchers:
-                text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
-
-        # 瑙勮寖鍖栨暟瀛楃紪鍙�
-        pattern = re.compile(r"(\d{4,32})")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('digit')
-            for matcher in matchers:
-                text = text.replace(matcher, Digit(digit=matcher).digit2chntext(), 1)
-
-        # 瑙勮寖鍖栫函鏁�
-        pattern = re.compile(r"(\d+(\.\d+)?)")
-        matchers = pattern.findall(text)
-        if matchers:
-            #print('cardinal')
-            for matcher in matchers:
-                text = text.replace(matcher[0], Cardinal(cardinal=matcher[0]).cardinal2chntext(), 1)
-
-        self.norm_text = text
-        self._particular()
-
-        return self.norm_text.lstrip('^').rstrip('$')
-
-
-def nsw_test_case(raw_text):
-    print('I:' + raw_text)
-    print('O:' + NSWNormalizer(raw_text).normalize())
-    print('')
-
-
-def nsw_test():
-    nsw_test_case('鍥鸿瘽锛�0595-23865596鎴�23880880銆�')
-    nsw_test_case('鍥鸿瘽锛�0595-23865596鎴�23880880銆�')
-    nsw_test_case('鎵嬫満锛�+86 19859213959鎴�15659451527銆�')
-    nsw_test_case('鍒嗘暟锛�32477/76391銆�')
-    nsw_test_case('鐧惧垎鏁帮細80.03%銆�')
-    nsw_test_case('缂栧彿锛�31520181154418銆�')
-    nsw_test_case('绾暟锛�2983.07鍏嬫垨12345.60绫炽��')
-    nsw_test_case('鏃ユ湡锛�1999骞�2鏈�20鏃ユ垨09骞�3鏈�15鍙枫��')
-    nsw_test_case('閲戦挶锛�12鍧�5锛�34.5鍏冿紝20.1涓�')
-    nsw_test_case('鐗规畩锛歄2O鎴朆2C銆�')
-    nsw_test_case('3456涓囧惃')
-    nsw_test_case('2938涓�')
-    nsw_test_case('938')
-    nsw_test_case('浠婂ぉ鍚冧簡115涓皬绗煎寘231涓澶�')
-    nsw_test_case('鏈�62锛呯殑姒傜巼')
-
-
-if __name__ == '__main__':
-    #nsw_test()
-
-    p = argparse.ArgumentParser()
-    p.add_argument('ifile', help='input filename, assume utf-8 encoding')
-    p.add_argument('ofile', help='output filename')
-    p.add_argument('--to_upper', action='store_true', help='convert to upper case')
-    p.add_argument('--to_lower', action='store_true', help='convert to lower case')
-    p.add_argument('--has_key', action='store_true', help="input text has Kaldi's key as first field.")
-    p.add_argument('--remove_fillers', type=bool, default=True, help='remove filler chars such as "鍛�, 鍟�"')
-    p.add_argument('--remove_erhua', type=bool, default=True, help='remove erhua chars such as "杩欏効"')
-    p.add_argument('--log_interval', type=int, default=10000, help='log interval in number of processed lines')
-    args = p.parse_args()
-
-    ifile = codecs.open(args.ifile, 'r', 'utf8')
-    ofile = codecs.open(args.ofile, 'w+', 'utf8')
-
-    n = 0
-    for l in ifile:
-        key = ''
-        text = ''
-        if args.has_key:
-            cols = l.split(maxsplit=1)
-            key = cols[0]
-            if len(cols) == 2:
-                text = cols[1].strip()
-            else:
-                text = ''
-        else:
-            text = l.strip()
-
-        # cases
-        if args.to_upper and args.to_lower:
-            sys.stderr.write('text norm: to_upper OR to_lower?')
-            exit(1)
-        if args.to_upper:
-            text = text.upper()
-        if args.to_lower:
-            text = text.lower()
-
-        # Filler chars removal
-        if args.remove_fillers:
-            for ch in FILLER_CHARS:
-                text = text.replace(ch, '')
-
-        if args.remove_erhua:
-            text = remove_erhua(text, ER_WHITELIST)
-
-        # NSW(Non-Standard-Word) normalization
-        text = NSWNormalizer(text).normalize()
-
-        # Punctuations removal
-        old_chars = CHINESE_PUNC_LIST + string.punctuation # includes all CN and EN punctuations
-        new_chars = ' ' * len(old_chars)
-        del_chars = ''
-        text = text.translate(str.maketrans(old_chars, new_chars, del_chars))
-
-        #
-        if args.has_key:
-            ofile.write(key + '\t' + text + '\n')
-        else:
-            ofile.write(text + '\n')
-
-        n += 1
-        if n % args.log_interval == 0:
-            sys.stderr.write("text norm: {} lines done.\n".format(n))
-
-    sys.stderr.write("text norm: {} lines done in total.\n".format(n))
-
-    ifile.close()
-    ofile.close()
diff --git a/egs/aishell2/transformerLM/utils b/egs/aishell2/transformerLM/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/aishell2/transformerLM/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/aishell2/transformerLM/utils/parse_options.sh b/egs/aishell2/transformerLM/utils/parse_options.sh
deleted file mode 100755
index 71fb9e5..0000000
--- a/egs/aishell2/transformerLM/utils/parse_options.sh
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env bash
-
-# Copyright 2012  Johns Hopkins University (Author: Daniel Povey);
-#                 Arnab Ghoshal, Karel Vesely
-
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# Parse command-line options.
-# To be sourced by another script (as in ". parse_options.sh").
-# Option format is: --option-name arg
-# and shell variable "option_name" gets set to value "arg."
-# The exception is --help, which takes no arguments, but prints the
-# $help_message variable (if defined).
-
-
-###
-### The --config file options have lower priority to command line
-### options, so we need to import them first...
-###
-
-# Now import all the configs specified by command-line, in left-to-right order
-for ((argpos=1; argpos<$#; argpos++)); do
-  if [ "${!argpos}" == "--config" ]; then
-    argpos_plus1=$((argpos+1))
-    config=${!argpos_plus1}
-    [ ! -r $config ] && echo "$0: missing config '$config'" && exit 1
-    . $config  # source the config file.
-  fi
-done
-
-
-###
-### Now we process the command line options
-###
-while true; do
-  [ -z "${1:-}" ] && break;  # break if there are no arguments
-  case "$1" in
-    # If the enclosing script is called with --help option, print the help
-    # message and exit.  Scripts should put help messages in $help_message
-    --help|-h) if [ -z "$help_message" ]; then echo "No help found." 1>&2;
-      else printf "$help_message\n" 1>&2 ; fi;
-      exit 0 ;;
-    --*=*) echo "$0: options to scripts must be of the form --name value, got '$1'"
-      exit 1 ;;
-    # If the first command-line argument begins with "--" (e.g. --foo-bar),
-    # then work out the variable name as $name, which will equal "foo_bar".
-    --*) name=`echo "$1" | sed s/^--// | sed s/-/_/g`;
-      # Next we test whether the variable in question is undefned-- if so it's
-      # an invalid option and we die.  Note: $0 evaluates to the name of the
-      # enclosing script.
-      # The test [ -z ${foo_bar+xxx} ] will return true if the variable foo_bar
-      # is undefined.  We then have to wrap this test inside "eval" because
-      # foo_bar is itself inside a variable ($name).
-      eval '[ -z "${'$name'+xxx}" ]' && echo "$0: invalid option $1" 1>&2 && exit 1;
-
-      oldval="`eval echo \\$$name`";
-      # Work out whether we seem to be expecting a Boolean argument.
-      if [ "$oldval" == "true" ] || [ "$oldval" == "false" ]; then
-        was_bool=true;
-      else
-        was_bool=false;
-      fi
-
-      # Set the variable to the right value-- the escaped quotes make it work if
-      # the option had spaces, like --cmd "queue.pl -sync y"
-      eval $name=\"$2\";
-
-      # Check that Boolean-valued arguments are really Boolean.
-      if $was_bool && [[ "$2" != "true" && "$2" != "false" ]]; then
-        echo "$0: expected \"true\" or \"false\": $1 $2" 1>&2
-        exit 1;
-      fi
-      shift 2;
-      ;;
-  *) break;
-  esac
-done
-
-
-# Check for an empty argument to the --cmd option, which can easily occur as a
-# result of scripting errors.
-[ ! -z "${cmd+xxx}" ] && [ -z "$cmd" ] && echo "$0: empty argument to --cmd option" 1>&2 && exit 1;
-
-
-true; # so this script returns exit code 0.
diff --git a/egs/aishell2/transformerLM/utils/run.pl b/egs/aishell2/transformerLM/utils/run.pl
deleted file mode 100755
index 483f95b..0000000
--- a/egs/aishell2/transformerLM/utils/run.pl
+++ /dev/null
@@ -1,356 +0,0 @@
-#!/usr/bin/env perl
-use warnings; #sed replacement for -w perl parameter
-# In general, doing
-#  run.pl some.log a b c is like running the command a b c in
-# the bash shell, and putting the standard error and output into some.log.
-# To run parallel jobs (backgrounded on the host machine), you can do (e.g.)
-#  run.pl JOB=1:4 some.JOB.log a b c JOB is like running the command a b c JOB
-# and putting it in some.JOB.log, for each one. [Note: JOB can be any identifier].
-# If any of the jobs fails, this script will fail.
-
-# A typical example is:
-#  run.pl some.log my-prog "--opt=foo bar" foo \|  other-prog baz
-# and run.pl will run something like:
-# ( my-prog '--opt=foo bar' foo |  other-prog baz ) >& some.log
-#
-# Basically it takes the command-line arguments, quotes them
-# as necessary to preserve spaces, and evaluates them with bash.
-# In addition it puts the command line at the top of the log, and
-# the start and end times of the command at the beginning and end.
-# The reason why this is useful is so that we can create a different
-# version of this program that uses a queueing system instead.
-
-#use Data::Dumper;
-
-@ARGV < 2 && die "usage: run.pl log-file command-line arguments...";
-
-#print STDERR "COMMAND-LINE: " .  Dumper(\@ARGV) . "\n";
-$job_pick = 'all';
-$max_jobs_run = -1;
-$jobstart = 1;
-$jobend = 1;
-$ignored_opts = ""; # These will be ignored.
-
-# First parse an option like JOB=1:4, and any
-# options that would normally be given to
-# queue.pl, which we will just discard.
-
-for (my $x = 1; $x <= 2; $x++) { # This for-loop is to
-  # allow the JOB=1:n option to be interleaved with the
-  # options to qsub.
-  while (@ARGV >= 2 && $ARGV[0] =~ m:^-:) {
-    # parse any options that would normally go to qsub, but which will be ignored here.
-    my $switch = shift @ARGV;
-    if ($switch eq "-V") {
-      $ignored_opts .= "-V ";
-    } elsif ($switch eq "--max-jobs-run" || $switch eq "-tc") {
-      # we do support the option --max-jobs-run n, and its GridEngine form -tc n.
-      # if the command appears multiple times uses the smallest option.
-      if ( $max_jobs_run <= 0 ) {
-          $max_jobs_run =  shift @ARGV;
-      } else {
-        my $new_constraint = shift @ARGV;
-        if ( ($new_constraint < $max_jobs_run) ) {
-          $max_jobs_run = $new_constraint;
-        }
-      }
-      
-      if (! ($max_jobs_run > 0)) {
-        die "run.pl: invalid option --max-jobs-run $max_jobs_run";
-      }
-    } else {
-      my $argument = shift @ARGV;
-      if ($argument =~ m/^--/) {
-        print STDERR "run.pl: WARNING: suspicious argument '$argument' to $switch; starts with '-'\n";
-      }
-      if ($switch eq "-sync" && $argument =~ m/^[yY]/) {
-        $ignored_opts .= "-sync "; # Note: in the
-        # corresponding code in queue.pl it says instead, just "$sync = 1;".
-      } elsif ($switch eq "-pe") { # e.g. -pe smp 5
-        my $argument2 = shift @ARGV;
-        $ignored_opts .= "$switch $argument $argument2 ";
-      } elsif ($switch eq "--gpu") {
-        $using_gpu = $argument;
-      } elsif ($switch eq "--pick") {
-        if($argument =~ m/^(all|failed|incomplete)$/) {
-          $job_pick = $argument;
-        } else {
-          print STDERR "run.pl: ERROR: --pick argument must be one of 'all', 'failed' or 'incomplete'"
-        }
-      } else {
-        # Ignore option.
-        $ignored_opts .= "$switch $argument ";
-      }
-    }
-  }
-  if ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+):(\d+)$/) { # e.g. JOB=1:20
-    $jobname = $1;
-    $jobstart = $2;
-    $jobend = $3;
-    if ($jobstart > $jobend) {
-      die "run.pl: invalid job range $ARGV[0]";
-    }
-    if ($jobstart <= 0) {
-      die "run.pl: invalid job range $ARGV[0], start must be strictly positive (this is required for GridEngine compatibility).";
-    }
-    shift;
-  } elsif ($ARGV[0] =~ m/^([\w_][\w\d_]*)+=(\d+)$/) { # e.g. JOB=1.
-    $jobname = $1;
-    $jobstart = $2;
-    $jobend = $2;
-    shift;
-  } elsif ($ARGV[0] =~ m/.+\=.*\:.*$/) {
-    print STDERR "run.pl: Warning: suspicious first argument to run.pl: $ARGV[0]\n";
-  }
-}
-
-# Users found this message confusing so we are removing it.
-# if ($ignored_opts ne "") {
-#   print STDERR "run.pl: Warning: ignoring options \"$ignored_opts\"\n";
-# }
-
-if ($max_jobs_run == -1) { # If --max-jobs-run option not set,
-                           # then work out the number of processors if possible,
-                           # and set it based on that.
-  $max_jobs_run = 0;
-  if ($using_gpu) {
-    if (open(P, "nvidia-smi -L |")) {
-      $max_jobs_run++ while (<P>);
-      close(P);
-    }
-    if ($max_jobs_run == 0) {
-      $max_jobs_run = 1;
-      print STDERR "run.pl: Warning: failed to detect number of GPUs from nvidia-smi, using ${max_jobs_run}\n";
-    }
-  } elsif (open(P, "</proc/cpuinfo")) {  # Linux
-    while (<P>) { if (m/^processor/) { $max_jobs_run++; } }
-    if ($max_jobs_run == 0) {
-      print STDERR "run.pl: Warning: failed to detect any processors from /proc/cpuinfo\n";
-      $max_jobs_run = 10;  # reasonable default.
-    }
-    close(P);
-  } elsif (open(P, "sysctl -a |")) {  # BSD/Darwin
-    while (<P>) {
-      if (m/hw\.ncpu\s*[:=]\s*(\d+)/) { # hw.ncpu = 4, or hw.ncpu: 4
-        $max_jobs_run = $1;
-        last;
-      }
-    }
-    close(P);
-    if ($max_jobs_run == 0) {
-      print STDERR "run.pl: Warning: failed to detect any processors from sysctl -a\n";
-      $max_jobs_run = 10;  # reasonable default.
-    }
-  } else {
-    # allow at most 32 jobs at once, on non-UNIX systems; change this code
-    # if you need to change this default.
-    $max_jobs_run = 32;
-  }
-  # The just-computed value of $max_jobs_run is just the number of processors
-  # (or our best guess); and if it happens that the number of jobs we need to
-  # run is just slightly above $max_jobs_run, it will make sense to increase
-  # $max_jobs_run to equal the number of jobs, so we don't have a small number
-  # of leftover jobs.
-  $num_jobs = $jobend - $jobstart + 1;
-  if (!$using_gpu &&
-      $num_jobs > $max_jobs_run && $num_jobs < 1.4 * $max_jobs_run) {
-    $max_jobs_run = $num_jobs;
-  }
-}
-
-sub pick_or_exit {
-  # pick_or_exit ( $logfile ) 
-  # Invoked before each job is started helps to run jobs selectively.
-  #
-  # Given the name of the output logfile decides whether the job must be 
-  # executed (by returning from the subroutine) or not (by terminating the
-  # process calling exit)
-  # 
-  # PRE: $job_pick is a global variable set by command line switch --pick
-  #      and indicates which class of jobs must be executed.
-  #
-  # 1) If a failed job is not executed the process exit code will indicate 
-  #    failure, just as if the task was just executed  and failed.
-  #
-  # 2) If a task is incomplete it will be executed. Incomplete may be either
-  #    a job whose log file does not contain the accounting notes in the end,
-  #    or a job whose log file does not exist.
-  #
-  # 3) If the $job_pick is set to 'all' (default behavior) a task will be
-  #    executed regardless of the result of previous attempts.
-  #
-  # This logic could have been implemented in the main execution loop
-  # but a subroutine to preserve the current level of readability of
-  # that part of the code.
-  #
-  # Alexandre Felipe, (o.alexandre.felipe@gmail.com) 14th of August of 2020
-  #
-  if($job_pick eq 'all'){
-    return; # no need to bother with the previous log
-  }
-  open my $fh, "<", $_[0] or return; # job not executed yet
-  my $log_line;
-  my $cur_line;
-  while ($cur_line = <$fh>) {
-    if( $cur_line =~ m/# Ended \(code .*/ ) {
-      $log_line = $cur_line;
-    }
-  }
-  close $fh;
-  if (! defined($log_line)){
-    return; # incomplete
-  }
-  if ( $log_line =~ m/# Ended \(code 0\).*/ ) {
-    exit(0); # complete
-  } elsif ( $log_line =~ m/# Ended \(code \d+(; signal \d+)?\).*/ ){
-    if ($job_pick !~ m/^(failed|all)$/) {
-      exit(1); # failed but not going to run
-    } else {
-      return; # failed
-    }
-  } elsif ( $log_line =~ m/.*\S.*/ ) {
-    return; # incomplete jobs are always run
-  }
-}
-
-
-$logfile = shift @ARGV;
-
-if (defined $jobname && $logfile !~ m/$jobname/ &&
-    $jobend > $jobstart) {
-  print STDERR "run.pl: you are trying to run a parallel job but "
-    . "you are putting the output into just one log file ($logfile)\n";
-  exit(1);
-}
-
-$cmd = "";
-
-foreach $x (@ARGV) {
-    if ($x =~ m/^\S+$/) { $cmd .=  $x . " "; }
-    elsif ($x =~ m:\":) { $cmd .= "'$x' "; }
-    else { $cmd .= "\"$x\" "; }
-}
-
-#$Data::Dumper::Indent=0;
-$ret = 0;
-$numfail = 0;
-%active_pids=();
-
-use POSIX ":sys_wait_h";
-for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-  if (scalar(keys %active_pids) >= $max_jobs_run) {
-
-    # Lets wait for a change in any child's status
-    # Then we have to work out which child finished
-    $r = waitpid(-1, 0);
-    $code = $?;
-    if ($r < 0 ) { die "run.pl: Error waiting for child process"; } # should never happen.
-    if ( defined $active_pids{$r} ) {
-        $jid=$active_pids{$r};
-        $fail[$jid]=$code;
-        if ($code !=0) { $numfail++;}
-        delete $active_pids{$r};
-        # print STDERR "Finished: $r/$jid " .  Dumper(\%active_pids) . "\n";
-    } else {
-        die "run.pl: Cannot find the PID of the child process that just finished.";
-    }
-
-    # In theory we could do a non-blocking waitpid over all jobs running just
-    # to find out if only one or more jobs finished during the previous waitpid()
-    # However, we just omit this and will reap the next one in the next pass
-    # through the for(;;) cycle
-  }
-  $childpid = fork();
-  if (!defined $childpid) { die "run.pl: Error forking in run.pl (writing to $logfile)"; }
-  if ($childpid == 0) { # We're in the child... this branch
-    # executes the job and returns (possibly with an error status).
-    if (defined $jobname) {
-      $cmd =~ s/$jobname/$jobid/g;
-      $logfile =~ s/$jobname/$jobid/g;
-    }
-    # exit if the job does not need to be executed
-    pick_or_exit( $logfile );
-
-    system("mkdir -p `dirname $logfile` 2>/dev/null");
-    open(F, ">$logfile") || die "run.pl: Error opening log file $logfile";
-    print F "# " . $cmd . "\n";
-    print F "# Started at " . `date`;
-    $starttime = `date +'%s'`;
-    print F "#\n";
-    close(F);
-
-    # Pipe into bash.. make sure we're not using any other shell.
-    open(B, "|bash") || die "run.pl: Error opening shell command";
-    print B "( " . $cmd . ") 2>>$logfile >> $logfile";
-    close(B);                   # If there was an error, exit status is in $?
-    $ret = $?;
-
-    $lowbits = $ret & 127;
-    $highbits = $ret >> 8;
-    if ($lowbits != 0) { $return_str = "code $highbits; signal $lowbits" }
-    else { $return_str = "code $highbits"; }
-
-    $endtime = `date +'%s'`;
-    open(F, ">>$logfile") || die "run.pl: Error opening log file $logfile (again)";
-    $enddate = `date`;
-    chop $enddate;
-    print F "# Accounting: time=" . ($endtime - $starttime) . " threads=1\n";
-    print F "# Ended ($return_str) at " . $enddate . ", elapsed time " . ($endtime-$starttime) . " seconds\n";
-    close(F);
-    exit($ret == 0 ? 0 : 1);
-  } else {
-    $pid[$jobid] = $childpid;
-    $active_pids{$childpid} = $jobid;
-    # print STDERR "Queued: " .  Dumper(\%active_pids) . "\n";
-  }
-}
-
-# Now we have submitted all the jobs, lets wait until all the jobs finish
-foreach $child (keys %active_pids) {
-    $jobid=$active_pids{$child};
-    $r = waitpid($pid[$jobid], 0);
-    $code = $?;
-    if ($r == -1) { die "run.pl: Error waiting for child process"; } # should never happen.
-    if ($r != 0) { $fail[$jobid]=$code; $numfail++ if $code!=0; } # Completed successfully
-}
-
-# Some sanity checks:
-# The $fail array should not contain undefined codes
-# The number of non-zeros in that array  should be equal to $numfail
-# We cannot do foreach() here, as the JOB ids do not start at zero
-$failed_jids=0;
-for ($jobid = $jobstart; $jobid <= $jobend; $jobid++) {
-  $job_return = $fail[$jobid];
-  if (not defined $job_return ) {
-    # print Dumper(\@fail);
-
-    die "run.pl: Sanity check failed: we have indication that some jobs are running " .
-      "even after we waited for all jobs to finish" ;
-  }
-  if ($job_return != 0 ){ $failed_jids++;}
-}
-if ($failed_jids != $numfail) {
-  die "run.pl: Sanity check failed: cannot find out how many jobs failed ($failed_jids x $numfail)."
-}
-if ($numfail > 0) { $ret = 1; }
-
-if ($ret != 0) {
-  $njobs = $jobend - $jobstart + 1;
-  if ($njobs == 1) {
-    if (defined $jobname) {
-      $logfile =~ s/$jobname/$jobstart/; # only one numbered job, so replace name with
-                                         # that job.
-    }
-    print STDERR "run.pl: job failed, log is in $logfile\n";
-    if ($logfile =~ m/JOB/) {
-      print STDERR "run.pl: probably you forgot to put JOB=1:\$nj in your script.";
-    }
-  }
-  else {
-    $logfile =~ s/$jobname/*/g;
-    print STDERR "run.pl: $numfail / $njobs failed, log is in $logfile\n";
-  }
-}
-
-
-exit ($ret);
diff --git a/egs/aishell2/transformerLM/utils/split_scp.pl b/egs/aishell2/transformerLM/utils/split_scp.pl
deleted file mode 100755
index 0876dcb..0000000
--- a/egs/aishell2/transformerLM/utils/split_scp.pl
+++ /dev/null
@@ -1,246 +0,0 @@
-#!/usr/bin/env perl
-
-# Copyright 2010-2011 Microsoft Corporation
-
-# See ../../COPYING for clarification regarding multiple authors
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#  http://www.apache.org/licenses/LICENSE-2.0
-#
-# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
-# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
-# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
-# MERCHANTABLITY OR NON-INFRINGEMENT.
-# See the Apache 2 License for the specific language governing permissions and
-# limitations under the License.
-
-
-# This program splits up any kind of .scp or archive-type file.
-# If there is no utt2spk option it will work on any text  file and
-# will split it up with an approximately equal number of lines in
-# each but.
-# With the --utt2spk option it will work on anything that has the
-# utterance-id as the first entry on each line; the utt2spk file is
-# of the form "utterance speaker" (on each line).
-# It splits it into equal size chunks as far as it can.  If you use the utt2spk
-# option it will make sure these chunks coincide with speaker boundaries.  In
-# this case, if there are more chunks than speakers (and in some other
-# circumstances), some of the resulting chunks will be empty and it will print
-# an error message and exit with nonzero status.
-# You will normally call this like:
-# split_scp.pl scp scp.1 scp.2 scp.3 ...
-# or
-# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
-# Note that you can use this script to split the utt2spk file itself,
-# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
-
-# You can also call the scripts like:
-# split_scp.pl -j 3 0 scp scp.0
-# [note: with this option, it assumes zero-based indexing of the split parts,
-# i.e. the second number must be 0 <= n < num-jobs.]
-
-use warnings;
-
-$num_jobs = 0;
-$job_id = 0;
-$utt2spk_file = "";
-$one_based = 0;
-
-for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
-    if ($ARGV[0] eq "-j") {
-        shift @ARGV;
-        $num_jobs = shift @ARGV;
-        $job_id = shift @ARGV;
-    }
-    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
-        $utt2spk_file=$1;
-        shift;
-    }
-    if ($ARGV[0] eq '--one-based') {
-        $one_based = 1;
-        shift @ARGV;
-    }
-}
-
-if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
-                       $job_id - $one_based >= $num_jobs)) {
-  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
-      ($one_based ? " --one-based" : "") . "'\n"
-}
-
-$one_based
-    and $job_id--;
-
-if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
-    die
-"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
-   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
- ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
-}
-
-$error = 0;
-$inscp = shift @ARGV;
-if ($num_jobs == 0) { # without -j option
-    @OUTPUTS = @ARGV;
-} else {
-    for ($j = 0; $j < $num_jobs; $j++) {
-        if ($j == $job_id) {
-            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
-            else { push @OUTPUTS, "-"; }
-        } else {
-            push @OUTPUTS, "/dev/null";
-        }
-    }
-}
-
-if ($utt2spk_file ne "") {  # We have the --utt2spk option...
-    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
-    while(<$u_fh>) {
-        @A = split;
-        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
-        ($u,$s) = @A;
-        $utt2spk{$u} = $s;
-    }
-    close $u_fh;
-    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
-    @spkrs = ();
-    while(<$i_fh>) {
-        @A = split;
-        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
-        $u = $A[0];
-        $s = $utt2spk{$u};
-        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
-        if(!defined $spk_count{$s}) {
-            push @spkrs, $s;
-            $spk_count{$s} = 0;
-            $spk_data{$s} = [];  # ref to new empty array.
-        }
-        $spk_count{$s}++;
-        push @{$spk_data{$s}}, $_;
-    }
-    # Now split as equally as possible ..
-    # First allocate spks to files by allocating an approximately
-    # equal number of speakers.
-    $numspks = @spkrs;  # number of speakers.
-    $numscps = @OUTPUTS; # number of output files.
-    if ($numspks < $numscps) {
-      die "$0: Refusing to split data because number of speakers $numspks " .
-          "is less than the number of output .scp files $numscps\n";
-    }
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scparray[$scpidx] = []; # [] is array reference.
-    }
-    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
-        $scpidx = int(($spkidx*$numscps) / $numspks);
-        $spk = $spkrs[$spkidx];
-        push @{$scparray[$scpidx]}, $spk;
-        $scpcount[$scpidx] += $spk_count{$spk};
-    }
-
-    # Now will try to reassign beginning + ending speakers
-    # to different scp's and see if it gets more balanced.
-    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
-    # We can show that if considering changing just 2 scp's, we minimize
-    # this by minimizing the squared difference in sizes.  This is
-    # equivalent to minimizing the absolute difference in sizes.  This
-    # shows this method is bound to converge.
-
-    $changed = 1;
-    while($changed) {
-        $changed = 0;
-        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-            # First try to reassign ending spk of this scp.
-            if($scpidx < $numscps-1) {
-                $sz = @{$scparray[$scpidx]};
-                if($sz > 0) {
-                    $spk = $scparray[$scpidx]->[$sz-1];
-                    $count = $spk_count{$spk};
-                    $nutt1 = $scpcount[$scpidx];
-                    $nutt2 = $scpcount[$scpidx+1];
-                    if( abs( ($nutt2+$count) - ($nutt1-$count))
-                        < abs($nutt2 - $nutt1))  { # Would decrease
-                        # size-diff by reassigning spk...
-                        $scpcount[$scpidx+1] += $count;
-                        $scpcount[$scpidx] -= $count;
-                        pop @{$scparray[$scpidx]};
-                        unshift @{$scparray[$scpidx+1]}, $spk;
-                        $changed = 1;
-                    }
-                }
-            }
-            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
-                $spk = $scparray[$scpidx]->[0];
-                $count = $spk_count{$spk};
-                $nutt1 = $scpcount[$scpidx-1];
-                $nutt2 = $scpcount[$scpidx];
-                if( abs( ($nutt2-$count) - ($nutt1+$count))
-                    < abs($nutt2 - $nutt1))  { # Would decrease
-                    # size-diff by reassigning spk...
-                    $scpcount[$scpidx-1] += $count;
-                    $scpcount[$scpidx] -= $count;
-                    shift @{$scparray[$scpidx]};
-                    push @{$scparray[$scpidx-1]}, $spk;
-                    $changed = 1;
-                }
-            }
-        }
-    }
-    # Now print out the files...
-    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
-                         : open($f_fh, '>&', \*STDOUT)) ||
-            die "$0: Could not open scp file $scpfile for writing: $!\n";
-        $count = 0;
-        if(@{$scparray[$scpidx]} == 0) {
-            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
-                         "$scpfile (too many splits and too few speakers?)\n";
-            $error = 1;
-        } else {
-            foreach $spk ( @{$scparray[$scpidx]} ) {
-                print $f_fh @{$spk_data{$spk}};
-                $count += $spk_count{$spk};
-            }
-            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
-        }
-        close($f_fh);
-    }
-} else {
-   # This block is the "normal" case where there is no --utt2spk
-   # option and we just break into equal size chunks.
-
-    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
-
-    $numscps = @OUTPUTS;  # size of array.
-    @F = ();
-    while(<$i_fh>) {
-        push @F, $_;
-    }
-    $numlines = @F;
-    if($numlines == 0) {
-        print STDERR "$0: error: empty input scp file $inscp\n";
-        $error = 1;
-    }
-    $linesperscp = int( $numlines / $numscps); # the "whole part"..
-    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
-    $remainder = $numlines - ($linesperscp * $numscps);
-    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
-    # [just doing int() rounds down].
-    $n = 0;
-    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
-        $scpfile = $OUTPUTS[$scpidx];
-        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
-                         : open($o_fh, '>&', \*STDOUT)) ||
-            die "$0: Could not open scp file $scpfile for writing: $!\n";
-        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
-            print $o_fh $F[$n++];
-        }
-        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
-    }
-    $n == $numlines || die "$n != $numlines [code error]";
-}
-
-exit ($error);
diff --git a/egs/librispeech/conformer/run.sh b/egs/librispeech/conformer/run.sh
index 21a8d33..5296340 100755
--- a/egs/librispeech/conformer/run.sh
+++ b/egs/librispeech/conformer/run.sh
@@ -97,7 +97,7 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
 token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
diff --git a/egs/librispeech/rnnt/README.md b/egs/librispeech/rnnt/README.md
new file mode 100644
index 0000000..c64e2e8
--- /dev/null
+++ b/egs/librispeech/rnnt/README.md
@@ -0,0 +1,18 @@
+
+# Streaming RNN-T Result
+
+## Training Config
+- 8 gpu(Tesla V100)
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train config: conf/train_conformer_rnnt_unified.yaml
+- chunk config: chunk size 16, 1 left chunk
+- LM config: LM was not used
+- Model size: 90M
+
+## Results (CER)
+- Decode config: conf/decode_rnnt_conformer_streaming.yaml
+
+|      testset   |  WER(%) |
+|:--------------:|:-------:|
+|    test_clean  |   3.58  |
+|    test_other  |   9.27  |
diff --git a/egs/librispeech/rnnt/conf/decode_rnnt_conformer_streaming.yaml b/egs/librispeech/rnnt/conf/decode_rnnt_conformer_streaming.yaml
new file mode 100644
index 0000000..26e43c6
--- /dev/null
+++ b/egs/librispeech/rnnt/conf/decode_rnnt_conformer_streaming.yaml
@@ -0,0 +1,8 @@
+# The conformer transducer decoding configuration from @jeon30c
+beam_size: 10
+simu_streaming: false
+streaming: true
+chunk_size: 16
+left_context: 16
+right_context: 0
+
diff --git a/egs/librispeech/rnnt/conf/train_conformer_rnnt_unified.yaml b/egs/librispeech/rnnt/conf/train_conformer_rnnt_unified.yaml
new file mode 100644
index 0000000..39c6a6a
--- /dev/null
+++ b/egs/librispeech/rnnt/conf/train_conformer_rnnt_unified.yaml
@@ -0,0 +1,98 @@
+encoder: chunk_conformer
+encoder_conf:
+      activation_type: swish
+      time_reduction_factor: 2
+      unified_model_training: true
+      default_chunk_size: 16
+      jitter_range: 4
+      left_chunk_size: 1
+      embed_vgg_like: false
+      subsampling_factor: 4
+      linear_units: 2048
+      output_size: 512
+      attention_heads: 8
+      dropout_rate: 0.5
+      positional_dropout_rate: 0.5
+      attention_dropout_rate: 0.5
+      cnn_module_kernel: 15
+      num_blocks: 12    
+
+# decoder related
+rnnt_decoder: rnnt
+rnnt_decoder_conf:
+    embed_size: 512
+    hidden_size: 512
+    embed_dropout_rate: 0.5
+    dropout_rate: 0.5
+    use_embed_mask: true
+
+joint_network_conf:
+    joint_space_size: 512
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+# Auxiliary CTC
+model: rnnt_unified
+model_conf:
+    auxiliary_ctc_weight: 0.0
+
+# minibatch related
+use_amp: true
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+max_epoch: 100
+val_scheduler_criterion:
+    - valid
+    - loss
+best_model_criterion:
+-   - valid
+    - cer_transducer_chunk
+    - min
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 40
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 50
+    num_time_mask: 5
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
diff --git a/egs/librispeech/rnnt/local/data_prep.sh b/egs/librispeech/rnnt/local/data_prep.sh
new file mode 100755
index 0000000..c939b5f
--- /dev/null
+++ b/egs/librispeech/rnnt/local/data_prep.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+  done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech/rnnt/local/download_and_untar.sh b/egs/librispeech/rnnt/local/download_and_untar.sh
new file mode 100755
index 0000000..fe322e4
--- /dev/null
+++ b/egs/librispeech/rnnt/local/download_and_untar.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+  echo "          train-clean-100, train-clean-360, train-other-500."
+  exit 1
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0
+fi
+
+
+# sizes of the archive files in bytes.  This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release.  Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  if ! wget -P $data --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1
+  fi
+fi
+
+if ! tar -C $data -xvzf $data/$part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1
+fi
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
diff --git a/egs/librispeech/rnnt/local/spm_encode.py b/egs/librispeech/rnnt/local/spm_encode.py
new file mode 100755
index 0000000..9e1c15f
--- /dev/null
+++ b/egs/librispeech/rnnt/local/spm_encode.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True,
+                        help="sentencepiece model to use for encoding")
+    parser.add_argument("--inputs", nargs="+", default=['-'],
+                        help="input files to filter/encode")
+    parser.add_argument("--outputs", nargs="+", default=['-'],
+                        help="path to save encoded outputs")
+    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+    parser.add_argument("--min-len", type=int, metavar="N",
+                        help="filter sentence pairs with fewer than N tokens")
+    parser.add_argument("--max-len", type=int, metavar="N",
+                        help="filter sentence pairs with more than N tokens")
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(args.outputs), \
+        "number of input and output paths should match"
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.output_format == "piece":
+        def encode(l):
+            return sp.EncodeAsPieces(l)
+    elif args.output_format == "id":
+        def encode(l):
+            return list(map(str, sp.EncodeAsIds(l)))
+    else:
+        raise NotImplementedError
+
+    if args.min_len is not None or args.max_len is not None:
+        def valid(line):
+            return (
+                (args.min_len is None or len(line) >= args.min_len) and
+                (args.max_len is None or len(line) <= args.max_len)
+            )
+    else:
+        def valid(lines):
+            return True
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-" else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-" else sys.stdout
+            for output in args.outputs
+        ]
+
+        stats = {
+            "num_empty": 0,
+            "num_filtered": 0,
+        }
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        for i, lines in enumerate(zip(*inputs), start=1):
+            enc_lines = list(map(encode_line, lines))
+            if not any(enc_line is None for enc_line in enc_lines):
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(" ".join(enc_line), file=output_h)
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech/rnnt/local/spm_train.py b/egs/librispeech/rnnt/local/spm_train.py
new file mode 100755
index 0000000..134a0b1
--- /dev/null
+++ b/egs/librispeech/rnnt/local/spm_train.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+import sys
+
+import sentencepiece as spm
+
+if __name__ == "__main__":
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/egs/librispeech/rnnt/path.sh b/egs/librispeech/rnnt/path.sh
new file mode 100644
index 0000000..7972642
--- /dev/null
+++ b/egs/librispeech/rnnt/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/librispeech/rnnt/run.sh b/egs/librispeech/rnnt/run.sh
new file mode 100755
index 0000000..efccd85
--- /dev/null
+++ b/egs/librispeech/rnnt/run.sh
@@ -0,0 +1,222 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="" #feature output dictionary
+exp_dir=""
+lang=en
+token_type=bpe
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=0
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=
+data_url=www.openslr.org/resources/12
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_960
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_conformer_rnnt_unified.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_rnnt_conformer_streaming.yaml
+inference_asr_model=valid.cer_transducer_chunk.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    for part in dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500; do
+        local/download_and_untar.sh ${raw_data} ${data_url} ${part}
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    for x in dev-clean dev-other test-clean test-other train-clean-100 train-clean-360 train-other-500; do
+        local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+    done
+    mkdir $feats_dir/data/$valid_set
+    dev_sets="dev_clean dev_other"
+    for file in wav.scp text; do
+        ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1;
+    done
+    mkdir $feats_dir/data/$train_set
+    train_sets="train_clean_100 train_clean_360 train_other_500"
+    for file in wav.scp text; do
+        ( for f in $train_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$train_set/$file || exit 1;
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} ${feats_dir}/data/${train_set}
+fi
+
+token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p ${feats_dir}/data/lang_char/
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+    local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: ASR Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=./ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            train.py \
+                --task_name asr \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --split_with_space false \
+                --bpemodel ${bpemodel}.model \
+                --token_type $token_type \
+                --token_list $token_list \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
+                --speed_perturb ${speed_perturb} \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --multiprocessing_distributed true \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/data/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
diff --git a/egs/librispeech/rnnt/utils b/egs/librispeech/rnnt/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/librispeech/rnnt/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/librispeech_100h/conformer/run.sh b/egs/librispeech_100h/conformer/run.sh
index e85c8eb..41df5a4 100755
--- a/egs/librispeech_100h/conformer/run.sh
+++ b/egs/librispeech_100h/conformer/run.sh
@@ -93,7 +93,7 @@
 
 if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
     echo "stage 1: Feature and CMVN Generation"
-    utils/compute_cmvn.sh ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config $asr_config --scale 1.0
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
 fi
 
 token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
@@ -120,7 +120,7 @@
 
 # ASR Training Stage
 world_size=$gpu_num  # run on one machine
-if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4; then
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
     echo "stage 4: ASR Training"
     mkdir -p ${exp_dir}/exp/${model_dir}
     mkdir -p ${exp_dir}/exp/${model_dir}/log
diff --git a/egs/librispeech_100h/rnnt/README.md b/egs/librispeech_100h/rnnt/README.md
new file mode 100644
index 0000000..abadb2d
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/README.md
@@ -0,0 +1,16 @@
+
+# Conformer Transducer Result
+
+## Training Config
+- Feature info: using 80 dims fbank, global cmvn, speed perturb(0.9, 1.0, 1.1), specaugment
+- Train config: conf/train_conformer_rnnt.yaml
+- LM config: LM was not used
+- Model size: 30.54M
+
+## Results (CER)
+- Decode config: conf/decode_rnnt_conformer.yaml
+
+|      testset   |  WER(%) |
+|:--------------:|:-------:|
+|    test_clean  |  6.64   |
+|    test_other  |  17.12  |
diff --git a/egs/librispeech_100h/rnnt/conf/decode_rnnt_conformer.yaml b/egs/librispeech_100h/rnnt/conf/decode_rnnt_conformer.yaml
new file mode 100644
index 0000000..1a841d6
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/conf/decode_rnnt_conformer.yaml
@@ -0,0 +1,3 @@
+# The conformer transducer decoding configuration from @jeon30c
+beam_size: 10
+
diff --git a/egs/librispeech_100h/rnnt/conf/train_conformer_rnnt.yaml b/egs/librispeech_100h/rnnt/conf/train_conformer_rnnt.yaml
new file mode 100644
index 0000000..7ef9ec3
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/conf/train_conformer_rnnt.yaml
@@ -0,0 +1,96 @@
+encoder: chunk_conformer
+encoder_conf:
+      activation_type: swish
+      time_reduction_factor: 2
+      embed_vgg_like: false
+      subsampling_factor: 4
+      linear_units: 1024
+      output_size: 256
+      attention_heads: 4
+      dropout_rate: 0.1
+      positional_dropout_rate: 0.1
+      attention_dropout_rate: 0.1
+      cnn_module_kernel: 31
+      num_blocks: 15    
+
+# decoder related
+rnnt_decoder: rnnt
+rnnt_decoder_conf:
+    embed_size: 256
+    hidden_size: 256
+    embed_dropout_rate: 0.1
+    dropout_rate: 0.2
+    use_embed_mask: true
+
+joint_network_conf:
+    joint_space_size: 320
+
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+ 
+# Auxiliary CTC
+model: rnnt
+model_conf:
+    auxiliary_ctc_weight: 0.3
+
+# minibatch related
+use_amp: true
+
+# optimization related
+accum_grad: 8
+grad_clip: 5
+max_epoch: 200
+val_scheduler_criterion:
+    - valid
+    - loss
+best_model_criterion:
+-   - valid
+    - cer_transducer
+    - min
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.002
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 25000
+
+normalize: None
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0
+    - 0.05
+    num_time_mask: 5
+
+dataset_conf:
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 1024
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 5000
+    num_workers: 8
+
+log_interval: 50
diff --git a/egs/librispeech_100h/rnnt/local/data_prep.sh b/egs/librispeech_100h/rnnt/local/data_prep.sh
new file mode 100755
index 0000000..c939b5f
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/local/data_prep.sh
@@ -0,0 +1,58 @@
+#!/usr/bin/env bash
+
+# Copyright 2014  Vassil Panayotov
+#           2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+if [ "$#" -ne 2 ]; then
+  echo "Usage: $0 <src-dir> <dst-dir>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data/LibriSpeech/dev-clean data/dev-clean"
+  exit 1
+fi
+
+src=$1
+dst=$2
+
+# all utterances are FLAC compressed
+if ! which flac >&/dev/null; then
+   echo "Please install 'flac' on ALL worker nodes!"
+   exit 1
+fi
+
+spk_file=$src/../SPEAKERS.TXT
+
+mkdir -p $dst || exit 1
+
+[ ! -d $src ] && echo "$0: no such directory $src" && exit 1
+[ ! -f $spk_file ] && echo "$0: expected file $spk_file to exist" && exit 1
+
+
+wav_scp=$dst/wav.scp; [[ -f "$wav_scp" ]] && rm $wav_scp
+trans=$dst/text; [[ -f "$trans" ]] && rm $trans
+
+for reader_dir in $(find -L $src -mindepth 1 -maxdepth 1 -type d | sort); do
+  reader=$(basename $reader_dir)
+  if ! [ $reader -eq $reader ]; then  # not integer.
+    echo "$0: unexpected subdirectory name $reader"
+    exit 1
+  fi
+
+  for chapter_dir in $(find -L $reader_dir/ -mindepth 1 -maxdepth 1 -type d | sort); do
+    chapter=$(basename $chapter_dir)
+    if ! [ "$chapter" -eq "$chapter" ]; then
+      echo "$0: unexpected chapter-subdirectory name $chapter"
+      exit 1
+    fi
+
+    find -L $chapter_dir/ -iname "*.flac" | sort | xargs -I% basename % .flac | \
+      awk -v "dir=$chapter_dir" '{printf "%s %s/%s.flac \n", $0, dir, $0}' >>$wav_scp|| exit 1
+
+    chapter_trans=$chapter_dir/${reader}-${chapter}.trans.txt
+    [ ! -f  $chapter_trans ] && echo "$0: expected file $chapter_trans to exist" && exit 1
+    cat $chapter_trans >>$trans
+  done
+done
+
+echo "$0: successfully prepared data in $dst"
+
+exit 0
diff --git a/egs/librispeech_100h/rnnt/local/download_and_untar.sh b/egs/librispeech_100h/rnnt/local/download_and_untar.sh
new file mode 100755
index 0000000..fe322e4
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/local/download_and_untar.sh
@@ -0,0 +1,97 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a15/vpanayotov/data www.openslr.org/resources/11 dev-clean"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: dev-clean, test-clean, dev-other, test-other,"
+  echo "          train-clean-100, train-clean-360, train-other-500."
+  exit 1
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1
+fi
+
+part_ok=false
+list="dev-clean test-clean dev-other test-other train-clean-100 train-clean-360 train-other-500"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1
+fi
+
+if [ -f $data/LibriSpeech/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0
+fi
+
+
+# sizes of the archive files in bytes.  This is some older versions.
+sizes_old="371012589 347390293 379743611 361838298 6420417880 23082659865 30626749128"
+# sizes_new is the archive file sizes of the final release.  Some of these sizes are of
+# things we probably won't download.
+sizes_new="337926286 314305928 695964615 297279345 87960560420 33373768 346663984 328757843 6387309499 23049477885 30593501606"
+
+if [ -f $data/$part.tar.gz ]; then
+  size=$(/bin/ls -l $data/$part.tar.gz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes_old $sizes_new; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tar.gz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tar.gz
+  else
+    echo "$data/$part.tar.gz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tar.gz ]; then
+  if ! which wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1
+  fi
+  full_url=$url/$part.tar.gz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  if ! wget -P $data --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1
+  fi
+fi
+
+if ! tar -C $data -xvzf $data/$part.tar.gz; then
+  echo "$0: error un-tarring archive $data/$part.tar.gz"
+  exit 1
+fi
+
+touch $data/LibriSpeech/$part/.complete
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tar.gz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tar.gz file since --remove-archive option was supplied."
+  rm $data/$part.tar.gz
+fi
diff --git a/egs/librispeech_100h/rnnt/local/spm_encode.py b/egs/librispeech_100h/rnnt/local/spm_encode.py
new file mode 100755
index 0000000..9e1c15f
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/local/spm_encode.py
@@ -0,0 +1,98 @@
+#!/usr/bin/env python
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+
+
+import argparse
+import contextlib
+import sys
+
+import sentencepiece as spm
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model", required=True,
+                        help="sentencepiece model to use for encoding")
+    parser.add_argument("--inputs", nargs="+", default=['-'],
+                        help="input files to filter/encode")
+    parser.add_argument("--outputs", nargs="+", default=['-'],
+                        help="path to save encoded outputs")
+    parser.add_argument("--output_format", choices=["piece", "id"], default="piece")
+    parser.add_argument("--min-len", type=int, metavar="N",
+                        help="filter sentence pairs with fewer than N tokens")
+    parser.add_argument("--max-len", type=int, metavar="N",
+                        help="filter sentence pairs with more than N tokens")
+    args = parser.parse_args()
+
+    assert len(args.inputs) == len(args.outputs), \
+        "number of input and output paths should match"
+
+    sp = spm.SentencePieceProcessor()
+    sp.Load(args.model)
+
+    if args.output_format == "piece":
+        def encode(l):
+            return sp.EncodeAsPieces(l)
+    elif args.output_format == "id":
+        def encode(l):
+            return list(map(str, sp.EncodeAsIds(l)))
+    else:
+        raise NotImplementedError
+
+    if args.min_len is not None or args.max_len is not None:
+        def valid(line):
+            return (
+                (args.min_len is None or len(line) >= args.min_len) and
+                (args.max_len is None or len(line) <= args.max_len)
+            )
+    else:
+        def valid(lines):
+            return True
+
+    with contextlib.ExitStack() as stack:
+        inputs = [
+            stack.enter_context(open(input, "r", encoding="utf-8"))
+            if input != "-" else sys.stdin
+            for input in args.inputs
+        ]
+        outputs = [
+            stack.enter_context(open(output, "w", encoding="utf-8"))
+            if output != "-" else sys.stdout
+            for output in args.outputs
+        ]
+
+        stats = {
+            "num_empty": 0,
+            "num_filtered": 0,
+        }
+
+        def encode_line(line):
+            line = line.strip()
+            if len(line) > 0:
+                line = encode(line)
+                if valid(line):
+                    return line
+                else:
+                    stats["num_filtered"] += 1
+            else:
+                stats["num_empty"] += 1
+            return None
+
+        for i, lines in enumerate(zip(*inputs), start=1):
+            enc_lines = list(map(encode_line, lines))
+            if not any(enc_line is None for enc_line in enc_lines):
+                for enc_line, output_h in zip(enc_lines, outputs):
+                    print(" ".join(enc_line), file=output_h)
+            if i % 10000 == 0:
+                print("processed {} lines".format(i), file=sys.stderr)
+
+        print("skipped {} empty lines".format(stats["num_empty"]), file=sys.stderr)
+        print("filtered {} lines".format(stats["num_filtered"]), file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/librispeech_100h/rnnt/local/spm_train.py b/egs/librispeech_100h/rnnt/local/spm_train.py
new file mode 100755
index 0000000..134a0b1
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/local/spm_train.py
@@ -0,0 +1,12 @@
+#!/usr/bin/env python3
+# Copyright (c) Facebook, Inc. and its affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the license found in the
+# https://github.com/pytorch/fairseq/blob/master/LICENSE
+import sys
+
+import sentencepiece as spm
+
+if __name__ == "__main__":
+    spm.SentencePieceTrainer.Train(" ".join(sys.argv[1:]))
diff --git a/egs/librispeech_100h/rnnt/path.sh b/egs/librispeech_100h/rnnt/path.sh
new file mode 100644
index 0000000..7972642
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/librispeech_100h/rnnt/run.sh b/egs/librispeech_100h/rnnt/run.sh
new file mode 100755
index 0000000..ad6a4d4
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/run.sh
@@ -0,0 +1,213 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1"
+gpu_num=2
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="" #feature output dictionary
+exp_dir=""
+lang=en
+token_type=bpe
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+sample_frequency=16000
+stage=0
+stop_stage=4
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=
+data_url=www.openslr.org/resources/12
+
+# bpe model
+nbpe=5000
+bpemode=unigram
+
+# exp tag
+tag="exp2"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train_clean_100
+valid_set=dev
+test_sets="test_clean test_other dev_clean dev_other"
+
+asr_config=conf/train_conformer_rnnt.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_rnnt_conformer.yaml
+#inference_config=conf/decode_asr_transformer_beam60_ctc0.3.yaml
+inference_asr_model=valid.cer_transducer.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    for part in dev-clean test-clean dev-other test-other train-clean-100; do
+        local/download_and_untar.sh ${raw_data} ${data_url} ${part}
+    done
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    for x in dev-clean dev-other test-clean test-other train-clean-100; do
+        local/data_prep.sh ${raw_data}/LibriSpeech/${x} ${feats_dir}/data/${x//-/_}
+    done
+    mkdir $feats_dir/data/$valid_set
+    dev_sets="dev_clean dev_other"
+    for file in wav.scp text; do
+        ( for f in $dev_sets; do cat $feats_dir/data/$f/$file; done ) | sort -k1 > $feats_dir/data/$valid_set/$file || exit 1;
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} ${feats_dir}/data/${train_set}
+fi
+
+token_list=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}_units.txt
+bpemodel=${feats_dir}/data/lang_char/${train_set}_${bpemode}${nbpe}
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    ### Task dependent. You have to check non-linguistic symbols used in the corpus.
+    echo "stage 2: Dictionary and Json Data Preparation"
+    mkdir -p ${feats_dir}/data/lang_char/
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    cut -f 2- -d" " ${feats_dir}/data/${train_set}/text > ${feats_dir}/data/lang_char/input.txt
+    local/spm_train.py --input=${feats_dir}/data/lang_char/input.txt --vocab_size=${nbpe} --model_type=${bpemode} --model_prefix=${bpemodel} --input_sentence_size=100000000
+    local/spm_encode.py --model=${bpemodel}.model --output_format=piece < ${feats_dir}/data/lang_char/input.txt | tr ' ' '\n' | sort | uniq | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=./ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            train.py \
+                --task_name asr \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --split_with_space false \
+                --bpemodel ${bpemodel}.model \
+                --token_type $token_type \
+                --token_list $token_list \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
+                --speed_perturb ${speed_perturb} \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --multiprocessing_distributed true \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/data/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/cmvn.mvn \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode rnnt \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/compute_wer.py ${_data}/text ${_dir}/text ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
diff --git a/egs/librispeech_100h/rnnt/utils b/egs/librispeech_100h/rnnt/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/librispeech_100h/rnnt/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml b/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml
new file mode 100644
index 0000000..e35e820
--- /dev/null
+++ b/egs/wenetspeech/conformer/conf/decode_asr_transformer_5beam.yaml
@@ -0,0 +1,6 @@
+beam_size: 5
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.5
+lm_weight: 0.7
diff --git a/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
new file mode 100644
index 0000000..a9658b8
--- /dev/null
+++ b/egs/wenetspeech/conformer/conf/train_asr_conformer.yaml
@@ -0,0 +1,104 @@
+# network architecture
+# encoder related
+encoder: conformer
+encoder_conf:
+    output_size: 512    # dimension of attention
+    attention_heads: 8
+    linear_units: 2048  # the number of units of position-wise feed forward
+    num_blocks: 12      # the number of encoder blocks
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.0
+    input_layer: conv2d # encoder architecture type
+    normalize_before: true
+    rel_pos_type: latest
+    pos_enc_layer_type: rel_pos
+    selfattention_layer_type: rel_selfattn
+    activation_type: swish
+    macaron_style: true
+    use_cnn_module: true
+    cnn_module_kernel: 15
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 8
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.0
+    src_attention_dropout_rate: 0.0
+
+# CTC realted
+ctc_conf:
+    ignore_nan_grad: true
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# optimization related
+accum_grad: 4
+grad_clip: 5
+patience: none
+max_epoch: 30
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.0015
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 30000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 30
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_range:
+    - 0
+    - 40
+    num_time_mask: 2
+
+dataset_conf:
+    data_names: speech,text
+    data_types: sound,text_nospace
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 2048
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 32000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
diff --git a/egs/wenetspeech/conformer/local/data.sh b/egs/wenetspeech/conformer/local/data.sh
new file mode 100755
index 0000000..2b0a4be
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/data.sh
@@ -0,0 +1,102 @@
+#!/usr/bin/env bash
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+log() {
+    local fname=${BASH_SOURCE[1]##*/}
+    echo -e "$(date '+%Y-%m-%dT%H:%M:%S') (${fname}:${BASH_LINENO[0]}:${FUNCNAME[1]}) $*"
+}
+SECONDS=0
+
+# general configuration
+nj=10
+stage=2
+stop_stage=100
+set=L
+data_dir="data"
+WENETSPEECH=
+train_cmd=
+
+log "$0 $*"
+. utils/parse_options.sh
+
+if [ ! -d "${WENETSPEECH}/audio" ] && [ ! -f "${WENETSPEECH}/WenetSpeech.json" ]; then
+    echo "Valid WENETSPEECH data not found in ${WENETSPEECH}."
+    echo "Please follow the instruction in https://wenet-e2e.github.io/WenetSpeech/"
+    echo "and re-construct the data."
+    exit 1
+fi
+
+train_set=train_"$(echo "${set}" | tr "[:upper:]" "[:lower:]")"
+dev_set=dev
+test_sets="test_net test_meeting"
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    log "data preparation"
+    mkdir -p ${data_dir}
+    abs_data_dir=$(readlink -f ${data_dir})
+    log "making Kaldi format data directory in ${abs_data_dir}"
+    local/wenetspeech_data_prep.sh \
+        --train-subset ${set} \
+        --stage 1 \
+        ${WENETSPEECH} \
+        ${abs_data_dir}
+
+    # prepare utt2spk and spk2utt files
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        dir=${data_dir}/${x}
+        paste -d " " <(cut -f 1 ${dir}/segments) <(cut -f 1 ${dir}/segments) | \
+            sort -u > ${dir}/utt2spk
+        utils/utt2spk_to_spk2utt.pl ${dir}/utt2spk > ${dir}/spk2utt
+    done
+fi
+
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    log "process the long term opus audio file, may take about 3 hours"
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        log "process audio for ${data_dir}/${x}"
+        dir=${data_dir}/${x}
+        mkdir -p ${dir}/logs
+
+        nutt=$(<${dir}/segments wc -l)
+        nj=$((nj<nutt?nj:nutt))
+
+        split_scps=""
+        for n in $(seq ${nj}); do
+            split_scps="${split_scps} ${dir}/logs/segments.${n}"
+        done
+        utils/split_scp.pl ${dir}/segments ${split_scps}
+
+        ${train_cmd} "JOB=1:${nj}" "${dir}/logs/process_audio.JOB.log"\
+            python3 local/process_opus.py \
+                ${dir}/wav.scp \
+                ${dir}/logs/segments.JOB   \
+                ${dir}/logs/wav.JOB.scp
+
+        # modify the `wav.scp` file and rename the `segments` file
+        # rename the `segments` file to avoid the audio file formatting process in stage 3 of `asr.sh`
+        mv ${dir}/wav.scp ${dir}/wav.scp.org
+        mv ${dir}/segments ${dir}/segments.org
+        for n in $(seq ${nj}); do
+            cat ${dir}/logs/wav.${n}.scp || exit 1;
+        done | sort -u > ${dir}/wav.scp
+    done
+fi
+
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    log "format text file"
+    for x in ${train_set} ${dev_set} ${test_sets}; do
+        log "format text for ${data_dir}/${x}"
+        dir=${data_dir}/${x}
+        mv ${dir}/text ${dir}/text.org
+        paste -d " " <(cut -f 1 ${dir}/text.org) \
+            <(cut -f 2- ${dir}/text.org | local/text_normalize.pl) | \
+            sort -u > ${dir}/text
+        utils/fix_data_dir.sh ${dir}
+    done
+fi
+
+log "Successfully finished. [elapsed=${SECONDS}s]"
diff --git a/egs/wenetspeech/conformer/local/extract_meta.py b/egs/wenetspeech/conformer/local/extract_meta.py
new file mode 100755
index 0000000..6074162
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/extract_meta.py
@@ -0,0 +1,114 @@
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import json
+import os
+import sys
+
+
+def get_args():
+    parser = argparse.ArgumentParser(
+        description="""
+      This script is used to process raw json dataset of WenetSpeech,
+      where the long wav is splitinto segments and
+      data of wenet format is generated.
+      """
+    )
+    parser.add_argument("input_json", help="""Input json file of WenetSpeech""")
+    parser.add_argument("output_dir", help="""Output dir for prepared data""")
+
+    args = parser.parse_args()
+    return args
+
+
+def meta_analysis(input_json, output_dir):
+    input_dir = os.path.dirname(input_json)
+
+    if not os.path.exists(output_dir):
+        os.makedirs(output_dir)
+
+    try:
+        with open(input_json, "r") as injson:
+            json_data = json.load(injson)
+    except Exception:
+        sys.exit(f"Failed to load input json file: {input_json}")
+    else:
+        if json_data["audios"] is not None:
+            with open(f"{output_dir}/text", "w") as utt2text, open(
+                f"{output_dir}/segments", "w"
+            ) as segments, open(f"{output_dir}/utt2dur", "w") as utt2dur, open(
+                f"{output_dir}/wav.scp", "w"
+            ) as wavscp, open(
+                f"{output_dir}/utt2subsets", "w"
+            ) as utt2subsets, open(
+                f"{output_dir}/reco2dur", "w"
+            ) as reco2dur:
+                for long_audio in json_data["audios"]:
+                    try:
+                        long_audio_path = os.path.realpath(
+                            os.path.join(input_dir, long_audio["path"])
+                        )
+                        aid = long_audio["aid"]
+                        segments_lists = long_audio["segments"]
+                        duration = long_audio["duration"]
+                        assert os.path.exists(long_audio_path)
+                    except AssertionError:
+                        print(
+                            f"""Warning: {aid} something is wrong,
+                                  maybe AssertionError, skipped"""
+                        )
+                        continue
+                    except Exception:
+                        print(
+                            f"""Warning: {aid} something is wrong, maybe the
+                                  error path: {long_audio_path}, skipped"""
+                        )
+                        continue
+                    else:
+                        wavscp.write(f"{aid}\t{long_audio_path}\n")
+                        reco2dur.write(f"{aid}\t{duration}\n")
+                        for segment_file in segments_lists:
+                            try:
+                                sid = segment_file["sid"]
+                                start_time = segment_file["begin_time"]
+                                end_time = segment_file["end_time"]
+                                dur = end_time - start_time
+                                text = segment_file["text"]
+                                segment_subsets = segment_file["subsets"]
+                            except Exception:
+                                print(
+                                    f"""Warning: {segment_file} something
+                                          is wrong, skipped"""
+                                )
+                                continue
+                            else:
+                                utt2text.write(f"{sid}\t{text}\n")
+                                segments.write(
+                                    f"{sid}\t{aid}\t{start_time}\t{end_time}\n"
+                                )
+                                utt2dur.write(f"{sid}\t{dur}\n")
+                                segment_sub_names = " ".join(segment_subsets)
+                                utt2subsets.write(f"{sid}\t{segment_sub_names}\n")
+
+
+def main():
+    args = get_args()
+
+    meta_analysis(args.input_json, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/aishell2/transformer/utils/__init__.py b/egs/wenetspeech/conformer/local/path.sh
similarity index 100%
rename from egs/aishell2/transformer/utils/__init__.py
rename to egs/wenetspeech/conformer/local/path.sh
diff --git a/egs/wenetspeech/conformer/local/process_opus.py b/egs/wenetspeech/conformer/local/process_opus.py
new file mode 100755
index 0000000..581953f
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/process_opus.py
@@ -0,0 +1,88 @@
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# process_opus.py: segmentation and downsampling of opus audio
+
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+
+import os
+import sys
+
+import torchaudio
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
+    num_utts = len(utt_list)
+    step = int(num_utts * 0.01)
+    with open(output_wav_scp, "w", encoding="UTF-8") as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)).replace(
+                "audio", "audio_seg"
+            )
+            seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
+
+            os.makedirs(output_dir, exist_ok=True)
+            if current_wav_path != previous_wav_path:
+                waveform, sample_rate = torchaudio.load(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * sample_rate)
+            end = int(end_time_list[i] * sample_rate)
+            target_audio = waveform[:, start:end]
+            torchaudio.save(seg_wav_path, target_audio, sample_rate)
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % step == 0:
+                print("seg wav finished: {}%".format(int(i / step)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list = read_file(
+        wav_scp, segments
+    )
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/egs/wenetspeech/conformer/local/text_normalize.pl b/egs/wenetspeech/conformer/local/text_normalize.pl
new file mode 100755
index 0000000..55b35e2
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/text_normalize.pl
@@ -0,0 +1,24 @@
+#!/usr/bin/env perl
+use utf8;
+use open qw(:std :utf8);
+use warnings;
+
+while (<STDIN>) {
+    chomp;
+    # remove non UTF-8 whitespace character
+    if ($_ =~ /銆�/) {$_ =~ s:銆�::g;}
+    if ($_ =~ /聽/) {$_ =~ s:聽::g;}
+    # upper letters
+    if ($_ =~ /[a-zA-Z]/) {$_ =~ uc $_;}
+    # add "_" before and after each English word
+    if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+    if ($_ =~ /([A-Z]+)\s+([A-Z]+)/) {$_ =~ s/([A-Z]+)\s+([A-Z]+)/$1\_$2/g;}
+    if ($_ =~ m/([A-Z]+)(\p{Han}+)/) {$_ =~ s/([A-Z]+)(\p{Han}+)/$1\_$2/g;}
+    if ($_ =~ m/(\p{Han}+)([A-Z]+)/) {$_ =~ s/(\p{Han}+)([A-Z]+)/$1\_$2/g;}
+    # remove UTF-8 whitespace charcter
+    if ($_ =~ /\s+/) {$_ =~ s:\s+::g;}
+    # replace "_" with a normal whitespace
+    if ($_ =~ /\_/) {$_ =~ s:\_: :g;}
+
+    print "$_\n";
+}
diff --git a/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
new file mode 100755
index 0000000..4959328
--- /dev/null
+++ b/egs/wenetspeech/conformer/local/wenetspeech_data_prep.sh
@@ -0,0 +1,135 @@
+#!/usr/bin/env bash
+
+# Copyright 2021  Xiaomi Corporation (Author: Yongqing Wang)
+#                 Seasalt AI, Inc (Author: Guoguo Chen)
+#                 Mobvoi Inc(Author: Di Wu, Binbin Zhang)
+#                 NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+set -e
+set -o pipefail
+
+stage=1
+prefix=
+train_subset=L
+
+. utils/parse_options.sh || exit 1;
+
+filter_by_id () {
+  idlist=$1
+  input=$2
+  output=$3
+  field=1
+  if [ $# -eq 4 ]; then
+    field=$4
+  fi
+  cat $input | perl -se '
+    open(F, "<$idlist") || die "Could not open id-list file $idlist";
+    while(<F>) {
+      @A = split;
+      @A>=1 || die "Invalid id-list file line $_";
+      $seen{$A[0]} = 1;
+    }
+    while(<>) {
+      @A = split;
+      @A > 0 || die "Invalid file line $_";
+      @A >= $field || die "Invalid file line $_";
+      if ($seen{$A[$field-1]}) {
+        print $_;
+      }
+    }' -- -idlist="$idlist" -field="$field" > $output ||\
+  (echo "$0: filter_by_id() error: $input" && exit 1) || exit 1;
+}
+
+subset_data_dir () {
+  utt_list=$1
+  src_dir=$2
+  dest_dir=$3
+  mkdir -p $dest_dir || exit 1;
+  # wav.scp text segments utt2dur
+  filter_by_id $utt_list $src_dir/utt2dur $dest_dir/utt2dur ||\
+    (echo "$0: subset_data_dir() error: $src_dir/utt2dur" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/text $dest_dir/text ||\
+    (echo "$0: subset_data_dir() error: $src_dir/text" && exit 1) || exit 1;
+  filter_by_id $utt_list $src_dir/segments $dest_dir/segments ||\
+    (echo "$0: subset_data_dir() error: $src_dir/segments" && exit 1) || exit 1;
+  awk '{print $2}' $dest_dir/segments | sort | uniq > $dest_dir/reco
+  filter_by_id $dest_dir/reco $src_dir/wav.scp $dest_dir/wav.scp ||\
+    (echo "$0: subset_data_dir() error: $src_dir/wav.scp" && exit 1) || exit 1;
+  rm -f $dest_dir/reco
+}
+
+if [ $# -ne 2 ]; then
+  echo "Usage: $0 [options] <wenetspeech-dataset-dir> <data-dir>"
+  echo " e.g.: $0 --train-subset L /disk1/audio_data/wenetspeech/ data/"
+  echo ""
+  echo "This script takes the WenetSpeech source directory, and prepares the"
+  echo "WeNet format data directory."
+  echo "  --prefix <prefix>                # Prefix for output data directory."
+  echo "  --stage <stage>                  # Processing stage."
+  echo "  --train-subset <L|M|S|W>     # Train subset to be created."
+  exit 1
+fi
+
+wenetspeech_dir=$1
+data_dir=$2
+
+declare -A subsets
+subsets=(
+  [L]="train_l"
+  [M]="train_m"
+  [S]="train_s"
+  [W]="train_w"
+  [DEV]="dev"
+  [TEST_NET]="test_net"
+  [TEST_MEETING]="test_meeting")
+
+prefix=${prefix:+${prefix}_}
+
+corpus_dir=$data_dir/${prefix}corpus/
+if [ $stage -le 1 ]; then
+  echo "$0: Extract meta into $corpus_dir"
+  # Sanity check.
+  [ ! -f $wenetspeech_dir/WenetSpeech.json ] &&\
+    echo "$0: Please download $wenetspeech_dir/WenetSpeech.json!" && exit 1;
+  [ ! -d $wenetspeech_dir/audio ] &&\
+    echo "$0: Please download $wenetspeech_dir/audio!" && exit 1;
+
+  [ ! -d $corpus_dir ] && mkdir -p $corpus_dir
+
+  # Files to be created:
+  # wav.scp text segments utt2dur
+  python3 local/extract_meta.py \
+    $wenetspeech_dir/WenetSpeech.json $corpus_dir || exit 1;
+fi
+
+if [ $stage -le 2 ]; then
+  echo "$0: Split data to train, dev, test_net, and test_meeting"
+  [ ! -f $corpus_dir/utt2subsets ] &&\
+    echo "$0: No such file $corpus_dir/utt2subsets!" && exit 1;
+  for label in $train_subset DEV TEST_NET TEST_MEETING; do
+    if [ ! ${subsets[$label]+set} ]; then
+      echo "$0: Subset $label is not defined in WenetSpeech.json." && exit 1;
+    fi
+    subset=${subsets[$label]}
+    [ ! -d $data_dir/${prefix}$subset ] && mkdir -p $data_dir/${prefix}$subset
+    cat $corpus_dir/utt2subsets | \
+       awk -v s=$label '{for (i=2;i<=NF;i++) if($i==s) print $0;}' \
+       > $corpus_dir/${prefix}${subset}_utt_list|| exit 1;
+    subset_data_dir $corpus_dir/${prefix}${subset}_utt_list \
+      $corpus_dir $data_dir/${prefix}$subset || exit 1;
+  done
+fi
+
+echo "$0: Done"
diff --git a/egs/wenetspeech/conformer/path.sh b/egs/wenetspeech/conformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/wenetspeech/conformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/wenetspeech/conformer/run.sh b/egs/wenetspeech/conformer/run.sh
new file mode 100644
index 0000000..db18361
--- /dev/null
+++ b/egs/wenetspeech/conformer/run.sh
@@ -0,0 +1,222 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3,4,5,6,7"
+gpu_num=8
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=zh
+token_type=char
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=3
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=/nfs/zhifu.gzf/wenetspeech_proc
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+set=L
+train_set=train_l
+valid_set=dev
+test_sets="dev test_net test_meeting"
+
+asr_config=conf/train_asr_conformer.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer_5beam.yaml
+inference_asr_model=valid.acc.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "For downloading data, please refer to https://github.com/wenet-e2e/WenetSpeech."
+    exit 0;
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/data.sh --set ${set} --nj $nj --data_dir $feats_dir --WENETSPEECH $raw_data --train_cmd $train_cmd
+    mkdir $feats_dir/data
+    mv $feats_dir/$train_set $feats_dir/data/$train_set
+    for x in $test_sets; do
+        mv $feats_dir/$x $feats_dir/data/
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 0.1
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: ASR Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            train.py \
+                --task_name asr \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --token_type $token_type \
+                --token_list $token_list \
+                --dataset_type large \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --data_file_names "wav.scp,text" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --speed_perturb ${speed_perturb} \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+          you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/data/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
+        python utils/proce_text.py ${_data}/text ${_data}/text.proc
+        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
+
+# Prepare files for ModelScope fine-tuning and inference
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "stage 6: ModelScope Preparation"
+    cp ${feats_dir}/data/${train_set}/cmvn/am.mvn ${exp_dir}/exp/${model_dir}/am.mvn
+    vocab_size=$(cat ${token_list} | wc -l)
+    python utils/gen_modelscope_configuration.py \
+        --am_model_name $inference_asr_model \
+        --mode asr \
+        --model_name conformer \
+        --dataset wenetspeech \
+        --output_dir $exp_dir/exp/$model_dir \
+        --vocab_size $vocab_size \
+        --tag $tag
+fi
\ No newline at end of file
diff --git a/egs/wenetspeech/conformer/utils b/egs/wenetspeech/conformer/utils
new file mode 120000
index 0000000..fe070dd
--- /dev/null
+++ b/egs/wenetspeech/conformer/utils
@@ -0,0 +1 @@
+../../aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/README.md b/egs_modelscope/asr/TEMPLATE/README.md
index 7ff04eb..0219c5b 100644
--- a/egs_modelscope/asr/TEMPLATE/README.md
+++ b/egs_modelscope/asr/TEMPLATE/README.md
@@ -20,11 +20,14 @@
 print(rec_result)
 ```
 #### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+##### Streaming Decoding
 ```python
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.4'
+    model_revision='v1.0.6',
+    update_model=False,
+    mode='paraformer_streaming'
     )
 import soundfile
 speech, sample_rate = soundfile.read("example/asr_example.wav")
@@ -41,6 +44,23 @@
 rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
 print(rec_result)
 ```
+
+##### Fake Streaming Decoding
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.6',
+    update_model=False,
+    mode="paraformer_fake_streaming"
+)
+audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+rec_result = inference_pipeline(audio_in=audio_in)
+print(rec_result)
+```
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
 
 #### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
index bc511bb..bef3849 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
@@ -1,39 +1,13 @@
-import os
-import logging
-import torch
-import soundfile
-
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
 
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-os.environ["MODELSCOPE_CACHE"] = "./"
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.4'
+    model_revision='v1.0.6',
+    update_model=False,
+    mode="paraformer_fake_streaming"
 )
-
-model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online")
-speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
-speech_length = speech.shape[0]
-
-sample_offset = 0
-chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
-stride_size =  chunk_size[1] * 960
-param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
-final_result = ""
-
-for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
-    if sample_offset + stride_size >= speech_length - 1:
-        stride_size = speech_length - sample_offset
-        param_dict["is_final"] = True
-    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
-                                    param_dict=param_dict)
-    if len(rec_result) != 0:
-        final_result += rec_result['text']
-        print(rec_result)
-print(final_result)
+audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+rec_result = inference_pipeline(audio_in=audio_in)
+print(rec_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py
new file mode 100644
index 0000000..c1f4afe
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py
@@ -0,0 +1,41 @@
+import os
+import logging
+import torch
+import soundfile
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+os.environ["MODELSCOPE_CACHE"] = "./"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.6',
+    update_model=False,
+    mode="paraformer_streaming"
+)
+
+model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online")
+speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
+speech_length = speech.shape[0]
+
+sample_offset = 0
+chunk_size = [5, 10, 5] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+stride_size =  chunk_size[1] * 960
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+final_result = ""
+
+for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+    if sample_offset + stride_size >= speech_length - 1:
+        stride_size = speech_length - sample_offset
+        param_dict["is_final"] = True
+    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
+                                    param_dict=param_dict)
+    if len(rec_result) != 0:
+        final_result += rec_result['text']
+        print(rec_result)
+print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py
new file mode 100644
index 0000000..30034aa
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py
@@ -0,0 +1,38 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        model_revision='v1.0.6',
+        update_model=False,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online", data_path="./data")
+    params.output_dir = "./checkpoint"              # m妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "small"                   # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 1000                       # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 20                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                             # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
new file mode 100644
index 0000000..241ebef
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -0,0 +1,33 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        model_revision='v1.0.6',
+        update_model=False,
+        mode="paraformer_fake_streaming",
+        param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
+    )
+    inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--decoding_mode', type=str, default="normal")
+    parser.add_argument('--model_revision', type=str, default=None)
+    parser.add_argument('--mode', type=str, default=None)
+    parser.add_argument('--hotword_txt', type=str, default=None)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh
new file mode 100644
index 0000000..0cab31e
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=32
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=32    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid} \
+            --mode "paraformer_fake_streaming"
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/utils b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/utils
new file mode 120000
index 0000000..a961ddc
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online/utils
@@ -0,0 +1 @@
+../../TEMPLATE/utils/
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
index abe6640..dae7766 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo.py
@@ -1,39 +1,13 @@
-import os
-import logging
-import torch
-import soundfile
-
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
 
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-os.environ["MODELSCOPE_CACHE"] = "./"
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
     model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
-    model_revision='v1.0.4'
+    model_revision='v1.0.6',
+    update_model=False,
+    mode="paraformer_fake_streaming"
 )
-
-model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
-speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
-speech_length = speech.shape[0]
-
-sample_offset = 0
-chunk_size = [8, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
-stride_size =  chunk_size[1] * 960
-param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
-final_result = ""
-
-for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
-    if sample_offset + stride_size >= speech_length - 1:
-        stride_size = speech_length - sample_offset
-        param_dict["is_final"] = True
-    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
-                                    param_dict=param_dict)
-    if len(rec_result) != 0:
-        final_result += rec_result['text']
-        print(rec_result)
-print(final_result.strip())
+audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+rec_result = inference_pipeline(audio_in=audio_in)
+print(rec_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py
new file mode 100644
index 0000000..199fcd8
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/demo_online.py
@@ -0,0 +1,41 @@
+import os
+import logging
+import torch
+import soundfile
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+os.environ["MODELSCOPE_CACHE"] = "./"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    model_revision='v1.0.6',
+    update_model=False,
+    mode="paraformer_streaming"
+)
+
+model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online")
+speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav"))
+speech_length = speech.shape[0]
+
+sample_offset = 0
+chunk_size = [8, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms
+stride_size =  chunk_size[1] * 960
+param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size}
+final_result = ""
+
+for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)):
+    if sample_offset + stride_size >= speech_length - 1:
+        stride_size = speech_length - sample_offset
+        param_dict["is_final"] = True
+    rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size],
+                                    param_dict=param_dict)
+    if len(rec_result) != 0:
+        final_result += rec_result['text']
+        print(rec_result)
+print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py
new file mode 100644
index 0000000..bd58c87
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/finetune.py
@@ -0,0 +1,38 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        model_revision='v1.0.6',
+        update_model=False,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online", data_path="./data")
+    params.output_dir = "./checkpoint"              # m妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "small"                   # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 1000                       # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 20                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                             # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
new file mode 100644
index 0000000..241ebef
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -0,0 +1,33 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        model_revision='v1.0.6',
+        update_model=False,
+        mode="paraformer_fake_streaming",
+        param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
+    )
+    inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--decoding_mode', type=str, default="normal")
+    parser.add_argument('--model_revision', type=str, default=None)
+    parser.add_argument('--mode', type=str, default=None)
+    parser.add_argument('--hotword_txt', type=str, default=None)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh
new file mode 100644
index 0000000..f565825
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.sh
@@ -0,0 +1,104 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=32
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=32    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ -n "${checkpoint_dir}" ]; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid} \
+            --mode "paraformer_fake_streaming"
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/utils b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/utils
new file mode 120000
index 0000000..a961ddc
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/utils
@@ -0,0 +1 @@
+../../TEMPLATE/utils/
\ No newline at end of file
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index 5bc205c..f54399a 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -10,10 +10,9 @@
         task=Tasks.auto_speech_recognition,
         model=args.model,
         output_dir=args.output_dir,
-        batch_size=args.batch_size,
         param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
     )
-    inference_pipeline(audio_in=args.audio_in)
+    inference_pipeline(audio_in=args.audio_in, batch_size_token=args.batch_size_token)
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser()
@@ -22,7 +21,7 @@
     parser.add_argument('--output_dir', type=str, default="./results/")
     parser.add_argument('--decoding_mode', type=str, default="normal")
     parser.add_argument('--hotword_txt', type=str, default=None)
-    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--batch_size_token', type=int, default=5000)
     parser.add_argument('--gpuid', type=str, default="0")
     args = parser.parse_args()
     modelscope_infer(args)
diff --git a/egs_modelscope/tp/TEMPLATE/README.md b/egs_modelscope/tp/TEMPLATE/README.md
index 7cc8508..3c7129f 100644
--- a/egs_modelscope/tp/TEMPLATE/README.md
+++ b/egs_modelscope/tp/TEMPLATE/README.md
@@ -11,7 +11,7 @@
 inference_pipeline = pipeline(
     task=Tasks.speech_timestamp,
     model='damo/speech_timestamp_prediction-v1-16k-offline',
-    output_dir=None)
+    model_revision='v1.1.0')
 
 rec_result = inference_pipeline(
     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
diff --git a/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv b/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv
index 662bf04..49eda5f 100644
--- a/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv
+++ b/fun_text_processing/inverse_text_normalization/id/id_unit_test.tsv
@@ -1,10 +1,10 @@
 dua ribu dua puluh dua	2022
-tiga ribu	300
+tiga ribu	3000
 sembilan ribu sembilan ratus sembilan puluh sembilan	9999
-seribu satu	100001
-ribu	100
+seribu satu	1001
+ribu	1000
 seribu	1000
-seribu dua ratus delapan puluh sembilan	10289
+seribu dua ratus delapan puluh sembilan	1289
 ribu dua ratus delapan puluh sembilan	1289
 nol satu dua tiga empat lima enam tujuh delapan sembilan	01 2345-6789
 empat belas	14
@@ -22,8 +22,8 @@
 seratus dua puluh tiga	123
 ratus dua puluh tiga	123
 dua puluh empat maret 	24 maret
-ribu tujuh puluh enam	10076
-seribu tujuh puluh enam	100076
-ribu tujuh puluh enam rupiah	10076 rupiah
+ribu tujuh puluh enam	1076
+seribu tujuh puluh enam	1076
+ribu tujuh puluh enam rupiah	1076 rupiah
 tujuh puluh enam	76
-ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima	+62 21 6539-0605
\ No newline at end of file
+ditambah enam dua dua satu enam lima tiga sembilan nol enam nol lima	+62 21 6539-0605
diff --git a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
index 539acbc..d2f1a77 100644
--- a/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
+++ b/fun_text_processing/inverse_text_normalization/id/taggers/cardinal.py
@@ -26,11 +26,10 @@
         graph_teen = pynini.string_file(get_abs_path("data/numbers/teen.tsv"))
         graph_hundreds = pynini.string_file(get_abs_path("data/numbers/hundreds.tsv"))
         graph_thousand = pynini.string_file(get_abs_path("data/numbers/thousand.tsv"))
-
-        graph_cents = pynini.cross("seratus", "100") | pynini.cross("ratus", "100") | pynini.union(graph_hundreds, pynutil.insert("0"))
+        
         graph_hundred = pynini.cross("ratus", "") | pynini.cross("seratus", "")
 
-        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("00"))
+        graph_hundred_component = pynini.union(graph_digit + delete_space + graph_hundred, pynutil.insert("0"))
         graph_hundred_component += delete_space
         graph_hundred_component += pynini.union(
             graph_teen | pynutil.insert("00"),
@@ -44,8 +43,8 @@
                 (graph_ties | pynutil.insert("0")) + delete_space + (
                             graph_digit | pynutil.insert("0")),
         )
-        graph_hundred_component = graph_hundred_component | graph_cents | graph_one_hundred_component
-
+        graph_hundred_component = graph_hundred_component | graph_one_hundred_component
+    
         graph_hundred_component_at_least_one_none_zero_digit = graph_hundred_component @ (
             pynini.closure(DAMO_DIGIT) + (DAMO_DIGIT - "0") + pynini.closure(DAMO_DIGIT)
         )
@@ -54,14 +53,12 @@
         )
         graph_thousand = pynini.cross("ribu", "") | pynini.cross("seribu", "")
         graph_one_thousand_component = pynini.union(pynini.cross("ribu", "1") | pynini.cross("seribu", "1"))
-        graph_thousand_cents = pynini.cross("seribu", "10") | pynini.cross("ribu","10") | pynini.union(graph_thousand, pynutil.insert(""))
+       
         graph_thousands = pynini.union(
             graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("ribu") | pynutil.delete("seribu")),
             pynutil.insert("000", weight=0.1),
         )
-        graph_thousand_component = pynini.union(graph_digit + delete_space + graph_thousand, pynutil.insert("000"))
-        graph_thousand_component += delete_space
-        graph_thousands = graph_thousands | graph_thousand_cents | graph_thousand_component | graph_one_thousand_component
+        graph_thousands = graph_thousands | (pynutil.insert("00") + graph_one_thousand_component)
 
         graph_million = pynini.union(
             graph_hundred_component_at_least_one_none_zero_digit + delete_space + (pynutil.delete("juta") | pynutil.delete("sejuta")),
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index 760fd07..47ce0ee 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -305,6 +305,7 @@
             nbest: int = 1,
             frontend_conf: dict = None,
             hotword_list_or_file: str = None,
+            decoding_ind: int = 0,
             **kwargs,
     ):
         assert check_argument_types()
@@ -415,6 +416,7 @@
         self.nbest = nbest
         self.frontend = frontend
         self.encoder_downsampling_factor = 1
+        self.decoding_ind = decoding_ind
         if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
             self.encoder_downsampling_factor = 4
 
@@ -452,7 +454,7 @@
         batch = to_device(batch, device=self.device)
 
         # b. Forward Encoder
-        enc, enc_len = self.asr_model.encode(**batch)
+        enc, enc_len = self.asr_model.encode(**batch, ind=self.decoding_ind)
         if isinstance(enc, tuple):
             enc = enc[0]
         # assert len(enc) == 1, len(enc)
@@ -491,9 +493,9 @@
             else:
                 if pre_token_length[i] == 0:
                     yseq = torch.tensor(
-                        [self.asr_model.sos] + [self.asr_model.eos], device=yseq.device
+                        [self.asr_model.sos] + [self.asr_model.eos], device=pre_acoustic_embeds.device
                     )
-                    score = torch.tensor(0.0, device=yseq.device)
+                    score = torch.tensor(0.0, device=pre_acoustic_embeds.device)
                 else:
                     yseq = am_scores.argmax(dim=-1)
                     score = am_scores.max(dim=-1)[0]
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index ec5e175..f84212d 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -1638,6 +1638,8 @@
         return inference_uniasr(**kwargs)
     elif mode == "paraformer":
         return inference_paraformer(**kwargs)
+    elif mode == "paraformer_fake_streaming":
+        return inference_paraformer(**kwargs)
     elif mode == "paraformer_streaming":
         return inference_paraformer_online(**kwargs)
     elif mode.startswith("paraformer_vad"):
@@ -1920,4 +1922,4 @@
 
 
 if __name__ == "__main__":
-    main()
\ No newline at end of file
+    main()
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index 94f7262..267e405 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -23,6 +23,8 @@
         from funasr.tasks.asr import ASRTask as ASRTask
     elif mode == "paraformer":
         from funasr.tasks.asr import ASRTaskParaformer as ASRTask
+    elif mode == "paraformer_streaming":
+        from funasr.tasks.asr import ASRTaskParaformer as ASRTask
     elif mode == "paraformer_vad_punc":
         from funasr.tasks.asr import ASRTaskParaformer as ASRTask
     elif mode == "uniasr":
diff --git a/funasr/bin/vad_infer.py b/funasr/bin/vad_infer.py
index 245757c..e1698d0 100644
--- a/funasr/bin/vad_infer.py
+++ b/funasr/bin/vad_infer.py
@@ -175,7 +175,8 @@
         batch_size = speech.shape[0]
         segments = [[]] * batch_size
         if self.frontend is not None:
-            feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final)
+            reset = in_cache == dict()
+            feats, feats_len = self.frontend.forward(speech, speech_lengths, is_final, reset)
             fbanks, _ = self.frontend.get_fbank()
         else:
             raise Exception("Need to extract feats first, please configure frontend configuration")
diff --git a/funasr/build_utils/build_asr_model.py b/funasr/build_utils/build_asr_model.py
index ddc827f..46c11b0 100644
--- a/funasr/build_utils/build_asr_model.py
+++ b/funasr/build_utils/build_asr_model.py
@@ -23,7 +23,7 @@
 from funasr.models.joint_net.joint_network import JointNetwork
 from funasr.models.e2e_asr import ASRModel
 from funasr.models.e2e_asr_mfcca import MFCCA
-from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerBert, BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerOnline, ParaformerBert, BiCifParaformer, ContextualParaformer
 from funasr.models.e2e_tp import TimestampPredictor
 from funasr.models.e2e_uni_asr import UniASR
 from funasr.models.e2e_asr_transducer import TransducerModel, UnifiedTransducerModel
@@ -82,6 +82,7 @@
         asr=ASRModel,
         uniasr=UniASR,
         paraformer=Paraformer,
+        paraformer_online=ParaformerOnline,
         paraformer_bert=ParaformerBert,
         bicif_paraformer=BiCifParaformer,
         contextual_paraformer=ContextualParaformer,
@@ -293,7 +294,7 @@
             token_list=token_list,
             **args.model_conf,
         )
-    elif args.model in ["paraformer", "paraformer_bert", "bicif_paraformer", "contextual_paraformer"]:
+    elif args.model in ["paraformer", "paraformer_online", "paraformer_bert", "bicif_paraformer", "contextual_paraformer"]:
         # predictor
         predictor_class = predictor_choices.get_class(args.predictor)
         predictor = predictor_class(**args.predictor_conf)
diff --git a/funasr/datasets/large_datasets/build_dataloader.py b/funasr/datasets/large_datasets/build_dataloader.py
index 339292f..aa5d9be 100644
--- a/funasr/datasets/large_datasets/build_dataloader.py
+++ b/funasr/datasets/large_datasets/build_dataloader.py
@@ -77,12 +77,13 @@
             bpe_tokenizer = SentencepiecesTokenizer(args.bpemodel)
         self.dataset_conf = args.dataset_conf
         self.frontend_conf = args.frontend_conf
+        self.speed_perturb = args.speed_perturb if hasattr(args, "speed_perturb") else None 
         logging.info("dataloader config: {}".format(self.dataset_conf))
         batch_mode = self.dataset_conf.get("batch_mode", "padding")
         data_list = args.train_data_file if mode == "train" else args.valid_data_file
         self.dataset = Dataset(data_list, symbol_table, seg_dict, punc_dict, bpe_tokenizer,
                                self.dataset_conf, self.frontend_conf,
-                               speed_perturb=args.speed_perturb if mode == "train" else None,
+                               speed_perturb=self.speed_perturb if mode == "train" else None,
                                mode=mode, batch_mode=batch_mode)
 
     def build_iter(self, epoch, shuffle=True):
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 5df61fd..68b63e1 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -148,6 +148,12 @@
                         if "key" not in sample_dict:
                             sample_dict["key"] = segs[0]
                         sample_dict['hw_tag'] = 1
+                    elif data_type == "text_nospace":
+                        text = item
+                        segs = text.strip().split(maxsplit=1)
+                        sample_dict[data_name] = [x for x in segs[1]]
+                        if "key" not in sample_dict:
+                            sample_dict["key"] = segs[0]
                     else:
                         text = item
                         segs = text.strip().split()
diff --git a/funasr/models/decoder/sanm_decoder.py b/funasr/models/decoder/sanm_decoder.py
index 18cd343..ed920bf 100644
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@@ -935,6 +935,7 @@
         hlens: torch.Tensor,
         ys_in_pad: torch.Tensor,
         ys_in_lens: torch.Tensor,
+        chunk_mask: torch.Tensor = None,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Forward decoder.
 
@@ -955,9 +956,13 @@
         """
         tgt = ys_in_pad
         tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None]
-
+        
         memory = hs_pad
         memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :]
+        if chunk_mask is not None:
+            memory_mask = memory_mask * chunk_mask
+            if tgt_mask.size(1) != memory_mask.size(1):
+                memory_mask = torch.cat((memory_mask, memory_mask[:, -2:-1, :]), dim=1)
 
         x = tgt
         x, tgt_mask, memory, memory_mask, _ = self.decoders(
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 82acef2..686038e 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -153,6 +153,7 @@
             speech_lengths: torch.Tensor,
             text: torch.Tensor,
             text_lengths: torch.Tensor,
+            decoding_ind: int = None,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
         Args:
@@ -160,6 +161,7 @@
                 speech_lengths: (Batch, )
                 text: (Batch, Length)
                 text_lengths: (Batch,)
+                decoding_ind: int
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -176,7 +178,11 @@
         speech = speech[:, :speech_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        if hasattr(self.encoder, "overlap_chunk_cls"):
+            ind = self.encoder.overlap_chunk_cls.random_choice(self.training, decoding_ind)
+            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
+        else:
+            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -272,12 +278,13 @@
         return {"feats": feats, "feats_lengths": feats_lengths}
 
     def encode(
-            self, speech: torch.Tensor, speech_lengths: torch.Tensor
+            self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Frontend + Encoder. Note that this method is used by asr_inference.py
         Args:
                 speech: (Batch, Length, ...)
                 speech_lengths: (Batch, )
+                ind: int
         """
         with autocast(False):
             # 1. Extract feats
@@ -299,11 +306,25 @@
         # feats: (Batch, Length, Dim)
         # -> encoder_out: (Batch, Length2, Dim2)
         if self.encoder.interctc_use_conditioning:
-            encoder_out, encoder_out_lens, _ = self.encoder(
-                feats, feats_lengths, ctc=self.ctc
-            )
+            if hasattr(self.encoder, "overlap_chunk_cls"):
+                encoder_out, encoder_out_lens, _ = self.encoder(
+                    feats, feats_lengths, ctc=self.ctc, ind=ind
+                )
+                encoder_out, encoder_out_lens = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
+                                                                                            encoder_out_lens,
+                                                                                            chunk_outs=None)
+            else:
+                encoder_out, encoder_out_lens, _ = self.encoder(
+                    feats, feats_lengths, ctc=self.ctc
+                )
         else:
-            encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
+            if hasattr(self.encoder, "overlap_chunk_cls"):
+                encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths, ind=ind)
+                encoder_out, encoder_out_lens = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
+                                                                                            encoder_out_lens,
+                                                                                            chunk_outs=None)
+            else:
+                encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -592,9 +613,137 @@
     """
 
     def __init__(
-            self, *args, **kwargs,
+            self,
+            vocab_size: int,
+            token_list: Union[Tuple[str, ...], List[str]],
+            frontend: Optional[AbsFrontend],
+            specaug: Optional[AbsSpecAug],
+            normalize: Optional[AbsNormalize],
+            encoder: AbsEncoder,
+            decoder: AbsDecoder,
+            ctc: CTC,
+            ctc_weight: float = 0.5,
+            interctc_weight: float = 0.0,
+            ignore_id: int = -1,
+            blank_id: int = 0,
+            sos: int = 1,
+            eos: int = 2,
+            lsm_weight: float = 0.0,
+            length_normalized_loss: bool = False,
+            report_cer: bool = True,
+            report_wer: bool = True,
+            sym_space: str = "<space>",
+            sym_blank: str = "<blank>",
+            extract_feats_in_collect_stats: bool = True,
+            predictor=None,
+            predictor_weight: float = 0.0,
+            predictor_bias: int = 0,
+            sampling_ratio: float = 0.2,
+            decoder_attention_chunk_type: str = 'chunk',
+            share_embedding: bool = False,
+            preencoder: Optional[AbsPreEncoder] = None,
+            postencoder: Optional[AbsPostEncoder] = None,
+            use_1st_decoder_loss: bool = False,
     ):
-        super().__init__(*args, **kwargs)
+        assert check_argument_types()
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        assert 0.0 <= interctc_weight < 1.0, interctc_weight
+
+        super().__init__(
+            vocab_size=vocab_size,
+            token_list=token_list,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            ctc_weight=ctc_weight,
+            interctc_weight=interctc_weight,
+            ignore_id=ignore_id,
+            blank_id=blank_id,
+            sos=sos,
+            eos=eos,
+            lsm_weight=lsm_weight,
+            length_normalized_loss=length_normalized_loss,
+            report_cer=report_cer,
+            report_wer=report_wer,
+            sym_space=sym_space,
+            sym_blank=sym_blank,
+            extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+            predictor=predictor,
+            predictor_weight=predictor_weight,
+            predictor_bias=predictor_bias,
+            sampling_ratio=sampling_ratio,
+        )
+        # note that eos is the same as sos (equivalent ID)
+        self.blank_id = blank_id
+        self.sos = vocab_size - 1 if sos is None else sos
+        self.eos = vocab_size - 1 if eos is None else eos
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.interctc_weight = interctc_weight
+        self.token_list = token_list.copy()
+
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.postencoder = postencoder
+        self.encoder = encoder
+
+        if not hasattr(self.encoder, "interctc_use_conditioning"):
+            self.encoder.interctc_use_conditioning = False
+        if self.encoder.interctc_use_conditioning:
+            self.encoder.conditioning_layer = torch.nn.Linear(
+                vocab_size, self.encoder.output_size()
+            )
+
+        self.error_calculator = None
+
+        if ctc_weight == 1.0:
+            self.decoder = None
+        else:
+            self.decoder = decoder
+
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+
+        if ctc_weight == 0.0:
+            self.ctc = None
+        else:
+            self.ctc = ctc
+
+        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
+        self.predictor = predictor
+        self.predictor_weight = predictor_weight
+        self.predictor_bias = predictor_bias
+        self.sampling_ratio = sampling_ratio
+        self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
+        self.step_cur = 0
+        self.scama_mask = None
+        if hasattr(self.encoder, "overlap_chunk_cls") and self.encoder.overlap_chunk_cls is not None:
+            from funasr.modules.streaming_utils.chunk_utilis import build_scama_mask_for_cross_attention_decoder
+            self.build_scama_mask_for_cross_attention_decoder_fn = build_scama_mask_for_cross_attention_decoder
+            self.decoder_attention_chunk_type = decoder_attention_chunk_type
+
+        self.share_embedding = share_embedding
+        if self.share_embedding:
+            self.decoder.embed = None
+
+        self.use_1st_decoder_loss = use_1st_decoder_loss
 
     def forward(
             self,
@@ -602,6 +751,7 @@
             speech_lengths: torch.Tensor,
             text: torch.Tensor,
             text_lengths: torch.Tensor,
+            decoding_ind: int = None,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
         Args:
@@ -609,6 +759,7 @@
                 speech_lengths: (Batch, )
                 text: (Batch, Length)
                 text_lengths: (Batch,)
+                decoding_ind: int
         """
         assert text_lengths.dim() == 1, text_lengths.shape
         # Check that batch_size is unified
@@ -625,7 +776,11 @@
         speech = speech[:, :speech_lengths.max()]
 
         # 1. Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        if hasattr(self.encoder, "overlap_chunk_cls"):
+            ind = self.encoder.overlap_chunk_cls.random_choice(self.training, decoding_ind)
+            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
+        else:
+            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
             intermediate_outs = encoder_out[1]
@@ -638,8 +793,12 @@
 
         # 1. CTC branch
         if self.ctc_weight != 0.0:
+            if hasattr(self.encoder, "overlap_chunk_cls"):
+                encoder_out_ctc, encoder_out_lens_ctc = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
+                                                                                                    encoder_out_lens,
+                                                                                                    chunk_outs=None)
             loss_ctc, cer_ctc = self._calc_ctc_loss(
-                encoder_out, encoder_out_lens, text, text_lengths
+                encoder_out_ctc, encoder_out_lens_ctc, text, text_lengths
             )
 
             # Collect CTC branch stats
@@ -652,8 +811,14 @@
             for layer_idx, intermediate_out in intermediate_outs:
                 # we assume intermediate_out has the same length & padding
                 # as those of encoder_out
+                if hasattr(self.encoder, "overlap_chunk_cls"):
+                    encoder_out_ctc, encoder_out_lens_ctc = \
+                        self.encoder.overlap_chunk_cls.remove_chunk(
+                            intermediate_out,
+                            encoder_out_lens,
+                            chunk_outs=None)
                 loss_ic, cer_ic = self._calc_ctc_loss(
-                    intermediate_out, encoder_out_lens, text, text_lengths
+                    encoder_out_ctc, encoder_out_lens_ctc, text, text_lengths
                 )
                 loss_interctc = loss_interctc + loss_ic
 
@@ -672,7 +837,7 @@
 
         # 2b. Attention decoder branch
         if self.ctc_weight != 1.0:
-            loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
+            loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att = self._calc_att_predictor_loss(
                 encoder_out, encoder_out_lens, text, text_lengths
             )
 
@@ -684,8 +849,12 @@
         else:
             loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
 
+        if self.use_1st_decoder_loss and pre_loss_att is not None:
+            loss = loss + pre_loss_att
+
         # Collect Attn branch stats
         stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+        stats["pre_loss_att"] = pre_loss_att.detach() if pre_loss_att is not None else None
         stats["acc"] = acc_att
         stats["cer"] = cer_att
         stats["wer"] = wer_att
@@ -697,14 +866,67 @@
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
 
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+        Args:
+                        speech: (Batch, Length, ...)
+                        speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+        
+        # 4. Forward encoder
+        # feats: (Batch, Length, Dim)
+        # -> encoder_out: (Batch, Length2, Dim2)
+        if self.encoder.interctc_use_conditioning:
+            encoder_out, encoder_out_lens, _ = self.encoder(
+                feats, feats_lengths, ctc=self.ctc, ind=ind
+            )
+        else:
+            encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths, ind=ind)
+        intermediate_outs = None
+        if isinstance(encoder_out, tuple):
+            intermediate_outs = encoder_out[1]
+            encoder_out = encoder_out[0]
+
+        # Post-encoder, e.g. NLU
+        if self.postencoder is not None:
+            encoder_out, encoder_out_lens = self.postencoder(
+                encoder_out, encoder_out_lens
+            )
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        assert encoder_out.size(1) <= encoder_out_lens.max(), (
+            encoder_out.size(),
+            encoder_out_lens.max(),
+        )
+
+        if intermediate_outs is not None:
+            return (encoder_out, intermediate_outs), encoder_out_lens
+
+        return encoder_out, encoder_out_lens
+
     def encode_chunk(
             self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
     ) -> Tuple[torch.Tensor, torch.Tensor]:
         """Frontend + Encoder. Note that this method is used by asr_inference.py
-<<<<<<< HEAD
-=======
-
->>>>>>> 4cd79db451786548d8a100f25c3b03da0eb30f4b
         Args:
                 speech: (Batch, Length, ...)
                 speech_lengths: (Batch, )
@@ -750,11 +972,240 @@
 
         return encoder_out, torch.tensor([encoder_out.size(1)])
 
+    def _calc_att_predictor_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+            encoder_out.device)
+        if self.predictor_bias == 1:
+            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+            ys_pad_lens = ys_pad_lens + self.predictor_bias
+        mask_chunk_predictor = None
+        if self.encoder.overlap_chunk_cls is not None:
+            mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(None,
+                                                                                           device=encoder_out.device,
+                                                                                           batch_size=encoder_out.size(
+                                                                                               0))
+            mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(None, device=encoder_out.device,
+                                                                                   batch_size=encoder_out.size(0))
+            encoder_out = encoder_out * mask_shfit_chunk
+        pre_acoustic_embeds, pre_token_length, pre_alphas, _ = self.predictor(encoder_out,
+                                                                              ys_pad,
+                                                                              encoder_out_mask,
+                                                                              ignore_id=self.ignore_id,
+                                                                              mask_chunk_predictor=mask_chunk_predictor,
+                                                                              target_label_length=ys_pad_lens,
+                                                                              )
+        predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
+                                                                                             encoder_out_lens)
+
+        scama_mask = None
+        if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
+            encoder_chunk_size = self.encoder.overlap_chunk_cls.chunk_size_pad_shift_cur
+            attention_chunk_center_bias = 0
+            attention_chunk_size = encoder_chunk_size
+            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
+            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
+                get_mask_shift_att_chunk_decoder(None,
+                                                 device=encoder_out.device,
+                                                 batch_size=encoder_out.size(0)
+                                                 )
+            scama_mask = self.build_scama_mask_for_cross_attention_decoder_fn(
+                predictor_alignments=predictor_alignments,
+                encoder_sequence_length=encoder_out_lens,
+                chunk_size=1,
+                encoder_chunk_size=encoder_chunk_size,
+                attention_chunk_center_bias=attention_chunk_center_bias,
+                attention_chunk_size=attention_chunk_size,
+                attention_chunk_type=self.decoder_attention_chunk_type,
+                step=None,
+                predictor_mask_chunk_hopping=mask_chunk_predictor,
+                decoder_att_look_back_factor=decoder_att_look_back_factor,
+                mask_shift_att_chunk_decoder=mask_shift_att_chunk_decoder,
+                target_length=ys_pad_lens,
+                is_training=self.training,
+            )
+        elif self.encoder.overlap_chunk_cls is not None:
+            encoder_out, encoder_out_lens = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out,
+                                                                                        encoder_out_lens,
+                                                                                        chunk_outs=None)
+        # 0. sampler
+        decoder_out_1st = None
+        pre_loss_att = None
+        if self.sampling_ratio > 0.0:
+            if self.step_cur < 2:
+                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            if self.use_1st_decoder_loss:
+                sematic_embeds, decoder_out_1st, pre_loss_att = \
+                    self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad,
+                                           ys_pad_lens, pre_acoustic_embeds, scama_mask)
+            else:
+                sematic_embeds, decoder_out_1st = \
+                    self.sampler(encoder_out, encoder_out_lens, ys_pad,
+                                 ys_pad_lens, pre_acoustic_embeds, scama_mask)
+        else:
+            if self.step_cur < 2:
+                logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+            sematic_embeds = pre_acoustic_embeds
+
+        # 1. Forward decoder
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, scama_mask
+        )
+        decoder_out, _ = decoder_outs[0], decoder_outs[1]
+
+        if decoder_out_1st is None:
+            decoder_out_1st = decoder_out
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_pad)
+        acc_att = th_accuracy(
+            decoder_out_1st.view(-1, self.vocab_size),
+            ys_pad,
+            ignore_label=self.ignore_id,
+        )
+        loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out_1st.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att, loss_pre, pre_loss_att
+
+    def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, chunk_mask=None):
+
+        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
+        ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
+        if self.share_embedding:
+            ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
+        else:
+            ys_pad_embed = self.decoder.embed(ys_pad_masked)
+        with torch.no_grad():
+            decoder_outs = self.decoder(
+                encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, chunk_mask
+            )
+            decoder_out, _ = decoder_outs[0], decoder_outs[1]
+            pred_tokens = decoder_out.argmax(-1)
+            nonpad_positions = ys_pad.ne(self.ignore_id)
+            seq_lens = (nonpad_positions).sum(1)
+            same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
+            input_mask = torch.ones_like(nonpad_positions)
+            bsz, seq_len = ys_pad.size()
+            for li in range(bsz):
+                target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
+                if target_num > 0:
+                    input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
+            input_mask = input_mask.eq(1)
+            input_mask = input_mask.masked_fill(~nonpad_positions, False)
+            input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
+
+        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
+            input_mask_expand_dim, 0)
+        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
+
+    def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, chunk_mask=None):
+        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
+        ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
+        if self.share_embedding:
+            ys_pad_embed = self.decoder.output_layer.weight[ys_pad_masked]
+        else:
+            ys_pad_embed = self.decoder.embed(ys_pad_masked)
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, chunk_mask
+        )
+        pre_loss_att = self.criterion_att(decoder_outs[0], ys_pad)
+        decoder_out, _ = decoder_outs[0], decoder_outs[1]
+        pred_tokens = decoder_out.argmax(-1)
+        nonpad_positions = ys_pad.ne(self.ignore_id)
+        seq_lens = (nonpad_positions).sum(1)
+        same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
+        input_mask = torch.ones_like(nonpad_positions)
+        bsz, seq_len = ys_pad.size()
+        for li in range(bsz):
+            target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
+            if target_num > 0:
+                input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
+        input_mask = input_mask.eq(1)
+        input_mask = input_mask.masked_fill(~nonpad_positions, False)
+        input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
+
+        sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
+            input_mask_expand_dim, 0)
+
+        return sematic_embeds * tgt_mask, decoder_out * tgt_mask, pre_loss_att
+
+    def calc_predictor(self, encoder_out, encoder_out_lens):
+
+        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+            encoder_out.device)
+        mask_chunk_predictor = None
+        if self.encoder.overlap_chunk_cls is not None:
+            mask_chunk_predictor = self.encoder.overlap_chunk_cls.get_mask_chunk_predictor(None,
+                                                                                           device=encoder_out.device,
+                                                                                           batch_size=encoder_out.size(
+                                                                                               0))
+            mask_shfit_chunk = self.encoder.overlap_chunk_cls.get_mask_shfit_chunk(None, device=encoder_out.device,
+                                                                                   batch_size=encoder_out.size(0))
+            encoder_out = encoder_out * mask_shfit_chunk
+        pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index = self.predictor(encoder_out,
+                                                                                           None,
+                                                                                           encoder_out_mask,
+                                                                                           ignore_id=self.ignore_id,
+                                                                                           mask_chunk_predictor=mask_chunk_predictor,
+                                                                                           target_label_length=None,
+                                                                                           )
+        predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
+                                                                                             encoder_out_lens+1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)
+
+        scama_mask = None
+        if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
+            encoder_chunk_size = self.encoder.overlap_chunk_cls.chunk_size_pad_shift_cur
+            attention_chunk_center_bias = 0
+            attention_chunk_size = encoder_chunk_size
+            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
+            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
+                get_mask_shift_att_chunk_decoder(None,
+                                                 device=encoder_out.device,
+                                                 batch_size=encoder_out.size(0)
+                                                 )
+            scama_mask = self.build_scama_mask_for_cross_attention_decoder_fn(
+                predictor_alignments=predictor_alignments,
+                encoder_sequence_length=encoder_out_lens,
+                chunk_size=1,
+                encoder_chunk_size=encoder_chunk_size,
+                attention_chunk_center_bias=attention_chunk_center_bias,
+                attention_chunk_size=attention_chunk_size,
+                attention_chunk_type=self.decoder_attention_chunk_type,
+                step=None,
+                predictor_mask_chunk_hopping=mask_chunk_predictor,
+                decoder_att_look_back_factor=decoder_att_look_back_factor,
+                mask_shift_att_chunk_decoder=mask_shift_att_chunk_decoder,
+                target_length=None,
+                is_training=self.training,
+            )
+        self.scama_mask = scama_mask
+
+        return pre_acoustic_embeds, pre_token_length, pre_alphas, pre_peak_index
+
     def calc_predictor_chunk(self, encoder_out, cache=None):
 
         pre_acoustic_embeds, pre_token_length = \
             self.predictor.forward_chunk(encoder_out, cache["encoder"])
         return pre_acoustic_embeds, pre_token_length
+
+    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
+        decoder_outs = self.decoder(
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, self.scama_mask
+        )
+        decoder_out = decoder_outs[0]
+        decoder_out = torch.log_softmax(decoder_out, dim=-1)
+        return decoder_out, ys_pad_lens
 
     def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
         decoder_outs = self.decoder.forward_chunk(
@@ -1800,4 +2251,4 @@
                     "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
                                                                                   var_dict_tf[name_tf].shape))
 
-        return var_dict_torch_update
\ No newline at end of file
+        return var_dict_torch_update
diff --git a/funasr/models/e2e_asr_transducer.py b/funasr/models/e2e_asr_transducer.py
index 3120087..3f9f31c 100644
--- a/funasr/models/e2e_asr_transducer.py
+++ b/funasr/models/e2e_asr_transducer.py
@@ -108,7 +108,7 @@
         self.use_auxiliary_lm_loss = auxiliary_lm_loss_weight > 0
 
         if self.use_auxiliary_ctc:
-            self.ctc_lin = torch.nn.Linear(encoder.output_size, vocab_size)
+            self.ctc_lin = torch.nn.Linear(encoder.output_size(), vocab_size)
             self.ctc_dropout_rate = auxiliary_ctc_dropout_rate
 
         if self.use_auxiliary_lm_loss:
@@ -162,7 +162,9 @@
 
         # 1. Encoder
         encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
-
+        if hasattr(self.encoder, 'overlap_chunk_cls') and self.encoder.overlap_chunk_cls is not None:
+            encoder_out, encoder_out_lens = self.encoder.overlap_chunk_cls.remove_chunk(encoder_out, encoder_out_lens,
+                                                                                        chunk_outs=None)
         # 2. Transducer-related I/O preparation
         decoder_in, target, t_len, u_len = get_transducer_task_io(
             text,
@@ -577,7 +579,7 @@
         self.use_auxiliary_lm_loss = auxiliary_lm_loss_weight > 0
 
         if self.use_auxiliary_ctc:
-            self.ctc_lin = torch.nn.Linear(encoder.output_size, vocab_size)
+            self.ctc_lin = torch.nn.Linear(encoder.output_size(), vocab_size)
             self.ctc_dropout_rate = auxiliary_ctc_dropout_rate
 
         if self.use_auxiliary_att:
diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index 82d8422..14d56a8 100644
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -226,7 +226,6 @@
                                                self.vad_opts.frame_in_ms)
         self.encoder = encoder
         # init variables
-        self.is_final = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -253,11 +252,10 @@
         self.data_buf = None
         self.data_buf_all = None
         self.waveform = None
-        self.ResetDetection()
         self.frontend = frontend
+        self.last_drop_frames = 0
 
     def AllResetDetection(self):
-        self.is_final = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
         self.latest_confirmed_speech_frame = 0
@@ -284,7 +282,8 @@
         self.data_buf = None
         self.data_buf_all = None
         self.waveform = None
-        self.ResetDetection()
+        self.last_drop_frames = 0
+        self.windows_detector.Reset()
 
     def ResetDetection(self):
         self.continous_silence_frame_count = 0
@@ -296,6 +295,15 @@
         self.windows_detector.Reset()
         self.sil_frame = 0
         self.frame_probs = []
+
+        if self.output_data_buf:
+            assert self.output_data_buf[-1].contain_seg_end_point == True
+            drop_frames = int(self.output_data_buf[-1].end_ms / self.vad_opts.frame_in_ms)
+            real_drop_frames = drop_frames - self.last_drop_frames
+            self.last_drop_frames = drop_frames
+            self.data_buf_all = self.data_buf_all[real_drop_frames * int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
+            self.decibel = self.decibel[real_drop_frames:]
+            self.scores = self.scores[:, real_drop_frames:, :]
 
     def ComputeDecibel(self) -> None:
         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
@@ -324,7 +332,7 @@
         while self.data_buf_start_frame < frame_idx:
             if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                 self.data_buf_start_frame += 1
-                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
+                self.data_buf = self.data_buf_all[(self.data_buf_start_frame - self.last_drop_frames) * int(
                     self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
 
     def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
@@ -473,6 +481,8 @@
     def forward(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
                 is_final: bool = False
                 ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
+        if not in_cache:
+            self.AllResetDetection()
         self.waveform = waveform  # compute decibel for each frame
         self.ComputeDecibel()
         self.ComputeScores(feats, in_cache)
@@ -501,6 +511,8 @@
     def forward_online(self, feats: torch.Tensor, waveform: torch.tensor, in_cache: Dict[str, torch.Tensor] = dict(),
                        is_final: bool = False, max_end_sil: int = 800
                        ) -> Tuple[List[List[List[int]]], Dict[str, torch.Tensor]]:
+        if not in_cache:
+            self.AllResetDetection()
         self.max_end_sil_frame_cnt_thresh = max_end_sil - self.vad_opts.speech_to_sil_time_thres
         self.waveform = waveform  # compute decibel for each frame
 
@@ -541,7 +553,7 @@
             return 0
         for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
             frame_state = FrameState.kFrameStateInvalid
-            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i - self.last_drop_frames)
             self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
 
         return 0
@@ -551,7 +563,7 @@
             return 0
         for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
             frame_state = FrameState.kFrameStateInvalid
-            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i - self.last_drop_frames)
             if i != 0:
                 self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
             else:
diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py
index da67586..46eabd1 100644
--- a/funasr/models/encoder/sanm_encoder.py
+++ b/funasr/models/encoder/sanm_encoder.py
@@ -633,6 +633,8 @@
                 self.embed = torch.nn.Linear(input_size, output_size)
         elif input_layer == "pe":
             self.embed = SinusoidalPositionEncoder()
+        elif input_layer == "pe_online":
+            self.embed = StreamSinusoidalPositionEncoder()
         else:
             raise ValueError("unknown input_layer: " + input_layer)
         self.normalize_before = normalize_before
@@ -818,6 +820,59 @@
             return (xs_pad, intermediate_outs), olens, None
         return xs_pad, olens, None
 
+    def _add_overlap_chunk(self, feats: np.ndarray, cache: dict = {}):
+        if len(cache) == 0:
+            return feats
+        cache["feats"] = to_device(cache["feats"], device=feats.device)
+        overlap_feats = torch.cat((cache["feats"], feats), dim=1)
+        cache["feats"] = overlap_feats[:, -(cache["chunk_size"][0] + cache["chunk_size"][2]):, :]
+        return overlap_feats
+
+    def forward_chunk(self,
+                      xs_pad: torch.Tensor,
+                      ilens: torch.Tensor,
+                      cache: dict = None,
+                      ctc: CTC = None,
+                      ):
+        xs_pad *= self.output_size() ** 0.5
+        if self.embed is None:
+            xs_pad = xs_pad
+        else:
+            xs_pad = self.embed(xs_pad, cache)
+        if cache["tail_chunk"]:
+            xs_pad = to_device(cache["feats"], device=xs_pad.device)
+        else:
+            xs_pad = self._add_overlap_chunk(xs_pad, cache)
+        encoder_outs = self.encoders0(xs_pad, None, None, None, None)
+        xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            encoder_outs = self.encoders(xs_pad, None, None, None, None)
+            xs_pad, masks = encoder_outs[0], encoder_outs[1]
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                encoder_outs = encoder_layer(xs_pad, None, None, None, None)
+                xs_pad, masks = encoder_outs[0], encoder_outs[1]
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+
+                    # intermediate outputs are also normalized
+                    if self.normalize_before:
+                        encoder_out = self.after_norm(encoder_out)
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), None, None
+        return xs_pad, ilens, None
+
     def gen_tf2torch_map_dict(self):
         tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
         tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 35fab57..f16bdd9 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -395,8 +395,10 @@
         return feats_pad, feats_lens, lfr_splice_frame_idxs
 
     def forward(
-            self, input: torch.Tensor, input_lengths: torch.Tensor, is_final: bool = False
+        self, input: torch.Tensor, input_lengths: torch.Tensor, is_final: bool = False, reset: bool = False
     ) -> Tuple[torch.Tensor, torch.Tensor]:
+        if reset:
+            self.cache_reset()
         batch_size = input.shape[0]
         assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
         waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths)  # input shape: B T D
@@ -500,4 +502,4 @@
         feats_pad = pad_sequence(feats,
                                  batch_first=True,
                                  padding_value=0.0)
-        return feats_pad, feats_lens
\ No newline at end of file
+        return feats_pad, feats_lens
diff --git a/funasr/runtime/html5/readme.md b/funasr/runtime/html5/readme.md
index 0fbafac..1e9031e 100644
--- a/funasr/runtime/html5/readme.md
+++ b/funasr/runtime/html5/readme.md
@@ -41,7 +41,7 @@
 `Tips:` asr service and html5 service should be deployed on the same device.
 ```shell
 cd ../python/websocket
-python ws_server_online.py --port 1095
+python wss_srv_asr.py --port 10095
 ```
 
 
diff --git a/funasr/runtime/html5/readme_cn.md b/funasr/runtime/html5/readme_cn.md
index 38005e0..b859387 100644
--- a/funasr/runtime/html5/readme_cn.md
+++ b/funasr/runtime/html5/readme_cn.md
@@ -49,7 +49,7 @@
 #### wss鏂瑰紡
 ```shell
 cd ../python/websocket
-python ws_server_online.py --port 1095
+python wss_srv_asr.py --port 10095
 ```
 
 ### 娴忚鍣ㄦ墦寮�鍦板潃
diff --git a/funasr/runtime/html5/static/index.html b/funasr/runtime/html5/static/index.html
index 3bc0fc8..99aa9b4 100644
--- a/funasr/runtime/html5/static/index.html
+++ b/funasr/runtime/html5/static/index.html
@@ -19,8 +19,15 @@
 			<div class="div_class_recordControl">
 				asr鏈嶅姟鍣ㄥ湴鍧�(蹇呭～):
 				<br>
-				<input id="wssip" type="text" style=" width: 100%;height:100%" value="wss://127.0.0.1:1095/"/>
+				<input id="wssip" type="text" style=" width: 100%;height:100%" value="wss://127.0.0.1:10095/"/>
 				<br>
+				<br>
+				<div style="border:2px solid #ccc;">
+				閫夋嫨asr妯″瀷妯″紡:<br/>
+      <label><input name="asr_mode" type="radio" value="2pass" checked="true"/>2pass </label>&nbsp;&nbsp;
+      <label><input name="asr_mode" type="radio" value="online" />online </label>&nbsp;&nbsp;
+      <label><input name="asr_mode" type="radio" value="offline" />offline </label> 
+				</div>
 				<br>
 				璇煶璇嗗埆缁撴灉鏄剧ず锛�
 				<br>
@@ -29,6 +36,7 @@
 				<br>
                 <div id="info_div">璇风偣鍑诲紑濮�</div>
 				<div class="div_class_buttons">
+					<button id="btnConnect">杩炴帴</button>
 					<button id="btnStart">寮�濮�</button>
 					<button id="btnStop">鍋滄</button>
  
diff --git a/funasr/runtime/html5/static/main.js b/funasr/runtime/html5/static/main.js
index 6548aa3..22f53c1 100644
--- a/funasr/runtime/html5/static/main.js
+++ b/funasr/runtime/html5/static/main.js
@@ -23,28 +23,61 @@
 var sampleBuf=new Int16Array();
 // 瀹氫箟鎸夐挳鍝嶅簲浜嬩欢
 var btnStart = document.getElementById('btnStart');
-btnStart.onclick = start;
+btnStart.onclick = record;
 var btnStop = document.getElementById('btnStop');
 btnStop.onclick = stop;
 btnStop.disabled = true;
+btnStart.disabled = true;
  
-
+btnConnect= document.getElementById('btnConnect');
+btnConnect.onclick = start;
  
-var rec_text=""
+var rec_text="";
+var offline_text="";
 var info_div = document.getElementById('info_div');
 
 //var now_ipaddress=window.location.href;
 //now_ipaddress=now_ipaddress.replace("https://","wss://");
 //now_ipaddress=now_ipaddress.replace("static/index.html","");
 //document.getElementById('wssip').value=now_ipaddress;
+ 
+function getAsrMode(){
+
+            var item = null;
+            var obj = document.getElementsByName("asr_mode");
+            for (var i = 0; i < obj.length; i++) { //閬嶅巻Radio 
+                if (obj[i].checked) {
+                    item = obj[i].value;  
+					break;
+                }
+		    
+
+           }
+		   console.log("asr mode"+item);
+		   return item;
+}
+		   
 
 // 璇煶璇嗗埆缁撴灉; 瀵筳sonMsg鏁版嵁瑙ｆ瀽,灏嗚瘑鍒粨鏋滈檮鍔犲埌缂栬緫妗嗕腑
 function getJsonMessage( jsonMsg ) {
+	//console.log(jsonMsg);
 	console.log( "message: " + JSON.parse(jsonMsg.data)['text'] );
 	var rectxt=""+JSON.parse(jsonMsg.data)['text'];
+	var asrmodel=JSON.parse(jsonMsg.data)['mode'];
+	if(asrmodel=="2pass-offline")
+	{
+		offline_text=offline_text+rectxt; //.replace(/ +/g,"");
+		rec_text=offline_text;
+	}
+	else
+	{
+		rec_text=rec_text+rectxt; //.replace(/ +/g,"");
+	}
 	var varArea=document.getElementById('varArea');
-	rec_text=rec_text+rectxt.replace(/ +/g,"");
+	
 	varArea.value=rec_text;
+	console.log( "offline_text: " + asrmodel+","+offline_text);
+	console.log( "rec_text: " + rec_text);
 	 
  
 }
@@ -53,11 +86,14 @@
 function getConnState( connState ) {
 	if ( connState === 0 ) {
  
-		rec.open( function(){
-			rec.start();
-			console.log("寮�濮嬪綍闊�");
+		//rec.open( function(){
+		//	rec.start();
+		//	console.log("寮�濮嬪綍闊�");
  
-		});
+		//});
+		btnStart.disabled = false;
+		btnConnect.disabled = true;
+		info_div.innerHTML='杩炴帴鎴愬姛!璇风偣鍑诲紑濮�';
 	} else if ( connState === 1 ) {
 		//stop();
 	} else if ( connState === 2 ) {
@@ -66,11 +102,19 @@
 		 
 		alert("杩炴帴鍦板潃"+document.getElementById('wssip').value+"澶辫触,璇锋鏌sr鍦板潃鍜岀鍙ｏ紝骞剁‘淇漢5鏈嶅姟鍜宎sr鏈嶅姟鍦ㄥ悓涓�涓煙鍐呫�傛垨鎹釜娴忚鍣ㄨ瘯璇曘��");
 		btnStart.disabled = true;
-		info_div.innerHTML='璇风偣鍑诲紑濮�';
+
+		info_div.innerHTML='璇风偣鍑昏繛鎺�';
 	}
 }
 
-
+function record()
+{
+		 rec.open( function(){
+		 rec.start();
+		 console.log("寮�濮�");
+		 btnStart.disabled = true;
+		 });
+}
 // 璇嗗埆鍚姩銆佸仠姝€�佹竻绌烘搷浣�
 function start() {
 	
@@ -78,14 +122,15 @@
 	clear();
 	//鎺т欢鐘舵�佹洿鏂�
  	    
-
+    info_div.innerHTML="姝ｅ湪杩炴帴asr鏈嶅姟鍣紝璇风瓑寰�...";
 	//鍚姩杩炴帴
 	var ret=wsconnecter.wsStart();
 	if(ret==1){
 		isRec = true;
 		btnStart.disabled = true;
 		btnStop.disabled = false;
-	    info_div.innerHTML="姝ｅ湪杩炴帴asr鏈嶅姟鍣紝璇风瓑寰�...";
+		btnConnect.disabled=true;
+
 	}
 }
 
@@ -97,7 +142,9 @@
 			"wav_name":  "h5",
 			"is_speaking":  false,
 			"chunk_interval":10,
+			"mode":getAsrMode(),
 		};
+		console.log(request);
 		if(sampleBuf.length>0){
 		wsconnecter.wsSend(sampleBuf,false);
 		console.log("sampleBuf.length"+sampleBuf.length);
@@ -114,7 +161,12 @@
 	isRec = false;
     info_div.innerHTML="璇风瓑鍊�...";
 	btnStop.disabled = true;
-	setTimeout(function(){btnStart.disabled = false;info_div.innerHTML="璇风偣鍑诲紑濮�";}, 3000 );
+	setTimeout(function(){
+		console.log("call stop ws!");
+		wsconnecter.wsStop();
+		btnStart.disabled = true;
+		btnConnect.disabled=false;
+		info_div.innerHTML="璇风偣鍑昏繛鎺�";}, 3000 );
 	rec.stop(function(blob,duration){
   
 		console.log(blob);
@@ -149,6 +201,7 @@
  
 	varArea.value="";
     rec_text="";
+	offline_text="";
  
 }
 
diff --git a/funasr/runtime/html5/static/wsconnecter.js b/funasr/runtime/html5/static/wsconnecter.js
index 82d751b..676a94a 100644
--- a/funasr/runtime/html5/static/wsconnecter.js
+++ b/funasr/runtime/html5/static/wsconnecter.js
@@ -28,7 +28,11 @@
 		if ( 'WebSocket' in window ) {
 			speechSokt = new WebSocket( Uri ); // 瀹氫箟socket杩炴帴瀵硅薄
 			speechSokt.onopen = function(e){onOpen(e);}; // 瀹氫箟鍝嶅簲鍑芥暟
-			speechSokt.onclose = function(e){onClose(e);};
+			speechSokt.onclose = function(e){
+			    console.log("onclose ws!");
+			    speechSokt.close();
+				onClose(e);
+				};
 			speechSokt.onmessage = function(e){onMessage(e);};
 			speechSokt.onerror = function(e){onError(e);};
 			return 1;
@@ -42,6 +46,7 @@
 	// 瀹氫箟鍋滄涓庡彂閫佸嚱鏁�
 	this.wsStop = function () {
 		if(speechSokt != undefined) {
+			console.log("stop ws!");
 			speechSokt.close();
 		}
 	};
@@ -69,7 +74,9 @@
 			"wav_name":  "h5",
 			"is_speaking":  true,
 			"chunk_interval":10,
+			"mode":getAsrMode(),
 		};
+		console.log(request);
 		speechSokt.send( JSON.stringify(request) );
 		console.log("杩炴帴鎴愬姛");
 		stateHandle(0);
diff --git a/funasr/runtime/onnxruntime/CMakeLists.txt b/funasr/runtime/onnxruntime/CMakeLists.txt
index 9f6013f..0847d1f 100644
--- a/funasr/runtime/onnxruntime/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/CMakeLists.txt
@@ -7,6 +7,8 @@
 # set(CMAKE_CXX_STANDARD 11)
 set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
 set(CMAKE_POSITION_INDEPENDENT_CODE ON)
+set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
+
 
 include(TestBigEndian)
 test_big_endian(BIG_ENDIAN)
@@ -30,12 +32,13 @@
 include_directories(${PROJECT_SOURCE_DIR}/third_party/kaldi-native-fbank)
 include_directories(${PROJECT_SOURCE_DIR}/third_party/yaml-cpp/include)
 
-add_subdirectory(third_party/yaml-cpp)
-add_subdirectory(third_party/kaldi-native-fbank/kaldi-native-fbank/csrc)
-add_subdirectory(src)
-
 if(ENABLE_GLOG)
     include_directories(${PROJECT_SOURCE_DIR}/third_party/glog)
     set(BUILD_TESTING OFF)
     add_subdirectory(third_party/glog)
-endif()
\ No newline at end of file
+endif()
+
+add_subdirectory(third_party/yaml-cpp)
+add_subdirectory(third_party/kaldi-native-fbank/kaldi-native-fbank/csrc)
+add_subdirectory(src)
+add_subdirectory(bin)
diff --git a/funasr/runtime/onnxruntime/bin/CMakeLists.txt b/funasr/runtime/onnxruntime/bin/CMakeLists.txt
new file mode 100644
index 0000000..962da0b
--- /dev/null
+++ b/funasr/runtime/onnxruntime/bin/CMakeLists.txt
@@ -0,0 +1,16 @@
+include_directories(${CMAKE_SOURCE_DIR}/include)
+
+add_executable(funasr-onnx-offline "funasr-onnx-offline.cpp")
+target_link_libraries(funasr-onnx-offline PUBLIC funasr)
+
+add_executable(funasr-onnx-offline-vad "funasr-onnx-offline-vad.cpp")
+target_link_libraries(funasr-onnx-offline-vad PUBLIC funasr)
+
+add_executable(funasr-onnx-online-vad "funasr-onnx-online-vad.cpp")
+target_link_libraries(funasr-onnx-online-vad PUBLIC funasr)
+
+add_executable(funasr-onnx-offline-punc "funasr-onnx-offline-punc.cpp")
+target_link_libraries(funasr-onnx-offline-punc PUBLIC funasr)
+
+add_executable(funasr-onnx-offline-rtf "funasr-onnx-offline-rtf.cpp")
+target_link_libraries(funasr-onnx-offline-rtf PUBLIC funasr)
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline-punc.cpp
similarity index 100%
rename from funasr/runtime/onnxruntime/src/funasr-onnx-offline-punc.cpp
rename to funasr/runtime/onnxruntime/bin/funasr-onnx-offline-punc.cpp
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp
similarity index 93%
rename from funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp
rename to funasr/runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp
index 2d182e0..d2692ce 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-rtf.cpp
+++ b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline-rtf.cpp
@@ -28,7 +28,7 @@
 std::atomic<int> wav_index(0);
 std::mutex mtx;
 
-void runReg(FUNASR_HANDLE asr_handle, vector<string> wav_list, 
+void runReg(FUNASR_HANDLE asr_handle, vector<string> wav_list, vector<string> wav_ids,
             float* total_length, long* total_time, int core_id) {
     
     struct timeval start, end;
@@ -59,7 +59,7 @@
 
         if(result){
             string msg = FunASRGetResult(result, 0);
-            LOG(INFO) << "Thread: " << this_thread::get_id() <<" Result: " << msg.c_str();
+            LOG(INFO) << "Thread: " << this_thread::get_id() << "," << wav_ids[i] << " : " << msg.c_str();
 
             float snippet_time = FunASRGetRetSnippetTime(result);
             n_total_length += snippet_time;
@@ -146,9 +146,12 @@
 
     // read wav_path
     vector<string> wav_list;
+    vector<string> wav_ids;
+    string default_id = "wav_default_id";
     string wav_path_ = model_path.at(WAV_PATH);
     if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
         wav_list.emplace_back(wav_path_);
+        wav_ids.emplace_back(default_id);
     }
     else if(is_target_file(wav_path_, "scp")){
         ifstream in(wav_path_);
@@ -162,7 +165,8 @@
             istringstream iss(line);
             string column1, column2;
             iss >> column1 >> column2;
-            wav_list.emplace_back(column2); 
+            wav_list.emplace_back(column2);
+            wav_ids.emplace_back(column1);
         }
         in.close();
     }else{
@@ -178,7 +182,7 @@
     int rtf_threds = thread_num.getValue();
     for (int i = 0; i < rtf_threds; i++)
     {
-        threads.emplace_back(thread(runReg, asr_handle, wav_list, &total_length, &total_time, i));
+        threads.emplace_back(thread(runReg, asr_handle, wav_list, wav_ids, &total_length, &total_time, i));
     }
 
     for (auto& thread : threads)
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp
similarity index 90%
rename from funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
rename to funasr/runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp
index 0f606c6..3aaffbd 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
+++ b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline-vad.cpp
@@ -38,8 +38,8 @@
     }
 }
 
-void print_segs(vector<vector<int>>* vec) {
-    string seg_out="[";
+void print_segs(vector<vector<int>>* vec, string &wav_id) {
+    string seg_out=wav_id + ": [";
     for (int i = 0; i < vec->size(); i++) {
         vector<int> inner_vec = (*vec)[i];
         seg_out += "[";
@@ -97,9 +97,12 @@
 
     // read wav_path
     vector<string> wav_list;
+    vector<string> wav_ids;
+    string default_id = "wav_default_id";
     string wav_path_ = model_path.at(WAV_PATH);
     if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
         wav_list.emplace_back(wav_path_);
+        wav_ids.emplace_back(default_id);
     }
     else if(is_target_file(wav_path_, "scp")){
         ifstream in(wav_path_);
@@ -113,7 +116,8 @@
             istringstream iss(line);
             string column1, column2;
             iss >> column1 >> column2;
-            wav_list.emplace_back(column2); 
+            wav_list.emplace_back(column2);
+            wav_ids.emplace_back(column1);
         }
         in.close();
     }else{
@@ -123,9 +127,11 @@
     
     float snippet_time = 0.0f;
     long taking_micros = 0;
-    for(auto& wav_file : wav_list){
+    for (int i = 0; i < wav_list.size(); i++) {
+        auto& wav_file = wav_list[i];
+        auto& wav_id = wav_ids[i];
         gettimeofday(&start, NULL);
-        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000);
+        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), NULL, 16000);
         gettimeofday(&end, NULL);
         seconds = (end.tv_sec - start.tv_sec);
         taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
@@ -133,7 +139,7 @@
         if (result)
         {
             vector<std::vector<int>>* vad_segments = FsmnVadGetResult(result, 0);
-            print_segs(vad_segments);
+            print_segs(vad_segments, wav_id);
             snippet_time += FsmnVadGetRetSnippetTime(result);
             FsmnVadFreeResult(result);
         }
@@ -142,7 +148,7 @@
             LOG(ERROR) << ("No return data!\n");
         }
     }
- 
+
     LOG(INFO) << "Audio length: " << (double)snippet_time << " s";
     LOG(INFO) << "Model inference takes: " << (double)taking_micros / 1000000 <<" s";
     LOG(INFO) << "Model inference RTF: " << (double)taking_micros/ (snippet_time*1000000);
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline.cpp
similarity index 92%
rename from funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp
rename to funasr/runtime/onnxruntime/bin/funasr-onnx-offline.cpp
index 3472925..82668f8 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline.cpp
+++ b/funasr/runtime/onnxruntime/bin/funasr-onnx-offline.cpp
@@ -88,9 +88,12 @@
 
     // read wav_path
     vector<string> wav_list;
-    string wav_path_ = model_path.at(WAV_PATH); 
+    vector<string> wav_ids;
+    string default_id = "wav_default_id";
+    string wav_path_ = model_path.at(WAV_PATH);
     if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
         wav_list.emplace_back(wav_path_);
+        wav_ids.emplace_back(default_id);
     }
     else if(is_target_file(wav_path_, "scp")){
         ifstream in(wav_path_);
@@ -104,7 +107,8 @@
             istringstream iss(line);
             string column1, column2;
             iss >> column1 >> column2;
-            wav_list.emplace_back(column2); 
+            wav_list.emplace_back(column2);
+            wav_ids.emplace_back(column1);
         }
         in.close();
     }else{
@@ -114,7 +118,9 @@
     
     float snippet_time = 0.0f;
     long taking_micros = 0;
-    for(auto& wav_file : wav_list){
+    for (int i = 0; i < wav_list.size(); i++) {
+        auto& wav_file = wav_list[i];
+        auto& wav_id = wav_ids[i];
         gettimeofday(&start, NULL);
         FUNASR_RESULT result=FunOfflineInfer(asr_hanlde, wav_file.c_str(), RASR_NONE, NULL, 16000);
         gettimeofday(&end, NULL);
@@ -124,7 +130,7 @@
         if (result)
         {
             string msg = FunASRGetResult(result, 0);
-            LOG(INFO)<<"Result: "<<msg;
+            LOG(INFO)<< wav_id <<" : "<<msg;
             snippet_time += FunASRGetRetSnippetTime(result);
             FunASRFreeResult(result);
         }
diff --git a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp b/funasr/runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp
similarity index 62%
copy from funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
copy to funasr/runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp
index 0f606c6..68e32e5 100644
--- a/funasr/runtime/onnxruntime/src/funasr-onnx-offline-vad.cpp
+++ b/funasr/runtime/onnxruntime/bin/funasr-onnx-online-vad.cpp
@@ -18,6 +18,7 @@
 #include "funasrruntime.h"
 #include "tclap/CmdLine.h"
 #include "com-define.h"
+#include "audio.h"
 
 using namespace std;
 
@@ -38,10 +39,16 @@
     }
 }
 
-void print_segs(vector<vector<int>>* vec) {
-    string seg_out="[";
+void print_segs(vector<vector<int>>* vec, string &wav_id) {
+    if((*vec).size() == 0){
+        return;
+    }    
+    string seg_out=wav_id + ": [";
     for (int i = 0; i < vec->size(); i++) {
         vector<int> inner_vec = (*vec)[i];
+        if(inner_vec.size() == 0){
+            continue;
+        }
         seg_out += "[";
         for (int j = 0; j < inner_vec.size(); j++) {
             seg_out += to_string(inner_vec[j]);
@@ -97,9 +104,12 @@
 
     // read wav_path
     vector<string> wav_list;
+    vector<string> wav_ids;
+    string default_id = "wav_default_id";
     string wav_path_ = model_path.at(WAV_PATH);
     if(is_target_file(wav_path_, "wav") || is_target_file(wav_path_, "pcm")){
         wav_list.emplace_back(wav_path_);
+        wav_ids.emplace_back(default_id);
     }
     else if(is_target_file(wav_path_, "scp")){
         ifstream in(wav_path_);
@@ -113,39 +123,76 @@
             istringstream iss(line);
             string column1, column2;
             iss >> column1 >> column2;
-            wav_list.emplace_back(column2); 
+            wav_list.emplace_back(column2);
+            wav_ids.emplace_back(column1);
         }
         in.close();
     }else{
         LOG(ERROR)<<"Please check the wav extension!";
         exit(-1);
     }
-    
+    // init online features
+    FUNASR_HANDLE online_hanlde=FsmnVadOnlineInit(vad_hanlde);
     float snippet_time = 0.0f;
     long taking_micros = 0;
-    for(auto& wav_file : wav_list){
-        gettimeofday(&start, NULL);
-        FUNASR_RESULT result=FsmnVadInfer(vad_hanlde, wav_file.c_str(), FSMN_VAD_OFFLINE, NULL, 16000);
-        gettimeofday(&end, NULL);
-        seconds = (end.tv_sec - start.tv_sec);
-        taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+    for (int i = 0; i < wav_list.size(); i++) {
+        auto& wav_file = wav_list[i];
+        auto& wav_id = wav_ids[i];
 
-        if (result)
-        {
-            vector<std::vector<int>>* vad_segments = FsmnVadGetResult(result, 0);
-            print_segs(vad_segments);
-            snippet_time += FsmnVadGetRetSnippetTime(result);
-            FsmnVadFreeResult(result);
-        }
-        else
-        {
-            LOG(ERROR) << ("No return data!\n");
+        int32_t sampling_rate_ = -1;
+        funasr::Audio audio(1);
+		if(is_target_file(wav_file.c_str(), "wav")){
+			int32_t sampling_rate_ = -1;
+			if(!audio.LoadWav2Char(wav_file.c_str(), &sampling_rate_)){
+				LOG(ERROR)<<"Failed to load "<< wav_file;
+                exit(-1);
+            }
+		}else if(is_target_file(wav_file.c_str(), "pcm")){
+			if (!audio.LoadPcmwav2Char(wav_file.c_str(), &sampling_rate_)){
+				LOG(ERROR)<<"Failed to load "<< wav_file;
+                exit(-1);
+            }
+		}else{
+			LOG(ERROR)<<"Wrong wav extension";
+			exit(-1);
+		}
+        char* speech_buff = audio.GetSpeechChar();
+        int buff_len = audio.GetSpeechLen()*2;
+
+        int step = 3200;
+        bool is_final = false;
+
+        for (int sample_offset = 0; sample_offset < buff_len; sample_offset += std::min(step, buff_len - sample_offset)) {
+            if (sample_offset + step >= buff_len - 1) {
+                    step = buff_len - sample_offset;
+                    is_final = true;
+                } else {
+                    is_final = false;
+            }
+            gettimeofday(&start, NULL);
+            FUNASR_RESULT result = FsmnVadInferBuffer(online_hanlde, speech_buff+sample_offset, step, NULL, is_final, 16000);
+            gettimeofday(&end, NULL);
+            seconds = (end.tv_sec - start.tv_sec);
+            taking_micros += ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
+
+            if (result)
+            {
+                vector<std::vector<int>>* vad_segments = FsmnVadGetResult(result, 0);
+                print_segs(vad_segments, wav_id);
+                snippet_time += FsmnVadGetRetSnippetTime(result);
+                FsmnVadFreeResult(result);
+            }
+            else
+            {
+                LOG(ERROR) << ("No return data!\n");
+            }
         }
     }
- 
+
     LOG(INFO) << "Audio length: " << (double)snippet_time << " s";
     LOG(INFO) << "Model inference takes: " << (double)taking_micros / 1000000 <<" s";
     LOG(INFO) << "Model inference RTF: " << (double)taking_micros/ (snippet_time*1000000);
+    FsmnVadUninit(online_hanlde);
     FsmnVadUninit(vad_hanlde);
     return 0;
 }
diff --git a/funasr/runtime/onnxruntime/include/audio.h b/funasr/runtime/onnxruntime/include/audio.h
index 1eabd3e..d2100a4 100644
--- a/funasr/runtime/onnxruntime/include/audio.h
+++ b/funasr/runtime/onnxruntime/include/audio.h
@@ -33,8 +33,9 @@
 
 class Audio {
   private:
-    float *speech_data;
-    int16_t *speech_buff;
+    float *speech_data=nullptr;
+    int16_t *speech_buff=nullptr;
+    char* speech_char=nullptr;
     int speech_len;
     int speech_align_len;
     int offset;
@@ -47,18 +48,22 @@
     Audio(int data_type, int size);
     ~Audio();
     void Disp();
-    bool LoadWav(const char* filename, int32_t* sampling_rate);
     void WavResample(int32_t sampling_rate, const float *waveform, int32_t n);
     bool LoadWav(const char* buf, int n_len, int32_t* sampling_rate);
+    bool LoadWav(const char* filename, int32_t* sampling_rate);
+    bool LoadWav2Char(const char* filename, int32_t* sampling_rate);
     bool LoadPcmwav(const char* buf, int n_file_len, int32_t* sampling_rate);
     bool LoadPcmwav(const char* filename, int32_t* sampling_rate);
+    bool LoadPcmwav2Char(const char* filename, int32_t* sampling_rate);
     int FetchChunck(float *&dout, int len);
     int Fetch(float *&dout, int &len, int &flag);
     void Padding();
     void Split(OfflineStream* offline_streamj);
-    void Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments);
+    void Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished=true);
     float GetTimeLen();
     int GetQueueSize() { return (int)frame_queue.size(); }
+    char* GetSpeechChar(){return speech_char;}
+    int GetSpeechLen(){return speech_len;}
 };
 
 } // namespace funasr
diff --git a/funasr/runtime/onnxruntime/include/funasrruntime.h b/funasr/runtime/onnxruntime/include/funasrruntime.h
index 5cfdb47..af430f7 100644
--- a/funasr/runtime/onnxruntime/include/funasrruntime.h
+++ b/funasr/runtime/onnxruntime/include/funasrruntime.h
@@ -46,12 +46,6 @@
 	FUNASR_MODEL_PARAFORMER = 3,
 }FUNASR_MODEL_TYPE;
 
-typedef enum
-{
- FSMN_VAD_OFFLINE=0,
- FSMN_VAD_ONLINE = 1,
-}FSMN_VAD_MODE;
-
 typedef void (* QM_CALLBACK)(int cur_step, int n_total); // n_total: total steps; cur_step: Current Step.
 	
 // ASR
@@ -68,11 +62,12 @@
 _FUNASRAPI const float	FunASRGetRetSnippetTime(FUNASR_RESULT result);
 
 // VAD
-_FUNASRAPI FUNASR_HANDLE  	FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode=FSMN_VAD_OFFLINE);
+_FUNASRAPI FUNASR_HANDLE  	FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num);
+_FUNASRAPI FUNASR_HANDLE  	FsmnVadOnlineInit(FUNASR_HANDLE fsmnvad_handle);
 // buffer
-_FUNASRAPI FUNASR_RESULT	FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+_FUNASRAPI FUNASR_RESULT	FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, QM_CALLBACK fn_callback, bool input_finished=true, int sampling_rate=16000);
 // file, support wav & pcm
-_FUNASRAPI FUNASR_RESULT	FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate=16000);
+_FUNASRAPI FUNASR_RESULT	FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, QM_CALLBACK fn_callback, int sampling_rate=16000);
 
 _FUNASRAPI std::vector<std::vector<int>>*	FsmnVadGetResult(FUNASR_RESULT result,int n_index);
 _FUNASRAPI void			 	FsmnVadFreeResult(FUNASR_RESULT result);
diff --git a/funasr/runtime/onnxruntime/include/vad-model.h b/funasr/runtime/onnxruntime/include/vad-model.h
index b1b1e9d..07f1833 100644
--- a/funasr/runtime/onnxruntime/include/vad-model.h
+++ b/funasr/runtime/onnxruntime/include/vad-model.h
@@ -12,14 +12,9 @@
     virtual ~VadModel(){};
     virtual void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num)=0;
     virtual std::vector<std::vector<int>> Infer(std::vector<float> &waves, bool input_finished=true)=0;
-    virtual void ReadModel(const char* vad_model)=0;
-    virtual void LoadConfigFromYaml(const char* filename)=0;
-    virtual void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
-                    std::vector<float> &waves)=0;
-    virtual void LoadCmvn(const char *filename)=0;
-    virtual void InitCache()=0;
 };
 
-VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode);
+VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num);
+VadModel *CreateVadModel(void* fsmnvad_handle);
 } // namespace funasr
 #endif
diff --git a/funasr/runtime/onnxruntime/src/CMakeLists.txt b/funasr/runtime/onnxruntime/src/CMakeLists.txt
index 341a16a..d083d8e 100644
--- a/funasr/runtime/onnxruntime/src/CMakeLists.txt
+++ b/funasr/runtime/onnxruntime/src/CMakeLists.txt
@@ -1,11 +1,8 @@
 
 file(GLOB files1 "*.cpp")
-file(GLOB files2 "*.cc")
+set(files ${files1})
 
-set(files ${files1} ${files2})
-set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
-add_library(funasr ${files})
+add_library(funasr SHARED ${files})
 
 if(WIN32)
     set(EXTRA_LIBS pthread yaml-cpp csrc glog)
@@ -24,13 +21,3 @@
 
 include_directories(${CMAKE_SOURCE_DIR}/include)
 target_link_libraries(funasr PUBLIC onnxruntime ${EXTRA_LIBS})
-
-add_executable(funasr-onnx-offline "funasr-onnx-offline.cpp")
-add_executable(funasr-onnx-offline-vad "funasr-onnx-offline-vad.cpp")
-add_executable(funasr-onnx-offline-punc "funasr-onnx-offline-punc.cpp")
-add_executable(funasr-onnx-offline-rtf "funasr-onnx-offline-rtf.cpp")
-target_link_libraries(funasr-onnx-offline PUBLIC funasr)
-target_link_libraries(funasr-onnx-offline-vad PUBLIC funasr)
-target_link_libraries(funasr-onnx-offline-punc PUBLIC funasr)
-target_link_libraries(funasr-onnx-offline-rtf PUBLIC funasr)
-
diff --git a/funasr/runtime/onnxruntime/src/audio.cpp b/funasr/runtime/onnxruntime/src/audio.cpp
index 6d63d67..23d0010 100644
--- a/funasr/runtime/onnxruntime/src/audio.cpp
+++ b/funasr/runtime/onnxruntime/src/audio.cpp
@@ -176,12 +176,12 @@
 {
     if (speech_buff != NULL) {
         free(speech_buff);
-        
     }
-
     if (speech_data != NULL) {
-        
         free(speech_data);
+    }
+    if (speech_char != NULL) {
+        free(speech_char);
     }
 }
 
@@ -296,8 +296,47 @@
         return false;
 }
 
-bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
+bool Audio::LoadWav2Char(const char *filename, int32_t* sampling_rate)
 {
+    WaveHeader header;
+    if (speech_char != NULL) {
+        free(speech_char);
+    }
+    offset = 0;
+    std::ifstream is(filename, std::ifstream::binary);
+    is.read(reinterpret_cast<char *>(&header), sizeof(header));
+    if(!is){
+        LOG(ERROR) << "Failed to read " << filename;
+        return false;
+    }
+    if (!header.Validate()) {
+        return false;
+    }
+    header.SeekToDataChunk(is);
+        if (!is) {
+            return false;
+    }
+    if (!header.Validate()) {
+        return false;
+    }
+    header.SeekToDataChunk(is);
+    if (!is) {
+        return false;
+    }
+    
+    *sampling_rate = header.sample_rate;
+    // header.subchunk2_size contains the number of bytes in the data.
+    // As we assume each sample contains two bytes, so it is divided by 2 here
+    speech_len = header.subchunk2_size / 2;
+    speech_char = (char *)malloc(header.subchunk2_size);
+    memset(speech_char, 0, header.subchunk2_size);
+    is.read(speech_char, header.subchunk2_size);
+
+    return true;
+}
+
+bool Audio::LoadWav(const char* buf, int n_file_len, int32_t* sampling_rate)
+{ 
     WaveHeader header;
     if (speech_data != NULL) {
         free(speech_data);
@@ -441,6 +480,33 @@
 
 }
 
+bool Audio::LoadPcmwav2Char(const char* filename, int32_t* sampling_rate)
+{
+    if (speech_char != NULL) {
+        free(speech_char);
+    }
+    offset = 0;
+
+    FILE* fp;
+    fp = fopen(filename, "rb");
+    if (fp == nullptr)
+	{
+        LOG(ERROR) << "Failed to read " << filename;
+        return false;
+	}
+    fseek(fp, 0, SEEK_END);
+    uint32_t n_file_len = ftell(fp);
+    fseek(fp, 0, SEEK_SET);
+
+    speech_len = (n_file_len) / 2;
+    speech_char = (char *)malloc(n_file_len);
+    memset(speech_char, 0, n_file_len);
+    fread(speech_char, sizeof(int16_t), n_file_len/2, fp);
+    fclose(fp);
+    
+    return true;
+}
+
 int Audio::FetchChunck(float *&dout, int len)
 {
     if (offset >= speech_align_len) {
@@ -541,7 +607,7 @@
 }
 
 
-void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments)
+void Audio::Split(VadModel* vad_obj, vector<std::vector<int>>& vad_segments, bool input_finished)
 {
     AudioFrame *frame;
 
@@ -552,7 +618,7 @@
     frame = NULL;
 
     std::vector<float> pcm_data(speech_data, speech_data+sp_len);
-    vad_segments = vad_obj->Infer(pcm_data);
+    vad_segments = vad_obj->Infer(pcm_data, input_finished);
 }
 
 } // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/ct-transformer.cpp b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
index 38a5a70..58eec25 100644
--- a/funasr/runtime/onnxruntime/src/ct-transformer.cpp
+++ b/funasr/runtime/onnxruntime/src/ct-transformer.cpp
@@ -103,9 +103,10 @@
         vector<string> WordWithPunc;
         for (int i = 0; i < InputStr.size(); i++)
         {
-            if (i > 0 && !(InputStr[i][0] & 0x80) && (i + 1) <InputStr.size() && !(InputStr[i+1][0] & 0x80))// 锟叫硷拷锟接拷模锟�
+            // if (i > 0 && !(InputStr[i][0] & 0x80) && (i + 1) <InputStr.size() && !(InputStr[i+1][0] & 0x80))// 锟叫硷拷锟接拷模锟�
+            if (i > 0 && !(InputStr[i-1][0] & 0x80) && !(InputStr[i][0] & 0x80))
             {
-                InputStr[i] = InputStr[i]+ " ";
+                InputStr[i] = " " + InputStr[i];
             }
             WordWithPunc.push_back(InputStr[i]);
 
@@ -128,7 +129,7 @@
                 NewPuncOut.assign(NewPunctuation.begin(), NewPunctuation.end() - 1);
                 NewPuncOut.push_back(PERIOD_INDEX);
             }
-            else if (NewString[NewString.size() - 1] == m_tokenizer.Id2Punc(PERIOD_INDEX) && NewString[NewString.size() - 1] == m_tokenizer.Id2Punc(QUESTION_INDEX))
+            else if (NewString[NewString.size() - 1] != m_tokenizer.Id2Punc(PERIOD_INDEX) && NewString[NewString.size() - 1] != m_tokenizer.Id2Punc(QUESTION_INDEX))
             {
                 NewSentenceOut = NewString;
                 NewSentenceOut.push_back(m_tokenizer.Id2Punc(PERIOD_INDEX));
@@ -187,4 +188,4 @@
     return punction;
 }
 
-} // namespace funasr
\ No newline at end of file
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad-online.cpp b/funasr/runtime/onnxruntime/src/fsmn-vad-online.cpp
new file mode 100644
index 0000000..0346916
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad-online.cpp
@@ -0,0 +1,198 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+ * MIT License  (https://opensource.org/licenses/MIT)
+*/
+
+#include <fstream>
+#include "precomp.h"
+
+namespace funasr {
+
+void FsmnVadOnline::FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
+                               std::vector<float> &waves) {
+    knf::OnlineFbank fbank(fbank_opts_);
+    // cache merge
+    waves.insert(waves.begin(), input_cache_.begin(), input_cache_.end());
+    int frame_number = ComputeFrameNum(waves.size(), frame_sample_length_, frame_shift_sample_length_);
+    // Send the audio after the last frame shift position to the cache
+    input_cache_.clear();
+    input_cache_.insert(input_cache_.begin(), waves.begin() + frame_number * frame_shift_sample_length_, waves.end());
+    if (frame_number == 0) {
+        return;
+    }
+    // Delete audio that haven't undergone fbank processing
+    waves.erase(waves.begin() + (frame_number - 1) * frame_shift_sample_length_ + frame_sample_length_, waves.end());
+
+    std::vector<float> buf(waves.size());
+    for (int32_t i = 0; i != waves.size(); ++i) {
+        buf[i] = waves[i] * 32768;
+    }
+    fbank.AcceptWaveform(sample_rate, buf.data(), buf.size());
+    // fbank.AcceptWaveform(sample_rate, &waves[0], waves.size());
+    int32_t frames = fbank.NumFramesReady();
+    for (int32_t i = 0; i != frames; ++i) {
+        const float *frame = fbank.GetFrame(i);
+        vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
+        vad_feats.emplace_back(frame_vector);
+    }
+}
+
+void FsmnVadOnline::ExtractFeats(float sample_rate, vector<std::vector<float>> &vad_feats,
+                                 vector<float> &waves, bool input_finished) {
+  FbankKaldi(sample_rate, vad_feats, waves);
+  // cache deal & online lfr,cmvn
+  if (vad_feats.size() > 0) {
+    if (!reserve_waveforms_.empty()) {
+      waves.insert(waves.begin(), reserve_waveforms_.begin(), reserve_waveforms_.end());
+    }
+    if (lfr_splice_cache_.empty()) {
+      for (int i = 0; i < (lfr_m - 1) / 2; i++) {
+        lfr_splice_cache_.emplace_back(vad_feats[0]);
+      }
+    }
+    if (vad_feats.size() + lfr_splice_cache_.size() >= lfr_m) {
+      vad_feats.insert(vad_feats.begin(), lfr_splice_cache_.begin(), lfr_splice_cache_.end());
+      int frame_from_waves = (waves.size() - frame_sample_length_) / frame_shift_sample_length_ + 1;
+      int minus_frame = reserve_waveforms_.empty() ? (lfr_m - 1) / 2 : 0;
+      int lfr_splice_frame_idxs = OnlineLfrCmvn(vad_feats, input_finished);
+      int reserve_frame_idx = lfr_splice_frame_idxs - minus_frame;
+      reserve_waveforms_.clear();
+      reserve_waveforms_.insert(reserve_waveforms_.begin(),
+                                waves.begin() + reserve_frame_idx * frame_shift_sample_length_,
+                                waves.begin() + frame_from_waves * frame_shift_sample_length_);
+      int sample_length = (frame_from_waves - 1) * frame_shift_sample_length_ + frame_sample_length_;
+      waves.erase(waves.begin() + sample_length, waves.end());
+    } else {
+      reserve_waveforms_.clear();
+      reserve_waveforms_.insert(reserve_waveforms_.begin(),
+                                waves.begin() + frame_sample_length_ - frame_shift_sample_length_, waves.end());
+      lfr_splice_cache_.insert(lfr_splice_cache_.end(), vad_feats.begin(), vad_feats.end());
+    }
+  } else {
+    if (input_finished) {
+      if (!reserve_waveforms_.empty()) {
+        waves = reserve_waveforms_;
+      }
+      vad_feats = lfr_splice_cache_;
+      OnlineLfrCmvn(vad_feats, input_finished);
+    }
+  }
+  if(input_finished){
+      Reset();
+      ResetCache();
+  }
+}
+
+int FsmnVadOnline::OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_finished) {
+    vector<vector<float>> out_feats;
+    int T = vad_feats.size();
+    int T_lrf = ceil((T - (lfr_m - 1) / 2) / lfr_n);
+    int lfr_splice_frame_idxs = T_lrf;
+    vector<float> p;
+    for (int i = 0; i < T_lrf; i++) {
+        if (lfr_m <= T - i * lfr_n) {
+            for (int j = 0; j < lfr_m; j++) {
+                p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
+            }
+            out_feats.emplace_back(p);
+            p.clear();
+        } else {
+            if (input_finished) {
+                int num_padding = lfr_m - (T - i * lfr_n);
+                for (int j = 0; j < (vad_feats.size() - i * lfr_n); j++) {
+                    p.insert(p.end(), vad_feats[i * lfr_n + j].begin(), vad_feats[i * lfr_n + j].end());
+                }
+                for (int j = 0; j < num_padding; j++) {
+                    p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
+                }
+                out_feats.emplace_back(p);
+            } else {
+                lfr_splice_frame_idxs = i;
+                break;
+            }
+        }
+    }
+    lfr_splice_frame_idxs = std::min(T - 1, lfr_splice_frame_idxs * lfr_n);
+    lfr_splice_cache_.clear();
+    lfr_splice_cache_.insert(lfr_splice_cache_.begin(), vad_feats.begin() + lfr_splice_frame_idxs, vad_feats.end());
+
+    // Apply cmvn
+    for (auto &out_feat: out_feats) {
+        for (int j = 0; j < means_list_.size(); j++) {
+            out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
+        }
+    }
+    vad_feats = out_feats;
+    return lfr_splice_frame_idxs;
+}
+
+std::vector<std::vector<int>>
+FsmnVadOnline::Infer(std::vector<float> &waves, bool input_finished) {
+    std::vector<std::vector<float>> vad_feats;
+    std::vector<std::vector<float>> vad_probs;
+    ExtractFeats(vad_sample_rate_, vad_feats, waves, input_finished);
+    fsmnvad_handle_->Forward(vad_feats, &vad_probs, &in_cache_, input_finished);
+
+    std::vector<std::vector<int>> vad_segments;
+    vad_segments = vad_scorer(vad_probs, waves, input_finished, true, vad_silence_duration_, vad_max_len_,
+                              vad_speech_noise_thres_, vad_sample_rate_);
+    return vad_segments;
+}
+
+void FsmnVadOnline::InitCache(){
+  std::vector<float> cache_feats(128 * 19 * 1, 0);
+  for (int i=0;i<4;i++){
+    in_cache_.emplace_back(cache_feats);
+  }
+};
+
+void FsmnVadOnline::Reset(){
+  in_cache_.clear();
+  InitCache();
+};
+
+void FsmnVadOnline::Test() {
+}
+
+void FsmnVadOnline::InitOnline(std::shared_ptr<Ort::Session> &vad_session,
+                               Ort::Env &env,
+                               std::vector<const char *> &vad_in_names,
+                               std::vector<const char *> &vad_out_names,
+                               knf::FbankOptions &fbank_opts,
+                               std::vector<float> &means_list,
+                               std::vector<float> &vars_list,
+                               int vad_sample_rate,
+                               int vad_silence_duration,
+                               int vad_max_len,
+                               double vad_speech_noise_thres) {
+    vad_session_ = vad_session;
+    vad_in_names_ = vad_in_names;
+    vad_out_names_ = vad_out_names;
+    fbank_opts_ = fbank_opts;
+    means_list_ = means_list;
+    vars_list_ = vars_list;
+    vad_sample_rate_ = vad_sample_rate;
+    vad_silence_duration_ = vad_silence_duration;
+    vad_max_len_ = vad_max_len;
+    vad_speech_noise_thres_ = vad_speech_noise_thres;
+}
+
+FsmnVadOnline::~FsmnVadOnline() {
+}
+
+FsmnVadOnline::FsmnVadOnline(FsmnVad* fsmnvad_handle):fsmnvad_handle_(std::move(fsmnvad_handle)),session_options_{}{
+   InitCache();
+   InitOnline(fsmnvad_handle_->vad_session_,
+              fsmnvad_handle_->env_,
+              fsmnvad_handle_->vad_in_names_,
+              fsmnvad_handle_->vad_out_names_,
+              fsmnvad_handle_->fbank_opts_,
+              fsmnvad_handle_->means_list_,
+              fsmnvad_handle_->vars_list_,
+              fsmnvad_handle_->vad_sample_rate_,
+              fsmnvad_handle_->vad_silence_duration_,
+              fsmnvad_handle_->vad_max_len_,
+              fsmnvad_handle_->vad_speech_noise_thres_);
+}
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad-online.h b/funasr/runtime/onnxruntime/src/fsmn-vad-online.h
new file mode 100644
index 0000000..4d429b6
--- /dev/null
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad-online.h
@@ -0,0 +1,88 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+ * MIT License  (https://opensource.org/licenses/MIT)
+*/
+
+#pragma once 
+#include "precomp.h"
+
+namespace funasr {
+class FsmnVadOnline : public VadModel {
+/**
+ * Author: Speech Lab of DAMO Academy, Alibaba Group
+ * Deep-FSMN for Large Vocabulary Continuous Speech Recognition
+ * https://arxiv.org/abs/1803.05030
+*/
+
+public:
+    explicit FsmnVadOnline(FsmnVad* fsmnvad_handle);
+    ~FsmnVadOnline();
+    void Test();
+    std::vector<std::vector<int>> Infer(std::vector<float> &waves, bool input_finished);
+    void ExtractFeats(float sample_rate, vector<vector<float>> &vad_feats, vector<float> &waves, bool input_finished);
+    void Reset();
+
+private:
+    E2EVadModel vad_scorer = E2EVadModel();
+    // std::unique_ptr<FsmnVad> fsmnvad_handle_;
+    FsmnVad* fsmnvad_handle_ = nullptr;
+
+    void FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
+                    std::vector<float> &waves);
+    int OnlineLfrCmvn(vector<vector<float>> &vad_feats, bool input_finished);
+    void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num){}
+    void InitCache();
+    void InitOnline(std::shared_ptr<Ort::Session> &vad_session,
+                    Ort::Env &env,
+                    std::vector<const char *> &vad_in_names,
+                    std::vector<const char *> &vad_out_names,
+                    knf::FbankOptions &fbank_opts,
+                    std::vector<float> &means_list,
+                    std::vector<float> &vars_list,
+                    int vad_sample_rate,
+                    int vad_silence_duration,
+                    int vad_max_len,
+                    double vad_speech_noise_thres);
+
+    static int ComputeFrameNum(int sample_length, int frame_sample_length, int frame_shift_sample_length) {
+        int frame_num = static_cast<int>((sample_length - frame_sample_length) / frame_shift_sample_length + 1);
+        if (frame_num >= 1 && sample_length >= frame_sample_length)
+            return frame_num;
+        else
+            return 0;
+    }
+    void ResetCache() {
+        reserve_waveforms_.clear();
+        input_cache_.clear();
+        lfr_splice_cache_.clear();
+    }
+
+    // from fsmnvad_handle_
+    std::shared_ptr<Ort::Session> vad_session_ = nullptr;
+    Ort::Env env_;
+    Ort::SessionOptions session_options_;
+    std::vector<const char *> vad_in_names_;
+    std::vector<const char *> vad_out_names_;
+    knf::FbankOptions fbank_opts_;
+    std::vector<float> means_list_;
+    std::vector<float> vars_list_;
+
+    std::vector<std::vector<float>> in_cache_;
+    // The reserved waveforms by fbank
+    std::vector<float> reserve_waveforms_;
+    // waveforms reserved after last shift position
+    std::vector<float> input_cache_;
+    // lfr reserved cache
+    std::vector<std::vector<float>> lfr_splice_cache_;
+
+    int vad_sample_rate_ = MODEL_SAMPLE_RATE;
+    int vad_silence_duration_ = VAD_SILENCE_DURATION;
+    int vad_max_len_ = VAD_MAX_LEN;
+    double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES;
+    int lfr_m = VAD_LFR_M;
+    int lfr_n = VAD_LFR_N;
+    int frame_sample_length_ = vad_sample_rate_ / 1000 * 25;;
+    int frame_shift_sample_length_ = vad_sample_rate_ / 1000 * 10;
+};
+
+} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
index 516dc88..697828b 100644
--- a/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad.cpp
@@ -37,14 +37,14 @@
         this->vad_max_len_ = post_conf["max_single_segment_time"].as<int>();
         this->vad_speech_noise_thres_ = post_conf["speech_noise_thres"].as<double>();
 
-        fbank_opts.frame_opts.dither = frontend_conf["dither"].as<float>();
-        fbank_opts.mel_opts.num_bins = frontend_conf["n_mels"].as<int>();
-        fbank_opts.frame_opts.samp_freq = (float)vad_sample_rate_;
-        fbank_opts.frame_opts.window_type = frontend_conf["window"].as<string>();
-        fbank_opts.frame_opts.frame_shift_ms = frontend_conf["frame_shift"].as<float>();
-        fbank_opts.frame_opts.frame_length_ms = frontend_conf["frame_length"].as<float>();
-        fbank_opts.energy_floor = 0;
-        fbank_opts.mel_opts.debug_mel = false;
+        fbank_opts_.frame_opts.dither = frontend_conf["dither"].as<float>();
+        fbank_opts_.mel_opts.num_bins = frontend_conf["n_mels"].as<int>();
+        fbank_opts_.frame_opts.samp_freq = (float)vad_sample_rate_;
+        fbank_opts_.frame_opts.window_type = frontend_conf["window"].as<string>();
+        fbank_opts_.frame_opts.frame_shift_ms = frontend_conf["frame_shift"].as<float>();
+        fbank_opts_.frame_opts.frame_length_ms = frontend_conf["frame_length"].as<float>();
+        fbank_opts_.energy_floor = 0;
+        fbank_opts_.mel_opts.debug_mel = false;
     }catch(exception const &e){
         LOG(ERROR) << "Error when load argument from vad config YAML.";
         exit(-1);
@@ -55,6 +55,7 @@
     try {
         vad_session_ = std::make_shared<Ort::Session>(
                 env_, vad_model, session_options_);
+        LOG(INFO) << "Successfully load model from " << vad_model;
     } catch (std::exception const &e) {
         LOG(ERROR) << "Error when load vad onnx model: " << e.what();
         exit(0);
@@ -109,7 +110,9 @@
 
 void FsmnVad::Forward(
         const std::vector<std::vector<float>> &chunk_feats,
-        std::vector<std::vector<float>> *out_prob) {
+        std::vector<std::vector<float>> *out_prob,
+        std::vector<std::vector<float>> *in_cache,
+        bool is_final) {
     Ort::MemoryInfo memory_info =
             Ort::MemoryInfo::CreateCpu(OrtDeviceAllocator, OrtMemTypeCPU);
 
@@ -132,9 +135,9 @@
     // 4 caches
     // cache node {batch,128,19,1}
     const int64_t cache_feats_shape[4] = {1, 128, 19, 1};
-    for (int i = 0; i < in_cache_.size(); i++) {
+    for (int i = 0; i < in_cache->size(); i++) {
       vad_inputs.emplace_back(std::move(Ort::Value::CreateTensor<float>(
-              memory_info, in_cache_[i].data(), in_cache_[i].size(), cache_feats_shape, 4)));
+              memory_info, (*in_cache)[i].data(), (*in_cache)[i].size(), cache_feats_shape, 4)));
     }
   
     // 4. Onnx infer
@@ -162,15 +165,17 @@
     }
   
     // get 4 caches outputs,each size is 128*19
-    // for (int i = 1; i < 5; i++) {
-    //   float* data = vad_ort_outputs[i].GetTensorMutableData<float>();
-    //   memcpy(in_cache_[i-1].data(), data, sizeof(float) * 128*19);
-    // }
+    if(!is_final){
+        for (int i = 1; i < 5; i++) {
+        float* data = vad_ort_outputs[i].GetTensorMutableData<float>();
+        memcpy((*in_cache)[i-1].data(), data, sizeof(float) * 128*19);
+        }
+    }
 }
 
 void FsmnVad::FbankKaldi(float sample_rate, std::vector<std::vector<float>> &vad_feats,
                          std::vector<float> &waves) {
-    knf::OnlineFbank fbank(fbank_opts);
+    knf::OnlineFbank fbank(fbank_opts_);
 
     std::vector<float> buf(waves.size());
     for (int32_t i = 0; i != waves.size(); ++i) {
@@ -180,7 +185,7 @@
     int32_t frames = fbank.NumFramesReady();
     for (int32_t i = 0; i != frames; ++i) {
         const float *frame = fbank.GetFrame(i);
-        std::vector<float> frame_vector(frame, frame + fbank_opts.mel_opts.num_bins);
+        std::vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
         vad_feats.emplace_back(frame_vector);
     }
 }
@@ -205,7 +210,7 @@
                 vector<string> means_lines{istream_iterator<string>{means_lines_stream}, istream_iterator<string>{}};
                 if (means_lines[0] == "<LearnRateCoef>") {
                     for (int j = 3; j < means_lines.size() - 1; j++) {
-                        means_list.push_back(stof(means_lines[j]));
+                        means_list_.push_back(stof(means_lines[j]));
                     }
                     continue;
                 }
@@ -216,8 +221,8 @@
                 vector<string> vars_lines{istream_iterator<string>{vars_lines_stream}, istream_iterator<string>{}};
                 if (vars_lines[0] == "<LearnRateCoef>") {
                     for (int j = 3; j < vars_lines.size() - 1; j++) {
-                        // vars_list.push_back(stof(vars_lines[j])*scale);
-                        vars_list.push_back(stof(vars_lines[j]));
+                        // vars_list_.push_back(stof(vars_lines[j])*scale);
+                        vars_list_.push_back(stof(vars_lines[j]));
                     }
                     continue;
                 }
@@ -263,8 +268,8 @@
     }
     // Apply cmvn
     for (auto &out_feat: out_feats) {
-        for (int j = 0; j < means_list.size(); j++) {
-            out_feat[j] = (out_feat[j] + means_list[j]) * vars_list[j];
+        for (int j = 0; j < means_list_.size(); j++) {
+            out_feat[j] = (out_feat[j] + means_list_[j]) * vars_list_[j];
         }
     }
     vad_feats = out_feats;
@@ -276,7 +281,7 @@
     std::vector<std::vector<float>> vad_probs;
     FbankKaldi(vad_sample_rate_, vad_feats, waves);
     LfrCmvn(vad_feats);
-    Forward(vad_feats, &vad_probs);
+    Forward(vad_feats, &vad_probs, &in_cache_, input_finished);
 
     E2EVadModel vad_scorer = E2EVadModel();
     std::vector<std::vector<int>> vad_segments;
diff --git a/funasr/runtime/onnxruntime/src/fsmn-vad.h b/funasr/runtime/onnxruntime/src/fsmn-vad.h
index a8ec4ce..adceb1f 100644
--- a/funasr/runtime/onnxruntime/src/fsmn-vad.h
+++ b/funasr/runtime/onnxruntime/src/fsmn-vad.h
@@ -22,7 +22,30 @@
     void Test();
     void InitVad(const std::string &vad_model, const std::string &vad_cmvn, const std::string &vad_config, int thread_num);
     std::vector<std::vector<int>> Infer(std::vector<float> &waves, bool input_finished=true);
+    void Forward(
+        const std::vector<std::vector<float>> &chunk_feats,
+        std::vector<std::vector<float>> *out_prob,
+        std::vector<std::vector<float>> *in_cache,
+        bool is_final);
     void Reset();
+    
+    std::shared_ptr<Ort::Session> vad_session_ = nullptr;
+    Ort::Env env_;
+    Ort::SessionOptions session_options_;
+    std::vector<const char *> vad_in_names_;
+    std::vector<const char *> vad_out_names_;
+    std::vector<std::vector<float>> in_cache_;
+    
+    knf::FbankOptions fbank_opts_;
+    std::vector<float> means_list_;
+    std::vector<float> vars_list_;
+
+    int vad_sample_rate_ = MODEL_SAMPLE_RATE;
+    int vad_silence_duration_ = VAD_SILENCE_DURATION;
+    int vad_max_len_ = VAD_MAX_LEN;
+    double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES;
+    int lfr_m = VAD_LFR_M;
+    int lfr_n = VAD_LFR_N;
 
 private:
 
@@ -37,31 +60,9 @@
                     std::vector<float> &waves);
 
     void LfrCmvn(std::vector<std::vector<float>> &vad_feats);
-
-    void Forward(
-            const std::vector<std::vector<float>> &chunk_feats,
-            std::vector<std::vector<float>> *out_prob);
-
     void LoadCmvn(const char *filename);
     void InitCache();
 
-    std::shared_ptr<Ort::Session> vad_session_ = nullptr;
-    Ort::Env env_;
-    Ort::SessionOptions session_options_;
-    std::vector<const char *> vad_in_names_;
-    std::vector<const char *> vad_out_names_;
-    std::vector<std::vector<float>> in_cache_;
-    
-    knf::FbankOptions fbank_opts;
-    std::vector<float> means_list;
-    std::vector<float> vars_list;
-
-    int vad_sample_rate_ = MODEL_SAMPLE_RATE;
-    int vad_silence_duration_ = VAD_SILENCE_DURATION;
-    int vad_max_len_ = VAD_MAX_LEN;
-    double vad_speech_noise_thres_ = VAD_SPEECH_NOISE_THRES;
-    int lfr_m = VAD_LFR_M;
-    int lfr_n = VAD_LFR_N;
 };
 
 } // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/funasrruntime.cpp b/funasr/runtime/onnxruntime/src/funasrruntime.cpp
index adef504..f504b39 100644
--- a/funasr/runtime/onnxruntime/src/funasrruntime.cpp
+++ b/funasr/runtime/onnxruntime/src/funasrruntime.cpp
@@ -11,9 +11,15 @@
 		return mm;
 	}
 
-	_FUNASRAPI FUNASR_HANDLE  FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num, FSMN_VAD_MODE mode)
+	_FUNASRAPI FUNASR_HANDLE  FsmnVadInit(std::map<std::string, std::string>& model_path, int thread_num)
 	{
-		funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num, mode);
+		funasr::VadModel* mm = funasr::CreateVadModel(model_path, thread_num);
+		return mm;
+	}
+
+	_FUNASRAPI FUNASR_HANDLE  FsmnVadOnlineInit(FUNASR_HANDLE fsmnvad_handle)
+	{
+		funasr::VadModel* mm = funasr::CreateVadModel(fsmnvad_handle);
 		return mm;
 	}
 
@@ -96,7 +102,7 @@
 	}
 
 	// APIs for VAD Infer
-	_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	_FUNASRAPI FUNASR_RESULT FsmnVadInferBuffer(FUNASR_HANDLE handle, const char* sz_buf, int n_len, QM_CALLBACK fn_callback, bool input_finished, int sampling_rate)
 	{
 		funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
 		if (!vad_obj)
@@ -110,13 +116,13 @@
 		p_result->snippet_time = audio.GetTimeLen();
 		
 		vector<std::vector<int>> vad_segments;
-		audio.Split(vad_obj, vad_segments);
+		audio.Split(vad_obj, vad_segments, input_finished);
 		p_result->segments = new vector<std::vector<int>>(vad_segments);
 
 		return p_result;
 	}
 
-	_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, FSMN_VAD_MODE mode, QM_CALLBACK fn_callback, int sampling_rate)
+	_FUNASRAPI FUNASR_RESULT FsmnVadInfer(FUNASR_HANDLE handle, const char* sz_filename, QM_CALLBACK fn_callback, int sampling_rate)
 	{
 		funasr::VadModel* vad_obj = (funasr::VadModel*)handle;
 		if (!vad_obj)
@@ -139,7 +145,7 @@
 		p_result->snippet_time = audio.GetTimeLen();
 		
 		vector<std::vector<int>> vad_segments;
-		audio.Split(vad_obj, vad_segments);
+		audio.Split(vad_obj, vad_segments, true);
 		p_result->segments = new vector<std::vector<int>>(vad_segments);
 
 		return p_result;
diff --git a/funasr/runtime/onnxruntime/src/online-feature.cpp b/funasr/runtime/onnxruntime/src/online-feature.cpp
deleted file mode 100644
index a21589c..0000000
--- a/funasr/runtime/onnxruntime/src/online-feature.cpp
+++ /dev/null
@@ -1,137 +0,0 @@
-/**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
- * MIT License  (https://opensource.org/licenses/MIT)
- * Contributed by zhuzizyf(China Telecom).
-*/
-
-#include "online-feature.h"
-#include <utility>
-
-namespace funasr {
-OnlineFeature::OnlineFeature(int sample_rate, knf::FbankOptions fbank_opts, int lfr_m, int lfr_n,
-                             std::vector<std::vector<float>> cmvns)
-  : sample_rate_(sample_rate),
-    fbank_opts_(std::move(fbank_opts)),
-    lfr_m_(lfr_m),
-    lfr_n_(lfr_n),
-    cmvns_(std::move(cmvns)) {
-  frame_sample_length_ = sample_rate_ / 1000 * 25;;
-  frame_shift_sample_length_ = sample_rate_ / 1000 * 10;
-}
-
-void OnlineFeature::ExtractFeats(vector<std::vector<float>> &vad_feats,
-                                 vector<float> waves, bool input_finished) {
-  input_finished_ = input_finished;
-  OnlineFbank(vad_feats, waves);
-  // cache deal & online lfr,cmvn
-  if (vad_feats.size() > 0) {
-    if (!reserve_waveforms_.empty()) {
-      waves.insert(waves.begin(), reserve_waveforms_.begin(), reserve_waveforms_.end());
-    }
-    if (lfr_splice_cache_.empty()) {
-      for (int i = 0; i < (lfr_m_ - 1) / 2; i++) {
-        lfr_splice_cache_.emplace_back(vad_feats[0]);
-      }
-    }
-    if (vad_feats.size() + lfr_splice_cache_.size() >= lfr_m_) {
-      vad_feats.insert(vad_feats.begin(), lfr_splice_cache_.begin(), lfr_splice_cache_.end());
-      int frame_from_waves = (waves.size() - frame_sample_length_) / frame_shift_sample_length_ + 1;
-      int minus_frame = reserve_waveforms_.empty() ? (lfr_m_ - 1) / 2 : 0;
-      int lfr_splice_frame_idxs = OnlineLfrCmvn(vad_feats);
-      int reserve_frame_idx = lfr_splice_frame_idxs - minus_frame;
-      reserve_waveforms_.clear();
-      reserve_waveforms_.insert(reserve_waveforms_.begin(),
-                                waves.begin() + reserve_frame_idx * frame_shift_sample_length_,
-                                waves.begin() + frame_from_waves * frame_shift_sample_length_);
-      int sample_length = (frame_from_waves - 1) * frame_shift_sample_length_ + frame_sample_length_;
-      waves.erase(waves.begin() + sample_length, waves.end());
-    } else {
-      reserve_waveforms_.clear();
-      reserve_waveforms_.insert(reserve_waveforms_.begin(),
-                                waves.begin() + frame_sample_length_ - frame_shift_sample_length_, waves.end());
-      lfr_splice_cache_.insert(lfr_splice_cache_.end(), vad_feats.begin(), vad_feats.end());
-    }
-
-  } else {
-    if (input_finished_) {
-      if (!reserve_waveforms_.empty()) {
-        waves = reserve_waveforms_;
-      }
-      vad_feats = lfr_splice_cache_;
-      OnlineLfrCmvn(vad_feats);
-      ResetCache();
-    }
-  }
-
-}
-
-int OnlineFeature::OnlineLfrCmvn(vector<vector<float>> &vad_feats) {
-  vector<vector<float>> out_feats;
-  int T = vad_feats.size();
-  int T_lrf = ceil((T - (lfr_m_ - 1) / 2) / lfr_n_);
-  int lfr_splice_frame_idxs = T_lrf;
-  vector<float> p;
-  for (int i = 0; i < T_lrf; i++) {
-    if (lfr_m_ <= T - i * lfr_n_) {
-      for (int j = 0; j < lfr_m_; j++) {
-        p.insert(p.end(), vad_feats[i * lfr_n_ + j].begin(), vad_feats[i * lfr_n_ + j].end());
-      }
-      out_feats.emplace_back(p);
-      p.clear();
-    } else {
-      if (input_finished_) {
-        int num_padding = lfr_m_ - (T - i * lfr_n_);
-        for (int j = 0; j < (vad_feats.size() - i * lfr_n_); j++) {
-          p.insert(p.end(), vad_feats[i * lfr_n_ + j].begin(), vad_feats[i * lfr_n_ + j].end());
-        }
-        for (int j = 0; j < num_padding; j++) {
-          p.insert(p.end(), vad_feats[vad_feats.size() - 1].begin(), vad_feats[vad_feats.size() - 1].end());
-        }
-        out_feats.emplace_back(p);
-      } else {
-        lfr_splice_frame_idxs = i;
-        break;
-      }
-    }
-  }
-  lfr_splice_frame_idxs = std::min(T - 1, lfr_splice_frame_idxs * lfr_n_);
-  lfr_splice_cache_.clear();
-  lfr_splice_cache_.insert(lfr_splice_cache_.begin(), vad_feats.begin() + lfr_splice_frame_idxs, vad_feats.end());
-
-  // Apply cmvn
-  for (auto &out_feat: out_feats) {
-    for (int j = 0; j < cmvns_[0].size(); j++) {
-      out_feat[j] = (out_feat[j] + cmvns_[0][j]) * cmvns_[1][j];
-    }
-  }
-  vad_feats = out_feats;
-  return lfr_splice_frame_idxs;
-}
-
-void OnlineFeature::OnlineFbank(vector<std::vector<float>> &vad_feats,
-                                vector<float> &waves) {
-
-  knf::OnlineFbank fbank(fbank_opts_);
-  // cache merge
-  waves.insert(waves.begin(), input_cache_.begin(), input_cache_.end());
-  int frame_number = ComputeFrameNum(waves.size(), frame_sample_length_, frame_shift_sample_length_);
-  // Send the audio after the last frame shift position to the cache
-  input_cache_.clear();
-  input_cache_.insert(input_cache_.begin(), waves.begin() + frame_number * frame_shift_sample_length_, waves.end());
-  if (frame_number == 0) {
-    return;
-  }
-  // Delete audio that haven't undergone fbank processing
-  waves.erase(waves.begin() + (frame_number - 1) * frame_shift_sample_length_ + frame_sample_length_, waves.end());
-
-  fbank.AcceptWaveform(sample_rate_, &waves[0], waves.size());
-  int32_t frames = fbank.NumFramesReady();
-  for (int32_t i = 0; i != frames; ++i) {
-    const float *frame = fbank.GetFrame(i);
-    vector<float> frame_vector(frame, frame + fbank_opts_.mel_opts.num_bins);
-    vad_feats.emplace_back(frame_vector);
-  }
-
-}
-
-} // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/onnxruntime/src/online-feature.h b/funasr/runtime/onnxruntime/src/online-feature.h
deleted file mode 100644
index 16e6e4b..0000000
--- a/funasr/runtime/onnxruntime/src/online-feature.h
+++ /dev/null
@@ -1,58 +0,0 @@
-/**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
- * MIT License  (https://opensource.org/licenses/MIT)
- * Contributed by zhuzizyf(China Telecom).
-*/
-#pragma once 
-#include <vector>
-#include "precomp.h"
-
-using namespace std;
-namespace funasr {
-class OnlineFeature {
-
-public:
-  OnlineFeature(int sample_rate, knf::FbankOptions fbank_opts, int lfr_m_, int lfr_n_,
-                std::vector<std::vector<float>> cmvns_);
-
-  void ExtractFeats(vector<vector<float>> &vad_feats, vector<float> waves, bool input_finished);
-
-private:
-  void OnlineFbank(vector<vector<float>> &vad_feats, vector<float> &waves);
-  int OnlineLfrCmvn(vector<vector<float>> &vad_feats);
-  
-  static int ComputeFrameNum(int sample_length, int frame_sample_length, int frame_shift_sample_length) {
-    int frame_num = static_cast<int>((sample_length - frame_sample_length) / frame_shift_sample_length + 1);
-    if (frame_num >= 1 && sample_length >= frame_sample_length)
-      return frame_num;
-    else
-      return 0;
-  }
-
-  void ResetCache() {
-    reserve_waveforms_.clear();
-    input_cache_.clear();
-    lfr_splice_cache_.clear();
-    input_finished_ = false;
-
-  }
-
-  knf::FbankOptions fbank_opts_;
-  // The reserved waveforms by fbank
-  std::vector<float> reserve_waveforms_;
-  // waveforms reserved after last shift position
-  std::vector<float> input_cache_;
-  // lfr reserved cache
-  std::vector<std::vector<float>> lfr_splice_cache_;
-  std::vector<std::vector<float>> cmvns_;
-
-  int sample_rate_ = 16000;
-  int frame_sample_length_ = sample_rate_ / 1000 * 25;;
-  int frame_shift_sample_length_ = sample_rate_ / 1000 * 10;
-  int lfr_m_;
-  int lfr_n_;
-  bool input_finished_ = false;
-
-};
-
-} // namespace funasr
diff --git a/funasr/runtime/onnxruntime/src/paraformer.h b/funasr/runtime/onnxruntime/src/paraformer.h
index 533c16f..9df0977 100644
--- a/funasr/runtime/onnxruntime/src/paraformer.h
+++ b/funasr/runtime/onnxruntime/src/paraformer.h
@@ -18,7 +18,7 @@
         //std::unique_ptr<knf::OnlineFbank> fbank_;
         knf::FbankOptions fbank_opts;
 
-        Vocab* vocab;
+        Vocab* vocab = nullptr;
         vector<float> means_list;
         vector<float> vars_list;
         const float scale = 22.6274169979695;
@@ -30,7 +30,7 @@
         void ApplyCmvn(vector<float> *v);
         string GreedySearch( float* in, int n_len, int64_t token_nums);
 
-        std::shared_ptr<Ort::Session> m_session;
+        std::shared_ptr<Ort::Session> m_session = nullptr;
         Ort::Env env_;
         Ort::SessionOptions session_options;
 
diff --git a/funasr/runtime/onnxruntime/src/precomp.h b/funasr/runtime/onnxruntime/src/precomp.h
index e607dbf..838dddc 100644
--- a/funasr/runtime/onnxruntime/src/precomp.h
+++ b/funasr/runtime/onnxruntime/src/precomp.h
@@ -36,8 +36,9 @@
 #include "offline-stream.h"
 #include "tokenizer.h"
 #include "ct-transformer.h"
-#include "fsmn-vad.h"
 #include "e2e-vad.h"
+#include "fsmn-vad.h"
+#include "fsmn-vad-online.h"
 #include "vocab.h"
 #include "audio.h"
 #include "tensor.h"
diff --git a/funasr/runtime/onnxruntime/src/vad-model.cpp b/funasr/runtime/onnxruntime/src/vad-model.cpp
index 336758f..c164c3e 100644
--- a/funasr/runtime/onnxruntime/src/vad-model.cpp
+++ b/funasr/runtime/onnxruntime/src/vad-model.cpp
@@ -1,14 +1,10 @@
 #include "precomp.h"
 
 namespace funasr {
-VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num, int mode)
+VadModel *CreateVadModel(std::map<std::string, std::string>& model_path, int thread_num)
 {
     VadModel *mm;
-    if(mode == FSMN_VAD_OFFLINE){
-        mm = new FsmnVad();
-    }else{
-        LOG(ERROR)<<"Online fsmn vad not imp!";
-    }
+    mm = new FsmnVad();
 
     string vad_model_path;
     string vad_cmvn_path;
@@ -25,4 +21,11 @@
     return mm;
 }
 
+VadModel *CreateVadModel(void* fsmnvad_handle)
+{
+    VadModel *mm;
+    mm = new FsmnVadOnline((FsmnVad*)fsmnvad_handle);
+    return mm;
+}
+
 } // namespace funasr
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/README.md b/funasr/runtime/python/websocket/README.md
index f489bac..fcdc83c 100644
--- a/funasr/runtime/python/websocket/README.md
+++ b/funasr/runtime/python/websocket/README.md
@@ -21,43 +21,10 @@
 ```
 
 ### Start server
-#### ASR offline server
-##### API-reference
-```shell
-python ws_server_offline.py \
---port [port id] \
---asr_model [asr model_name] \
---punc_model [punc model_name] \
---ngpu [0 or 1] \
---ncpu [1 or 4] \
---certfile [path of certfile for ssl] \
---keyfile [path of keyfile for ssl] 
-```
-##### Usage examples
-```shell
-python ws_server_offline.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-```
 
-#### ASR streaming server
 ##### API-reference
 ```shell
-python ws_server_online.py \
---port [port id] \
---asr_model_online [asr model_name] \
---ngpu [0 or 1] \
---ncpu [1 or 4] \
---certfile [path of certfile for ssl] \
---keyfile [path of keyfile for ssl] 
-```
-##### Usage examples
-```shell
-python ws_server_online.py --port 10095 --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
-```
-
-#### ASR offline/online 2pass server
-##### API-reference
-```shell
-python ws_server_2pass.py \
+python wss_srv_asr.py \
 --port [port id] \
 --asr_model [asr model_name] \
 --asr_model_online [asr model_name] \
@@ -69,7 +36,7 @@
 ```
 ##### Usage examples
 ```shell
-python ws_server_2pass.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"  --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
+python wss_srv_asr.py --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"  --asr_model_online "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online"
 ```
 
 ## For the client
@@ -84,7 +51,7 @@
 ### Start client
 #### API-reference
 ```shell
-python ws_client.py \
+python wss_client_asr.py \
 --host [ip_address] \
 --port [port id] \
 --chunk_size ["5,10,5"=600ms, "8,8,4"=480ms] \
@@ -93,43 +60,45 @@
 --audio_in [if set, loadding from wav.scp, else recording from mircrophone] \
 --output_dir [if set, write the results to output_dir] \
 --send_without_sleep [only set for offline] \
---ssl [1 for wss connect, 0 for ws, default is 1]
+--ssl [1 for wss connect, 0 for ws, default is 1] \
+--mode [`online` for streaming asr, `offline` for non-streaming, `2pass` for unifying streaming and non-streaming asr] \
 ```
+
 #### Usage examples
 ##### ASR offline client
 Recording from mircrophone
 ```shell
 # --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode offline --chunk_interval 10 --words_max_print 100
 ```
 Loadding from wav.scp(kaldi style)
 ```shell
 # --chunk_interval, "10": 600/10=60ms, "5"=600/5=120ms, "20": 600/12=30ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_interval 10 --words_max_print 100 --audio_in "./data/wav.scp" --send_without_sleep --output_dir "./results"
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode offline --chunk_interval 10 --words_max_print 100 --audio_in "./data/wav.scp" --send_without_sleep --output_dir "./results"
 ```
 
 ##### ASR streaming client
 Recording from mircrophone
 ```shell
 # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --words_max_print 100
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode online --chunk_size "5,10,5" --words_max_print 100
 ```
 Loadding from wav.scp(kaldi style)
 ```shell
 # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "5,10,5" --audio_in "./data/wav.scp" --output_dir "./results"
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode online --chunk_size "5,10,5" --audio_in "./data/wav.scp" --output_dir "./results"
 ```
 
 ##### ASR offline/online 2pass client
 Recording from mircrophone
 ```shell
 # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "8,8,4"
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4"
 ```
 Loadding from wav.scp(kaldi style)
 ```shell
 # --chunk_size, "5,10,5"=600ms, "8,8,4"=480ms
-python ws_client.py --host "0.0.0.0" --port 10095 --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
+python wss_client_asr.py --host "0.0.0.0" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
 ```
 ## Acknowledge
 1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
diff --git a/funasr/runtime/python/websocket/parse_args.py b/funasr/runtime/python/websocket/parse_args.py
index 82d9c90..ffecff7 100644
--- a/funasr/runtime/python/websocket/parse_args.py
+++ b/funasr/runtime/python/websocket/parse_args.py
@@ -33,7 +33,7 @@
                     help="0 for cpu, 1 for gpu")
 parser.add_argument("--ncpu",
                     type=int,
-                    default=1,
+                    default=4,
                     help="cpu cores")
 parser.add_argument("--certfile",
                     type=str,
diff --git a/funasr/runtime/python/websocket/ws_client.py b/funasr/runtime/python/websocket/ws_client.py
deleted file mode 100644
index f7dfcaf..0000000
--- a/funasr/runtime/python/websocket/ws_client.py
+++ /dev/null
@@ -1,292 +0,0 @@
-# -*- encoding: utf-8 -*-
-import os
-import time
-import websockets,ssl
-import asyncio
-# import threading
-import argparse
-import json
-import traceback
-from multiprocessing import Process
-from funasr.fileio.datadir_writer import DatadirWriter
-
-import logging
-
-logging.basicConfig(level=logging.ERROR)
-
-parser = argparse.ArgumentParser()
-parser.add_argument("--host",
-                    type=str,
-                    default="localhost",
-                    required=False,
-                    help="host ip, localhost, 0.0.0.0")
-parser.add_argument("--port",
-                    type=int,
-                    default=10095,
-                    required=False,
-                    help="grpc server port")
-parser.add_argument("--chunk_size",
-                    type=str,
-                    default="5, 10, 5",
-                    help="chunk")
-parser.add_argument("--chunk_interval",
-                    type=int,
-                    default=10,
-                    help="chunk")
-parser.add_argument("--audio_in",
-                    type=str,
-                    default=None,
-                    help="audio_in")
-parser.add_argument("--send_without_sleep",
-                    action="store_true",
-                    default=False,
-                    help="if audio_in is set, send_without_sleep")
-parser.add_argument("--test_thread_num",
-                    type=int,
-                    default=1,
-                    help="test_thread_num")
-parser.add_argument("--words_max_print",
-                    type=int,
-                    default=10000,
-                    help="chunk")
-parser.add_argument("--output_dir",
-                    type=str,
-                    default=None,
-                    help="output_dir")
-                    
-parser.add_argument("--ssl",
-                    type=int,
-                    default=1,
-                    help="1 for ssl connect, 0 for no ssl")
-
-args = parser.parse_args()
-args.chunk_size = [int(x) for x in args.chunk_size.split(",")]
-print(args)
-# voices = asyncio.Queue()
-from queue import Queue
-voices = Queue()
-
-ibest_writer = None
-if args.output_dir is not None:
-    writer = DatadirWriter(args.output_dir)
-    ibest_writer = writer[f"1best_recog"]
-
-async def record_microphone():
-    is_finished = False
-    import pyaudio
-    #print("2")
-    global voices 
-    FORMAT = pyaudio.paInt16
-    CHANNELS = 1
-    RATE = 16000
-    chunk_size = 60*args.chunk_size[1]/args.chunk_interval
-    CHUNK = int(RATE / 1000 * chunk_size)
-
-    p = pyaudio.PyAudio()
-
-    stream = p.open(format=FORMAT,
-                    channels=CHANNELS,
-                    rate=RATE,
-                    input=True,
-                    frames_per_buffer=CHUNK)
-
-    message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": "microphone", "is_speaking": True})
-    voices.put(message)
-    while True:
-
-        data = stream.read(CHUNK)
-        message = data  
-        
-        voices.put(message)
-
-        await asyncio.sleep(0.005)
-
-async def record_from_scp(chunk_begin,chunk_size):
-    import wave
-    global voices
-    is_finished = False
-    if args.audio_in.endswith(".scp"):
-        f_scp = open(args.audio_in)
-        wavs = f_scp.readlines()
-    else:
-        wavs = [args.audio_in]
-    if chunk_size>0:
-        wavs=wavs[chunk_begin:chunk_begin+chunk_size]
-    for wav in wavs:
-        wav_splits = wav.strip().split()
-        wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
-        wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
-        
-        # bytes_f = open(wav_path, "rb")
-        # bytes_data = bytes_f.read()
-        with wave.open(wav_path, "rb") as wav_file:
-            params = wav_file.getparams()
-            # header_length = wav_file.getheaders()[0][1]
-            # wav_file.setpos(header_length)
-            frames = wav_file.readframes(wav_file.getnframes())
-
-        audio_bytes = bytes(frames)
-        # stride = int(args.chunk_size/1000*16000*2)
-        stride = int(60*args.chunk_size[1]/args.chunk_interval/1000*16000*2)
-        chunk_num = (len(audio_bytes)-1)//stride + 1
-        # print(stride)
-        
-        # send first time
-        message = json.dumps({"chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": wav_name,"is_speaking": True})
-        voices.put(message)
-        is_speaking = True
-        for i in range(chunk_num):
-
-            beg = i*stride
-            data = audio_bytes[beg:beg+stride]
-            message = data  
-            voices.put(message)
-            if i == chunk_num-1:
-                is_speaking = False
-                message = json.dumps({"is_speaking": is_speaking})
-                voices.put(message)
-            # print("data_chunk: ", len(data_chunk))
-            # print(voices.qsize())
-            sleep_duration = 0.001 if args.send_without_sleep else 60*args.chunk_size[1]/args.chunk_interval/1000
-            await asyncio.sleep(sleep_duration)
-
-
-async def ws_send():
-    global voices
-    global websocket
-    print("started to sending data!")
-    while True:
-        while not voices.empty():
-            data = voices.get()
-            voices.task_done()
-            try:
-                await websocket.send(data)
-            except Exception as e:
-                print('Exception occurred:', e)
-                traceback.print_exc()
-                exit(0)
-            await asyncio.sleep(0.005)
-        await asyncio.sleep(0.005)
-
-
-
-async def message(id):
-    global websocket
-    text_print = ""
-    text_print_2pass_online = ""
-    text_print_2pass_offline = ""
-    while True:
-        try:
-            meg = await websocket.recv()
-            meg = json.loads(meg)
-            wav_name = meg.get("wav_name", "demo")
-            # print(wav_name)
-            text = meg["text"]
-            if ibest_writer is not None:
-                ibest_writer["text"][wav_name] = text
-            
-            if meg["mode"] == "online":
-                text_print += "{}".format(text)
-                text_print = text_print[-args.words_max_print:]
-                os.system('clear')
-                print("\rpid"+str(id)+": "+text_print)
-            elif meg["mode"] == "online":
-                text_print += "{}".format(text)
-                text_print = text_print[-args.words_max_print:]
-                os.system('clear')
-                print("\rpid"+str(id)+": "+text_print)
-            else:
-                if meg["mode"] == "2pass-online":
-                    text_print_2pass_online += "{}".format(text)
-                    text_print = text_print_2pass_offline + text_print_2pass_online
-                else:
-                    text_print_2pass_online = ""
-                    text_print = text_print_2pass_offline + "{}".format(text)
-                    text_print_2pass_offline += "{}".format(text)
-                text_print = text_print[-args.words_max_print:]
-                os.system('clear')
-                print("\rpid" + str(id) + ": " + text_print)
-
-        except Exception as e:
-            print("Exception:", e)
-            traceback.print_exc()
-            exit(0)
-
-async def print_messge():
-    global websocket
-    while True:
-        try:
-            meg = await websocket.recv()
-            meg = json.loads(meg)
-            print(meg)
-        except Exception as e:
-            print("Exception:", e)
-            traceback.print_exc()
-            exit(0)
-
-async def ws_client(id,chunk_begin,chunk_size):
-    global websocket
-    if  args.ssl==1:
-       ssl_context = ssl.SSLContext()
-       ssl_context.check_hostname = False
-       ssl_context.verify_mode = ssl.CERT_NONE
-       uri = "wss://{}:{}".format(args.host, args.port)
-    else:
-       uri = "ws://{}:{}".format(args.host, args.port)
-       ssl_context=None
-    print("connect to",uri)
-    async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None,ssl=ssl_context):
-        if args.audio_in is not None:
-            task = asyncio.create_task(record_from_scp(chunk_begin,chunk_size))
-        else:
-            task = asyncio.create_task(record_microphone())
-        task2 = asyncio.create_task(ws_send())
-        task3 = asyncio.create_task(message(id))
-        await asyncio.gather(task, task2, task3)
-
-def one_thread(id,chunk_begin,chunk_size):
-   asyncio.get_event_loop().run_until_complete(ws_client(id,chunk_begin,chunk_size))
-   asyncio.get_event_loop().run_forever()
-
-
-if __name__ == '__main__':
-   # for microphone 
-   if  args.audio_in is  None:
-     p = Process(target=one_thread,args=(0, 0, 0))
-     p.start()
-     p.join()
-     print('end')
-   else:
-     # calculate the number of wavs for each preocess
-     if args.audio_in.endswith(".scp"):
-         f_scp = open(args.audio_in)
-         wavs = f_scp.readlines()
-     else:
-         wavs = [args.audio_in]
-     total_len=len(wavs)
-     if total_len>=args.test_thread_num:
-          chunk_size=int((total_len)/args.test_thread_num)
-          remain_wavs=total_len-chunk_size*args.test_thread_num
-     else:
-          chunk_size=1
-          remain_wavs=0
-
-     process_list = []
-     chunk_begin=0
-     for i in range(args.test_thread_num):
-         now_chunk_size= chunk_size
-         if remain_wavs>0:
-             now_chunk_size=chunk_size+1
-             remain_wavs=remain_wavs-1
-         # process i handle wavs at chunk_begin and size of now_chunk_size
-         p = Process(target=one_thread,args=(i,chunk_begin,now_chunk_size))
-         chunk_begin=chunk_begin+now_chunk_size
-         p.start()
-         process_list.append(p)
-
-     for i in process_list:
-         p.join()
-
-     print('end')
-
-
diff --git a/funasr/runtime/python/websocket/ws_server_2pass.py b/funasr/runtime/python/websocket/ws_server_2pass.py
deleted file mode 100644
index df13ad9..0000000
--- a/funasr/runtime/python/websocket/ws_server_2pass.py
+++ /dev/null
@@ -1,206 +0,0 @@
-import asyncio
-import json
-import websockets
-import time
-import logging
-import tracemalloc
-import numpy as np
-import ssl
-from parse_args import args
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
-
-tracemalloc.start()
-
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-
-websocket_users = set()
-
-print("model loading")
-# asr
-inference_pipeline_asr = pipeline(
-    task=Tasks.auto_speech_recognition,
-    model=args.asr_model,
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-    model_revision=None)
-
-
-# vad
-inference_pipeline_vad = pipeline(
-    task=Tasks.voice_activity_detection,
-    model=args.vad_model,
-    model_revision=None,
-    output_dir=None,
-    batch_size=1,
-    mode='online',
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-)
-
-if args.punc_model != "":
-    inference_pipeline_punc = pipeline(
-        task=Tasks.punctuation,
-        model=args.punc_model,
-        model_revision="v1.0.2",
-        ngpu=args.ngpu,
-        ncpu=args.ncpu,
-    )
-else:
-    inference_pipeline_punc = None
-
-inference_pipeline_asr_online = pipeline(
-    task=Tasks.auto_speech_recognition,
-    model=args.asr_model_online,
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-    model_revision='v1.0.4')
-
-print("model loaded")
-
-async def ws_serve(websocket, path):
-    frames = []
-    frames_asr = []
-    frames_asr_online = []
-    global websocket_users
-    websocket_users.add(websocket)
-    websocket.param_dict_asr = {}
-    websocket.param_dict_asr_online = {"cache": dict()}
-    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
-    websocket.param_dict_punc = {'cache': list()}
-    websocket.vad_pre_idx = 0
-    speech_start = False
-    speech_end_i = False
-    websocket.wav_name = "microphone"
-    print("new user connected", flush=True)
-
-    try:
-        async for message in websocket:
-            if isinstance(message, str):
-                messagejson = json.loads(message)
-        
-                if "is_speaking" in messagejson:
-                    websocket.is_speaking = messagejson["is_speaking"]
-                    websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
-                if "chunk_interval" in messagejson:
-                    websocket.chunk_interval = messagejson["chunk_interval"]
-                if "wav_name" in messagejson:
-                    websocket.wav_name = messagejson.get("wav_name")
-                if "chunk_size" in messagejson:
-                    websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
-            if len(frames_asr_online) > 0 or len(frames_asr) > 0 or not isinstance(message, str):
-                if not isinstance(message, str):
-                    frames.append(message)
-                    duration_ms = len(message)//32
-                    websocket.vad_pre_idx += duration_ms
-        
-                    # asr online
-                    frames_asr_online.append(message)
-                    websocket.param_dict_asr_online["is_final"] = speech_end_i
-                    if len(frames_asr_online) % websocket.chunk_interval == 0 or websocket.param_dict_asr_online["is_final"]:
-                        
-                        audio_in = b"".join(frames_asr_online)
-                        await async_asr_online(websocket, audio_in)
-                        frames_asr_online = []
-                    if speech_start:
-                        frames_asr.append(message)
-                    # vad online
-                    speech_start_i, speech_end_i = await async_vad(websocket, message)
-                    if speech_start_i:
-                        speech_start = True
-                        beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
-                        frames_pre = frames[-beg_bias:]
-                        frames_asr = []
-                        frames_asr.extend(frames_pre)
-                # asr punc offline
-                if speech_end_i or not websocket.is_speaking:
-                    # print("vad end point")
-                    audio_in = b"".join(frames_asr)
-                    await async_asr(websocket, audio_in)
-                    frames_asr = []
-                    speech_start = False
-                    # frames_asr_online = []
-                    # websocket.param_dict_asr_online = {"cache": dict()}
-                    if not websocket.is_speaking:
-                        websocket.vad_pre_idx = 0
-                        frames = []
-                        websocket.param_dict_vad = {'in_cache': dict()}
-                    else:
-                        frames = frames[-20:]
-
-     
-    except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)
-        websocket_users.remove(websocket)
-    except websockets.InvalidState:
-        print("InvalidState...")
-    except Exception as e:
-        print("Exception:", e)
-
-
-async def async_vad(websocket, audio_in):
-
-    segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
-
-    speech_start = False
-    speech_end = False
-    
-    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
-        return speech_start, speech_end
-    if segments_result["text"][0][0] != -1:
-        speech_start = segments_result["text"][0][0]
-    if segments_result["text"][0][1] != -1:
-        speech_end = True
-    return speech_start, speech_end
-
-
-async def async_asr(websocket, audio_in):
-            if len(audio_in) > 0:
-                # print(len(audio_in))
-                audio_in = load_bytes(audio_in)
-                
-                rec_result = inference_pipeline_asr(audio_in=audio_in,
-                                                    param_dict=websocket.param_dict_asr)
-                # print(rec_result)
-                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
-                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
-                                                         param_dict=websocket.param_dict_punc)
-                    # print("offline", rec_result)
-                if 'text' in rec_result:
-                    message = json.dumps({"mode": "2pass-offline", "text": rec_result["text"], "wav_name": websocket.wav_name})
-                    await websocket.send(message)
-
-
-async def async_asr_online(websocket, audio_in):
-    if len(audio_in) > 0:
-        audio_in = load_bytes(audio_in)
-        # print(websocket.param_dict_asr_online.get("is_final", False))
-        rec_result = inference_pipeline_asr_online(audio_in=audio_in,
-                                                   param_dict=websocket.param_dict_asr_online)
-        # print(rec_result)
-        if websocket.param_dict_asr_online.get("is_final", False):
-            return
-            #     websocket.param_dict_asr_online["cache"] = dict()
-        if "text" in rec_result:
-            if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
-                # print("online", rec_result)
-                message = json.dumps({"mode": "2pass-online", "text": rec_result["text"], "wav_name": websocket.wav_name})
-                await websocket.send(message)
-
-if len(args.certfile)>0:
-	ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-	
-	# Generate with Lets Encrypt, copied to this location, chown to current user and 400 permissions
-	ssl_cert = args.certfile
-	ssl_key = args.keyfile
-
-	ssl_context.load_cert_chain(ssl_cert, keyfile=ssl_key)
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None,ssl=ssl_context)
-else:
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
-asyncio.get_event_loop().run_until_complete(start_server)
-asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_server_offline.py b/funasr/runtime/python/websocket/ws_server_offline.py
deleted file mode 100644
index 1ea1ff7..0000000
--- a/funasr/runtime/python/websocket/ws_server_offline.py
+++ /dev/null
@@ -1,163 +0,0 @@
-import asyncio
-import json
-import websockets
-import time
-import logging
-import tracemalloc
-import numpy as np
-import ssl
-
-from parse_args import args
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
-
-tracemalloc.start()
-
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-
-websocket_users = set()
-
-print("model loading")
-# asr
-inference_pipeline_asr = pipeline(
-    task=Tasks.auto_speech_recognition,
-    model=args.asr_model,
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-    model_revision=None)
-
-
-# vad
-inference_pipeline_vad = pipeline(
-    task=Tasks.voice_activity_detection,
-    model=args.vad_model,
-    model_revision=None,
-    output_dir=None,
-    batch_size=1,
-    mode='online',
-    ngpu=args.ngpu,
-    ncpu=args.ncpu,
-)
-
-if args.punc_model != "":
-    inference_pipeline_punc = pipeline(
-        task=Tasks.punctuation,
-        model=args.punc_model,
-        model_revision=None,
-        ngpu=args.ngpu,
-        ncpu=args.ncpu,
-    )
-else:
-    inference_pipeline_punc = None
-
-print("model loaded")
-
-async def ws_serve(websocket, path):
-    frames = []
-    frames_asr = []
-    global websocket_users
-    websocket_users.add(websocket)
-    websocket.param_dict_asr = {}
-    websocket.param_dict_vad = {'in_cache': dict(), "is_final": False}
-    websocket.param_dict_punc = {'cache': list()}
-    websocket.vad_pre_idx = 0
-    speech_start = False
-    websocket.wav_name = "microphone"
-    print("new user connected", flush=True)
-
-    try:
-        async for message in websocket:
-            if isinstance(message, str):
-                messagejson = json.loads(message)
-                if "is_speaking" in messagejson:
-                    websocket.is_speaking = messagejson["is_speaking"]
-                    websocket.param_dict_vad["is_final"] = not websocket.is_speaking
-                if "wav_name" in messagejson:
-                    websocket.wav_name = messagejson.get("wav_name")
-            
-            if len(frames_asr) > 0 or not isinstance(message, str):
-                if not isinstance(message, str):
-                    frames.append(message)
-                    duration_ms = len(message)//32
-                    websocket.vad_pre_idx += duration_ms
-    
-                    if speech_start:
-                        frames_asr.append(message)
-                    speech_start_i, speech_end_i = await async_vad(websocket, message)
-                    if speech_start_i:
-                        speech_start = True
-                        beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
-                        frames_pre = frames[-beg_bias:]
-                        frames_asr = []
-                        frames_asr.extend(frames_pre)
-                if speech_end_i or not websocket.is_speaking:
-                    audio_in = b"".join(frames_asr)
-                    await async_asr(websocket, audio_in)
-                    frames_asr = []
-                    speech_start = False
-                    if not websocket.is_speaking:
-                        websocket.vad_pre_idx = 0
-                        frames = []
-                        websocket.param_dict_vad = {'in_cache': dict()}
-                    else:
-                        frames = frames[-20:]
-
-     
-    except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)
-        websocket_users.remove(websocket)
-    except websockets.InvalidState:
-        print("InvalidState...")
-    except Exception as e:
-        print("Exception:", e)
-
-
-async def async_vad(websocket, audio_in):
-
-    segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
-
-    speech_start = False
-    speech_end = False
-    
-    if len(segments_result) == 0 or len(segments_result["text"]) > 1:
-        return speech_start, speech_end
-    if segments_result["text"][0][0] != -1:
-        speech_start = segments_result["text"][0][0]
-    if segments_result["text"][0][1] != -1:
-        speech_end = True
-    return speech_start, speech_end
-
-
-async def async_asr(websocket, audio_in):
-            if len(audio_in) > 0:
-                # print(len(audio_in))
-                audio_in = load_bytes(audio_in)
-                
-                rec_result = inference_pipeline_asr(audio_in=audio_in,
-                                                    param_dict=websocket.param_dict_asr)
-                print(rec_result)
-                if inference_pipeline_punc is not None and 'text' in rec_result and len(rec_result["text"])>0:
-                    rec_result = inference_pipeline_punc(text_in=rec_result['text'],
-                                                         param_dict=websocket.param_dict_punc)
-                    # print(rec_result)
-                message = json.dumps({"mode": "offline", "text": rec_result["text"], "wav_name": websocket.wav_name})
-                await websocket.send(message)
-                
-                
-if len(args.certfile)>0:
-	ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-	
-	# Generate with Lets Encrypt, copied to this location, chown to current user and 400 permissions
-	ssl_cert = args.certfile
-	ssl_key = args.keyfile
-
-	ssl_context.load_cert_chain(ssl_cert, keyfile=ssl_key)
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None,ssl=ssl_context)
-else:
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
-asyncio.get_event_loop().run_until_complete(start_server)
-asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/ws_server_online.py b/funasr/runtime/python/websocket/ws_server_online.py
deleted file mode 100644
index 4cecd5f..0000000
--- a/funasr/runtime/python/websocket/ws_server_online.py
+++ /dev/null
@@ -1,147 +0,0 @@
-import asyncio
-import json
-import websockets
-import time
-from queue import Queue
-import threading
-import logging
-import tracemalloc
-import numpy as np
-import ssl
-from parse_args import args
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.utils.logger import get_logger
-from funasr.runtime.python.onnxruntime.funasr_onnx.utils.frontend import load_bytes
-
-tracemalloc.start()
-
-logger = get_logger(log_level=logging.CRITICAL)
-logger.setLevel(logging.CRITICAL)
-
-
-websocket_users = set()
-
-
-print("model loading")
-
-inference_pipeline_asr_online = pipeline(
-	task=Tasks.auto_speech_recognition,
-	model=args.asr_model_online,
-	ngpu=args.ngpu,
-	ncpu=args.ncpu,
-	model_revision='v1.0.4')
-
-# vad
-inference_pipeline_vad = pipeline(
-    task=Tasks.voice_activity_detection,
-    model=args.vad_model,
-    model_revision=None,
-    output_dir=None,
-    batch_size=1,
-    mode='online',
-    ngpu=args.ngpu,
-    ncpu=1,
-)
-
-print("model loaded")
-
-
-
-async def ws_serve(websocket, path):
-	frames = []
-	frames_asr_online = []
-	global websocket_users
-	websocket_users.add(websocket)
-	websocket.param_dict_asr_online = {"cache": dict()}
-	websocket.param_dict_vad = {'in_cache': dict()}
-	websocket.wav_name = "microphone"
-	print("new user connected",flush=True)
-	try:
-		async for message in websocket:
-			
-			
-			if isinstance(message, str):
-				messagejson = json.loads(message)
-				
-				if "is_speaking" in messagejson:
-					websocket.is_speaking = messagejson["is_speaking"]
-					websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
-					websocket.param_dict_vad["is_final"] = not websocket.is_speaking
-					# need to fire engine manually if no data received any more
-					if not websocket.is_speaking:
-						await async_asr_online(websocket, b"")
-				if "chunk_interval" in messagejson:
-					websocket.chunk_interval=messagejson["chunk_interval"]
-				if "wav_name" in messagejson:
-					websocket.wav_name = messagejson.get("wav_name")
-				if "chunk_size" in messagejson:
-					websocket.param_dict_asr_online["chunk_size"] = messagejson["chunk_size"]
-			# if has bytes in buffer or message is bytes
-			if len(frames_asr_online) > 0 or not isinstance(message, str):
-				if not isinstance(message, str):
-					frames_asr_online.append(message)
-					# frames.append(message)
-					# duration_ms = len(message) // 32
-					# websocket.vad_pre_idx += duration_ms
-					speech_start_i, speech_end_i = await async_vad(websocket, message)
-					websocket.is_speaking = not speech_end_i
-					
-				if len(frames_asr_online) % websocket.chunk_interval == 0 or not websocket.is_speaking:
-					websocket.param_dict_asr_online["is_final"] = not websocket.is_speaking
-					audio_in = b"".join(frames_asr_online)
-					await async_asr_online(websocket, audio_in)
-					frames_asr_online = []
-	
-	
-	except websockets.ConnectionClosed:
-		print("ConnectionClosed...", websocket_users)
-		websocket_users.remove(websocket)
-	except websockets.InvalidState:
-		print("InvalidState...")
-	except Exception as e:
-		print("Exception:", e)
-
-
-async def async_asr_online(websocket,audio_in):
-	if len(audio_in) >= 0:
-		audio_in = load_bytes(audio_in)
-		# print(websocket.param_dict_asr_online.get("is_final", False))
-		rec_result = inference_pipeline_asr_online(audio_in=audio_in,
-		                                           param_dict=websocket.param_dict_asr_online)
-		# print(rec_result)
-		if websocket.param_dict_asr_online.get("is_final", False):
-			websocket.param_dict_asr_online["cache"] = dict()
-		if "text" in rec_result:
-			if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
-				message = json.dumps({"mode": "online", "text": rec_result["text"], "wav_name": websocket.wav_name})
-				await websocket.send(message)
-
-
-async def async_vad(websocket, audio_in):
-	segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
-	
-	speech_start = False
-	speech_end = False
-	
-	if len(segments_result) == 0 or len(segments_result["text"]) > 1:
-		return speech_start, speech_end
-	if segments_result["text"][0][0] != -1:
-		speech_start = segments_result["text"][0][0]
-	if segments_result["text"][0][1] != -1:
-		speech_end = True
-	return speech_start, speech_end
-
-if len(args.certfile)>0:
-	ssl_context = ssl.SSLContext(ssl.PROTOCOL_TLS_SERVER)
-	
-	# Generate with Lets Encrypt, copied to this location, chown to current user and 400 permissions
-	ssl_cert = args.certfile
-	ssl_key = args.keyfile
-	
-	ssl_context.load_cert_chain(ssl_cert, keyfile=ssl_key)
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None,ssl=ssl_context)
-else:
-	start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
-asyncio.get_event_loop().run_until_complete(start_server)
-asyncio.get_event_loop().run_forever()
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/wss_client_asr.py b/funasr/runtime/python/websocket/wss_client_asr.py
index 586e0a4..2ea8a16 100644
--- a/funasr/runtime/python/websocket/wss_client_asr.py
+++ b/funasr/runtime/python/websocket/wss_client_asr.py
@@ -1,7 +1,7 @@
 # -*- encoding: utf-8 -*-
 import os
 import time
-import websockets,ssl
+import websockets, ssl
 import asyncio
 # import threading
 import argparse
@@ -12,6 +12,7 @@
 
 import logging
 
+SUPPORT_AUDIO_TYPE_SETS = ['.wav', '.pcm']
 logging.basicConfig(level=logging.ERROR)
 
 parser = argparse.ArgumentParser()
@@ -53,7 +54,7 @@
                     type=str,
                     default=None,
                     help="output_dir")
-                    
+
 parser.add_argument("--ssl",
                     type=int,
                     default=1,
@@ -68,22 +69,25 @@
 print(args)
 # voices = asyncio.Queue()
 from queue import Queue
-voices = Queue()
 
+voices = Queue()
+offline_msg_done=False
+ 
 ibest_writer = None
 if args.output_dir is not None:
     writer = DatadirWriter(args.output_dir)
     ibest_writer = writer[f"1best_recog"]
 
+
 async def record_microphone():
     is_finished = False
     import pyaudio
-    #print("2")
-    global voices 
+    # print("2")
+    global voices
     FORMAT = pyaudio.paInt16
     CHANNELS = 1
     RATE = 16000
-    chunk_size = 60*args.chunk_size[1]/args.chunk_interval
+    chunk_size = 60 * args.chunk_size[1] / args.chunk_interval
     CHUNK = int(RATE / 1000 * chunk_size)
 
     p = pyaudio.PyAudio()
@@ -94,19 +98,16 @@
                     input=True,
                     frames_per_buffer=CHUNK)
 
-    message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": "microphone", "is_speaking": True})
+    message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval,
+                          "wav_name": "microphone", "is_speaking": True})
     voices.put(message)
     while True:
-
         data = stream.read(CHUNK)
-        message = data  
-        
+        message = data
         voices.put(message)
-
         await asyncio.sleep(0.005)
 
-async def record_from_scp(chunk_begin,chunk_size):
-    import wave
+async def record_from_scp(chunk_begin, chunk_size):
     global voices
     is_finished = False
     if args.audio_in.endswith(".scp"):
@@ -114,91 +115,98 @@
         wavs = f_scp.readlines()
     else:
         wavs = [args.audio_in]
-    if chunk_size>0:
-        wavs=wavs[chunk_begin:chunk_begin+chunk_size]
+    if chunk_size > 0:
+        wavs = wavs[chunk_begin:chunk_begin + chunk_size]
     for wav in wavs:
         wav_splits = wav.strip().split()
+ 
         wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
         wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
-        
-        # bytes_f = open(wav_path, "rb")
-        # bytes_data = bytes_f.read()
-        with wave.open(wav_path, "rb") as wav_file:
-            params = wav_file.getparams()
-            # header_length = wav_file.getheaders()[0][1]
-            # wav_file.setpos(header_length)
-            frames = wav_file.readframes(wav_file.getnframes())
+        if not len(wav_path.strip())>0:
+           continue
+        if wav_path.endswith(".pcm"):
+            with open(wav_path, "rb") as f:
+                audio_bytes = f.read()
+        elif wav_path.endswith(".wav"):
+            import wave
+            with wave.open(wav_path, "rb") as wav_file:
+                params = wav_file.getparams()
+                frames = wav_file.readframes(wav_file.getnframes())
+                audio_bytes = bytes(frames)
+        else:
+            raise NotImplementedError(
+                f'Not supported audio type')
 
-        audio_bytes = bytes(frames)
         # stride = int(args.chunk_size/1000*16000*2)
-        stride = int(60*args.chunk_size[1]/args.chunk_interval/1000*16000*2)
-        chunk_num = (len(audio_bytes)-1)//stride + 1
+        stride = int(60 * args.chunk_size[1] / args.chunk_interval / 1000 * 16000 * 2)
+        chunk_num = (len(audio_bytes) - 1) // stride + 1
         # print(stride)
-        
+
         # send first time
-        message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval, "wav_name": wav_name,"is_speaking": True})
-        voices.put(message)
+        message = json.dumps({"mode": args.mode, "chunk_size": args.chunk_size, "chunk_interval": args.chunk_interval,
+                              "wav_name": wav_name, "is_speaking": True})
+        #voices.put(message)
+        await websocket.send(message)
         is_speaking = True
         for i in range(chunk_num):
 
-            beg = i*stride
-            data = audio_bytes[beg:beg+stride]
-            message = data  
-            voices.put(message)
-            if i == chunk_num-1:
+            beg = i * stride
+            data = audio_bytes[beg:beg + stride]
+            message = data
+            #voices.put(message)
+            await websocket.send(message)
+            if i == chunk_num - 1:
                 is_speaking = False
                 message = json.dumps({"is_speaking": is_speaking})
-                voices.put(message)
-            # print("data_chunk: ", len(data_chunk))
-            # print(voices.qsize())
-            sleep_duration = 0.001 if args.send_without_sleep else 60*args.chunk_size[1]/args.chunk_interval/1000
+                #voices.put(message)
+                await websocket.send(message)
+ 
+            sleep_duration = 0.001 if args.send_without_sleep else 60 * args.chunk_size[1] / args.chunk_interval / 1000
             await asyncio.sleep(sleep_duration)
+    # when all data sent, we need to close websocket
+    while not voices.empty():
+         await asyncio.sleep(1)
+    await asyncio.sleep(3)
+    # offline model need to wait for message recved
+    
+    if args.mode=="offline":
+      global offline_msg_done
+      while  not  offline_msg_done:
+         await asyncio.sleep(1)
+    
+    await websocket.close()
+     
+ 
+ 
 
-
-async def ws_send():
-    global voices
-    global websocket
-    print("started to sending data!")
-    while True:
-        while not voices.empty():
-            data = voices.get()
-            voices.task_done()
-            try:
-                await websocket.send(data)
-            except Exception as e:
-                print('Exception occurred:', e)
-                traceback.print_exc()
-                exit(0)
-            await asyncio.sleep(0.005)
-        await asyncio.sleep(0.005)
-
-
-
+ 
+             
 async def message(id):
-    global websocket
+    global websocket,voices,offline_msg_done
     text_print = ""
     text_print_2pass_online = ""
     text_print_2pass_offline = ""
-    while True:
-        try:
+    try:
+       while True:
+        
             meg = await websocket.recv()
             meg = json.loads(meg)
             wav_name = meg.get("wav_name", "demo")
-            # print(wav_name)
             text = meg["text"]
             if ibest_writer is not None:
                 ibest_writer["text"][wav_name] = text
-            
+
             if meg["mode"] == "online":
                 text_print += "{}".format(text)
                 text_print = text_print[-args.words_max_print:]
                 os.system('clear')
-                print("\rpid"+str(id)+": "+text_print)
-            elif meg["mode"] == "online":
+                print("\rpid" + str(id) + ": " + text_print)
+            elif meg["mode"] == "offline":
                 text_print += "{}".format(text)
                 text_print = text_print[-args.words_max_print:]
                 os.system('clear')
-                print("\rpid"+str(id)+": "+text_print)
+                print("\rpid" + str(id) + ": " + text_print)
+                offline_msg_done=True
             else:
                 if meg["mode"] == "2pass-online":
                     text_print_2pass_online += "{}".format(text)
@@ -211,10 +219,12 @@
                 os.system('clear')
                 print("\rpid" + str(id) + ": " + text_print)
 
-        except Exception as e:
+    except Exception as e:
             print("Exception:", e)
-            traceback.print_exc()
-            exit(0)
+            #traceback.print_exc()
+            #await websocket.close()
+ 
+
 
 async def print_messge():
     global websocket
@@ -225,72 +235,87 @@
             print(meg)
         except Exception as e:
             print("Exception:", e)
-            traceback.print_exc()
+            #traceback.print_exc()
             exit(0)
 
-async def ws_client(id,chunk_begin,chunk_size):
-    global websocket
-    if  args.ssl==1:
-       ssl_context = ssl.SSLContext()
-       ssl_context.check_hostname = False
-       ssl_context.verify_mode = ssl.CERT_NONE
-       uri = "wss://{}:{}".format(args.host, args.port)
+async def ws_client(id, chunk_begin, chunk_size):
+  if args.audio_in is None:
+       chunk_begin=0
+       chunk_size=1
+  global websocket,voices,offline_msg_done
+ 
+  for i in range(chunk_begin,chunk_begin+chunk_size):
+    offline_msg_done=False
+    voices = Queue()
+    if args.ssl == 1:
+        ssl_context = ssl.SSLContext()
+        ssl_context.check_hostname = False
+        ssl_context.verify_mode = ssl.CERT_NONE
+        uri = "wss://{}:{}".format(args.host, args.port)
     else:
-       uri = "ws://{}:{}".format(args.host, args.port)
-       ssl_context=None
-    print("connect to",uri)
-    async for websocket in websockets.connect(uri, subprotocols=["binary"], ping_interval=None,ssl=ssl_context):
+        uri = "ws://{}:{}".format(args.host, args.port)
+        ssl_context = None
+    print("connect to", uri)
+    async with websockets.connect(uri, subprotocols=["binary"], ping_interval=None, ssl=ssl_context) as websocket:
         if args.audio_in is not None:
-            task = asyncio.create_task(record_from_scp(chunk_begin,chunk_size))
+            task = asyncio.create_task(record_from_scp(i, 1))
         else:
             task = asyncio.create_task(record_microphone())
-        task2 = asyncio.create_task(ws_send())
-        task3 = asyncio.create_task(message(id))
-        await asyncio.gather(task, task2, task3)
+        #task2 = asyncio.create_task(ws_send())
+        task3 = asyncio.create_task(message(str(id)+"_"+str(i))) #processid+fileid
+        await asyncio.gather(task, task3)
+  exit(0)
+    
 
-def one_thread(id,chunk_begin,chunk_size):
-   asyncio.get_event_loop().run_until_complete(ws_client(id,chunk_begin,chunk_size))
-   asyncio.get_event_loop().run_forever()
-
+def one_thread(id, chunk_begin, chunk_size):
+    asyncio.get_event_loop().run_until_complete(ws_client(id, chunk_begin, chunk_size))
+    asyncio.get_event_loop().run_forever()
 
 if __name__ == '__main__':
-   # for microphone 
-   if  args.audio_in is  None:
-     p = Process(target=one_thread,args=(0, 0, 0))
-     p.start()
-     p.join()
-     print('end')
-   else:
-     # calculate the number of wavs for each preocess
-     if args.audio_in.endswith(".scp"):
-         f_scp = open(args.audio_in)
-         wavs = f_scp.readlines()
-     else:
-         wavs = [args.audio_in]
-     total_len=len(wavs)
-     if total_len>=args.test_thread_num:
-          chunk_size=int((total_len)/args.test_thread_num)
-          remain_wavs=total_len-chunk_size*args.test_thread_num
-     else:
-          chunk_size=1
-          remain_wavs=0
+    # for microphone
+    if args.audio_in is None:
+        p = Process(target=one_thread, args=(0, 0, 0))
+        p.start()
+        p.join()
+        print('end')
+    else:
+        # calculate the number of wavs for each preocess
+        if args.audio_in.endswith(".scp"):
+            f_scp = open(args.audio_in)
+            wavs = f_scp.readlines()
+        else:
+            wavs = [args.audio_in]
+        for wav in wavs:
+            wav_splits = wav.strip().split()
+            wav_name = wav_splits[0] if len(wav_splits) > 1 else "demo"
+            wav_path = wav_splits[1] if len(wav_splits) > 1 else wav_splits[0]
+            audio_type = os.path.splitext(wav_path)[-1].lower()
+            if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
+                raise NotImplementedError(
+                    f'Not supported audio type: {audio_type}')
 
-     process_list = []
-     chunk_begin=0
-     for i in range(args.test_thread_num):
-         now_chunk_size= chunk_size
-         if remain_wavs>0:
-             now_chunk_size=chunk_size+1
-             remain_wavs=remain_wavs-1
-         # process i handle wavs at chunk_begin and size of now_chunk_size
-         p = Process(target=one_thread,args=(i,chunk_begin,now_chunk_size))
-         chunk_begin=chunk_begin+now_chunk_size
-         p.start()
-         process_list.append(p)
+        total_len = len(wavs)
+        if total_len >= args.test_thread_num:
+            chunk_size = int(total_len / args.test_thread_num)
+            remain_wavs = total_len - chunk_size * args.test_thread_num
+        else:
+            chunk_size = 1
+            remain_wavs = 0
 
-     for i in process_list:
-         p.join()
+        process_list = []
+        chunk_begin = 0
+        for i in range(args.test_thread_num):
+            now_chunk_size = chunk_size
+            if remain_wavs > 0:
+                now_chunk_size = chunk_size + 1
+                remain_wavs = remain_wavs - 1
+            # process i handle wavs at chunk_begin and size of now_chunk_size
+            p = Process(target=one_thread, args=(i, chunk_begin, now_chunk_size))
+            chunk_begin = chunk_begin + now_chunk_size
+            p.start()
+            process_list.append(p)
 
-     print('end')
+        for i in process_list:
+            p.join()
 
-
+        print('end')
diff --git a/funasr/runtime/python/websocket/wss_srv_asr.py b/funasr/runtime/python/websocket/wss_srv_asr.py
index 71c97e6..09f2305 100644
--- a/funasr/runtime/python/websocket/wss_srv_asr.py
+++ b/funasr/runtime/python/websocket/wss_srv_asr.py
@@ -35,8 +35,6 @@
     task=Tasks.voice_activity_detection,
     model=args.vad_model,
     model_revision=None,
-    output_dir=None,
-    batch_size=1,
     mode='online',
     ngpu=args.ngpu,
     ncpu=args.ncpu,
@@ -58,15 +56,36 @@
     model=args.asr_model_online,
     ngpu=args.ngpu,
     ncpu=args.ncpu,
-    model_revision='v1.0.4')
+    model_revision='v1.0.4',
+    update_model='v1.0.4',
+    mode='paraformer_streaming')
 
-print("model loaded")
+print("model loaded! only support one client at the same time now!!!!")
 
+async def ws_reset(websocket):
+    print("ws reset now, total num is ",len(websocket_users))
+    websocket.param_dict_asr_online = {"cache": dict()}
+    websocket.param_dict_vad = {'in_cache': dict(), "is_final": True}
+    websocket.param_dict_asr_online["is_final"]=True
+    # audio_in=b''.join(np.zeros(int(16000),dtype=np.int16))
+    # inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
+    # inference_pipeline_asr_online(audio_in=audio_in, param_dict=websocket.param_dict_asr_online)
+    await websocket.close()
+    
+    
+async def clear_websocket():
+   for websocket in websocket_users:
+       await ws_reset(websocket)
+   websocket_users.clear()
+ 
+ 
+       
 async def ws_serve(websocket, path):
     frames = []
     frames_asr = []
     frames_asr_online = []
     global websocket_users
+    await clear_websocket()
     websocket_users.add(websocket)
     websocket.param_dict_asr = {}
     websocket.param_dict_asr_online = {"cache": dict()}
@@ -74,7 +93,7 @@
     websocket.param_dict_punc = {'cache': list()}
     websocket.vad_pre_idx = 0
     speech_start = False
-    speech_end_i = False
+    speech_end_i = -1
     websocket.wav_name = "microphone"
     websocket.mode = "2pass"
     print("new user connected", flush=True)
@@ -103,7 +122,7 @@
         
                     # asr online
                     frames_asr_online.append(message)
-                    websocket.param_dict_asr_online["is_final"] = speech_end_i
+                    websocket.param_dict_asr_online["is_final"] = speech_end_i != -1
                     if len(frames_asr_online) % websocket.chunk_interval == 0 or websocket.param_dict_asr_online["is_final"]:
                         if websocket.mode == "2pass" or websocket.mode == "online":
                             audio_in = b"".join(frames_asr_online)
@@ -113,14 +132,14 @@
                         frames_asr.append(message)
                     # vad online
                     speech_start_i, speech_end_i = await async_vad(websocket, message)
-                    if speech_start_i:
+                    if speech_start_i != -1:
                         speech_start = True
                         beg_bias = (websocket.vad_pre_idx-speech_start_i)//duration_ms
                         frames_pre = frames[-beg_bias:]
                         frames_asr = []
                         frames_asr.extend(frames_pre)
                 # asr punc offline
-                if speech_end_i or not websocket.is_speaking:
+                if speech_end_i != -1 or not websocket.is_speaking:
                     # print("vad end point")
                     if websocket.mode == "2pass" or websocket.mode == "offline":
                         audio_in = b"".join(frames_asr)
@@ -138,7 +157,8 @@
 
      
     except websockets.ConnectionClosed:
-        print("ConnectionClosed...", websocket_users)
+        print("ConnectionClosed...", websocket_users,flush=True)
+        await ws_reset(websocket)
         websocket_users.remove(websocket)
     except websockets.InvalidState:
         print("InvalidState...")
@@ -150,15 +170,15 @@
 
     segments_result = inference_pipeline_vad(audio_in=audio_in, param_dict=websocket.param_dict_vad)
 
-    speech_start = False
-    speech_end = False
+    speech_start = -1
+    speech_end = -1
     
     if len(segments_result) == 0 or len(segments_result["text"]) > 1:
         return speech_start, speech_end
     if segments_result["text"][0][0] != -1:
         speech_start = segments_result["text"][0][0]
     if segments_result["text"][0][1] != -1:
-        speech_end = True
+        speech_end = segments_result["text"][0][1]
     return speech_start, speech_end
 
 
@@ -207,4 +227,4 @@
 else:
     start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None)
 asyncio.get_event_loop().run_until_complete(start_server)
-asyncio.get_event_loop().run_forever()
\ No newline at end of file
+asyncio.get_event_loop().run_forever()
diff --git a/funasr/runtime/websocket/CMakeLists.txt b/funasr/runtime/websocket/CMakeLists.txt
index e89537b..c1715d8 100644
--- a/funasr/runtime/websocket/CMakeLists.txt
+++ b/funasr/runtime/websocket/CMakeLists.txt
@@ -2,14 +2,14 @@
 
 project(FunASRWebscoket) 
 
+set(CMAKE_CXX_STANDARD 14 CACHE STRING "The C++ version to be used.")
+set(CMAKE_POSITION_INDEPENDENT_CODE ON)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}/bin)
-
 
 option(ENABLE_WEBSOCKET "Whether to build websocket server" ON)
  
 if(ENABLE_WEBSOCKET)
   # cmake_policy(SET CMP0135 NEW)
-
   include(FetchContent)
   FetchContent_Declare(websocketpp
   GIT_REPOSITORY https://github.com/zaphoyd/websocketpp.git
@@ -20,7 +20,6 @@
   FetchContent_MakeAvailable(websocketpp)
   include_directories(${PROJECT_SOURCE_DIR}/third_party/websocket)
    
-
   FetchContent_Declare(asio
      URL   https://github.com/chriskohlhoff/asio/archive/refs/tags/asio-1-24-0.tar.gz
    SOURCE_DIR ${PROJECT_SOURCE_DIR}/third_party/asio
@@ -36,8 +35,6 @@
   
   FetchContent_MakeAvailable(json)
   include_directories(${PROJECT_SOURCE_DIR}/third_party/json/include)
- 
- 
 
 endif()
 
@@ -55,10 +52,12 @@
 include_directories(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog)
 set(BUILD_TESTING OFF)
 add_subdirectory(${PROJECT_SOURCE_DIR}/../onnxruntime/third_party/glog glog)
- 
 
-add_executable(websocketmain "websocketmain.cpp" "websocketsrv.cpp")
-add_executable(websocketclient "websocketclient.cpp")
+# install openssl first apt-get install libssl-dev
+find_package(OpenSSL REQUIRED)
 
-target_link_libraries(websocketclient PUBLIC funasr)
-target_link_libraries(websocketmain PUBLIC funasr)
+add_executable(funasr-ws-server "funasr-ws-server.cpp" "websocket-server.cpp")
+add_executable(funasr-ws-client "funasr-ws-client.cpp")
+
+target_link_libraries(funasr-ws-client PUBLIC funasr ssl crypto)
+target_link_libraries(funasr-ws-server PUBLIC funasr ssl crypto)
diff --git a/funasr/runtime/websocket/funasr-ws-client.cpp b/funasr/runtime/websocket/funasr-ws-client.cpp
new file mode 100644
index 0000000..4a3c751
--- /dev/null
+++ b/funasr/runtime/websocket/funasr-ws-client.cpp
@@ -0,0 +1,366 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// client for websocket, support multiple threads
+// ./funasr-ws-client  --server-ip <string>
+//                     --port <string>
+//                     --wav-path <string>
+//                     [--thread-num <int>] 
+//                     [--is-ssl <int>]  [--]
+//                     [--version] [-h]
+// example:
+// ./funasr-ws-client --server-ip 127.0.0.1 --port 8889 --wav-path test.wav --thread-num 1 --is-ssl 0
+
+#define ASIO_STANDALONE 1
+#include <websocketpp/client.hpp>
+#include <websocketpp/common/thread.hpp>
+#include <websocketpp/config/asio_client.hpp>
+#include <fstream>
+#include <atomic>
+#include <glog/logging.h>
+
+#include "audio.h"
+#include "nlohmann/json.hpp"
+#include "tclap/CmdLine.h"
+
+/**
+ * Define a semi-cross platform helper method that waits/sleeps for a bit.
+ */
+void WaitABit() {
+    #ifdef WIN32
+        Sleep(1000);
+    #else
+        sleep(1);
+    #endif
+}
+std::atomic<int> wav_index(0);
+
+bool IsTargetFile(const std::string& filename, const std::string target) {
+    std::size_t pos = filename.find_last_of(".");
+    if (pos == std::string::npos) {
+        return false;
+    }
+    std::string extension = filename.substr(pos + 1);
+    return (extension == target);
+}
+
+typedef websocketpp::config::asio_client::message_type::ptr message_ptr;
+typedef websocketpp::lib::shared_ptr<websocketpp::lib::asio::ssl::context> context_ptr;
+using websocketpp::lib::bind;
+using websocketpp::lib::placeholders::_1;
+using websocketpp::lib::placeholders::_2;
+context_ptr OnTlsInit(websocketpp::connection_hdl) {
+    context_ptr ctx = websocketpp::lib::make_shared<asio::ssl::context>(
+        asio::ssl::context::sslv23);
+
+    try {
+        ctx->set_options(
+            asio::ssl::context::default_workarounds | asio::ssl::context::no_sslv2 |
+            asio::ssl::context::no_sslv3 | asio::ssl::context::single_dh_use);
+
+    } catch (std::exception& e) {
+        LOG(ERROR) << e.what();
+    }
+    return ctx;
+}
+
+// template for tls or not config
+template <typename T>
+class WebsocketClient {
+  public:
+    // typedef websocketpp::client<T> client;
+    // typedef websocketpp::client<websocketpp::config::asio_tls_client>
+    // wss_client;
+    typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
+
+    WebsocketClient(int is_ssl) : m_open(false), m_done(false) {
+        // set up access channels to only log interesting things
+        m_client.clear_access_channels(websocketpp::log::alevel::all);
+        m_client.set_access_channels(websocketpp::log::alevel::connect);
+        m_client.set_access_channels(websocketpp::log::alevel::disconnect);
+        m_client.set_access_channels(websocketpp::log::alevel::app);
+
+        // Initialize the Asio transport policy
+        m_client.init_asio();
+
+        // Bind the handlers we are using
+        using websocketpp::lib::bind;
+        using websocketpp::lib::placeholders::_1;
+        m_client.set_open_handler(bind(&WebsocketClient::on_open, this, _1));
+        m_client.set_close_handler(bind(&WebsocketClient::on_close, this, _1));
+        // m_client.set_close_handler(bind(&WebsocketClient::on_close, this, _1));
+
+        m_client.set_message_handler(
+            [this](websocketpp::connection_hdl hdl, message_ptr msg) {
+              on_message(hdl, msg);
+            });
+
+        m_client.set_fail_handler(bind(&WebsocketClient::on_fail, this, _1));
+        m_client.clear_access_channels(websocketpp::log::alevel::all);
+    }
+
+    void on_message(websocketpp::connection_hdl hdl, message_ptr msg) {
+        const std::string& payload = msg->get_payload();
+        switch (msg->get_opcode()) {
+            case websocketpp::frame::opcode::text:
+				total_num=total_num+1;
+                LOG(INFO)<<total_num<<",on_message = " << payload;
+				if((total_num+1)==wav_index)
+				{
+					websocketpp::lib::error_code ec;
+					m_client.close(m_hdl, websocketpp::close::status::going_away, "", ec);
+					if (ec){
+                        LOG(ERROR)<< "Error closing connection " << ec.message();
+					}
+				}
+        }
+    }
+
+    // This method will block until the connection is complete  
+    void run(const std::string& uri, const std::vector<string>& wav_list, const std::vector<string>& wav_ids) {
+        // Create a new connection to the given URI
+        websocketpp::lib::error_code ec;
+        typename websocketpp::client<T>::connection_ptr con =
+            m_client.get_connection(uri, ec);
+        if (ec) {
+            m_client.get_alog().write(websocketpp::log::alevel::app,
+                                    "Get Connection Error: " + ec.message());
+            return;
+        }
+        // Grab a handle for this connection so we can talk to it in a thread
+        // safe manor after the event loop starts.
+        m_hdl = con->get_handle();
+
+        // Queue the connection. No DNS queries or network connections will be
+        // made until the io_service event loop is run.
+        m_client.connect(con);
+
+        // Create a thread to run the ASIO io_service event loop
+        websocketpp::lib::thread asio_thread(&websocketpp::client<T>::run,
+                                            &m_client);
+        while(true){
+            int i = wav_index.fetch_add(1);
+            if (i >= wav_list.size()) {
+                break;
+            }
+            send_wav_data(wav_list[i], wav_ids[i]);
+        }
+        WaitABit(); 
+
+        asio_thread.join();
+
+    }
+
+    // The open handler will signal that we are ready to start sending data
+    void on_open(websocketpp::connection_hdl) {
+        m_client.get_alog().write(websocketpp::log::alevel::app,
+                                "Connection opened, starting data!");
+
+        scoped_lock guard(m_lock);
+        m_open = true;
+    }
+
+    // The close handler will signal that we should stop sending data
+    void on_close(websocketpp::connection_hdl) {
+        m_client.get_alog().write(websocketpp::log::alevel::app,
+                                  "Connection closed, stopping data!");
+
+        scoped_lock guard(m_lock);
+        m_done = true;
+    }
+
+    // The fail handler will signal that we should stop sending data
+    void on_fail(websocketpp::connection_hdl) {
+        m_client.get_alog().write(websocketpp::log::alevel::app,
+                                  "Connection failed, stopping data!");
+
+        scoped_lock guard(m_lock);
+        m_done = true;
+    }
+    // send wav to server
+    void send_wav_data(string wav_path, string wav_id) {
+        uint64_t count = 0;
+        std::stringstream val;
+
+		funasr::Audio audio(1);
+        int32_t sampling_rate = 16000;
+		if(IsTargetFile(wav_path.c_str(), "wav")){
+			int32_t sampling_rate = -1;
+			if(!audio.LoadWav(wav_path.c_str(), &sampling_rate))
+				return ;
+		}else if(IsTargetFile(wav_path.c_str(), "pcm")){
+			if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate))
+				return ;
+		}else{
+			printf("Wrong wav extension");
+			exit(-1);
+		}
+
+        float* buff;
+        int len;
+        int flag = 0;
+        bool wait = false;
+        while (1) {
+            {
+                scoped_lock guard(m_lock);
+                // If the connection has been closed, stop generating data
+                if (m_done) {
+                  break;
+                }
+                // If the connection hasn't been opened yet wait a bit and retry
+                if (!m_open) {
+                  wait = true;
+                } else {
+                  break;
+                }
+            }
+            if (wait) {
+                LOG(INFO) << "wait.." << m_open;
+                WaitABit();
+                continue;
+            }
+        }
+        websocketpp::lib::error_code ec;
+
+        nlohmann::json jsonbegin;
+        nlohmann::json chunk_size = nlohmann::json::array();
+        chunk_size.push_back(5);
+        chunk_size.push_back(0);
+        chunk_size.push_back(5);
+        jsonbegin["chunk_size"] = chunk_size;
+        jsonbegin["chunk_interval"] = 10;
+        jsonbegin["wav_name"] = wav_id;
+        jsonbegin["is_speaking"] = true;
+        m_client.send(m_hdl, jsonbegin.dump(), websocketpp::frame::opcode::text,
+                      ec);
+
+        // fetch wav data use asr engine api
+        while (audio.Fetch(buff, len, flag) > 0) {
+            short iArray[len];
+
+            // convert float -1,1 to short -32768,32767
+            for (size_t i = 0; i < len; ++i) {
+              iArray[i] = (short)(buff[i] * 32767);
+            }
+            // send data to server
+            m_client.send(m_hdl, iArray, len * sizeof(short),
+                          websocketpp::frame::opcode::binary, ec);
+            LOG(INFO) << "sended data len=" << len * sizeof(short);
+            // The most likely error that we will get is that the connection is
+            // not in the right state. Usually this means we tried to send a
+            // message to a connection that was closed or in the process of
+            // closing. While many errors here can be easily recovered from,
+            // in this simple example, we'll stop the data loop.
+            if (ec) {
+              m_client.get_alog().write(websocketpp::log::alevel::app,
+                                        "Send Error: " + ec.message());
+              break;
+            }
+            // WaitABit();
+        }
+        nlohmann::json jsonresult;
+        jsonresult["is_speaking"] = false;
+        m_client.send(m_hdl, jsonresult.dump(), websocketpp::frame::opcode::text,
+                      ec);
+        // WaitABit();
+    }
+    websocketpp::client<T> m_client;
+
+  private:
+    websocketpp::connection_hdl m_hdl;
+    websocketpp::lib::mutex m_lock;
+    bool m_open;
+    bool m_done;
+	int total_num=0;
+};
+
+int main(int argc, char* argv[]) {
+    google::InitGoogleLogging(argv[0]);
+    FLAGS_logtostderr = true;
+
+    TCLAP::CmdLine cmd("funasr-ws-client", ' ', "1.0");
+    TCLAP::ValueArg<std::string> server_ip_("", "server-ip", "server-ip", true,
+                                           "127.0.0.1", "string");
+    TCLAP::ValueArg<std::string> port_("", "port", "port", true, "8889", "string");
+    TCLAP::ValueArg<std::string> wav_path_("", "wav-path", 
+        "the input could be: wav_path, e.g.: asr_example.wav; pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)", 
+        true, "", "string");
+    TCLAP::ValueArg<int> thread_num_("", "thread-num", "thread-num",
+                                       false, 1, "int");
+    TCLAP::ValueArg<int> is_ssl_(
+        "", "is-ssl", "is-ssl is 1 means use wss connection, or use ws connection", 
+        false, 0, "int");
+
+    cmd.add(server_ip_);
+    cmd.add(port_);
+    cmd.add(wav_path_);
+    cmd.add(thread_num_);
+    cmd.add(is_ssl_);
+    cmd.parse(argc, argv);
+
+    std::string server_ip = server_ip_.getValue();
+    std::string port = port_.getValue();
+    std::string wav_path = wav_path_.getValue();
+    int threads_num = thread_num_.getValue();
+    int is_ssl = is_ssl_.getValue();
+
+    std::vector<websocketpp::lib::thread> client_threads;
+    std::string uri = "";
+    if (is_ssl == 1) {
+        uri = "wss://" + server_ip + ":" + port;
+    } else {
+        uri = "ws://" + server_ip + ":" + port;
+    }
+
+    // read wav_path
+    std::vector<string> wav_list;
+    std::vector<string> wav_ids;
+    string default_id = "wav_default_id";
+    if(IsTargetFile(wav_path, "wav") || IsTargetFile(wav_path, "pcm")){
+        wav_list.emplace_back(wav_path);
+        wav_ids.emplace_back(default_id);
+    }
+    else if(IsTargetFile(wav_path, "scp")){
+        ifstream in(wav_path);
+        if (!in.is_open()) {
+            printf("Failed to open scp file");
+            return 0;
+        }
+        string line;
+        while(getline(in, line))
+        {
+            istringstream iss(line);
+            string column1, column2;
+            iss >> column1 >> column2;
+            wav_list.emplace_back(column2);
+            wav_ids.emplace_back(column1);
+        }
+        in.close();
+    }else{
+        printf("Please check the wav extension!");
+        exit(-1);
+    }
+    
+    for (size_t i = 0; i < threads_num; i++) {
+        client_threads.emplace_back([uri, wav_list, wav_ids, is_ssl]() {
+          if (is_ssl == 1) {
+            WebsocketClient<websocketpp::config::asio_tls_client> c(is_ssl);
+
+            c.m_client.set_tls_init_handler(bind(&OnTlsInit, ::_1));
+
+            c.run(uri, wav_list, wav_ids);
+          } else {
+            WebsocketClient<websocketpp::config::asio_client> c(is_ssl);
+
+            c.run(uri, wav_list, wav_ids);
+          }
+        });
+    }
+
+    for (auto& t : client_threads) {
+        t.join();
+    }
+}
\ No newline at end of file
diff --git a/funasr/runtime/websocket/websocketmain.cpp b/funasr/runtime/websocket/funasr-ws-server.cpp
similarity index 66%
rename from funasr/runtime/websocket/websocketmain.cpp
rename to funasr/runtime/websocket/funasr-ws-server.cpp
index 4614b51..872f6a1 100644
--- a/funasr/runtime/websocket/websocketmain.cpp
+++ b/funasr/runtime/websocket/funasr-ws-server.cpp
@@ -5,12 +5,12 @@
 /* 2022-2023 by zhaomingwork */
 
 // io server
-// Usage:websocketmain  [--model_thread_num <int>] [--decoder_thread_num <int>]
+// Usage:funasr-ws-server  [--model_thread_num <int>] [--decoder_thread_num <int>]
 //                    [--io_thread_num <int>] [--port <int>] [--listen_ip
 //                    <string>] [--punc-quant <string>] [--punc-dir <string>]
 //                    [--vad-quant <string>] [--vad-dir <string>] [--quantize
 //                    <string>] --model-dir <string> [--] [--version] [-h]
-#include "websocketsrv.h"
+#include "websocket-server.h"
 
 using namespace std;
 void GetValue(TCLAP::ValueArg<std::string>& value_arg, string key,
@@ -25,7 +25,7 @@
     google::InitGoogleLogging(argv[0]);
     FLAGS_logtostderr = true;
 
-    TCLAP::CmdLine cmd("websocketmain", ' ', "1.0");
+    TCLAP::CmdLine cmd("funasr-ws-server", ' ', "1.0");
     TCLAP::ValueArg<std::string> model_dir(
         "", MODEL_DIR,
         "the asr model path, which contains model.onnx, config.yaml, am.mvn",
@@ -64,6 +64,14 @@
     TCLAP::ValueArg<int> model_thread_num("", "model_thread_num",
                                           "model_thread_num", false, 1, "int");
 
+    TCLAP::ValueArg<std::string> certfile("", "certfile", "certfile", false, "",
+                                          "string");
+    TCLAP::ValueArg<std::string> keyfile("", "keyfile", "keyfile", false, "",
+                                         "string");
+
+    cmd.add(certfile);
+    cmd.add(keyfile);
+
     cmd.add(model_dir);
     cmd.add(quantize);
     cmd.add(vad_dir);
@@ -94,46 +102,66 @@
     int s_model_thread_num = model_thread_num.getValue();
 
     asio::io_context io_decoder;  // context for decoding
+    asio::io_context io_server;   // context for server
 
     std::vector<std::thread> decoder_threads;
 
+    std::string s_certfile = certfile.getValue();
+    std::string s_keyfile = keyfile.getValue();
+
+    bool is_ssl = false;
+    if (!s_certfile.empty()) {
+      is_ssl = true;
+    }
+
     auto conn_guard = asio::make_work_guard(
         io_decoder);  // make sure threads can wait in the queue
-
+    auto server_guard = asio::make_work_guard(
+        io_server);  // make sure threads can wait in the queue
     // create threads pool
     for (int32_t i = 0; i < s_decoder_thread_num; ++i) {
       decoder_threads.emplace_back([&io_decoder]() { io_decoder.run(); });
     }
 
-    server server_;       // server for websocket
-    server_.init_asio();  // init asio
-    server_.set_reuse_addr(
-        true);  // reuse address as we create multiple threads
+    server server_;  // server for websocket
+    wss_server wss_server_;
+    if (is_ssl) {
+      wss_server_.init_asio(&io_server);  // init asio
+      wss_server_.set_reuse_addr(
+          true);  // reuse address as we create multiple threads
 
-    // list on port for accept
-    server_.listen(asio::ip::address::from_string(s_listen_ip), s_port);
+      // list on port for accept
+      wss_server_.listen(asio::ip::address::from_string(s_listen_ip), s_port);
+      WebSocketServer websocket_srv(
+          io_decoder, is_ssl, nullptr, &wss_server_, s_certfile,
+          s_keyfile);  // websocket server for asr engine
+      websocket_srv.initAsr(model_path, s_model_thread_num);  // init asr model
 
-    WebSocketServer websocket_srv(io_decoder,
-                                  &server_);  // websocket server for asr engine
-    websocket_srv.initAsr(model_path, s_model_thread_num);  // init asr model
+    } else {
+      server_.init_asio(&io_server);  // init asio
+      server_.set_reuse_addr(
+          true);  // reuse address as we create multiple threads
+
+      // list on port for accept
+      server_.listen(asio::ip::address::from_string(s_listen_ip), s_port);
+      WebSocketServer websocket_srv(
+          io_decoder, is_ssl, &server_, nullptr, s_certfile,
+          s_keyfile);  // websocket server for asr engine
+      websocket_srv.initAsr(model_path, s_model_thread_num);  // init asr model
+    }
+
     std::cout << "asr model init finished. listen on port:" << s_port
               << std::endl;
 
     // Start the ASIO network io_service run loop
-    if (s_io_thread_num == 1) {
-      server_.run();
-    } else {
-      typedef websocketpp::lib::shared_ptr<websocketpp::lib::thread> thread_ptr;
-      std::vector<thread_ptr> ts;
-      // create threads for io network
-      for (size_t i = 0; i < s_io_thread_num; i++) {
-        ts.push_back(websocketpp::lib::make_shared<websocketpp::lib::thread>(
-            &server::run, &server_));
-      }
-      // wait for theads
-      for (size_t i = 0; i < s_io_thread_num; i++) {
-        ts[i]->join();
-      }
+    std::vector<std::thread> ts;
+    // create threads for io network
+    for (size_t i = 0; i < s_io_thread_num; i++) {
+      ts.emplace_back([&io_server]() { io_server.run(); });
+    }
+    // wait for theads
+    for (size_t i = 0; i < s_io_thread_num; i++) {
+      ts[i].join();
     }
 
     // wait for theads
diff --git a/funasr/runtime/websocket/readme.md b/funasr/runtime/websocket/readme.md
index 078184e..4a1a9d4 100644
--- a/funasr/runtime/websocket/readme.md
+++ b/funasr/runtime/websocket/readme.md
@@ -33,7 +33,15 @@
 ```
 
 ### Build runtime
+required openssl lib
+
 ```shell
+#install openssl lib for ubuntu 
+apt-get install libssl-dev
+#install openssl lib for centos
+yum install openssl-devel
+
+
 git clone https://github.com/alibaba-damo-academy/FunASR.git && cd funasr/runtime/websocket
 mkdir build && cd build
 cmake  -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/path/to/onnxruntime-linux-x64-1.14.0
@@ -43,11 +51,12 @@
 
 ```shell
 cd bin
-./websocketmain  [--model_thread_num <int>] [--decoder_thread_num <int>]
+   ./funasr-ws-server  [--model_thread_num <int>] [--decoder_thread_num <int>]
                     [--io_thread_num <int>] [--port <int>] [--listen_ip
                     <string>] [--punc-quant <string>] [--punc-dir <string>]
                     [--vad-quant <string>] [--vad-dir <string>] [--quantize
-                    <string>] --model-dir <string> [--] [--version] [-h]
+                    <string>] --model-dir <string> [--keyfile <string>]
+                    [--certfile <string>] [--] [--version] [-h]
 Where:
    --model-dir <string>
      (required)  the asr model path, which contains model.onnx, config.yaml, am.mvn
@@ -70,25 +79,50 @@
      number of threads for network io, default:8
    --port <int>
      listen port, default:8889
+   --certfile <string>
+     path of certficate for WSS connection. if it is empty, it will be in WS mode.
+   --keyfile <string>
+     path of keyfile for WSS connection
   
    Required:  --model-dir <string>
    If use vad, please add: --vad-dir <string>
    If use punc, please add: --punc-dir <string>
 example:
-   websocketmain --model-dir /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+   funasr-ws-server --model-dir /FunASR/funasr/runtime/onnxruntime/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
 ```
 
 ## Run websocket client test
 
 ```shell
-Usage: websocketclient server_ip port wav_path threads_num
+./funasr-ws-client  --server-ip <string>
+                    --port <string>
+                    --wav-path <string>
+                    [--thread-num <int>] 
+                    [--is-ssl <int>]  [--]
+                    [--version] [-h]
+
+Where:
+   --server-ip <string>
+     (required)  server-ip
+
+   --port <string>
+     (required)  port
+
+   --wav-path <string>
+     (required)  the input could be: wav_path, e.g.: asr_example.wav;
+     pcm_path, e.g.: asr_example.pcm; wav.scp, kaldi style wav list (wav_id \t wav_path)
+
+   --thread-num <int>
+     thread-num
+
+   --is-ssl <int>
+     is-ssl is 1 means use wss connection, or use ws connection
 
 example:
-
-websocketclient 127.0.0.1 8889 funasr/runtime/websocket/test.pcm.wav 64
+./funasr-ws-client --server-ip 127.0.0.1 --port 8889 --wav-path test.wav --thread-num 1 --is-ssl 0
 
 result json, example like:
-{"text":"涓�浜屼笁鍥涗簲鍏竷鍏節鍗佷竴浜屼笁鍥涗簲鍏竷鍏節鍗�"}
+{"mode":"offline","text":"娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�","wav_name":"wav2"}
 ```
 
 
diff --git a/funasr/runtime/websocket/websocketsrv.cpp b/funasr/runtime/websocket/websocket-server.cpp
similarity index 68%
rename from funasr/runtime/websocket/websocketsrv.cpp
rename to funasr/runtime/websocket/websocket-server.cpp
index b81442c..a311c23 100644
--- a/funasr/runtime/websocket/websocketsrv.cpp
+++ b/funasr/runtime/websocket/websocket-server.cpp
@@ -10,11 +10,48 @@
 // pools, one for handle network data and one for asr decoder.
 // now only support offline engine.
 
-#include "websocketsrv.h"
+#include "websocket-server.h"
 
 #include <thread>
 #include <utility>
 #include <vector>
+
+context_ptr WebSocketServer::on_tls_init(tls_mode mode,
+                                         websocketpp::connection_hdl hdl,
+                                         std::string& s_certfile,
+                                         std::string& s_keyfile) {
+  namespace asio = websocketpp::lib::asio;
+
+  LOG(INFO) << "on_tls_init called with hdl: " << hdl.lock().get();
+  LOG(INFO) << "using TLS mode: "
+            << (mode == MOZILLA_MODERN ? "Mozilla Modern"
+                                       : "Mozilla Intermediate");
+                                       
+  context_ptr ctx = websocketpp::lib::make_shared<asio::ssl::context>(
+      asio::ssl::context::sslv23);
+
+  try {
+    if (mode == MOZILLA_MODERN) {
+      // Modern disables TLSv1
+      ctx->set_options(
+          asio::ssl::context::default_workarounds |
+          asio::ssl::context::no_sslv2 | asio::ssl::context::no_sslv3 |
+          asio::ssl::context::no_tlsv1 | asio::ssl::context::single_dh_use);
+    } else {
+      ctx->set_options(asio::ssl::context::default_workarounds |
+                       asio::ssl::context::no_sslv2 |
+                       asio::ssl::context::no_sslv3 |
+                       asio::ssl::context::single_dh_use);
+    }
+
+    ctx->use_certificate_chain_file(s_certfile);
+    ctx->use_private_key_file(s_keyfile, asio::ssl::context::pem);
+
+  } catch (std::exception& e) {
+    LOG(INFO) << "Exception: " << e.what();
+  }
+  return ctx;
+}
 
 // feed buffer to asr engine for decoder
 void WebSocketServer::do_decoder(const std::vector<char>& buffer,
@@ -40,11 +77,15 @@
       jsonresult["wav_name"] = msg["wav_name"];
 
       // send the json to client
-      server_->send(hdl, jsonresult.dump(), websocketpp::frame::opcode::text,
-                    ec);
+      if (is_ssl) {
+        wss_server_->send(hdl, jsonresult.dump(),
+                          websocketpp::frame::opcode::text, ec);
+      } else {
+        server_->send(hdl, jsonresult.dump(), websocketpp::frame::opcode::text,
+                      ec);
+      }
 
-      std::cout << "buffer.size=" << buffer.size()
-                << ",result json=" << jsonresult.dump() << std::endl;
+      LOG(INFO) << "buffer.size=" << buffer.size() << ",result json=" << jsonresult.dump();
       if (!isonline) {
         //  close the client if it is not online asr
         // server_->close(hdl, websocketpp::close::status::normal, "DONE", ec);
@@ -67,14 +108,14 @@
   data_msg->samples = std::make_shared<std::vector<char>>();
   data_msg->msg = nlohmann::json::parse("{}");
   data_map.emplace(hdl, data_msg);
-  std::cout << "on_open, active connections: " << data_map.size() << std::endl;
+  LOG(INFO) << "on_open, active connections: " << data_map.size();
 }
 
 void WebSocketServer::on_close(websocketpp::connection_hdl hdl) {
   scoped_lock guard(m_lock);
   data_map.erase(hdl);  // remove data vector when  connection is closed
 
-  std::cout << "on_close, active connections: " << data_map.size() << std::endl;
+  LOG(INFO) << "on_close, active connections: " << data_map.size();
 }
 
 // remove closed connection
@@ -83,15 +124,24 @@
   auto iter = data_map.begin();
   while (iter != data_map.end()) {  // loop to find closed connection
     websocketpp::connection_hdl hdl = iter->first;
-    server::connection_ptr con = server_->get_con_from_hdl(hdl);
-    if (con->get_state() != 1) {  // session::state::open ==1
-      to_remove.push_back(hdl);
+
+    if (is_ssl) {
+      wss_server::connection_ptr con = wss_server_->get_con_from_hdl(hdl);
+      if (con->get_state() != 1) {  // session::state::open ==1
+        to_remove.push_back(hdl);
+      }
+    } else {
+      server::connection_ptr con = server_->get_con_from_hdl(hdl);
+      if (con->get_state() != 1) {  // session::state::open ==1
+        to_remove.push_back(hdl);
+      }
     }
+
     iter++;
   }
   for (auto hdl : to_remove) {
     data_map.erase(hdl);
-    std::cout << "remove one connection " << std::endl;
+    LOG(INFO)<< "remove one connection ";
   }
 }
 void WebSocketServer::on_message(websocketpp::connection_hdl hdl,
@@ -109,7 +159,7 @@
 
   lock.unlock();
   if (sample_data_p == nullptr) {
-    std::cout << "error when fetch sample data vector" << std::endl;
+    LOG(INFO) << "error when fetch sample data vector";
     return;
   }
 
@@ -124,7 +174,7 @@
 
       if (jsonresult["is_speaking"] == false ||
           jsonresult["is_finished"] == true) {
-        std::cout << "client done" << std::endl;
+        LOG(INFO) << "client done";
 
         if (isonline) {
           // do_close(ws);
@@ -173,9 +223,9 @@
     // init model with api
 
     asr_hanlde = FunOfflineInit(model_path, thread_num);
-    std::cout << "model ready" << std::endl;
+    LOG(INFO) << "model successfully inited";
 
   } catch (const std::exception& e) {
-    std::cout << e.what() << std::endl;
+    LOG(INFO) << e.what();
   }
 }
diff --git a/funasr/runtime/websocket/websocket-server.h b/funasr/runtime/websocket/websocket-server.h
new file mode 100644
index 0000000..198af1c
--- /dev/null
+++ b/funasr/runtime/websocket/websocket-server.h
@@ -0,0 +1,137 @@
+/**
+ * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
+ * Reserved. MIT License  (https://opensource.org/licenses/MIT)
+ */
+/* 2022-2023 by zhaomingwork */
+
+// websocket server for asr engine
+// take some ideas from https://github.com/k2-fsa/sherpa-onnx
+// online-websocket-server-impl.cc, thanks. The websocket server has two threads
+// pools, one for handle network data and one for asr decoder.
+// now only support offline engine.
+
+#ifndef WEBSOCKET_SERVER_H_
+#define WEBSOCKET_SERVER_H_
+
+#include <iostream>
+#include <map>
+#include <memory>
+#include <string>
+#include <thread>
+#include <utility>
+#define ASIO_STANDALONE 1  // not boost
+#include <glog/logging.h>
+
+#include <fstream>
+#include <functional>
+#include <websocketpp/common/thread.hpp>
+#include <websocketpp/config/asio.hpp>
+#include <websocketpp/server.hpp>
+
+#include "asio.hpp"
+#include "com-define.h"
+#include "funasrruntime.h"
+#include "nlohmann/json.hpp"
+#include "tclap/CmdLine.h"
+typedef websocketpp::server<websocketpp::config::asio> server;
+typedef websocketpp::server<websocketpp::config::asio_tls> wss_server;
+typedef server::message_ptr message_ptr;
+using websocketpp::lib::bind;
+using websocketpp::lib::placeholders::_1;
+using websocketpp::lib::placeholders::_2;
+
+typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
+typedef websocketpp::lib::unique_lock<websocketpp::lib::mutex> unique_lock;
+typedef websocketpp::lib::shared_ptr<websocketpp::lib::asio::ssl::context>
+    context_ptr;
+
+typedef struct {
+  std::string msg;
+  float snippet_time;
+} FUNASR_RECOG_RESULT;
+
+typedef struct {
+  nlohmann::json msg;
+  std::shared_ptr<std::vector<char>> samples;
+} FUNASR_MESSAGE;
+
+// See https://wiki.mozilla.org/Security/Server_Side_TLS for more details about
+// the TLS modes. The code below demonstrates how to implement both the modern
+enum tls_mode { MOZILLA_INTERMEDIATE = 1, MOZILLA_MODERN = 2 };
+class WebSocketServer {
+ public:
+  WebSocketServer(asio::io_context& io_decoder, bool is_ssl, server* server,
+                  wss_server* wss_server, std::string& s_certfile,
+                  std::string& s_keyfile)
+      : io_decoder_(io_decoder),
+        is_ssl(is_ssl),
+        server_(server),
+        wss_server_(wss_server) {
+    if (is_ssl) {
+      std::cout << "certfile path is " << s_certfile << std::endl;
+      wss_server->set_tls_init_handler(
+          bind<context_ptr>(&WebSocketServer::on_tls_init, this,
+                            MOZILLA_INTERMEDIATE, ::_1, s_certfile, s_keyfile));
+      wss_server_->set_message_handler(
+          [this](websocketpp::connection_hdl hdl, message_ptr msg) {
+            on_message(hdl, msg);
+          });
+      // set open handle
+      wss_server_->set_open_handler(
+          [this](websocketpp::connection_hdl hdl) { on_open(hdl); });
+      // set close handle
+      wss_server_->set_close_handler(
+          [this](websocketpp::connection_hdl hdl) { on_close(hdl); });
+      // begin accept
+      wss_server_->start_accept();
+      // not print log
+      wss_server_->clear_access_channels(websocketpp::log::alevel::all);
+
+    } else {
+      // set message handle
+      server_->set_message_handler(
+          [this](websocketpp::connection_hdl hdl, message_ptr msg) {
+            on_message(hdl, msg);
+          });
+      // set open handle
+      server_->set_open_handler(
+          [this](websocketpp::connection_hdl hdl) { on_open(hdl); });
+      // set close handle
+      server_->set_close_handler(
+          [this](websocketpp::connection_hdl hdl) { on_close(hdl); });
+      // begin accept
+      server_->start_accept();
+      // not print log
+      server_->clear_access_channels(websocketpp::log::alevel::all);
+    }
+  }
+  void do_decoder(const std::vector<char>& buffer,
+                  websocketpp::connection_hdl& hdl, const nlohmann::json& msg);
+
+  void initAsr(std::map<std::string, std::string>& model_path, int thread_num);
+  void on_message(websocketpp::connection_hdl hdl, message_ptr msg);
+  void on_open(websocketpp::connection_hdl hdl);
+  void on_close(websocketpp::connection_hdl hdl);
+  context_ptr on_tls_init(tls_mode mode, websocketpp::connection_hdl hdl,
+                          std::string& s_certfile, std::string& s_keyfile);
+
+ private:
+  void check_and_clean_connection();
+  asio::io_context& io_decoder_;  // threads for asr decoder
+  // std::ofstream fout;
+  FUNASR_HANDLE asr_hanlde;  // asr engine handle
+  bool isonline = false;  // online or offline engine, now only support offline
+  bool is_ssl = true;
+  server* server_;          // websocket server
+  wss_server* wss_server_;  // websocket server
+
+  // use map to keep the received samples data from one connection in offline
+  // engine. if for online engline, a data struct is needed(TODO)
+
+  std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,
+           std::owner_less<websocketpp::connection_hdl>>
+      data_map;
+  websocketpp::lib::mutex m_lock;  // mutex for sample_map
+};
+
+#endif  // WEBSOCKET_SERVER_H_
diff --git a/funasr/runtime/websocket/websocketclient.cpp b/funasr/runtime/websocket/websocketclient.cpp
deleted file mode 100644
index 078fc5a..0000000
--- a/funasr/runtime/websocket/websocketclient.cpp
+++ /dev/null
@@ -1,237 +0,0 @@
-/**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
- * Reserved. MIT License  (https://opensource.org/licenses/MIT)
- */
-/* 2022-2023 by zhaomingwork */
-
-// client for websocket, support multiple threads
-// Usage: websocketclient server_ip port wav_path threads_num
-
-#define ASIO_STANDALONE 1
-#include <websocketpp/client.hpp>
-#include <websocketpp/common/thread.hpp>
-#include <websocketpp/config/asio_no_tls_client.hpp>
-
-#include "audio.h"
-#include "nlohmann/json.hpp"
-
-/**
- * Define a semi-cross platform helper method that waits/sleeps for a bit.
- */
-void wait_a_bit() {
-#ifdef WIN32
-  Sleep(1000);
-#else
-  sleep(1);
-#endif
-}
-typedef websocketpp::config::asio_client::message_type::ptr message_ptr;
-
-class websocket_client {
- public:
-  typedef websocketpp::client<websocketpp::config::asio_client> client;
-  typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
-
-  websocket_client() : m_open(false), m_done(false) {
-    // set up access channels to only log interesting things
-    m_client.clear_access_channels(websocketpp::log::alevel::all);
-    m_client.set_access_channels(websocketpp::log::alevel::connect);
-    m_client.set_access_channels(websocketpp::log::alevel::disconnect);
-    m_client.set_access_channels(websocketpp::log::alevel::app);
-
-    // Initialize the Asio transport policy
-    m_client.init_asio();
-
-    // Bind the handlers we are using
-    using websocketpp::lib::bind;
-    using websocketpp::lib::placeholders::_1;
-    m_client.set_open_handler(bind(&websocket_client::on_open, this, _1));
-    m_client.set_close_handler(bind(&websocket_client::on_close, this, _1));
-    m_client.set_close_handler(bind(&websocket_client::on_close, this, _1));
-
-    m_client.set_message_handler(
-        [this](websocketpp::connection_hdl hdl, message_ptr msg) {
-          on_message(hdl, msg);
-        });
-
-    m_client.set_fail_handler(bind(&websocket_client::on_fail, this, _1));
-    m_client.clear_access_channels(websocketpp::log::alevel::all);
-  }
-  void on_message(websocketpp::connection_hdl hdl, message_ptr msg) {
-    const std::string& payload = msg->get_payload();
-    switch (msg->get_opcode()) {
-      case websocketpp::frame::opcode::text:
-        std::cout << "on_message=" << payload << std::endl;
-    }
-  }
-  // This method will block until the connection is complete
-  void run(const std::string& uri, const std::string& wav_path) {
-    // Create a new connection to the given URI
-    websocketpp::lib::error_code ec;
-    client::connection_ptr con = m_client.get_connection(uri, ec);
-    if (ec) {
-      m_client.get_alog().write(websocketpp::log::alevel::app,
-                                "Get Connection Error: " + ec.message());
-      return;
-    }
-    this->wav_path = std::move(wav_path);
-    // Grab a handle for this connection so we can talk to it in a thread
-    // safe manor after the event loop starts.
-    m_hdl = con->get_handle();
-
-    // Queue the connection. No DNS queries or network connections will be
-    // made until the io_service event loop is run.
-    m_client.connect(con);
-
-    // Create a thread to run the ASIO io_service event loop
-    websocketpp::lib::thread asio_thread(&client::run, &m_client);
-
-    send_wav_data();
-    asio_thread.join();
-  }
-
-  // The open handler will signal that we are ready to start sending data
-  void on_open(websocketpp::connection_hdl) {
-    m_client.get_alog().write(websocketpp::log::alevel::app,
-                              "Connection opened, starting data!");
-
-    scoped_lock guard(m_lock);
-    m_open = true;
-  }
-
-  // The close handler will signal that we should stop sending data
-  void on_close(websocketpp::connection_hdl) {
-    m_client.get_alog().write(websocketpp::log::alevel::app,
-                              "Connection closed, stopping data!");
-
-    scoped_lock guard(m_lock);
-    m_done = true;
-  }
-
-  // The fail handler will signal that we should stop sending data
-  void on_fail(websocketpp::connection_hdl) {
-    m_client.get_alog().write(websocketpp::log::alevel::app,
-                              "Connection failed, stopping data!");
-
-    scoped_lock guard(m_lock);
-    m_done = true;
-  }
-  // send wav to server
-  void send_wav_data() {
-    uint64_t count = 0;
-    std::stringstream val;
-
-    funasr::Audio audio(1);
-    int32_t sampling_rate = 16000;
-
-    if (!audio.LoadPcmwav(wav_path.c_str(), &sampling_rate)) {
-      std::cout << "error in load wav" << std::endl;
-      return;
-    }
-
-    float* buff;
-    int len;
-    int flag = 0;
-    bool wait = false;
-    while (1) {
-      {
-        scoped_lock guard(m_lock);
-        // If the connection has been closed, stop generating data
-        if (m_done) {
-          break;
-        }
-
-        // If the connection hasn't been opened yet wait a bit and retry
-        if (!m_open) {
-          wait = true;
-        } else {
-          break;
-        }
-      }
-
-      if (wait) {
-        std::cout << "wait.." << m_open << std::endl;
-        wait_a_bit();
-
-        continue;
-      }
-    }
-    websocketpp::lib::error_code ec;
-
-    nlohmann::json jsonbegin;
-    nlohmann::json chunk_size = nlohmann::json::array();
-    chunk_size.push_back(5);
-    chunk_size.push_back(0);
-    chunk_size.push_back(5);
-    jsonbegin["chunk_size"] = chunk_size;
-    jsonbegin["chunk_interval"] = 10;
-    jsonbegin["wav_name"] = "damo";
-    jsonbegin["is_speaking"] = true;
-    m_client.send(m_hdl, jsonbegin.dump(), websocketpp::frame::opcode::text,
-                  ec);
-
-    // fetch wav data use asr engine api
-    while (audio.Fetch(buff, len, flag) > 0) {
-      short iArray[len];
-
-      // convert float -1,1 to short -32768,32767
-      for (size_t i = 0; i < len; ++i) {
-        iArray[i] = (short)(buff[i] * 32767);
-      }
-      // send data to server
-      m_client.send(m_hdl, iArray, len * sizeof(short),
-                    websocketpp::frame::opcode::binary, ec);
-      std::cout << "sended data len=" << len * sizeof(short) << std::endl;
-      // The most likely error that we will get is that the connection is
-      // not in the right state. Usually this means we tried to send a
-      // message to a connection that was closed or in the process of
-      // closing. While many errors here can be easily recovered from,
-      // in this simple example, we'll stop the data loop.
-      if (ec) {
-        m_client.get_alog().write(websocketpp::log::alevel::app,
-                                  "Send Error: " + ec.message());
-        break;
-      }
-
-      wait_a_bit();
-    }
-    nlohmann::json jsonresult;
-    jsonresult["is_speaking"] = false;
-    m_client.send(m_hdl, jsonresult.dump(), websocketpp::frame::opcode::text,
-                  ec);
-    wait_a_bit();
-  }
-
- private:
-  client m_client;
-  websocketpp::connection_hdl m_hdl;
-  websocketpp::lib::mutex m_lock;
-  std::string wav_path;
-  bool m_open;
-  bool m_done;
-};
-
-int main(int argc, char* argv[]) {
-  if (argc < 5) {
-    printf("Usage: %s server_ip port wav_path threads_num\n", argv[0]);
-    exit(-1);
-  }
-  std::string server_ip = argv[1];
-  std::string port = argv[2];
-  std::string wav_path = argv[3];
-  int threads_num = atoi(argv[4]);
-  std::vector<websocketpp::lib::thread> client_threads;
-
-  std::string uri = "ws://" + server_ip + ":" + port;
-
-  for (size_t i = 0; i < threads_num; i++) {
-    client_threads.emplace_back([uri, wav_path]() {
-      websocket_client c;
-      c.run(uri, wav_path);
-    });
-  }
-
-  for (auto& t : client_threads) {
-    t.join();
-  }
-}
\ No newline at end of file
diff --git a/funasr/runtime/websocket/websocketsrv.h b/funasr/runtime/websocket/websocketsrv.h
deleted file mode 100644
index 82d717e..0000000
--- a/funasr/runtime/websocket/websocketsrv.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/**
- * Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights
- * Reserved. MIT License  (https://opensource.org/licenses/MIT)
- */
-/* 2022-2023 by zhaomingwork */
-
-// websocket server for asr engine
-// take some ideas from https://github.com/k2-fsa/sherpa-onnx
-// online-websocket-server-impl.cc, thanks. The websocket server has two threads
-// pools, one for handle network data and one for asr decoder.
-// now only support offline engine.
-
-#ifndef WEBSOCKETSRV_SERVER_H_
-#define WEBSOCKETSRV_SERVER_H_
-
-#include <iostream>
-#include <map>
-#include <memory>
-#include <string>
-#include <thread>
-#include <utility>
-#define ASIO_STANDALONE 1  // not boost
-#include <glog/logging.h>
-
-#include <fstream>
-#include <functional>
-#include <websocketpp/common/thread.hpp>
-#include <websocketpp/config/asio_no_tls.hpp>
-#include <websocketpp/server.hpp>
-
-#include "asio.hpp"
-#include "com-define.h"
-#include "funasrruntime.h"
-#include "nlohmann/json.hpp"
-#include "tclap/CmdLine.h"
-typedef websocketpp::server<websocketpp::config::asio> server;
-typedef server::message_ptr message_ptr;
-using websocketpp::lib::bind;
-using websocketpp::lib::placeholders::_1;
-using websocketpp::lib::placeholders::_2;
-typedef websocketpp::lib::lock_guard<websocketpp::lib::mutex> scoped_lock;
-typedef websocketpp::lib::unique_lock<websocketpp::lib::mutex> unique_lock;
-
-typedef struct {
-  std::string msg;
-  float snippet_time;
-} FUNASR_RECOG_RESULT;
-
-typedef struct {
-  nlohmann::json msg;
-  std::shared_ptr<std::vector<char>> samples;
-} FUNASR_MESSAGE;
-
-class WebSocketServer {
- public:
-  WebSocketServer(asio::io_context& io_decoder, server* server_)
-      : io_decoder_(io_decoder), server_(server_) {
-    // set message handle
-    server_->set_message_handler(
-        [this](websocketpp::connection_hdl hdl, message_ptr msg) {
-          on_message(hdl, msg);
-        });
-    // set open handle
-    server_->set_open_handler(
-        [this](websocketpp::connection_hdl hdl) { on_open(hdl); });
-    // set close handle
-    server_->set_close_handler(
-        [this](websocketpp::connection_hdl hdl) { on_close(hdl); });
-    // begin accept
-    server_->start_accept();
-    // not print log
-    server_->clear_access_channels(websocketpp::log::alevel::all);
-  }
-  void do_decoder(const std::vector<char>& buffer,
-                  websocketpp::connection_hdl& hdl, const nlohmann::json& msg);
-
-  void initAsr(std::map<std::string, std::string>& model_path, int thread_num);
-  void on_message(websocketpp::connection_hdl hdl, message_ptr msg);
-  void on_open(websocketpp::connection_hdl hdl);
-  void on_close(websocketpp::connection_hdl hdl);
-
- private:
-  void check_and_clean_connection();
-  asio::io_context& io_decoder_;  // threads for asr decoder
-  // std::ofstream fout;
-  FUNASR_HANDLE asr_hanlde;  // asr engine handle
-  bool isonline = false;  // online or offline engine, now only support offline
-  server* server_;        // websocket server
-
-  // use map to keep the received samples data from one connection in offline
-  // engine. if for online engline, a data struct is needed(TODO)
-
-  std::map<websocketpp::connection_hdl, std::shared_ptr<FUNASR_MESSAGE>,
-           std::owner_less<websocketpp::connection_hdl>>
-      data_map;
-  websocketpp::lib::mutex m_lock;  // mutex for sample_map
-};
-
-#endif  // WEBSOCKETSRV_SERVER_H_
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 4f83ace..f066909 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -44,6 +44,7 @@
 from funasr.train.reporter import Reporter
 from funasr.train.reporter import SubReporter
 from funasr.utils.build_dataclass import build_dataclass
+from funasr.utils.kwargs2args import kwargs2args
 
 if torch.distributed.is_available():
     from torch.distributed import ReduceOp
@@ -620,6 +621,24 @@
                 all_steps_are_invalid = False
                 continue
 
+            if iiter == 1 and summary_writer is not None:
+                try:
+                    args = kwargs2args(model.forward, batch)
+                except (ValueError, TypeError):
+                    logging.warning(
+                        "inpect.signature() is failed for the model. "
+                        "The graph can't be added for tensorboard."
+                    )
+                else:
+                    try:
+                        summary_writer.add_graph(model, args, use_strict_trace=False)
+                    except Exception:
+                        logging.warning(
+                            "summary_writer.add_graph() is failed for the model. "
+                            "The graph can't be added for tensorboard."
+                        )
+                    del args
+
             with autocast(scaler is not None):
                 with reporter.measure_time("forward_time"):
                     retval = model(**batch)
diff --git a/funasr/utils/kwargs2args.py b/funasr/utils/kwargs2args.py
new file mode 100644
index 0000000..4e0cdba
--- /dev/null
+++ b/funasr/utils/kwargs2args.py
@@ -0,0 +1,19 @@
+import inspect
+
+
+def func(a: int, b, *, c, **kwargs):
+    pass
+
+def kwargs2args(func, kwargs):
+    parameters = inspect.signature(func).parameters
+    d = {k: i for i, k in enumerate(parameters)}
+    args = [None for i in range(len(parameters))]
+    for k, v in kwargs.items():
+        if k in d:
+            args[d[k]] = v
+
+    for i, v in enumerate(args):
+        if v is None:
+            break
+
+    return tuple(args[:i])
diff --git a/funasr/utils/prepare_data.py b/funasr/utils/prepare_data.py
index 36795b4..7602740 100644
--- a/funasr/utils/prepare_data.py
+++ b/funasr/utils/prepare_data.py
@@ -185,7 +185,7 @@
         for i in range(nj):
             path = ""
             for file_name in file_names:
-                path = path + os.path.join(split_path, str(i + 1), file_name)
+                path = path + " " + os.path.join(split_path, str(i + 1), file_name)
             f_data.write(path + "\n")
 
 
diff --git a/funasr/version.txt b/funasr/version.txt
index b49b253..ee6cdce 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.5.6
+0.6.1
diff --git a/setup.py b/setup.py
index ed891af..0e787ab 100644
--- a/setup.py
+++ b/setup.py
@@ -81,7 +81,7 @@
         # "gtn==0.0.0",
     ],
     "setup": [
-        "numpy<=1.21.3",
+        "numpy",
         "pytest-runner",
     ],
     "test": [
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 9098ea6..2b21acf 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -87,6 +87,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_hotword.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "鍥藉姟闄㈠彂灞曠爺绌朵腑蹇冨競鍦虹粡娴庣爺绌舵墍鍓墍闀块倱閮佹澗璁や负"
 
     def test_paraformer_large_aishell1(self):
         inference_pipeline = pipeline(
@@ -95,6 +96,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�"
 
     def test_paraformer_large_aishell2(self):
         inference_pipeline = pipeline(
@@ -103,6 +105,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�"
 
     def test_paraformer_large_common(self):
         inference_pipeline = pipeline(
@@ -111,6 +114,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�"
 
     def test_paraformer_large_online_common(self):
         inference_pipeline = pipeline(
@@ -119,6 +123,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋澶� 瀹舵潵 浣撻獙杈� 鎽╅櫌鎺� 鍑虹殑 璇煶璇� 鍒ā 鍨�"
 
     def test_paraformer_online_common(self):
         inference_pipeline = pipeline(
@@ -127,6 +132,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋 澶у鏉� 浣撻獙杈� 鎽╅櫌鎺� 鍑虹殑 璇煶璇� 鍒ā 鍨�"
 
     def test_paraformer_tiny_commandword(self):
         inference_pipeline = pipeline(
diff --git a/tests/test_asr_vad_punc_inference_pipeline.py b/tests/test_asr_vad_punc_inference_pipeline.py
index 628b256..f86f23d 100644
--- a/tests/test_asr_vad_punc_inference_pipeline.py
+++ b/tests/test_asr_vad_punc_inference_pipeline.py
@@ -26,6 +26,7 @@
         rec_result = inference_pipeline(
             audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
         logger.info("asr_vad_punc inference result: {0}".format(rec_result))
+        assert rec_result["text"] == "娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨嬨��"
 
 
 if __name__ == '__main__':

--
Gitblit v1.9.1