From 17e8f5b889be2ad31608b5203dc5fbc5fd5c0f8a Mon Sep 17 00:00:00 2001
From: nichongjia-2007 <nichongjia@gmail.com>
Date: 星期四, 20 七月 2023 21:26:58 +0800
Subject: [PATCH] Merge branch 'main' of https://github.com/alibaba-damo-academy/FunASR

---
 funasr/models/encoder/e_branchformer_encoder.py                           |  465 ++++++++
 funasr/datasets/large_datasets/build_dataloader.py                        |    9 
 funasr/train/trainer.py                                                   |    2 
 setup.py                                                                  |    3 
 funasr/runtime/websocket/readme_zh.md                                     |  190 +++
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py |    4 
 egs/aishell/branchformer/conf/decode_asr_transformer.yaml                 |    6 
 egs/aishell/e_branchformer/path.sh                                        |    5 
 funasr/runtime/websocket/readme.md                                        |    2 
 .github/workflows/UnitTest.yml                                            |    2 
 egs/aishell/e_branchformer/local/aishell_data_prep.sh                     |   66 +
 funasr/runtime/onnxruntime/third_party/download_ffmpeg.sh                 |    5 
 funasr/build_utils/build_asr_model.py                                     |    4 
 egs/aishell/e_branchformer/run.sh                                         |  225 ++++
 egs/aishell/branchformer/local/download_and_untar.sh                      |  105 ++
 funasr/runtime/html5/readme.md                                            |    2 
 egs/aishell/branchformer/utils                                            |    1 
 README_zh.md                                                              |   30 
 README.md                                                                 |  123 +
 funasr/datasets/large_datasets/dataset.py                                 |    8 
 egs/aishell/branchformer/path.sh                                          |    5 
 egs/aishell/branchformer/run.sh                                           |  225 ++++
 egs/aishell/e_branchformer/conf/decode_asr_transformer.yaml               |    6 
 funasr/utils/timestamp_tools.py                                           |   17 
 funasr/runtime/html5/readme_zh.md                                         |   93 +
 funasr/datasets/large_datasets/utils/tokenize.py                          |    2 
 funasr/version.txt                                                        |    2 
 funasr/runtime/python/websocket/funasr_wss_client.py                      |    1 
 funasr/bin/punc_infer.py                                                  |    8 
 egs/aishell/e_branchformer/local/download_and_untar.sh                    |  105 ++
 funasr/modules/fastformer.py                                              |  153 ++
 funasr/bin/punc_train.py                                                  |    6 
 egs_modelscope/speaker_diarization/TEMPLATE/README.md                     |    2 
 .github/workflows/main.yml                                                |    1 
 funasr/datasets/preprocessor.py                                           |   70 +
 docs/README.md                                                            |    4 
 funasr/models/encoder/branchformer_encoder.py                             |  545 ++++++++++
 funasr/bin/vad_inference_launch.py                                        |    2 
 funasr/tasks/asr.py                                                       |    1 
 egs/aishell/e_branchformer/conf/train_asr_e_branchformer.yaml             |  101 +
 /dev/null                                                                 |  135 --
 funasr/runtime/python/websocket/funasr_wss_server.py                      |    6 
 funasr/modules/cgmlp.py                                                   |  124 ++
 funasr/runtime/onnxruntime/third_party/download_onnxruntime.sh            |    5 
 egs/aishell/branchformer/conf/train_asr_branchformer.yaml                 |  104 ++
 egs/aishell/branchformer/local/aishell_data_prep.sh                       |   66 +
 funasr/modules/repeat.py                                                  |   21 
 egs/aishell/e_branchformer/utils                                          |    1 
 funasr/bin/asr_inference_launch.py                                        |   16 
 49 files changed, 2,868 insertions(+), 216 deletions(-)

diff --git a/.github/workflows/UnitTest.yml b/.github/workflows/UnitTest.yml
index 8ced9e4..ae7542b 100644
--- a/.github/workflows/UnitTest.yml
+++ b/.github/workflows/UnitTest.yml
@@ -6,9 +6,7 @@
         - main
   push:
     branches:
-      - dev_wjm
       - dev_jy
-      - dev_wjm_infer
 
 jobs:
   build:
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 26a47c4..332d35e 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,7 +5,6 @@
       - main
   push:
     branches:
-      - dev_wjm
       - main
       - dev_lyh
 
diff --git a/README.md b/README.md
index 3fbbbb4..e304da0 100644
--- a/README.md
+++ b/README.md
@@ -14,33 +14,60 @@
 [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
-| [**Usage**](#usage)
-| [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
-| [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
-| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md)
+| [**Quick Start**](#quick-start)
+| [**Runtime**](./funasr/runtime/readme.md)
+| [**Model Zoo**](./docs/model_zoo/modelscope_models.md)
 | [**Contact**](#contact)
-| [**M2MET2.0 Challenge**](https://github.com/alibaba-damo-academy/FunASR#multi-channel-multi-party-meeting-transcription-20-m2met20-challenge)
 
+
+<a name="whats-new"></a>
 ## What's new: 
 
-### FunASR runtime-SDK
+### FunASR runtime
 
 - 2023.07.03: 
 We have release the FunASR runtime-SDK-0.1.0, file transcription service (Mandarin) is now supported ([ZH](funasr/runtime/readme_cn.md)/[EN](funasr/runtime/readme.md))
 
 ### Multi-Channel Multi-Party Meeting Transcription 2.0 (M2MeT2.0) Challenge
 
-We are pleased to announce that the M2MeT2.0 challenge has been accepted by the ASRU 2023 challenge special session. The registration is now open. The baseline system is conducted on FunASR and is provided as a receipe of AliMeeting corpus. For more details you can see the guidence of M2MET2.0 ([CN](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)/[EN](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)).
+Challenge details ref to ([CN](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)/[EN](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html))
 
-### Release notes
+### Speech Recognition
+ 
+- Academic Models
+  - Encoder-Decoder Models (AED): [Transformer](egs/aishell/transformer), [Conformer](egs/aishell/conformer), [Branchformer](egs/aishell/branchformer)
+  - Transducer Models (RNNT): [RNNT streaming](egs/aishell/rnnt), [BAT streaming/non-streaming](egs/aishell/bat)
+  - Non-autoregressive Model (NAR): [Paraformer](egs/aishell/paraformer)
+  - Multi-speaker recognition model: [MFCCA](egs_modelscope/asr/mfcca)
 
-For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
 
+- Industrial-level Models
+  - Paraformer Models (Mandarin): [Paraformer-large](egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch), [Paraformer-large-long](egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch), [Paraformer-large streaming](egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online), [Paraformer-large-contextual](egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404)
+  - Conformer Models (English): [Conformer]()
+  - UniASR streaming offline unifying models: [16k UniASR Burmese](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary), [16k UniASR Hebrew](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary), [16k UniASR Urdu](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary), [8k UniASR Mandarin financial domain](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-finance-vocab3445-online/summary), [16k UniASR Mandarin audio-visual domain](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-audio_and_video-vocab3445-online/summary),
+  [Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary), [French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary),  [German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary),  [Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary),  [Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary)
+
+- Speaker Recognition
+  - Speaker Verification Model: [xvector](egs_modelscope/speaker_verification)
+  - Speaker Diarization Model: [SOND](egs/callhome/diarization/sond)
+
+- Punctuation Restoration
+  - Chinese Punctuation Model: [CT-Transformer](egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch), [CT-Transformer streaming](egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727)
+
+- Endpoint Detection
+  - [FSMN-VAD](egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common)
+
+- Timestamp Prediction
+  - Character-level FA Model: [TP-Aligner](egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline)
+
+
+<a name="highlights"></a>
 ## Highlights
 - FunASR is a fundamental speech recognition toolkit that offers a variety of features, including speech recognition (ASR), Voice Activity Detection (VAD), Punctuation Restoration, Language Models, Speaker Verification, Speaker diarization and multi-talker ASR.
 - We have released a vast collection of academic and industrial pretrained models on the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition), which can be accessed through our [Model Zoo](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/model_zoo/modelscope_models.md). The representative [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model has achieved SOTA performance in many speech recognition tasks. 
 - FunASR offers a user-friendly pipeline for fine-tuning pretrained models from the [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition). Additionally, the optimized dataloader in FunASR enables faster training speeds for large-scale datasets. This feature enhances the efficiency of the speech recognition process for researchers and practitioners.
 
+<a name="Installation"></a>
 ## Installation
 
 Install from pip
@@ -70,24 +97,60 @@
 
 For more details, please ref to [installation](https://alibaba-damo-academy.github.io/FunASR/en/installation/installation.html)
 
-## Usage
+<a name="quick-start"></a>
+## Quick Start
 
-You could use FunASR by:
+You can use FunASR in the following ways:
 
-- egs
-- egs_modelscope
-- runtime
+- Service Deployment SDK
+- Industrial model egs
+- Academic model egs
 
-### egs
-If you want to train the model from scratch, you could use funasr directly by recipe, as the following:
+### Service Deployment SDK
+
+#### Python version Example
+Supports real-time streaming speech recognition, uses non-streaming models for error correction, and outputs text with punctuation. Currently, only single client is supported. For multi-concurrency, please refer to the C++ version service deployment SDK below.
+
+##### Server Deployment
+
 ```shell
-cd egs/aishell/paraformer
-. ./run.sh --CUDA_VISIBLE_DEVICES="0,1" --gpu_num=2
+cd funasr/runtime/python/websocket
+python funasr_wss_server.py --port 10095
 ```
-More examples could be found in [docs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)
 
-### egs_modelscope
-If you want to infer or finetune pretraining models from modelscope, you could use funasr by modelscope pipeline, as the following:
+##### Client Testing
+
+```shell
+python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "5,10,5"
+```
+
+For more examples, please refer to [docs](https://alibaba-damo-academy.github.io/FunASR/en/runtime/websocket_python.html#id2).
+
+#### C++ version Example
+
+Currently, offline file transcription service (CPU) is supported, and concurrent requests of hundreds of channels are supported.
+
+##### Server Deployment
+
+You can use the following command to complete the deployment with one click:
+
+```shell
+curl -O https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/shell/funasr-runtime-deploy-offline-cpu-zh.sh
+sudo bash funasr-runtime-deploy-offline-cpu-zh.sh install --workspace ./funasr-runtime-resources
+```
+
+##### Client Testing
+
+```shell
+python3 funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"
+```
+
+For more examples, please refer to [docs](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/docs/SDK_tutorial_zh.md)
+
+
+### Industrial Model Egs
+
+If you want to use the pre-trained industrial models in ModelScope for inference or fine-tuning training, you can refer to the following command:
 
 ```python
 from modelscope.pipelines import pipeline
@@ -102,24 +165,20 @@
 print(rec_result)
 # {'text': '娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�'}
 ```
+
 More examples could be found in [docs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)
 
-### runtime
+### Academic model egs
 
-An example with websocket:
+If you want to train from scratch, usually for academic models, you can start training and inference with the following command:
 
-For the server:
 ```shell
-cd funasr/runtime/python/websocket
-python funasr_wss_server.py --port 10095
+cd egs/aishell/paraformer
+. ./run.sh --CUDA_VISIBLE_DEVICES="0,1" --gpu_num=2
 ```
+More examples could be found in [docs](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_pipeline/quick_start.html)
 
-For the client:
-```shell
-python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "5,10,5"
-#python funasr_wss_client.py --host "127.0.0.1" --port 10095 --mode 2pass --chunk_size "8,8,4" --audio_in "./data/wav.scp" --output_dir "./results"
-```
-More examples could be found in [docs](https://alibaba-damo-academy.github.io/FunASR/en/runtime/websocket_python.html#id2)
+<a name="contact"></a>
 ## Contact
 
 If you have any questions about FunASR, please contact us by
diff --git a/README_zh.md b/README_zh.md
index af7096c..ee9342d 100644
--- a/README_zh.md
+++ b/README_zh.md
@@ -37,11 +37,33 @@
 璇︽儏璇峰弬鑰冩枃妗o紙[鐐瑰嚮姝ゅ](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)锛�
 
 
-### 瀛︽湳妯″瀷鏇存柊
+### 璇煶璇嗗埆
 
-### 宸ヤ笟妯″瀷鏇存柊
+- 瀛︽湳妯″瀷锛�
+  - Encoder-Decoder妯″瀷锛歔Transformer](egs/aishell/transformer)锛孾Conformer](egs/aishell/conformer)锛孾Branchformer](egs/aishell/branchformer)
+  - Transducer妯″瀷锛歔RNNT锛堟祦寮忥級](egs/aishell/rnnt)锛孾BAT](egs/aishell/bat)
+  - 闈炶嚜鍥炲綊妯″瀷锛歔Paraformer](egs/aishell/paraformer)
+  - 澶氳璇濅汉璇嗗埆妯″瀷锛歔MFCCA](egs_modelscope/asr/mfcca)
+    
+- 宸ヤ笟妯″瀷锛�
+  - 涓枃閫氱敤妯″瀷锛歔Paraformer-large](egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch)锛孾Paraformer-large闀块煶棰戠増鏈琞(egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch)锛孾Paraformer-large娴佸紡鐗堟湰](egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-online)
+  - 涓枃閫氱敤鐑瘝妯″瀷锛歔Paraformer-large-contextual](egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404)锛�
+  - 鑻辨枃閫氱敤妯″瀷锛歔Conformer]()
+  - 娴佸紡绂荤嚎涓�浣撳寲妯″瀷锛� [16k UniASR闂藉崡璇璢(https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary)銆� [16k UniASR娉曡](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary)銆� [16k UniASR寰疯](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary)銆� [16k UniASR瓒婂崡璇璢(https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary)銆� [16k UniASR娉㈡柉璇璢(https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary),
+  [16k UniASR缂呯敻璇璢(https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/summary)銆�      [16k UniASR甯屼集鏉ヨ](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/summary)銆�      [16k UniASR涔屽皵閮借](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/summary)銆�      [8k UniASR涓枃閲戣瀺棰嗗煙](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-finance-vocab3445-online/summary)銆乕16k UniASR涓枃闊宠棰戦鍩焆(https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-audio_and_video-vocab3445-online/summary)
+    
+### 璇磋瘽浜鸿瘑鍒�
+  - 璇磋瘽浜虹‘璁ゆā鍨嬶細[xvector](egs_modelscope/speaker_verification)
+  - 璇磋瘽浜烘棩蹇楁ā鍨嬶細[SOND](egs/callhome/diarization/sond)
 
-- 2023/07/06 
+### 鏍囩偣鎭㈠
+  - 涓枃鏍囩偣妯″瀷锛歔CT-Transformer](egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch)锛孾CT-Transformer娴佸紡](egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727)
+
+### 绔偣妫�娴�
+  - [FSMN-VAD](egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common)
+
+### 鏃堕棿鎴抽娴�
+  - 瀛楃骇鍒ā鍨嬶細[TP-Aligner](egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline)
 
 <a name="鏍稿績鍔熻兘"></a>
 ## 鏍稿績鍔熻兘
@@ -180,7 +202,7 @@
 
 
 ## 璁稿彲鍗忚
-椤圭洰閬靛惊[The MIT License](https://opensource.org/licenses/MIT)寮�婧愬崗璁�. 宸ヤ笟妯″瀷璁稿彲鍗忚璇峰弬鑰冿紙[鐐瑰嚮姝ゅ](./MODEL_LICENSE)锛�
+椤圭洰閬靛惊[The MIT License](https://opensource.org/licenses/MIT)寮�婧愬崗璁�� 宸ヤ笟妯″瀷璁稿彲鍗忚璇峰弬鑰冿紙[鐐瑰嚮姝ゅ](./MODEL_LICENSE)锛�
 
 
 ## Stargazers over time
diff --git a/docs/README.md b/docs/README.md
index 4e16b04..df9b556 100644
--- a/docs/README.md
+++ b/docs/README.md
@@ -4,9 +4,9 @@
 For convenience, we provide users with the ability to generate local HTML manually.
 
 First, you should install the following packages, which is required for building HTML:
+
 ```sh
-conda activate funasr
-pip install requests sphinx nbsphinx sphinx_markdown_tables sphinx_rtd_theme recommonmark
+pip3 install -U "funasr[docs]"
 ```
 
 Then you can generate HTML manually.
diff --git a/egs/aishell/branchformer/conf/decode_asr_transformer.yaml b/egs/aishell/branchformer/conf/decode_asr_transformer.yaml
new file mode 100644
index 0000000..e87a293
--- /dev/null
+++ b/egs/aishell/branchformer/conf/decode_asr_transformer.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.4
+lm_weight: 0.0
diff --git a/egs/aishell/branchformer/conf/train_asr_branchformer.yaml b/egs/aishell/branchformer/conf/train_asr_branchformer.yaml
new file mode 100644
index 0000000..f35c897
--- /dev/null
+++ b/egs/aishell/branchformer/conf/train_asr_branchformer.yaml
@@ -0,0 +1,104 @@
+# network architecture
+# encoder related
+encoder: branchformer
+encoder_conf:
+    output_size: 256
+    use_attn: true
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    use_cgmlp: true
+    cgmlp_linear_units: 2048
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    merge_method: concat
+    cgmlp_weight: 0.5               # used only if merge_method is "fixed_ave"
+    attn_branch_drop_rate: 0.0      # used only if merge_method is "learned_ave"
+    num_blocks: 24
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    stochastic_depth_rate: 0.0
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 180
+val_scheduler_criterion:
+    - valid
+    - acc
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    data_names: speech,text
+    data_types: sound,text
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 2048
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
\ No newline at end of file
diff --git a/egs/aishell/branchformer/local/aishell_data_prep.sh b/egs/aishell/branchformer/local/aishell_data_prep.sh
new file mode 100755
index 0000000..83f489b
--- /dev/null
+++ b/egs/aishell/branchformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/branchformer/local/download_and_untar.sh b/egs/aishell/branchformer/local/download_and_untar.sh
new file mode 100755
index 0000000..d982559
--- /dev/null
+++ b/egs/aishell/branchformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/aishell/branchformer/path.sh b/egs/aishell/branchformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/aishell/branchformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/aishell/branchformer/run.sh b/egs/aishell/branchformer/run.sh
new file mode 100755
index 0000000..6bb4a0c
--- /dev/null
+++ b/egs/aishell/branchformer/run.sh
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3"
+gpu_num=4
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=zh
+token_type=char
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=0
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_branchformer.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer.yaml
+inference_asr_model=valid.acc.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: ASR Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi 
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            train.py \
+                --task_name asr \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --token_type $token_type \
+                --token_list $token_list \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --data_file_names "wav.scp,text" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --speed_perturb ${speed_perturb} \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/data/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
+        python utils/proce_text.py ${_data}/text ${_data}/text.proc
+        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
+
+# Prepare files for ModelScope fine-tuning and inference
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "stage 6: ModelScope Preparation"
+    cp ${feats_dir}/data/${train_set}/cmvn/am.mvn ${exp_dir}/exp/${model_dir}/am.mvn
+    vocab_size=$(cat ${token_list} | wc -l)
+    python utils/gen_modelscope_configuration.py \
+        --am_model_name $inference_asr_model \
+        --mode asr \
+        --model_name conformer \
+        --dataset aishell \
+        --output_dir $exp_dir/exp/$model_dir \
+        --vocab_size $vocab_size \
+        --tag $tag
+fi
\ No newline at end of file
diff --git a/egs/aishell/branchformer/utils b/egs/aishell/branchformer/utils
new file mode 120000
index 0000000..4072eac
--- /dev/null
+++ b/egs/aishell/branchformer/utils
@@ -0,0 +1 @@
+../transformer/utils
\ No newline at end of file
diff --git a/egs/aishell/e_branchformer/conf/decode_asr_transformer.yaml b/egs/aishell/e_branchformer/conf/decode_asr_transformer.yaml
new file mode 100644
index 0000000..e87a293
--- /dev/null
+++ b/egs/aishell/e_branchformer/conf/decode_asr_transformer.yaml
@@ -0,0 +1,6 @@
+beam_size: 10
+penalty: 0.0
+maxlenratio: 0.0
+minlenratio: 0.0
+ctc_weight: 0.4
+lm_weight: 0.0
diff --git a/egs/aishell/e_branchformer/conf/train_asr_e_branchformer.yaml b/egs/aishell/e_branchformer/conf/train_asr_e_branchformer.yaml
new file mode 100644
index 0000000..a30e9a2
--- /dev/null
+++ b/egs/aishell/e_branchformer/conf/train_asr_e_branchformer.yaml
@@ -0,0 +1,101 @@
+# network architecture
+# encoder related
+encoder: e_branchformer
+encoder_conf:
+    output_size: 256
+    attention_heads: 4
+    attention_layer_type: rel_selfattn
+    pos_enc_layer_type: rel_pos
+    rel_pos_type: latest
+    cgmlp_linear_units: 1024
+    cgmlp_conv_kernel: 31
+    use_linear_after_conv: false
+    gate_activation: identity
+    num_blocks: 12
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    attention_dropout_rate: 0.1
+    input_layer: conv2d
+    layer_drop_rate: 0.0
+    linear_units: 1024
+    positionwise_layer_type: linear
+    use_ffn: true
+    macaron_ffn: true
+    merge_conv_kernel: 31
+
+# decoder related
+decoder: transformer
+decoder_conf:
+    attention_heads: 4
+    linear_units: 2048
+    num_blocks: 6
+    dropout_rate: 0.1
+    positional_dropout_rate: 0.1
+    self_attention_dropout_rate: 0.
+    src_attention_dropout_rate: 0.
+
+# frontend related
+frontend: wav_frontend
+frontend_conf:
+    fs: 16000
+    window: hamming
+    n_mels: 80
+    frame_length: 25
+    frame_shift: 10
+    lfr_m: 1
+    lfr_n: 1
+
+# hybrid CTC/attention
+model_conf:
+    ctc_weight: 0.3
+    lsm_weight: 0.1     # label smoothing option
+    length_normalized_loss: false
+
+# optimization related
+accum_grad: 1
+grad_clip: 5
+max_epoch: 180
+best_model_criterion:
+-   - valid
+    - acc
+    - max
+keep_nbest_models: 10
+
+optim: adam
+optim_conf:
+   lr: 0.001
+   weight_decay: 0.000001
+scheduler: warmuplr
+scheduler_conf:
+   warmup_steps: 35000
+
+specaug: specaug
+specaug_conf:
+    apply_time_warp: true
+    time_warp_window: 5
+    time_warp_mode: bicubic
+    apply_freq_mask: true
+    freq_mask_width_range:
+    - 0
+    - 27
+    num_freq_mask: 2
+    apply_time_mask: true
+    time_mask_width_ratio_range:
+    - 0.
+    - 0.05
+    num_time_mask: 10
+
+dataset_conf:
+    data_names: speech,text
+    data_types: sound,text
+    shuffle: True
+    shuffle_conf:
+        shuffle_size: 2048
+        sort_size: 500
+    batch_conf:
+        batch_type: token
+        batch_size: 10000
+    num_workers: 8
+
+log_interval: 50
+normalize: None
\ No newline at end of file
diff --git a/egs/aishell/e_branchformer/local/aishell_data_prep.sh b/egs/aishell/e_branchformer/local/aishell_data_prep.sh
new file mode 100755
index 0000000..83f489b
--- /dev/null
+++ b/egs/aishell/e_branchformer/local/aishell_data_prep.sh
@@ -0,0 +1,66 @@
+#!/bin/bash
+
+# Copyright 2017 Xingyu Na
+# Apache 2.0
+
+#. ./path.sh || exit 1;
+
+if [ $# != 3 ]; then
+  echo "Usage: $0 <audio-path> <text-path> <output-path>"
+  echo " $0 /export/a05/xna/data/data_aishell/wav /export/a05/xna/data/data_aishell/transcript data"
+  exit 1;
+fi
+
+aishell_audio_dir=$1
+aishell_text=$2/aishell_transcript_v0.8.txt
+output_dir=$3
+
+train_dir=$output_dir/data/local/train
+dev_dir=$output_dir/data/local/dev
+test_dir=$output_dir/data/local/test
+tmp_dir=$output_dir/data/local/tmp
+
+mkdir -p $train_dir
+mkdir -p $dev_dir
+mkdir -p $test_dir
+mkdir -p $tmp_dir
+
+# data directory check
+if [ ! -d $aishell_audio_dir ] || [ ! -f $aishell_text ]; then
+  echo "Error: $0 requires two directory arguments"
+  exit 1;
+fi
+
+# find wav audio file for train, dev and test resp.
+find $aishell_audio_dir -iname "*.wav" > $tmp_dir/wav.flist
+n=`cat $tmp_dir/wav.flist | wc -l`
+[ $n -ne 141925 ] && \
+  echo Warning: expected 141925 data data files, found $n
+
+grep -i "wav/train" $tmp_dir/wav.flist > $train_dir/wav.flist || exit 1;
+grep -i "wav/dev" $tmp_dir/wav.flist > $dev_dir/wav.flist || exit 1;
+grep -i "wav/test" $tmp_dir/wav.flist > $test_dir/wav.flist || exit 1;
+
+rm -r $tmp_dir
+
+# Transcriptions preparation
+for dir in $train_dir $dev_dir $test_dir; do
+  echo Preparing $dir transcriptions
+  sed -e 's/\.wav//' $dir/wav.flist | awk -F '/' '{print $NF}' > $dir/utt.list
+  paste -d' ' $dir/utt.list $dir/wav.flist > $dir/wav.scp_all
+  utils/filter_scp.pl -f 1 $dir/utt.list $aishell_text > $dir/transcripts.txt
+  awk '{print $1}' $dir/transcripts.txt > $dir/utt.list
+  utils/filter_scp.pl -f 1 $dir/utt.list $dir/wav.scp_all | sort -u > $dir/wav.scp
+  sort -u $dir/transcripts.txt > $dir/text
+done
+
+mkdir -p $output_dir/data/train $output_dir/data/dev $output_dir/data/test
+
+for f in wav.scp text; do
+  cp $train_dir/$f $output_dir/data/train/$f || exit 1;
+  cp $dev_dir/$f $output_dir/data/dev/$f || exit 1;
+  cp $test_dir/$f $output_dir/data/test/$f || exit 1;
+done
+
+echo "$0: AISHELL data preparation succeeded"
+exit 0;
diff --git a/egs/aishell/e_branchformer/local/download_and_untar.sh b/egs/aishell/e_branchformer/local/download_and_untar.sh
new file mode 100755
index 0000000..d982559
--- /dev/null
+++ b/egs/aishell/e_branchformer/local/download_and_untar.sh
@@ -0,0 +1,105 @@
+#!/usr/bin/env bash
+
+# Copyright   2014  Johns Hopkins University (author: Daniel Povey)
+#             2017  Xingyu Na
+# Apache 2.0
+
+remove_archive=false
+
+if [ "$1" == --remove-archive ]; then
+  remove_archive=true
+  shift
+fi
+
+if [ $# -ne 3 ]; then
+  echo "Usage: $0 [--remove-archive] <data-base> <url-base> <corpus-part>"
+  echo "e.g.: $0 /export/a05/xna/data www.openslr.org/resources/33 data_aishell"
+  echo "With --remove-archive it will remove the archive after successfully un-tarring it."
+  echo "<corpus-part> can be one of: data_aishell, resource_aishell."
+fi
+
+data=$1
+url=$2
+part=$3
+
+if [ ! -d "$data" ]; then
+  echo "$0: no such directory $data"
+  exit 1;
+fi
+
+part_ok=false
+list="data_aishell resource_aishell"
+for x in $list; do
+  if [ "$part" == $x ]; then part_ok=true; fi
+done
+if ! $part_ok; then
+  echo "$0: expected <corpus-part> to be one of $list, but got '$part'"
+  exit 1;
+fi
+
+if [ -z "$url" ]; then
+  echo "$0: empty URL base."
+  exit 1;
+fi
+
+if [ -f $data/$part/.complete ]; then
+  echo "$0: data part $part was already successfully extracted, nothing to do."
+  exit 0;
+fi
+
+# sizes of the archive files in bytes.
+sizes="15582913665 1246920"
+
+if [ -f $data/$part.tgz ]; then
+  size=$(/bin/ls -l $data/$part.tgz | awk '{print $5}')
+  size_ok=false
+  for s in $sizes; do if [ $s == $size ]; then size_ok=true; fi; done
+  if ! $size_ok; then
+    echo "$0: removing existing file $data/$part.tgz because its size in bytes $size"
+    echo "does not equal the size of one of the archives."
+    rm $data/$part.tgz
+  else
+    echo "$data/$part.tgz exists and appears to be complete."
+  fi
+fi
+
+if [ ! -f $data/$part.tgz ]; then
+  if ! command -v wget >/dev/null; then
+    echo "$0: wget is not installed."
+    exit 1;
+  fi
+  full_url=$url/$part.tgz
+  echo "$0: downloading data from $full_url.  This may take some time, please be patient."
+
+  cd $data || exit 1
+  if ! wget --no-check-certificate $full_url; then
+    echo "$0: error executing wget $full_url"
+    exit 1;
+  fi
+fi
+
+cd $data || exit 1
+
+if ! tar -xvzf $part.tgz; then
+  echo "$0: error un-tarring archive $data/$part.tgz"
+  exit 1;
+fi
+
+touch $data/$part/.complete
+
+if [ $part == "data_aishell" ]; then
+  cd $data/$part/wav || exit 1
+  for wav in ./*.tar.gz; do
+    echo "Extracting wav from $wav"
+    tar -zxf $wav && rm $wav
+  done
+fi
+
+echo "$0: Successfully downloaded and un-tarred $data/$part.tgz"
+
+if $remove_archive; then
+  echo "$0: removing $data/$part.tgz file since --remove-archive option was supplied."
+  rm $data/$part.tgz
+fi
+
+exit 0;
diff --git a/egs/aishell/e_branchformer/path.sh b/egs/aishell/e_branchformer/path.sh
new file mode 100755
index 0000000..7972642
--- /dev/null
+++ b/egs/aishell/e_branchformer/path.sh
@@ -0,0 +1,5 @@
+export FUNASR_DIR=$PWD/../../..
+
+# NOTE(kan-bayashi): Use UTF-8 in Python to avoid UnicodeDecodeError when LC_ALL=C
+export PYTHONIOENCODING=UTF-8
+export PATH=$FUNASR_DIR/funasr/bin:$PATH
diff --git a/egs/aishell/e_branchformer/run.sh b/egs/aishell/e_branchformer/run.sh
new file mode 100755
index 0000000..bcba2d7
--- /dev/null
+++ b/egs/aishell/e_branchformer/run.sh
@@ -0,0 +1,225 @@
+#!/usr/bin/env bash
+
+. ./path.sh || exit 1;
+
+# machines configuration
+CUDA_VISIBLE_DEVICES="0,1,2,3"
+gpu_num=4
+count=1
+gpu_inference=true  # Whether to perform gpu decoding, set false for cpu decoding
+# for gpu decoding, inference_nj=ngpu*njob; for cpu decoding, inference_nj=njob
+njob=5
+train_cmd=utils/run.pl
+infer_cmd=utils/run.pl
+
+# general configuration
+feats_dir="../DATA" #feature output dictionary
+exp_dir="."
+lang=zh
+token_type=char
+type=sound
+scp=wav.scp
+speed_perturb="0.9 1.0 1.1"
+stage=0
+stop_stage=5
+
+# feature configuration
+feats_dim=80
+nj=64
+
+# data
+raw_data=../raw_data
+data_url=www.openslr.org/resources/33
+
+# exp tag
+tag="exp1"
+
+. utils/parse_options.sh || exit 1;
+
+# Set bash to 'debug' mode, it will exit on :
+# -e 'error', -u 'undefined variable', -o ... 'error in pipeline', -x 'print commands',
+set -e
+set -u
+set -o pipefail
+
+train_set=train
+valid_set=dev
+test_sets="dev test"
+
+asr_config=conf/train_asr_e_branchformer.yaml
+model_dir="baseline_$(basename "${asr_config}" .yaml)_${lang}_${token_type}_${tag}"
+
+inference_config=conf/decode_asr_transformer.yaml
+inference_asr_model=valid.acc.ave_10best.pb
+
+# you can set gpu num for decoding here
+gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default
+ngpu=$(echo $gpuid_list | awk -F "," '{print NF}')
+
+if ${gpu_inference}; then
+    inference_nj=$[${ngpu}*${njob}]
+    _ngpu=1
+else
+    inference_nj=$njob
+    _ngpu=0
+fi
+
+if [ ${stage} -le -1 ] && [ ${stop_stage} -ge -1 ]; then
+    echo "stage -1: Data Download"
+    local/download_and_untar.sh ${raw_data} ${data_url} data_aishell
+    local/download_and_untar.sh ${raw_data} ${data_url} resource_aishell
+fi
+
+if [ ${stage} -le 0 ] && [ ${stop_stage} -ge 0 ]; then
+    echo "stage 0: Data preparation"
+    # Data preparation
+    local/aishell_data_prep.sh ${raw_data}/data_aishell/wav ${raw_data}/data_aishell/transcript ${feats_dir}
+    for x in train dev test; do
+        cp ${feats_dir}/data/${x}/text ${feats_dir}/data/${x}/text.org
+        paste -d " " <(cut -f 1 -d" " ${feats_dir}/data/${x}/text.org) <(cut -f 2- -d" " ${feats_dir}/data/${x}/text.org | tr -d " ") \
+            > ${feats_dir}/data/${x}/text
+        utils/text2token.py -n 1 -s 1 ${feats_dir}/data/${x}/text > ${feats_dir}/data/${x}/text.org
+        mv ${feats_dir}/data/${x}/text.org ${feats_dir}/data/${x}/text
+    done
+fi
+
+if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
+    echo "stage 1: Feature and CMVN Generation"
+    utils/compute_cmvn.sh --fbankdir ${feats_dir}/data/${train_set} --cmd "$train_cmd" --nj $nj --feats_dim ${feats_dim} --config_file "$asr_config" --scale 1.0
+fi
+
+token_list=${feats_dir}/data/${lang}_token_list/$token_type/tokens.txt
+echo "dictionary: ${token_list}"
+if [ ${stage} -le 2 ] && [ ${stop_stage} -ge 2 ]; then
+    echo "stage 2: Dictionary Preparation"
+    mkdir -p ${feats_dir}/data/${lang}_token_list/$token_type/
+
+    echo "make a dictionary"
+    echo "<blank>" > ${token_list}
+    echo "<s>" >> ${token_list}
+    echo "</s>" >> ${token_list}
+    utils/text2token.py -s 1 -n 1 --space "" ${feats_dir}/data/$train_set/text | cut -f 2- -d" " | tr " " "\n" \
+        | sort | uniq | grep -a -v -e '^\s*$' | awk '{print $0}' >> ${token_list}
+    echo "<unk>" >> ${token_list}
+fi
+
+# LM Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 3 ] && [ ${stop_stage} -ge 3 ]; then
+    echo "stage 3: LM Training"
+fi
+
+# ASR Training Stage
+world_size=$gpu_num  # run on one machine
+if [ ${stage} -le 4 ] && [ ${stop_stage} -ge 4 ]; then
+    echo "stage 4: ASR Training"
+    mkdir -p ${exp_dir}/exp/${model_dir}
+    mkdir -p ${exp_dir}/exp/${model_dir}/log
+    INIT_FILE=${exp_dir}/exp/${model_dir}/ddp_init
+    if [ -f $INIT_FILE ];then
+        rm -f $INIT_FILE
+    fi 
+    init_method=file://$(readlink -f $INIT_FILE)
+    echo "$0: init method is $init_method"
+    for ((i = 0; i < $gpu_num; ++i)); do
+        {
+            rank=$i
+            local_rank=$i
+            gpu_id=$(echo $CUDA_VISIBLE_DEVICES | cut -d',' -f$[$i+1])
+            train.py \
+                --task_name asr \
+                --gpu_id $gpu_id \
+                --use_preprocessor true \
+                --token_type $token_type \
+                --token_list $token_list \
+                --data_dir ${feats_dir}/data \
+                --train_set ${train_set} \
+                --valid_set ${valid_set} \
+                --data_file_names "wav.scp,text" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --speed_perturb ${speed_perturb} \
+                --resume true \
+                --output_dir ${exp_dir}/exp/${model_dir} \
+                --config $asr_config \
+                --ngpu $gpu_num \
+                --num_worker_count $count \
+                --dist_init_method $init_method \
+                --dist_world_size $world_size \
+                --dist_rank $rank \
+                --local_rank $local_rank 1> ${exp_dir}/exp/${model_dir}/log/train.log.$i 2>&1
+        } &
+        done
+        wait
+fi
+
+# Testing Stage
+if [ ${stage} -le 5 ] && [ ${stop_stage} -ge 5 ]; then
+    echo "stage 5: Inference"
+    for dset in ${test_sets}; do
+        asr_exp=${exp_dir}/exp/${model_dir}
+        inference_tag="$(basename "${inference_config}" .yaml)"
+        _dir="${asr_exp}/${inference_tag}/${inference_asr_model}/${dset}"
+        _logdir="${_dir}/logdir"
+        if [ -d ${_dir} ]; then
+            echo "${_dir} is already exists. if you want to decode again, please delete this dir first."
+            exit 0
+        fi
+        mkdir -p "${_logdir}"
+        _data="${feats_dir}/data/${dset}"
+        key_file=${_data}/${scp}
+        num_scp_file="$(<${key_file} wc -l)"
+        _nj=$([ $inference_nj -le $num_scp_file ] && echo "$inference_nj" || echo "$num_scp_file")
+        split_scps=
+        for n in $(seq "${_nj}"); do
+            split_scps+=" ${_logdir}/keys.${n}.scp"
+        done
+        # shellcheck disable=SC2086
+        utils/split_scp.pl "${key_file}" ${split_scps}
+        _opts=
+        if [ -n "${inference_config}" ]; then
+            _opts+="--config ${inference_config} "
+        fi
+        ${infer_cmd} --gpu "${_ngpu}" --max-jobs-run "${_nj}" JOB=1:"${_nj}" "${_logdir}"/asr_inference.JOB.log \
+            python -m funasr.bin.asr_inference_launch \
+                --batch_size 1 \
+                --ngpu "${_ngpu}" \
+                --njob ${njob} \
+                --gpuid_list ${gpuid_list} \
+                --data_path_and_name_and_type "${_data}/${scp},speech,${type}" \
+                --cmvn_file ${feats_dir}/data/${train_set}/cmvn/am.mvn \
+                --key_file "${_logdir}"/keys.JOB.scp \
+                --asr_train_config "${asr_exp}"/config.yaml \
+                --asr_model_file "${asr_exp}"/"${inference_asr_model}" \
+                --output_dir "${_logdir}"/output.JOB \
+                --mode asr \
+                ${_opts}
+
+        for f in token token_int score text; do
+            if [ -f "${_logdir}/output.1/1best_recog/${f}" ]; then
+                for i in $(seq "${_nj}"); do
+                    cat "${_logdir}/output.${i}/1best_recog/${f}"
+                done | sort -k1 >"${_dir}/${f}"
+            fi
+        done
+        python utils/proce_text.py ${_dir}/text ${_dir}/text.proc
+        python utils/proce_text.py ${_data}/text ${_data}/text.proc
+        python utils/compute_wer.py ${_data}/text.proc ${_dir}/text.proc ${_dir}/text.cer
+        tail -n 3 ${_dir}/text.cer > ${_dir}/text.cer.txt
+        cat ${_dir}/text.cer.txt
+    done
+fi
+
+# Prepare files for ModelScope fine-tuning and inference
+if [ ${stage} -le 6 ] && [ ${stop_stage} -ge 6 ]; then
+    echo "stage 6: ModelScope Preparation"
+    cp ${feats_dir}/data/${train_set}/cmvn/am.mvn ${exp_dir}/exp/${model_dir}/am.mvn
+    vocab_size=$(cat ${token_list} | wc -l)
+    python utils/gen_modelscope_configuration.py \
+        --am_model_name $inference_asr_model \
+        --mode asr \
+        --model_name conformer \
+        --dataset aishell \
+        --output_dir $exp_dir/exp/$model_dir \
+        --vocab_size $vocab_size \
+        --tag $tag
+fi
\ No newline at end of file
diff --git a/egs/aishell/e_branchformer/utils b/egs/aishell/e_branchformer/utils
new file mode 120000
index 0000000..4072eac
--- /dev/null
+++ b/egs/aishell/e_branchformer/utils
@@ -0,0 +1 @@
+../transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/TEMPLATE/README.md b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
index ba179ed..0b354ad 100644
--- a/egs_modelscope/speaker_diarization/TEMPLATE/README.md
+++ b/egs_modelscope/speaker_diarization/TEMPLATE/README.md
@@ -3,7 +3,7 @@
 > **Note**: 
 > The modelscope pipeline supports all the models in 
 [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/model_zoo/modelscope_models.html#pretrained-models-on-modelscope) 
-to inference and finetine. Here we take the model of xvector_sv as example to demonstrate the usage.
+to inference and finetune. Here we take the model of xvector_sv as example to demonstrate the usage.
 
 ## Inference with pipeline
 ### Quick start
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py
index c04d985..679782e 100644
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo_long.py
@@ -8,9 +8,9 @@
 import soundfile as sf
 
 param_dict = dict()
-param_dict['hotword'] = "淇¤"
+param_dict['hotword'] = "浣犵殑鐑瘝"
 
-test_wav = '/Users/shixian/Downloads/tpdebug.wav'
+test_wav = 'YOUR_LONG_WAV.wav'
 output_dir = './tmp'
 os.system("mkdir -p {}".format(output_dir))
 
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 10f8e50..36c6d76 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -370,7 +370,7 @@
             results = speech2text(**batch)
             if len(results) < 1:
                 hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
+                results = [[" ", ["sil"], [2], hyp, 10, 6, []]] * nbest
             time_end = time.time()
             forward_time = time_end - time_beg
             lfr_factor = results[0][-1]
@@ -439,6 +439,7 @@
         logging.info(rtf_avg)
         if writer is not None:
             ibest_writer["rtf"]["rtf_avf"] = rtf_avg
+        torch.cuda.empty_cache()
         return asr_result_list
 
     return _forward
@@ -564,6 +565,7 @@
         if 'hotword' in kwargs:
             hotword_list_or_file = kwargs['hotword']
 
+        speech2vadsegment.vad_model.vad_opts.max_single_segment_time = kwargs.get("max_single_segment_time", 60000)
         batch_size_token = kwargs.get("batch_size_token", 6000)
         print("batch_size_token: ", batch_size_token)
 
@@ -646,8 +648,7 @@
             beg_idx = 0
             for j, _ in enumerate(range(0, n)):
                 batch_size_token_ms_cum += (sorted_data[j][0][1] - sorted_data[j][0][0])
-                if j < n - 1 and (batch_size_token_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][
-                    0]) < batch_size_token_ms:
+                if j < n - 1 and (batch_size_token_ms_cum + sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < batch_size_token_ms and (sorted_data[j + 1][0][1] - sorted_data[j + 1][0][0]) < speech2vadsegment.vad_model.vad_opts.max_single_segment_time:
                     continue
                 batch_size_token_ms_cum = 0
                 end_idx = j + 1
@@ -730,6 +731,7 @@
                     ibest_writer["time_stamp"][key] = "{}".format(time_stamp_postprocessed)
 
             logging.info("decoding, utt: {}, predictions: {}".format(key, text_postprocessed_punc))
+        torch.cuda.empty_cache()
         return asr_result_list
 
     return _forward
@@ -1327,7 +1329,6 @@
         right_context: Number of frames in right context AFTER subsampling.
         display_partial_hypotheses: Whether to display partial hypotheses.
     """
-    # assert check_argument_types()
 
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
@@ -1339,7 +1340,7 @@
         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
     )
 
-    if ngpu >= 1:
+    if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
     else:
         device = "cpu"
@@ -1370,10 +1371,7 @@
         left_context=left_context,
         right_context=right_context,
     )
-    speech2text = Speech2TextTransducer.from_pretrained(
-        model_tag=model_tag,
-        **speech2text_kwargs,
-    )
+    speech2text = Speech2TextTransducer(**speech2text_kwargs)
 
     def _forward(data_path_and_name_and_type,
                  raw_inputs: Union[np.ndarray, torch.Tensor] = None,
diff --git a/funasr/bin/punc_infer.py b/funasr/bin/punc_infer.py
index ac96811..7b61717 100644
--- a/funasr/bin/punc_infer.py
+++ b/funasr/bin/punc_infer.py
@@ -8,6 +8,7 @@
 
 import numpy as np
 import torch
+import os
 
 from funasr.build_utils.build_model_from_file import build_model_from_file
 from funasr.datasets.preprocessor import CodeMixTokenizerCommonPreprocessor
@@ -41,6 +42,11 @@
                 self.punc_list[i] = "锛�"
             elif self.punc_list[i] == "銆�":
                 self.period = i
+        self.seg_dict_file = None
+        self.seg_jieba = False
+        if "seg_jieba" in train_args:
+            self.seg_jieba = train_args.seg_jieba
+            self.seg_dict_file = os.path.dirname(model_file)+"/"+ "jieba_usr_dict"
         self.preprocessor = CodeMixTokenizerCommonPreprocessor(
             train=False,
             token_type=train_args.token_type,
@@ -50,6 +56,8 @@
             g2p_type=train_args.g2p,
             text_name="text",
             non_linguistic_symbols=train_args.non_linguistic_symbols,
+            seg_jieba=self.seg_jieba,
+            seg_dict_file=self.seg_dict_file
         )
 
     @torch.no_grad()
diff --git a/funasr/bin/punc_train.py b/funasr/bin/punc_train.py
index aeded7b..c3cbee9 100644
--- a/funasr/bin/punc_train.py
+++ b/funasr/bin/punc_train.py
@@ -44,4 +44,10 @@
     else:
         args.distributed = False
 
+    if args.dataset_type == "small":
+        if args.batch_size is not None:
+            args.batch_size = args.batch_size * args.ngpu * args.num_worker_count
+        if args.batch_bins is not None:
+            args.batch_bins = args.batch_bins * args.ngpu * args.num_worker_count
+
     main(args=args)
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 47af011..0cde570 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -123,7 +123,7 @@
                 vad_results.append(item)
                 if writer is not None:
                     ibest_writer["text"][keys[i]] = "{}".format(results[i])
-
+        torch.cuda.empty_cache()
         return vad_results
 
     return _forward
diff --git a/funasr/build_utils/build_asr_model.py b/funasr/build_utils/build_asr_model.py
index 6606d30..5e93444 100644
--- a/funasr/build_utils/build_asr_model.py
+++ b/funasr/build_utils/build_asr_model.py
@@ -39,6 +39,8 @@
 from funasr.models.encoder.resnet34_encoder import ResNet34Diar
 from funasr.models.encoder.rnn_encoder import RNNEncoder
 from funasr.models.encoder.sanm_encoder import SANMEncoder, SANMEncoderChunkOpt
+from funasr.models.encoder.branchformer_encoder import BranchformerEncoder
+from funasr.models.encoder.e_branchformer_encoder import EBranchformerEncoder
 from funasr.models.encoder.transformer_encoder import TransformerEncoder
 from funasr.models.frontend.default import DefaultFrontend
 from funasr.models.frontend.default import MultiChannelFrontend
@@ -113,6 +115,8 @@
         sanm=SANMEncoder,
         sanm_chunk_opt=SANMEncoderChunkOpt,
         data2vec_encoder=Data2VecEncoder,
+        branchformer=BranchformerEncoder,
+        e_branchformer=EBranchformerEncoder,
         mfcca_enc=MFCCAEncoder,
         chunk_conformer=ConformerChunkEncoder,
     ),
diff --git a/funasr/datasets/large_datasets/build_dataloader.py b/funasr/datasets/large_datasets/build_dataloader.py
index 7a1a906..6c2da2a 100644
--- a/funasr/datasets/large_datasets/build_dataloader.py
+++ b/funasr/datasets/large_datasets/build_dataloader.py
@@ -69,12 +69,15 @@
             symbol_table = read_symbol_table(args.token_list)
         if hasattr(args, "seg_dict_file") and args.seg_dict_file is not None:
             seg_dict = load_seg_dict(args.seg_dict_file)
-        if hasattr(args, "punc_dict_file") and args.punc_dict_file is not None:
-            punc_dict = read_symbol_table(args.punc_dict_file)
+        if hasattr(args, "punc_list") and args.punc_list is not None:
+            punc_dict = read_symbol_table(args.punc_list)
         if hasattr(args, "bpemodel") and args.bpemodel is not None:
             bpe_tokenizer = SentencepiecesTokenizer(args.bpemodel)
         self.dataset_conf = args.dataset_conf
-        self.frontend_conf = args.frontend_conf
+        if "frontend_conf" not in args:
+            self.frontend_conf =  None
+        else:
+            self.frontend_conf = args.frontend_conf
         self.speed_perturb = args.speed_perturb if hasattr(args, "speed_perturb") else None 
         logging.info("dataloader config: {}".format(self.dataset_conf))
         batch_mode = self.dataset_conf.get("batch_mode", "padding")
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 1e9bb26..6c166a5 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -229,15 +229,15 @@
                            mode=mode, 
                            )
 
-    filter_conf = conf.get('filter_conf', {})
-    filter_fn = partial(filter, **filter_conf)
-    dataset = FilterIterDataPipe(dataset, fn=filter_fn)
-
     if "text" in data_names:
         vocab = {'vocab': dict, 'seg_dict': seg_dict, 'punc_dict': punc_dict, 'bpe_tokenizer': bpe_tokenizer, 'hw_config': hw_config}
         tokenize_fn = partial(tokenize, **vocab)
         dataset = MapperIterDataPipe(dataset, fn=tokenize_fn)
 
+    filter_conf = conf.get('filter_conf', {})
+    filter_fn = partial(filter, **filter_conf)
+    dataset = FilterIterDataPipe(dataset, fn=filter_fn)
+
     if shuffle:
         buffer_conf = conf.get('shuffle_conf', {})
         buffer_size = buffer_conf['shuffle_size']
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index c16e1dc..34a97c1 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -54,9 +54,9 @@
 
     length = len(text)
     if 'hw_tag' in data:
+        pre_index = None
         if hw_config['pre_hwlist'] is not None and hw_config['pre_prob'] > 0:
             # enable preset hotword detect in sampling
-            pre_index = None
             for hw in hw_config['pre_hwlist']:
                 hw = " ".join(seg_tokenize(hw, seg_dict))
                 _find = " ".join(text).find(hw)
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index cb4288c..bd2c972 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -12,6 +12,7 @@
 import scipy.signal
 import soundfile
 
+
 from funasr.text.build_tokenizer import build_tokenizer
 from funasr.text.cleaner import TextCleaner
 from funasr.text.token_id_converter import TokenIDConverter
@@ -628,6 +629,7 @@
             text_name: str = "text",
             split_text_name: str = "split_text",
             split_with_space: bool = False,
+            seg_jieba: bool = False,
             seg_dict_file: str = None,
     ):
         super().__init__(
@@ -655,6 +657,10 @@
         )
         # The data field name for split text.
         self.split_text_name = split_text_name
+        self.seg_jieba = seg_jieba
+        if self.seg_jieba:
+            import jieba
+            jieba.load_userdict(seg_dict_file)
 
     @classmethod
     def split_words(cls, text: str):
@@ -677,12 +683,73 @@
                 words.append(current_word)
         return words
 
+    @classmethod
+    def isEnglish(cls, text:str):
+        if re.search('^[a-zA-Z\']+$', text):
+            return True
+        else:
+            return False
+
+    @classmethod
+    def join_chinese_and_english(cls, input_list):
+        line = ''
+        for token in input_list:
+            if cls.isEnglish(token):
+                line = line + ' ' + token
+            else:
+                line = line + token
+
+        line = line.strip()
+        return line   
+
+    @classmethod
+    def split_words_jieba(cls, text: str):
+        input_list = text.split()
+        token_list_all = []
+        langauge_list = []
+        token_list_tmp = []
+        language_flag = None
+        for token in input_list:
+            if cls.isEnglish(token) and language_flag == 'Chinese':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('Chinese')
+                token_list_tmp = []
+            elif not cls.isEnglish(token) and language_flag == 'English':
+                token_list_all.append(token_list_tmp)
+                langauge_list.append('English')
+                token_list_tmp = []
+
+            token_list_tmp.append(token)
+
+            if cls.isEnglish(token):
+                language_flag = 'English'
+            else:
+                language_flag = 'Chinese'
+
+        if token_list_tmp:
+            token_list_all.append(token_list_tmp)
+            langauge_list.append(language_flag)
+
+        result_list = []
+        for token_list_tmp, language_flag in zip(token_list_all, langauge_list):
+            if language_flag == 'English':
+                result_list.extend(token_list_tmp)
+            else:
+                seg_list = jieba.cut(cls.join_chinese_and_english(token_list_tmp), HMM=False)
+                result_list.extend(seg_list)
+
+        return result_list
+
     def __call__(
             self, uid: str, data: Dict[str, Union[list, str, np.ndarray]]
     ) -> Dict[str, Union[list, np.ndarray]]:
         # Split words.
         if isinstance(data[self.text_name], str):
-            split_text = self.split_words(data[self.text_name])
+            if self.seg_jieba:
+  #              jieba.load_userdict(seg_dict_file)
+                split_text = self.split_words_jieba(data[self.text_name])
+            else:
+                split_text = self.split_words(data[self.text_name])
         else:
             split_text = data[self.text_name]
         data[self.text_name] = " ".join(split_text)
@@ -782,7 +849,6 @@
     ) -> Dict[str, np.ndarray]:
         for i in range(self.num_tokenizer):
             text_name = self.text_name[i]
-            #import pdb; pdb.set_trace()
             if text_name in data and self.tokenizer[i] is not None:
                 text = data[text_name]
                 text = self.text_cleaner(text)
diff --git a/funasr/models/encoder/branchformer_encoder.py b/funasr/models/encoder/branchformer_encoder.py
new file mode 100644
index 0000000..70bd2c9
--- /dev/null
+++ b/funasr/models/encoder/branchformer_encoder.py
@@ -0,0 +1,545 @@
+# Copyright 2022 Yifan Peng (Carnegie Mellon University)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Branchformer encoder definition.
+
+Reference:
+    Yifan Peng, Siddharth Dalmia, Ian Lane, and Shinji Watanabe,
+    鈥淏ranchformer: Parallel MLP-Attention Architectures to Capture
+    Local and Global Context for Speech Recognition and Understanding,鈥�
+    in Proceedings of ICML, 2022.
+
+"""
+
+import logging
+from typing import List, Optional, Tuple, Union
+
+import numpy
+import torch
+
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.modules.cgmlp import ConvolutionalGatingMLP
+from funasr.modules.fastformer import FastSelfAttention
+from funasr.modules.nets_utils import make_pad_mask
+from funasr.modules.attention import (  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,
+    MultiHeadedAttention,
+    RelPositionMultiHeadedAttention,
+)
+from funasr.modules.embedding import (  # noqa: H301
+    LegacyRelPositionalEncoding,
+    PositionalEncoding,
+    RelPositionalEncoding,
+    ScaledPositionalEncoding,
+)
+from funasr.modules.layer_norm import LayerNorm
+from funasr.modules.repeat import repeat
+from funasr.modules.subsampling import (
+    Conv2dSubsampling,
+    Conv2dSubsampling2,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+    TooShortUttError,
+    check_short_utt,
+)
+
+
+class BranchformerEncoderLayer(torch.nn.Module):
+    """Branchformer encoder layer module.
+
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention, optional
+        cgmlp: ConvolutionalGatingMLP, optional
+        dropout_rate (float): dropout probability
+        merge_method (str): concat, learned_ave, fixed_ave
+        cgmlp_weight (float): weight of the cgmlp branch, between 0 and 1,
+            used if merge_method is fixed_ave
+        attn_branch_drop_rate (float): probability of dropping the attn branch,
+            used if merge_method is learned_ave
+        stochastic_depth_rate (float): stochastic depth probability
+    """
+
+    def __init__(
+        self,
+        size: int,
+        attn: Optional[torch.nn.Module],
+        cgmlp: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_method: str,
+        cgmlp_weight: float = 0.5,
+        attn_branch_drop_rate: float = 0.0,
+        stochastic_depth_rate: float = 0.0,
+    ):
+        super().__init__()
+        assert (attn is not None) or (
+            cgmlp is not None
+        ), "At least one branch should be valid"
+
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+        self.merge_method = merge_method
+        self.cgmlp_weight = cgmlp_weight
+        self.attn_branch_drop_rate = attn_branch_drop_rate
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.use_two_branches = (attn is not None) and (cgmlp is not None)
+
+        if attn is not None:
+            self.norm_mha = LayerNorm(size)  # for the MHA module
+        if cgmlp is not None:
+            self.norm_mlp = LayerNorm(size)  # for the MLP module
+        self.norm_final = LayerNorm(size)  # for the final output of the block
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+        if self.use_two_branches:
+            if merge_method == "concat":
+                self.merge_proj = torch.nn.Linear(size + size, size)
+
+            elif merge_method == "learned_ave":
+                # attention-based pooling for two branches
+                self.pooling_proj1 = torch.nn.Linear(size, 1)
+                self.pooling_proj2 = torch.nn.Linear(size, 1)
+
+                # linear projections for calculating merging weights
+                self.weight_proj1 = torch.nn.Linear(size, 1)
+                self.weight_proj2 = torch.nn.Linear(size, 1)
+
+                # linear projection after weighted average
+                self.merge_proj = torch.nn.Linear(size, size)
+
+            elif merge_method == "fixed_ave":
+                assert (
+                    0.0 <= cgmlp_weight <= 1.0
+                ), "cgmlp weight should be between 0.0 and 1.0"
+
+                # remove the other branch if only one branch is used
+                if cgmlp_weight == 0.0:
+                    self.use_two_branches = False
+                    self.cgmlp = None
+                    self.norm_mlp = None
+                elif cgmlp_weight == 1.0:
+                    self.use_two_branches = False
+                    self.attn = None
+                    self.norm_mha = None
+
+                # linear projection after weighted average
+                self.merge_proj = torch.nn.Linear(size, size)
+
+            else:
+                raise ValueError(f"unknown merge method: {merge_method}")
+
+        else:
+            self.merge_proj = torch.nn.Identity()
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+
+        if cache is not None:
+            raise NotImplementedError("cache is not None, which is not tested")
+
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        skip_layer = False
+        # with stochastic depth, residual connection `x + f(x)` becomes
+        # `x <- x + 1 / (1 - p) * f(x)` at training time.
+        stoch_layer_coeff = 1.0
+        if self.training and self.stochastic_depth_rate > 0:
+            skip_layer = torch.rand(1).item() < self.stochastic_depth_rate
+            stoch_layer_coeff = 1.0 / (1 - self.stochastic_depth_rate)
+
+        if skip_layer:
+            if cache is not None:
+                x = torch.cat([cache, x], dim=1)
+            if pos_emb is not None:
+                return (x, pos_emb), mask
+            return x, mask
+
+        # Two branches
+        x1 = x
+        x2 = x
+
+        # Branch 1: multi-headed attention module
+        if self.attn is not None:
+            x1 = self.norm_mha(x1)
+
+            if isinstance(self.attn, FastSelfAttention):
+                x_att = self.attn(x1, mask)
+            else:
+                if pos_emb is not None:
+                    x_att = self.attn(x1, x1, x1, pos_emb, mask)
+                else:
+                    x_att = self.attn(x1, x1, x1, mask)
+
+            x1 = self.dropout(x_att)
+
+        # Branch 2: convolutional gating mlp
+        if self.cgmlp is not None:
+            x2 = self.norm_mlp(x2)
+
+            if pos_emb is not None:
+                x2 = (x2, pos_emb)
+            x2 = self.cgmlp(x2, mask)
+            if isinstance(x2, tuple):
+                x2 = x2[0]
+
+            x2 = self.dropout(x2)
+
+        # Merge two branches
+        if self.use_two_branches:
+            if self.merge_method == "concat":
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj(torch.cat([x1, x2], dim=-1))
+                )
+            elif self.merge_method == "learned_ave":
+                if (
+                    self.training
+                    and self.attn_branch_drop_rate > 0
+                    and torch.rand(1).item() < self.attn_branch_drop_rate
+                ):
+                    # Drop the attn branch
+                    w1, w2 = 0.0, 1.0
+                else:
+                    # branch1
+                    score1 = (
+                        self.pooling_proj1(x1).transpose(1, 2) / self.size**0.5
+                    )  # (batch, 1, time)
+                    if mask is not None:
+                        min_value = float(
+                            numpy.finfo(
+                                torch.tensor(0, dtype=score1.dtype).numpy().dtype
+                            ).min
+                        )
+                        score1 = score1.masked_fill(mask.eq(0), min_value)
+                        score1 = torch.softmax(score1, dim=-1).masked_fill(
+                            mask.eq(0), 0.0
+                        )
+                    else:
+                        score1 = torch.softmax(score1, dim=-1)
+                    pooled1 = torch.matmul(score1, x1).squeeze(1)  # (batch, size)
+                    weight1 = self.weight_proj1(pooled1)  # (batch, 1)
+
+                    # branch2
+                    score2 = (
+                        self.pooling_proj2(x2).transpose(1, 2) / self.size**0.5
+                    )  # (batch, 1, time)
+                    if mask is not None:
+                        min_value = float(
+                            numpy.finfo(
+                                torch.tensor(0, dtype=score2.dtype).numpy().dtype
+                            ).min
+                        )
+                        score2 = score2.masked_fill(mask.eq(0), min_value)
+                        score2 = torch.softmax(score2, dim=-1).masked_fill(
+                            mask.eq(0), 0.0
+                        )
+                    else:
+                        score2 = torch.softmax(score2, dim=-1)
+                    pooled2 = torch.matmul(score2, x2).squeeze(1)  # (batch, size)
+                    weight2 = self.weight_proj2(pooled2)  # (batch, 1)
+
+                    # normalize weights of two branches
+                    merge_weights = torch.softmax(
+                        torch.cat([weight1, weight2], dim=-1), dim=-1
+                    )  # (batch, 2)
+                    merge_weights = merge_weights.unsqueeze(-1).unsqueeze(
+                        -1
+                    )  # (batch, 2, 1, 1)
+                    w1, w2 = merge_weights[:, 0], merge_weights[:, 1]  # (batch, 1, 1)
+
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj(w1 * x1 + w2 * x2)
+                )
+            elif self.merge_method == "fixed_ave":
+                x = x + stoch_layer_coeff * self.dropout(
+                    self.merge_proj(
+                        (1.0 - self.cgmlp_weight) * x1 + self.cgmlp_weight * x2
+                    )
+                )
+            else:
+                raise RuntimeError(f"unknown merge method: {self.merge_method}")
+        else:
+            if self.attn is None:
+                x = x + stoch_layer_coeff * self.dropout(self.merge_proj(x2))
+            elif self.cgmlp is None:
+                x = x + stoch_layer_coeff * self.dropout(self.merge_proj(x1))
+            else:
+                # This should not happen
+                raise RuntimeError("Both branches are not None, which is unexpected.")
+
+        x = self.norm_final(x)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
+
+
+class BranchformerEncoder(AbsEncoder):
+    """Branchformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        use_attn: bool = True,
+        attention_heads: int = 4,
+        attention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        rel_pos_type: str = "latest",
+        use_cgmlp: bool = True,
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        merge_method: str = "concat",
+        cgmlp_weight: Union[float, List[float]] = 0.5,
+        attn_branch_drop_rate: Union[float, List[float]] = 0.0,
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        zero_triu: bool = False,
+        padding_idx: int = -1,
+        stochastic_depth_rate: Union[float, List[float]] = 0.0,
+    ):
+        super().__init__()
+        self._output_size = output_size
+
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if attention_layer_type == "rel_selfattn":
+                attention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert attention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert attention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert attention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            if input_size == output_size:
+                self.embed = None
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        if attention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+        elif attention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
+        elif attention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        elif attention_layer_type == "fast_selfattn":
+            assert pos_enc_layer_type in ["abs_pos", "scaled_abs_pos"]
+            encoder_selfattn_layer = FastSelfAttention
+            encoder_selfattn_layer_args = (
+                output_size,
+                attention_heads,
+                attention_dropout_rate,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + attention_layer_type)
+
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (
+            output_size,
+            cgmlp_linear_units,
+            cgmlp_conv_kernel,
+            dropout_rate,
+            use_linear_after_conv,
+            gate_activation,
+        )
+
+        if isinstance(stochastic_depth_rate, float):
+            stochastic_depth_rate = [stochastic_depth_rate] * num_blocks
+        if len(stochastic_depth_rate) != num_blocks:
+            raise ValueError(
+                f"Length of stochastic_depth_rate ({len(stochastic_depth_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})"
+            )
+
+        if isinstance(cgmlp_weight, float):
+            cgmlp_weight = [cgmlp_weight] * num_blocks
+        if len(cgmlp_weight) != num_blocks:
+            raise ValueError(
+                f"Length of cgmlp_weight ({len(cgmlp_weight)}) should be equal to "
+                f"num_blocks ({num_blocks})"
+            )
+
+        if isinstance(attn_branch_drop_rate, float):
+            attn_branch_drop_rate = [attn_branch_drop_rate] * num_blocks
+        if len(attn_branch_drop_rate) != num_blocks:
+            raise ValueError(
+                f"Length of attn_branch_drop_rate ({len(attn_branch_drop_rate)}) "
+                f"should be equal to num_blocks ({num_blocks})"
+            )
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: BranchformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args)
+                if use_attn
+                else None,
+                cgmlp_layer(*cgmlp_layer_args) if use_cgmlp else None,
+                dropout_rate,
+                merge_method,
+                cgmlp_weight[lnum],
+                attn_branch_drop_rate[lnum],
+                stochastic_depth_rate[lnum],
+            ),
+        )
+        self.after_norm = LayerNorm(output_size)
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        xs_pad, masks = self.encoders(xs_pad, masks)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+
+        xs_pad = self.after_norm(xs_pad)
+        olens = masks.squeeze(1).sum(1)
+        return xs_pad, olens, None
diff --git a/funasr/models/encoder/e_branchformer_encoder.py b/funasr/models/encoder/e_branchformer_encoder.py
new file mode 100644
index 0000000..14028ed
--- /dev/null
+++ b/funasr/models/encoder/e_branchformer_encoder.py
@@ -0,0 +1,465 @@
+# Copyright 2022 Kwangyoun Kim (ASAPP inc.)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""E-Branchformer encoder definition.
+Reference:
+    Kwangyoun Kim, Felix Wu, Yifan Peng, Jing Pan,
+    Prashant Sridhar, Kyu J. Han, Shinji Watanabe,
+    "E-Branchformer: Branchformer with Enhanced merging
+    for speech recognition," in SLT 2022.
+"""
+
+import logging
+from typing import List, Optional, Tuple
+
+import torch
+
+from funasr.models.ctc import CTC
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.modules.cgmlp import ConvolutionalGatingMLP
+from funasr.modules.fastformer import FastSelfAttention
+from funasr.modules.nets_utils import get_activation, make_pad_mask
+from funasr.modules.attention import (  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,
+    MultiHeadedAttention,
+    RelPositionMultiHeadedAttention,
+)
+from funasr.modules.embedding import (  # noqa: H301
+    LegacyRelPositionalEncoding,
+    PositionalEncoding,
+    RelPositionalEncoding,
+    ScaledPositionalEncoding,
+)
+from funasr.modules.layer_norm import LayerNorm
+from funasr.modules.positionwise_feed_forward import (
+    PositionwiseFeedForward,
+)
+from funasr.modules.repeat import repeat
+from funasr.modules.subsampling import (
+    Conv2dSubsampling,
+    Conv2dSubsampling2,
+    Conv2dSubsampling6,
+    Conv2dSubsampling8,
+    TooShortUttError,
+    check_short_utt,
+)
+
+
+class EBranchformerEncoderLayer(torch.nn.Module):
+    """E-Branchformer encoder layer module.
+
+    Args:
+        size (int): model dimension
+        attn: standard self-attention or efficient attention
+        cgmlp: ConvolutionalGatingMLP
+        feed_forward: feed-forward module, optional
+        feed_forward: macaron-style feed-forward module, optional
+        dropout_rate (float): dropout probability
+        merge_conv_kernel (int): kernel size of the depth-wise conv in merge module
+    """
+
+    def __init__(
+        self,
+        size: int,
+        attn: torch.nn.Module,
+        cgmlp: torch.nn.Module,
+        feed_forward: Optional[torch.nn.Module],
+        feed_forward_macaron: Optional[torch.nn.Module],
+        dropout_rate: float,
+        merge_conv_kernel: int = 3,
+    ):
+        super().__init__()
+
+        self.size = size
+        self.attn = attn
+        self.cgmlp = cgmlp
+
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.ff_scale = 1.0
+        if self.feed_forward is not None:
+            self.norm_ff = LayerNorm(size)
+        if self.feed_forward_macaron is not None:
+            self.ff_scale = 0.5
+            self.norm_ff_macaron = LayerNorm(size)
+
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        self.norm_mlp = LayerNorm(size)  # for the MLP module
+        self.norm_final = LayerNorm(size)  # for the final output of the block
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+        self.depthwise_conv_fusion = torch.nn.Conv1d(
+            size + size,
+            size + size,
+            kernel_size=merge_conv_kernel,
+            stride=1,
+            padding=(merge_conv_kernel - 1) // 2,
+            groups=size + size,
+            bias=True,
+        )
+        self.merge_proj = torch.nn.Linear(size + size, size)
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, 1, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+        """
+
+        if cache is not None:
+            raise NotImplementedError("cache is not None, which is not tested")
+
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+
+        if self.feed_forward_macaron is not None:
+            residual = x
+            x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+
+        # Two branches
+        x1 = x
+        x2 = x
+
+        # Branch 1: multi-headed attention module
+        x1 = self.norm_mha(x1)
+
+        if isinstance(self.attn, FastSelfAttention):
+            x_att = self.attn(x1, mask)
+        else:
+            if pos_emb is not None:
+                x_att = self.attn(x1, x1, x1, pos_emb, mask)
+            else:
+                x_att = self.attn(x1, x1, x1, mask)
+
+        x1 = self.dropout(x_att)
+
+        # Branch 2: convolutional gating mlp
+        x2 = self.norm_mlp(x2)
+
+        if pos_emb is not None:
+            x2 = (x2, pos_emb)
+        x2 = self.cgmlp(x2, mask)
+        if isinstance(x2, tuple):
+            x2 = x2[0]
+
+        x2 = self.dropout(x2)
+
+        # Merge two branches
+        x_concat = torch.cat([x1, x2], dim=-1)
+        x_tmp = x_concat.transpose(1, 2)
+        x_tmp = self.depthwise_conv_fusion(x_tmp)
+        x_tmp = x_tmp.transpose(1, 2)
+        x = x + self.dropout(self.merge_proj(x_concat + x_tmp))
+
+        if self.feed_forward is not None:
+            # feed forward module
+            residual = x
+            x = self.norm_ff(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+
+        x = self.norm_final(x)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
+
+
+class EBranchformerEncoder(AbsEncoder):
+    """E-Branchformer encoder module."""
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        attention_layer_type: str = "rel_selfattn",
+        pos_enc_layer_type: str = "rel_pos",
+        rel_pos_type: str = "latest",
+        cgmlp_linear_units: int = 2048,
+        cgmlp_conv_kernel: int = 31,
+        use_linear_after_conv: bool = False,
+        gate_activation: str = "identity",
+        num_blocks: int = 12,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: Optional[str] = "conv2d",
+        zero_triu: bool = False,
+        padding_idx: int = -1,
+        layer_drop_rate: float = 0.0,
+        max_pos_emb_len: int = 5000,
+        use_ffn: bool = False,
+        macaron_ffn: bool = False,
+        ffn_activation_type: str = "swish",
+        linear_units: int = 2048,
+        positionwise_layer_type: str = "linear",
+        merge_conv_kernel: int = 3,
+        interctc_layer_idx=None,
+        interctc_use_conditioning: bool = False,
+    ):
+        super().__init__()
+        self._output_size = output_size
+
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if attention_layer_type == "rel_selfattn":
+                attention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert attention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert attention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert attention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d2":
+            self.embed = Conv2dSubsampling2(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate, max_pos_emb_len),
+            )
+        elif input_layer is None:
+            if input_size == output_size:
+                self.embed = None
+            else:
+                self.embed = torch.nn.Linear(input_size, output_size)
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+
+        activation = get_activation(ffn_activation_type)
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type is None:
+            logging.warning("no macaron ffn")
+        else:
+            raise ValueError("Support only linear.")
+
+        if attention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+        elif attention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
+        elif attention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        elif attention_layer_type == "fast_selfattn":
+            assert pos_enc_layer_type in ["abs_pos", "scaled_abs_pos"]
+            encoder_selfattn_layer = FastSelfAttention
+            encoder_selfattn_layer_args = (
+                output_size,
+                attention_heads,
+                attention_dropout_rate,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + attention_layer_type)
+
+        cgmlp_layer = ConvolutionalGatingMLP
+        cgmlp_layer_args = (
+            output_size,
+            cgmlp_linear_units,
+            cgmlp_conv_kernel,
+            dropout_rate,
+            use_linear_after_conv,
+            gate_activation,
+        )
+
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EBranchformerEncoderLayer(
+                output_size,
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                cgmlp_layer(*cgmlp_layer_args),
+                positionwise_layer(*positionwise_layer_args) if use_ffn else None,
+                positionwise_layer(*positionwise_layer_args)
+                if use_ffn and macaron_ffn
+                else None,
+                dropout_rate,
+                merge_conv_kernel,
+            ),
+            layer_drop_rate,
+        )
+        self.after_norm = LayerNorm(output_size)
+
+        if interctc_layer_idx is None:
+            interctc_layer_idx = []
+        self.interctc_layer_idx = interctc_layer_idx
+        if len(interctc_layer_idx) > 0:
+            assert 0 < min(interctc_layer_idx) and max(interctc_layer_idx) < num_blocks
+        self.interctc_use_conditioning = interctc_use_conditioning
+        self.conditioning_layer = None
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+        ctc: CTC = None,
+        max_layer: int = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+            ctc (CTC): Intermediate CTC module.
+            max_layer (int): Layer depth below which InterCTC is applied.
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+        """
+
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling2)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        elif self.embed is not None:
+            xs_pad = self.embed(xs_pad)
+
+        intermediate_outs = []
+        if len(self.interctc_layer_idx) == 0:
+            if max_layer is not None and 0 <= max_layer < len(self.encoders):
+                for layer_idx, encoder_layer in enumerate(self.encoders):
+                    xs_pad, masks = encoder_layer(xs_pad, masks)
+                    if layer_idx >= max_layer:
+                        break
+            else:
+                xs_pad, masks = self.encoders(xs_pad, masks)
+        else:
+            for layer_idx, encoder_layer in enumerate(self.encoders):
+                xs_pad, masks = encoder_layer(xs_pad, masks)
+
+                if layer_idx + 1 in self.interctc_layer_idx:
+                    encoder_out = xs_pad
+
+                    if isinstance(encoder_out, tuple):
+                        encoder_out = encoder_out[0]
+
+                    intermediate_outs.append((layer_idx + 1, encoder_out))
+
+                    if self.interctc_use_conditioning:
+                        ctc_out = ctc.softmax(encoder_out)
+
+                        if isinstance(xs_pad, tuple):
+                            xs_pad = list(xs_pad)
+                            xs_pad[0] = xs_pad[0] + self.conditioning_layer(ctc_out)
+                            xs_pad = tuple(xs_pad)
+                        else:
+                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+
+        xs_pad = self.after_norm(xs_pad)
+        olens = masks.squeeze(1).sum(1)
+        if len(intermediate_outs) > 0:
+            return (xs_pad, intermediate_outs), olens, None
+        return xs_pad, olens, None
diff --git a/funasr/modules/cgmlp.py b/funasr/modules/cgmlp.py
new file mode 100644
index 0000000..dcd085b
--- /dev/null
+++ b/funasr/modules/cgmlp.py
@@ -0,0 +1,124 @@
+"""MLP with convolutional gating (cgMLP) definition.
+
+References:
+    https://openreview.net/forum?id=RA-zVvZLYIy
+    https://arxiv.org/abs/2105.08050
+
+"""
+
+import torch
+
+from funasr.modules.nets_utils import get_activation
+from funasr.modules.layer_norm import LayerNorm
+
+
+class ConvolutionalSpatialGatingUnit(torch.nn.Module):
+    """Convolutional Spatial Gating Unit (CSGU)."""
+
+    def __init__(
+        self,
+        size: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+    ):
+        super().__init__()
+
+        n_channels = size // 2  # split input channels
+        self.norm = LayerNorm(n_channels)
+        self.conv = torch.nn.Conv1d(
+            n_channels,
+            n_channels,
+            kernel_size,
+            1,
+            (kernel_size - 1) // 2,
+            groups=n_channels,
+        )
+        if use_linear_after_conv:
+            self.linear = torch.nn.Linear(n_channels, n_channels)
+        else:
+            self.linear = None
+
+        if gate_activation == "identity":
+            self.act = torch.nn.Identity()
+        else:
+            self.act = get_activation(gate_activation)
+
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def espnet_initialization_fn(self):
+        torch.nn.init.normal_(self.conv.weight, std=1e-6)
+        torch.nn.init.ones_(self.conv.bias)
+        if self.linear is not None:
+            torch.nn.init.normal_(self.linear.weight, std=1e-6)
+            torch.nn.init.ones_(self.linear.bias)
+
+    def forward(self, x, gate_add=None):
+        """Forward method
+
+        Args:
+            x (torch.Tensor): (N, T, D)
+            gate_add (torch.Tensor): (N, T, D/2)
+
+        Returns:
+            out (torch.Tensor): (N, T, D/2)
+        """
+
+        x_r, x_g = x.chunk(2, dim=-1)
+
+        x_g = self.norm(x_g)  # (N, T, D/2)
+        x_g = self.conv(x_g.transpose(1, 2)).transpose(1, 2)  # (N, T, D/2)
+        if self.linear is not None:
+            x_g = self.linear(x_g)
+
+        if gate_add is not None:
+            x_g = x_g + gate_add
+
+        x_g = self.act(x_g)
+        out = x_r * x_g  # (N, T, D/2)
+        out = self.dropout(out)
+        return out
+
+
+class ConvolutionalGatingMLP(torch.nn.Module):
+    """Convolutional Gating MLP (cgMLP)."""
+
+    def __init__(
+        self,
+        size: int,
+        linear_units: int,
+        kernel_size: int,
+        dropout_rate: float,
+        use_linear_after_conv: bool,
+        gate_activation: str,
+    ):
+        super().__init__()
+
+        self.channel_proj1 = torch.nn.Sequential(
+            torch.nn.Linear(size, linear_units), torch.nn.GELU()
+        )
+        self.csgu = ConvolutionalSpatialGatingUnit(
+            size=linear_units,
+            kernel_size=kernel_size,
+            dropout_rate=dropout_rate,
+            use_linear_after_conv=use_linear_after_conv,
+            gate_activation=gate_activation,
+        )
+        self.channel_proj2 = torch.nn.Linear(linear_units // 2, size)
+
+    def forward(self, x, mask):
+        if isinstance(x, tuple):
+            xs_pad, pos_emb = x
+        else:
+            xs_pad, pos_emb = x, None
+
+        xs_pad = self.channel_proj1(xs_pad)  # size -> linear_units
+        xs_pad = self.csgu(xs_pad)  # linear_units -> linear_units/2
+        xs_pad = self.channel_proj2(xs_pad)  # linear_units/2 -> size
+
+        if pos_emb is not None:
+            out = (xs_pad, pos_emb)
+        else:
+            out = xs_pad
+        return out
diff --git a/funasr/modules/fastformer.py b/funasr/modules/fastformer.py
new file mode 100644
index 0000000..24ca947
--- /dev/null
+++ b/funasr/modules/fastformer.py
@@ -0,0 +1,153 @@
+"""Fastformer attention definition.
+
+Reference:
+    Wu et al., "Fastformer: Additive Attention Can Be All You Need"
+    https://arxiv.org/abs/2108.09084
+    https://github.com/wuch15/Fastformer
+
+"""
+
+import numpy
+import torch
+
+
+class FastSelfAttention(torch.nn.Module):
+    """Fast self-attention used in Fastformer."""
+
+    def __init__(
+        self,
+        size,
+        attention_heads,
+        dropout_rate,
+    ):
+        super().__init__()
+        if size % attention_heads != 0:
+            raise ValueError(
+                f"Hidden size ({size}) is not an integer multiple "
+                f"of attention heads ({attention_heads})"
+            )
+        self.attention_head_size = size // attention_heads
+        self.num_attention_heads = attention_heads
+
+        self.query = torch.nn.Linear(size, size)
+        self.query_att = torch.nn.Linear(size, attention_heads)
+        self.key = torch.nn.Linear(size, size)
+        self.key_att = torch.nn.Linear(size, attention_heads)
+        self.transform = torch.nn.Linear(size, size)
+        self.dropout = torch.nn.Dropout(dropout_rate)
+
+    def espnet_initialization_fn(self):
+        self.apply(self.init_weights)
+
+    def init_weights(self, module):
+        if isinstance(module, torch.nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+        if isinstance(module, torch.nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def transpose_for_scores(self, x):
+        """Reshape and transpose to compute scores.
+
+        Args:
+            x: (batch, time, size = n_heads * attn_dim)
+
+        Returns:
+            (batch, n_heads, time, attn_dim)
+        """
+
+        new_x_shape = x.shape[:-1] + (
+            self.num_attention_heads,
+            self.attention_head_size,
+        )
+        return x.reshape(*new_x_shape).transpose(1, 2)
+
+    def forward(self, xs_pad, mask):
+        """Forward method.
+
+        Args:
+            xs_pad: (batch, time, size = n_heads * attn_dim)
+            mask: (batch, 1, time), nonpadding is 1, padding is 0
+
+        Returns:
+            torch.Tensor: (batch, time, size)
+        """
+
+        batch_size, seq_len, _ = xs_pad.shape
+        mixed_query_layer = self.query(xs_pad)  # (batch, time, size)
+        mixed_key_layer = self.key(xs_pad)  # (batch, time, size)
+
+        if mask is not None:
+            mask = mask.eq(0)  # padding is 1, nonpadding is 0
+
+        # (batch, n_heads, time)
+        query_for_score = (
+            self.query_att(mixed_query_layer).transpose(1, 2)
+            / self.attention_head_size**0.5
+        )
+        if mask is not None:
+            min_value = float(
+                numpy.finfo(
+                    torch.tensor(0, dtype=query_for_score.dtype).numpy().dtype
+                ).min
+            )
+            query_for_score = query_for_score.masked_fill(mask, min_value)
+            query_weight = torch.softmax(query_for_score, dim=-1).masked_fill(mask, 0.0)
+        else:
+            query_weight = torch.softmax(query_for_score, dim=-1)
+
+        query_weight = query_weight.unsqueeze(2)  # (batch, n_heads, 1, time)
+        query_layer = self.transpose_for_scores(
+            mixed_query_layer
+        )  # (batch, n_heads, time, attn_dim)
+
+        pooled_query = (
+            torch.matmul(query_weight, query_layer)
+            .transpose(1, 2)
+            .reshape(-1, 1, self.num_attention_heads * self.attention_head_size)
+        )  # (batch, 1, size = n_heads * attn_dim)
+        pooled_query = self.dropout(pooled_query)
+        pooled_query_repeat = pooled_query.repeat(1, seq_len, 1)  # (batch, time, size)
+
+        mixed_query_key_layer = (
+            mixed_key_layer * pooled_query_repeat
+        )  # (batch, time, size)
+
+        # (batch, n_heads, time)
+        query_key_score = (
+            self.key_att(mixed_query_key_layer) / self.attention_head_size**0.5
+        ).transpose(1, 2)
+        if mask is not None:
+            min_value = float(
+                numpy.finfo(
+                    torch.tensor(0, dtype=query_key_score.dtype).numpy().dtype
+                ).min
+            )
+            query_key_score = query_key_score.masked_fill(mask, min_value)
+            query_key_weight = torch.softmax(query_key_score, dim=-1).masked_fill(
+                mask, 0.0
+            )
+        else:
+            query_key_weight = torch.softmax(query_key_score, dim=-1)
+
+        query_key_weight = query_key_weight.unsqueeze(2)  # (batch, n_heads, 1, time)
+        key_layer = self.transpose_for_scores(
+            mixed_query_key_layer
+        )  # (batch, n_heads, time, attn_dim)
+        pooled_key = torch.matmul(
+            query_key_weight, key_layer
+        )  # (batch, n_heads, 1, attn_dim)
+        pooled_key = self.dropout(pooled_key)
+
+        # NOTE: value = query, due to param sharing
+        weighted_value = (pooled_key * query_layer).transpose(
+            1, 2
+        )  # (batch, time, n_heads, attn_dim)
+        weighted_value = weighted_value.reshape(
+            weighted_value.shape[:-2]
+            + (self.num_attention_heads * self.attention_head_size,)
+        )  # (batch, time, size)
+        weighted_value = (
+            self.dropout(self.transform(weighted_value)) + mixed_query_layer
+        )
+
+        return weighted_value
diff --git a/funasr/modules/repeat.py b/funasr/modules/repeat.py
index ff1e182..7e16066 100644
--- a/funasr/modules/repeat.py
+++ b/funasr/modules/repeat.py
@@ -14,25 +14,38 @@
 class MultiSequential(torch.nn.Sequential):
     """Multi-input multi-output torch.nn.Sequential."""
 
+    def __init__(self, *args, layer_drop_rate=0.0):
+        """Initialize MultiSequential with layer_drop.
+
+        Args:
+            layer_drop_rate (float): Probability of dropping out each fn (layer).
+
+        """
+        super(MultiSequential, self).__init__(*args)
+        self.layer_drop_rate = layer_drop_rate
+
     def forward(self, *args):
         """Repeat."""
-        for m in self:
-            args = m(*args)
+        _probs = torch.empty(len(self)).uniform_()
+        for idx, m in enumerate(self):
+            if not self.training or (_probs[idx] >= self.layer_drop_rate):
+                args = m(*args)
         return args
 
 
-def repeat(N, fn):
+def repeat(N, fn, layer_drop_rate=0.0):
     """Repeat module N times.
 
     Args:
         N (int): Number of repeat time.
         fn (Callable): Function to generate module.
+        layer_drop_rate (float): Probability of dropping out each fn (layer).
 
     Returns:
         MultiSequential: Repeated model instance.
 
     """
-    return MultiSequential(*[fn(n) for n in range(N)])
+    return MultiSequential(*[fn(n) for n in range(N)], layer_drop_rate=layer_drop_rate)
 
 
 class MultiBlocks(torch.nn.Module):
diff --git a/funasr/runtime/html5/demo.gif b/funasr/runtime/html5/demo.gif
deleted file mode 100644
index f487f2c..0000000
--- a/funasr/runtime/html5/demo.gif
+++ /dev/null
Binary files differ
diff --git a/funasr/runtime/html5/readme.md b/funasr/runtime/html5/readme.md
index c85641c..e46ab92 100644
--- a/funasr/runtime/html5/readme.md
+++ b/funasr/runtime/html5/readme.md
@@ -1,3 +1,5 @@
+([绠�浣撲腑鏂嘳(./readme_zh.md)|English)
+
 # Html5 server for asr service
 
 ## Requirement
diff --git a/funasr/runtime/html5/readme_cn.md b/funasr/runtime/html5/readme_cn.md
deleted file mode 100644
index dfc27c9..0000000
--- a/funasr/runtime/html5/readme_cn.md
+++ /dev/null
@@ -1,135 +0,0 @@
-# Html5 server for asr service
-
-## Requirement
-#### Install the modelscope and funasr
-```shell
-pip install -U modelscope funasr
-# For the users in China, you could install with the command:
-# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
-git clone https://github.com/alibaba/FunASR.git && cd FunASR
-```
-#### Install the requirements for server
-```shell
-pip install flask
-# pip install gevent (Optional)
-# pip install pyOpenSSL (Optional)
-```
-
-### javascript (Optional)
-[html5褰曢煶](https://github.com/xiangyuecn/Recorder)
-```shell
-Recorder 
-```
-
-## demo椤甸潰濡備笅
-<div align="center"><img src="./demo.gif" width="150"/> </div>
-
-[//]: # (## 涓ょws_server杩炴帴妯″紡)
-
-[//]: # (### 1&#41;鐩存帴杩炴帴妯″紡锛屾祻瑙堝櫒https楹﹀厠椋� --> html5 demo鏈嶅姟 --> js wss鎺ュ彛 --> wss asr online srv&#40;璇佷功鐢熸垚璇峰線鍚庣湅&#41;)
-
-[//]: # (### 2&#41;nginx涓浆锛屾祻瑙堝櫒https楹﹀厠椋� --> html5 demo鏈嶅姟 --> js wss鎺ュ彛 --> nginx鏈嶅姟 --> ws asr online srv)
-
-## 鎿嶄綔姝ラ
-### html5 demo鏈嶅姟鍚姩
-鍚姩html5鏈嶅姟锛岄渶瑕乻sl璇佷功(宸茬敓鎴愶紝濡傞渶瑕佽嚜宸辩敓鎴愯寰�鍚庣湅)
-```shell
-h5Server.py [-h] [--host HOST] [--port PORT] [--certfile CERTFILE] [--keyfile KEYFILE]             
-```
-渚嬪瓙濡備笅锛岄渶瑕佹敞鎰廼p鍦板潃锛屽鏋滀粠鍏朵粬璁惧璁块棶闇�姹傦紙渚嬪鎵嬫満绔級锛岄渶瑕佸皢ip鍦板潃璁句负鐪熷疄ip 
-```shell
-cd funasr/runtime/html5
-python h5Server.py --host 0.0.0.0 --port 1337
-# python h5Server.py --host 30.220.136.139 --port 1337
-```
-### 鍚姩ASR鏈嶅姟
-[鍏蜂綋璇风湅online asr](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/websocket)
-
-`Tips:` asr 鏈嶅姟闇�瑕佷笌html5鏈嶅姟閮ㄧ讲鍒板悓涓�涓墿鐞嗘満鍣ㄤ笂
-#### wss鏂瑰紡
-```shell
-cd ../python/websocket
-python funasr_wss_server.py --port 10095
-```
-
-### 娴忚鍣ㄦ墦寮�鍦板潃
-ip鍦板潃闇�瑕佷笌html5 server淇濇寔涓�鑷达紝濡傛灉鏄湰鍦版満鍣紝鍙互鐢�127.0.0.1
-```shell
-https://127.0.0.1:1337/static/index.html
-# https://30.220.136.139:1337/static/index.html
-```
-
-### 淇敼缃戦〉閲宎sr鎺ュ彛鍦板潃
-淇敼缃戦〉涓紝asr鏈嶅姟鍣ㄥ湴鍧�锛坵ebsocket srv鐨刬p涓庣鍙o級锛岀偣鍑诲紑濮嬪嵆鍙娇鐢ㄣ�傛敞鎰廻5鏈嶅姟鍜宎sr鏈嶅姟闇�瑕佸湪鍚屼竴涓湇鍔″櫒涓婏紝鍚﹀垯瀛樺湪璺ㄥ煙闂銆�
-
-
-
-[//]: # (## nginx閰嶇疆璇存槑&#40;浜嗚В鐨勫彲浠ヨ烦杩�&#41;)
-
-[//]: # (h5鎵撳紑楹﹀厠椋庨渶瑕乭ttps鍗忚锛屽悓鏃跺悗绔殑asr websocket涔熷繀椤绘槸wss鍗忚锛屽鏋淸online asr]&#40;https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/websocket&#41;浠s鏂瑰紡杩愯锛屾垜浠彲浠ラ�氳繃nginx閰嶇疆瀹炵幇wss鍗忚鍒皐s鍗忚鐨勮浆鎹€��)
-
-[//]: # ()
-[//]: # (### nginx杞彂閰嶇疆绀轰緥)
-
-[//]: # (```shell)
-
-[//]: # (events {                                                                                                            [0/1548])
-
-[//]: # (    worker_connections  1024;)
-
-[//]: # (    accept_mutex on;)
-
-[//]: # (  })
-
-[//]: # (http {)
-
-[//]: # (  error_log  error.log;)
-
-[//]: # (  access_log  access.log;)
-
-[//]: # (  server {)
-
-[//]: # ()
-[//]: # (    listen 5921 ssl http2;  # nginx listen port for wss)
-
-[//]: # (    server_name www.test.com;)
-
-[//]: # ()
-[//]: # (    ssl_certificate     /funasr/server.crt;)
-
-[//]: # (    ssl_certificate_key /funasr/server.key;)
-
-[//]: # (    ssl_protocols       TLSv1 TLSv1.1 TLSv1.2;)
-
-[//]: # (    ssl_ciphers         HIGH:!aNULL:!MD5;)
-
-[//]: # ()
-[//]: # (    location /wss/ {)
-
-[//]: # ()
-[//]: # ()
-[//]: # (      proxy_pass http://127.0.0.1:1111/;  # asr online model ws address and port)
-
-[//]: # (      proxy_http_version 1.1;)
-
-[//]: # (      proxy_set_header Upgrade $http_upgrade;)
-
-[//]: # (      proxy_set_header Connection "upgrade";)
-
-[//]: # (      proxy_read_timeout 600s;)
-
-[//]: # ()
-[//]: # (    })
-
-[//]: # (  })
-
-[//]: # (```)
-
-[//]: # (### 淇敼wsconnecter.js閲宎sr鎺ュ彛鍦板潃)
-
-[//]: # (wsconnecter.js閲岄厤缃畂nline asr鏈嶅姟鍦板潃璺緞锛岃繖閲岄厤缃殑鏄痺ss绔彛)
-
-[//]: # (var Uri = "wss://xxx:xxx/wss/" )
-## Acknowledge
-1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
-2. We acknowledge [AiHealthx](http://www.aihealthx.com/) for contributing the html5 demo.
\ No newline at end of file
diff --git a/funasr/runtime/html5/readme_zh.md b/funasr/runtime/html5/readme_zh.md
new file mode 100644
index 0000000..1010838
--- /dev/null
+++ b/funasr/runtime/html5/readme_zh.md
@@ -0,0 +1,93 @@
+(绠�浣撲腑鏂噟[English](./readme.md))
+
+# 璇煶璇嗗埆鏈嶅姟Html5瀹㈡埛绔闂晫闈�
+
+鏈嶅姟绔儴缃查噰鐢╳ebsocket鍗忚锛屽鎴风鍙互鏀寔html5缃戦〉璁块棶锛屾敮鎸侀害鍏嬮杈撳叆涓庢枃浠惰緭鍏ワ紝鍙互閫氳繃濡備笅2绉嶆柟寮忚闂細
+- 鏂瑰紡涓�锛�
+
+   html瀹㈡埛绔洿杩烇紝鎵嬪姩涓嬭浇瀹㈡埛绔紙[鐐瑰嚮姝ゅ](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/html5/static)锛夎嚦鏈湴锛屾墦寮�`index.html`缃戦〉锛岃緭鍏ss鍦板潃涓庣鍙e彿
+
+- 鏂瑰紡浜岋細
+
+   html5鏈嶅姟绔紝鑷姩涓嬭浇瀹㈡埛绔嚦鏈湴锛屾敮鎸佹墜鏈虹瓑绔笂璁块棶
+
+## 璇煶璇嗗埆鏈嶅姟鍚姩
+
+鏀寔python鐗堟湰涓巆++鐗堟湰鏈嶅姟閮ㄧ讲锛屽叾涓�
+
+- python鐗堟湰
+  
+  鐩存帴閮ㄧ讲python pipeline锛屾敮鎸佹祦寮忓疄鏃惰闊宠瘑鍒ā鍨嬶紝绂荤嚎璇煶璇嗗埆妯″瀷锛屾祦寮忕绾夸竴浣撳寲绾犻敊妯″瀷锛岃緭鍑哄甫鏍囩偣鏂囧瓧銆傚崟涓猻erver锛屾敮鎸佸崟涓猚lient銆�
+
+- c++鐗堟湰
+  
+  funasr-runtime-sdk锛屾敮鎸佷竴閿儴缃诧紝0.1.0鐗堟湰锛屾敮鎸佺绾挎枃浠惰浆鍐欍�傚崟涓猻erver锛屾敮鎸佷笂鐧捐矾client璇锋眰銆�
+
+### python鐗堟湰鏈嶅姟鍚姩
+
+#### 瀹夎渚濊禆鐜
+
+```shell
+pip3 install -U modelscope funasr flask
+# 涓浗澶ч檰鐢ㄦ埛锛屽鏋滈亣鍒扮綉缁滈棶棰橈紝鍙互閫氳繃涓嬮潰鎸囦护瀹夎锛�
+# pip3 install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+```
+
+#### 鍚姩ASR鏈嶅姟
+
+#### wss鏂瑰紡
+
+```shell
+cd funasr/runtime/python/websocket
+python funasr_wss_server.py --port 10095
+```
+
+璇︾粏鍙傛暟閰嶇疆涓庤В鏋愶紙[鐐瑰嚮姝ゅ](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/websocket)锛�
+
+#### html5鏈嶅姟锛堝彲閫夛級
+
+濡傛灉闇�瑕佷娇鐢ㄤ笂闈㈡墍璇寸殑瀹㈡埛绔柟寮忎簩锛岃繘琛岃闂紝鍙互鍚姩html5鏈嶅姟
+```shell
+h5Server.py [-h] [--host HOST] [--port PORT] [--certfile CERTFILE] [--keyfile KEYFILE]             
+```
+渚嬪瓙濡備笅锛岄渶瑕佹敞鎰廼p鍦板潃锛屽鏋滀粠鍏朵粬璁惧璁块棶闇�姹傦紙渚嬪鎵嬫満绔級锛岄渶瑕佸皢ip鍦板潃璁句负鐪熷疄鍏綉ip 
+```shell
+cd funasr/runtime/html5
+python h5Server.py --host 0.0.0.0 --port 1337
+```
+
+鍚姩鍚庯紝鍦ㄦ祻瑙堝櫒涓緭鍏ワ紙[https://127.0.0.1:1337/static/index.html](https://127.0.0.1:1337/static/index.html)锛夊嵆鍙闂�
+
+### c++ 鐗堟湰鏈嶅姟鍚姩
+
+鐢变簬c++渚濊禆鐜杈冨锛屽缓璁噰鐢╠ocker閮ㄧ讲锛屾敮鎸佷竴閿惎鍔ㄦ湇鍔�
+
+```shell
+curl -O https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/shell/funasr-runtime-deploy-offline-cpu-zh.sh;
+sudo bash funasr-runtime-deploy-offline-cpu-zh.sh install --workspace /root/funasr-runtime-resources
+```
+璇︾粏鍙傛暟閰嶇疆涓庤В鏋愶紙[鐐瑰嚮姝ゅ](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/docs/SDK_tutorial_zh.md)锛�
+
+
+## 瀹㈡埛绔祴璇�
+
+### 鏂瑰紡涓�
+
+html瀹㈡埛绔洿杩烇紝鎵嬪姩涓嬭浇瀹㈡埛绔紙[鐐瑰嚮姝ゅ](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/html5/static)锛夎嚦鏈湴锛屾墦寮�`index.html`缃戦〉锛岃緭鍏ss鍦板潃涓庣鍙e彿鍗冲彲浣跨敤
+
+### 鏂瑰紡浜�
+
+html5鏈嶅姟绔紝鑷姩涓嬭浇瀹㈡埛绔嚦鏈湴锛屾敮鎸佹墜鏈虹瓑绔笂璁块棶锛宨p鍦板潃闇�瑕佷笌html5 server淇濇寔涓�鑷达紝濡傛灉鏄湰鍦版満鍣紝鍙互鐢�127.0.0.1
+
+
+```shell
+https://127.0.0.1:1337/static/index.html
+```
+
+杈撳叆wss鍦板潃涓庣鍙e彿鍗冲彲浣跨敤
+
+
+## Acknowledge
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We acknowledge [AiHealthx](http://www.aihealthx.com/) for contributing the html5 demo.
diff --git a/funasr/runtime/onnxruntime/third_party/download_ffmpeg.sh b/funasr/runtime/onnxruntime/third_party/download_ffmpeg.sh
new file mode 100644
index 0000000..4a52184
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/download_ffmpeg.sh
@@ -0,0 +1,5 @@
+wget https://github.com/BtbN/FFmpeg-Builds/releases/download/autobuild-2023-07-09-12-50/ffmpeg-N-111383-g20b8688092-linux64-gpl-shared.tar.xz
+tar -xvf ffmpeg-N-111383-g20b8688092-linux64-gpl-shared.tar.xz
+# 鍥藉唴鍙互浣跨敤涓嬭堪鏂瑰紡
+# wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/dep_libs/ffmpeg-N-111383-g20b8688092-linux64-gpl-shared.tar.xz
+# tar -xvf ffmpeg-N-111383-g20b8688092-linux64-gpl-shared.tar.xz
diff --git a/funasr/runtime/onnxruntime/third_party/download_onnxruntime.sh b/funasr/runtime/onnxruntime/third_party/download_onnxruntime.sh
new file mode 100644
index 0000000..adaddc5
--- /dev/null
+++ b/funasr/runtime/onnxruntime/third_party/download_onnxruntime.sh
@@ -0,0 +1,5 @@
+
+# download an appropriate onnxruntime from https://github.com/microsoft/onnxruntime/releases/tag/v1.14.0
+# here we get a copy of onnxruntime for linux 64
+wget https://github.com/microsoft/onnxruntime/releases/download/v1.14.0/onnxruntime-linux-x64-1.14.0.tgz
+tar -zxvf onnxruntime-linux-x64-1.14.0.tgz
\ No newline at end of file
diff --git a/funasr/runtime/python/websocket/funasr_wss_client.py b/funasr/runtime/python/websocket/funasr_wss_client.py
index 72121f7..264014f 100644
--- a/funasr/runtime/python/websocket/funasr_wss_client.py
+++ b/funasr/runtime/python/websocket/funasr_wss_client.py
@@ -204,6 +204,7 @@
         
             meg = await websocket.recv()
             meg = json.loads(meg)
+            print(meg)
             wav_name = meg.get("wav_name", "demo")
             text = meg["text"]
 
diff --git a/funasr/runtime/python/websocket/funasr_wss_server.py b/funasr/runtime/python/websocket/funasr_wss_server.py
index 4929090..2aff9ba 100644
--- a/funasr/runtime/python/websocket/funasr_wss_server.py
+++ b/funasr/runtime/python/websocket/funasr_wss_server.py
@@ -240,7 +240,8 @@
                                                          param_dict=websocket.param_dict_punc)
                     # print("offline", rec_result)
                 if 'text' in rec_result:
-                    message = json.dumps({"mode": websocket.mode, "text": rec_result["text"], "wav_name": websocket.wav_name})
+                    mode = "2pass-offline" if "2pass" in websocket.mode else websocket.mode
+                    message = json.dumps({"mode": mode, "text": rec_result["text"], "wav_name": websocket.wav_name})
                     await websocket.send(message)
 
 
@@ -256,7 +257,8 @@
         if "text" in rec_result:
             if rec_result["text"] != "sil" and rec_result["text"] != "waiting_for_more_voice":
                 # print("online", rec_result)
-                message = json.dumps({"mode": websocket.mode, "text": rec_result["text"], "wav_name": websocket.wav_name})
+                mode = "2pass-online" if "2pass" in websocket.mode else websocket.mode
+                message = json.dumps({"mode": mode, "text": rec_result["text"], "wav_name": websocket.wav_name})
                 await websocket.send(message)
 
 if len(args.certfile)>0:
diff --git a/funasr/runtime/websocket/readme.md b/funasr/runtime/websocket/readme.md
index 378f478..414d6b8 100644
--- a/funasr/runtime/websocket/readme.md
+++ b/funasr/runtime/websocket/readme.md
@@ -1,3 +1,5 @@
+([绠�浣撲腑鏂嘳(https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/websocket/readme_zh.md)|English)
+
 # Service with websocket-cpp
 
 ## Export the model
diff --git a/funasr/runtime/websocket/readme_zh.md b/funasr/runtime/websocket/readme_zh.md
new file mode 100644
index 0000000..6535069
--- /dev/null
+++ b/funasr/runtime/websocket/readme_zh.md
@@ -0,0 +1,190 @@
+(绠�浣撲腑鏂噟[English](https://github.com/alibaba-damo-academy/FunASR/blob/main/funasr/runtime/websocket/readme.md))
+
+# 閲囩敤websocket鍗忚鐨刢++閮ㄧ讲鏂规
+
+## 蹇�熶笂鎵�
+### 闀滃儚鍚姩
+
+閫氳繃涓嬭堪鍛戒护鎷夊彇骞跺惎鍔‵unASR runtime-SDK鐨刣ocker闀滃儚锛�
+
+```shell
+sudo docker pull registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.1.0
+
+sudo docker run -p 10095:10095 -it --privileged=true -v /root:/workspace/models registry.cn-hangzhou.aliyuncs.com/funasr_repo/funasr:funasr-runtime-sdk-cpu-0.1.0
+```
+濡傛灉鎮ㄦ病鏈夊畨瑁卍ocker锛屽彲鍙傝�僛Docker瀹夎](#Docker瀹夎)
+
+### 鏈嶅姟绔惎鍔�
+
+docker鍚姩涔嬪悗锛屽惎鍔� funasr-wss-server鏈嶅姟绋嬪簭锛�
+```shell
+cd FunASR/funasr/runtime
+./run_server.sh \
+  --download-model-dir /workspace/models \
+  --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
+  --model-dir damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx  \
+  --punc-dir damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx
+```
+鏈嶅姟绔缁嗗弬鏁颁粙缁嶅彲鍙傝�僛鏈嶅姟绔弬鏁颁粙缁峕(#鏈嶅姟绔弬鏁颁粙缁�)
+
+### 瀹㈡埛绔祴璇曚笌浣跨敤
+
+涓嬭浇瀹㈡埛绔祴璇曞伐鍏风洰褰晄amples
+```shell
+wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/sample/funasr_samples.tar.gz
+```
+鎴戜滑浠ython璇█瀹㈡埛绔负渚嬶紝杩涜璇存槑锛屾敮鎸佸绉嶉煶棰戞牸寮忚緭鍏ワ紙.wav, .pcm, .mp3绛夛級锛屼篃鏀寔瑙嗛杈撳叆(.mp4绛�)锛屼互鍙婂鏂囦欢鍒楄〃wav.scp杈撳叆锛屽叾浠栫増鏈鎴风璇峰弬鑰冩枃妗o紙[鐐瑰嚮姝ゅ](#瀹㈡埛绔敤娉曡瑙�)锛夛紝瀹氬埗鏈嶅姟閮ㄧ讲璇峰弬鑰僛濡備綍瀹氬埗鏈嶅姟閮ㄧ讲](#濡備綍瀹氬埗鏈嶅姟閮ㄧ讲)
+```shell
+python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav"
+```
+
+------------------
+
+## 鎿嶄綔姝ラ璇﹁В
+
+### 渚濊禆搴撲笅杞�
+
+#### Download onnxruntime
+```shell
+bash third_party/download_onnxruntime.sh
+```
+
+#### Download ffmpeg
+```shell
+bash third_party/download_ffmpeg.sh
+```
+
+#### Install openblas and openssl
+```shell
+sudo apt-get install libopenblas-dev libssl-dev #ubuntu
+# sudo yum -y install openblas-devel openssl-devel #centos
+
+```
+
+### 缂栬瘧
+
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR/funasr/runtime/websocket
+mkdir build && cd build
+cmake  -DCMAKE_BUILD_TYPE=release .. -DONNXRUNTIME_DIR=/path/to/onnxruntime-linux-x64-1.14.0 -DFFMPEG_DIR=/path/to/ffmpeg-N-111383-g20b8688092-linux64-gpl-shared
+make
+```
+
+### 鍚姩鏈嶅姟閮ㄧ讲
+
+#### 浠巑odelscope涓ā鍨嬪惎鍔ㄧず渚�
+```shell
+./funasr-wss-server  \
+  --download-model-dir /workspace/models \
+  --model-dir damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx \
+  --vad-dir damo/speech_fsmn_vad_zh-cn-16k-common-onnx \
+  --punc-dir damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx
+```
+
+娉ㄦ剰锛氫笂闈㈢ず渚嬩腑锛宍model-dir`锛宍vad-dir`锛宍punc-dir`涓烘ā鍨嬪湪modelscope涓ā鍨嬪悕瀛楋紝鐩存帴浠巑odelscope涓嬭浇妯″瀷骞朵笖瀵煎嚭閲忓寲鍚庣殑onnx銆傚鏋滈渶瑕佷粠鏈湴鍚姩锛岄渶瑕佹敼鎴愭湰鍦扮粷瀵硅矾寰勩��
+
+#### 浠庢湰鍦版ā鍨嬪惎鍔ㄧず渚�
+
+##### 瀵煎嚭妯″瀷
+
+```shell
+python -m funasr.export.export_model \
+--export-dir ./export \
+--type onnx \
+--quantize True \
+--model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch \
+--model-name damo/speech_fsmn_vad_zh-cn-16k-common-pytorch \
+--model-name damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch
+```
+瀵煎嚭杩囩▼璇︾粏浠嬬粛锛圼鐐瑰嚮姝ゅ](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)锛�
+
+##### 鍚姩鏈嶅姟
+```shell
+./funasr-wss-server  \
+  --download-model-dir /workspace/models \
+  --model-dir ./exportdamo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-onnx \
+  --vad-dir ./exportdamo/speech_fsmn_vad_zh-cn-16k-common-onnx \
+  --punc-dir ./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-onnx
+```
+
+#### 鍛戒护鍙傛暟浠嬬粛锛�
+```text
+--download-model-dir 妯″瀷涓嬭浇鍦板潃锛岄�氳繃璁剧疆model ID浠嶮odelscope涓嬭浇妯″瀷銆傚鏋滀粠鏈湴妯″瀷鍚姩锛屽彲浠ヤ笉璁剧疆銆�
+--model-dir  modelscope 涓� ASR model ID锛屾垨鑰呮湰鍦版ā鍨嬬粷瀵硅矾寰�
+--quantize  True涓洪噺鍖朅SR妯″瀷锛孎alse涓洪潪閲忓寲ASR妯″瀷锛岄粯璁ゆ槸True
+--vad-dir  modelscope 涓� VAD model ID锛屾垨鑰呮湰鍦版ā鍨嬬粷瀵硅矾寰�
+--vad-quant   True涓洪噺鍖朧AD妯″瀷锛孎alse涓洪潪閲忓寲VAD妯″瀷锛岄粯璁ゆ槸True
+--punc-dir  modelscope 涓� 鏍囩偣 model ID锛屾垨鑰呮湰鍦版ā鍨嬬粷瀵硅矾寰�
+--punc-quant   True涓洪噺鍖朠UNC妯″瀷锛孎alse涓洪潪閲忓寲PUNC妯″瀷锛岄粯璁ゆ槸True
+--port  鏈嶅姟绔洃鍚殑绔彛鍙凤紝榛樿涓� 10095
+--decoder-thread-num  鏈嶅姟绔惎鍔ㄧ殑鎺ㄧ悊绾跨▼鏁帮紝榛樿涓� 8
+--io-thread-num  鏈嶅姟绔惎鍔ㄧ殑IO绾跨▼鏁帮紝榛樿涓� 1
+--certfile  ssl鐨勮瘉涔︽枃浠讹紝榛樿涓猴細../../../ssl_key/server.crt
+--keyfile   ssl鐨勫瘑閽ユ枃浠讹紝榛樿涓猴細../../../ssl_key/server.key
+```
+
+### 瀹㈡埛绔敤娉曡瑙�
+
+涓嬭浇瀹㈡埛绔祴璇曞伐鍏风洰褰晄amples
+```shell
+wget https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/sample/funasr_samples.tar.gz
+```
+
+鍦ㄦ湇鍔″櫒涓婂畬鎴怓unASR鏈嶅姟閮ㄧ讲浠ュ悗锛屽彲浠ラ�氳繃濡備笅鐨勬楠ゆ潵娴嬭瘯鍜屼娇鐢ㄧ绾挎枃浠惰浆鍐欐湇鍔°��
+鐩墠鍒嗗埆鏀寔浠ヤ笅鍑犵缂栫▼璇█瀹㈡埛绔�
+
+- [Python](#python-client)
+- [CPP](#cpp-client)
+- [html缃戦〉鐗堟湰](#Html缃戦〉鐗�)
+- [Java](#Java-client)
+
+#### python-client
+鑻ユ兂鐩存帴杩愯client杩涜娴嬭瘯锛屽彲鍙傝�冨涓嬬畝鏄撹鏄庯紝浠ython鐗堟湰涓轰緥锛�
+
+```shell
+python3 wss_client_asr.py --host "127.0.0.1" --port 10095 --mode offline --audio_in "../audio/asr_example.wav" --output_dir "./results"
+```
+
+鍛戒护鍙傛暟璇存槑锛�
+```text
+--host 涓篎unASR runtime-SDK鏈嶅姟閮ㄧ讲鏈哄櫒ip锛岄粯璁や负鏈満ip锛�127.0.0.1锛夛紝濡傛灉client涓庢湇鍔′笉鍦ㄥ悓涓�鍙版湇鍔″櫒锛岄渶瑕佹敼涓洪儴缃叉満鍣╥p
+--port 10095 閮ㄧ讲绔彛鍙�
+--mode offline琛ㄧず绂荤嚎鏂囦欢杞啓
+--audio_in 闇�瑕佽繘琛岃浆鍐欑殑闊抽鏂囦欢锛屾敮鎸佹枃浠惰矾寰勶紝鏂囦欢鍒楄〃wav.scp
+--output_dir 璇嗗埆缁撴灉淇濆瓨璺緞
+```
+
+### cpp-client
+杩涘叆samples/cpp鐩綍鍚庯紝鍙互鐢╟pp杩涜娴嬭瘯锛屾寚浠ゅ涓嬶細
+```shell
+./funasr-wss-client --server-ip 127.0.0.1 --port 10095 --wav-path ../audio/asr_example.wav
+```
+
+鍛戒护鍙傛暟璇存槑锛�
+
+```text
+--server-ip 涓篎unASR runtime-SDK鏈嶅姟閮ㄧ讲鏈哄櫒ip锛岄粯璁や负鏈満ip锛�127.0.0.1锛夛紝濡傛灉client涓庢湇鍔′笉鍦ㄥ悓涓�鍙版湇鍔″櫒锛岄渶瑕佹敼涓洪儴缃叉満鍣╥p
+--port 10095 閮ㄧ讲绔彛鍙�
+--wav-path 闇�瑕佽繘琛岃浆鍐欑殑闊抽鏂囦欢锛屾敮鎸佹枃浠惰矾寰�
+```
+
+### Html缃戦〉鐗�
+
+鍦ㄦ祻瑙堝櫒涓墦寮� html/static/index.html锛屽嵆鍙嚭鐜板涓嬮〉闈紝鏀寔楹﹀厠椋庤緭鍏ヤ笌鏂囦欢涓婁紶锛岀洿鎺ヨ繘琛屼綋楠�
+
+<img src="images/html.png"  width="900"/>
+
+### Java-client
+
+```shell
+FunasrWsClient --host localhost --port 10095 --audio_in ./asr_example.wav --mode offline
+```
+璇︾粏鍙互鍙傝�冩枃妗o紙[鐐瑰嚮姝ゅ](../java/readme.md)锛�
+
+
+
+## Acknowledge
+1. This project is maintained by [FunASR community](https://github.com/alibaba-damo-academy/FunASR).
+2. We acknowledge [zhaoming](https://github.com/zhaomingwork/FunASR/tree/add-offline-websocket-srv/funasr/runtime/websocket) for contributing the websocket(cpp-api).
+
+
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index a6f7eac..427f400 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -1538,7 +1538,6 @@
         Return:
             model: ASR BAT model.
         """
-        assert check_argument_types()
 
         if isinstance(args.token_list, str):
             with open(args.token_list, encoding="utf-8") as f:
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index a25f39a..27d6f9c 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -369,7 +369,7 @@
                             ],
                             "scaler": scaler.state_dict() if scaler is not None else None,
                             "ema_model": model.encoder.ema.model.state_dict()
-                            if hasattr(model.encoder, "ema") and model.encoder.ema is not None else None,
+                            if hasattr(model, "encoder") and hasattr(model.encoder, "ema") and model.encoder.ema is not None else None,
                         },
                         buffer,
                     )
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 4e7a8a9..5787f1d 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,14 +1,10 @@
-from itertools import zip_longest
-
 import torch
-import copy
 import codecs
 import logging
-import edit_distance
 import argparse
-import pdb
 import numpy as np
-from typing import Any, List, Tuple, Union
+import edit_distance
+from itertools import zip_longest
 
 
 def ts_prediction_lfr6_standard(us_alphas, 
@@ -36,7 +32,14 @@
     # so treat the frames between two peaks as the duration of the former token
     fire_place = torch.where(peaks>1.0-1e-4)[0].cpu().numpy() + force_time_shift  # total offset
     num_peak = len(fire_place)
-    assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
+    if num_peak != len(char_list) + 1:
+        logging.warning("length mismatch, result might be incorrect.")
+        logging.warning("num_peaks: {}, num_chars+1: {}, which is supposed to be same.".format(num_peak, len(char_list)+1))
+    if num_peak > len(char_list) + 1:
+        fire_place = fire_place[:len(char_list) - 1]
+    elif num_peak < len(char_list) + 1:
+        char_list = char_list[:num_peak + 1]
+    # assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
     # begin silence
     if fire_place[0] > START_END_THRESHOLD:
         # char_list.insert(0, '<sil>')
diff --git a/funasr/version.txt b/funasr/version.txt
index 1a5ac0d..faef31a 100644
--- a/funasr/version.txt
+++ b/funasr/version.txt
@@ -1 +1 @@
-0.6.9
+0.7.0
diff --git a/setup.py b/setup.py
index 6bb4bcd..9c36dd0 100644
--- a/setup.py
+++ b/setup.py
@@ -23,6 +23,7 @@
         "nltk>=3.4.5",
         # ASR
         "sentencepiece",
+        "jieba",
         # TTS
         "pypinyin>=0.44.0",
         "espnet_tts_frontend",
@@ -122,4 +123,4 @@
         "License :: OSI Approved :: Apache Software License",
         "Topic :: Software Development :: Libraries :: Python Modules",
     ],
-)
\ No newline at end of file
+)

--
Gitblit v1.9.1