From 544b798b32819fe2ffed1fccb44e8c2620449a53 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 16 二月 2023 17:30:51 +0800
Subject: [PATCH] Merge branch 'dev_gzf' of github.com:alibaba-damo-academy/FunASR into dev_gzf add

---
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py                                                   |  144 +
 docs_cn/modelscope_usages.md                                                                                                       |   18 
 funasr/tasks/vad.py                                                                                                                |    7 
 funasr/bin/asr_inference_paraformer.py                                                                                             |   45 
 funasr/bin/asr_inference_uniasr_vad.py                                                                                             |   12 
 docs/installation.md                                                                                                               |   30 
 funasr/bin/vad_inference.py                                                                                                        |   48 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py                                  |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py                             |    2 
 docs/get_started.md                                                                                                                |   43 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py                               |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py                             |    2 
 funasr/runtime/python/torchscripts/paraformer/__init__.py                                                                          |    0 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py                             |    2 
 funasr/tasks/abs_task.py                                                                                                           |   10 
 funasr/bin/asr_inference_uniasr.py                                                                                                 |   14 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md                      |   19 
 funasr/models/e2e_vad.py                                                                                                           |  117 +
 funasr/utils/timestamp_tools.py                                                                                                    |   82 -
 docs_cn/build_task.md                                                                                                              |  105 +
 funasr/bin/build_trainer.py                                                                                                        |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py                                |    2 
 funasr/export/models/e2e_asr_paraformer.py                                                                                         |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py                              |    2 
 funasr/models/encoder/encoder_layer_mfcca.py                                                                                       |  270 +++
 docs/index.rst                                                                                                                     |    2 
 funasr/models/encoder/fsmn_encoder.py                                                                                              |  168 +-
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py                              |    2 
 funasr/datasets/iterable_dataset.py                                                                                                |  156 +
 docs/modelscope_usages.md                                                                                                          |   53 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/demo.py                                                              |    9 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py                                |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py                   |    2 
 funasr/runtime/python/onnxruntime/paraformer/__init__.py                                                                           |    0 
 .github/workflows/main.yml                                                                                                         |   17 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py                                               |   35 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py                  |    2 
 docs/images/wechat.png                                                                                                             |    0 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py                              |    2 
 funasr/bin/vad_inference_launch.py                                                                                                 |    6 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py                                 |    2 
 funasr/export/README.md                                                                                                            |   48 
 funasr/runtime/python/__init__.py                                                                                                  |    0 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py                                 |    2 
 egs/alimeeting/diarization/sond/README.md                                                                                          |   21 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py                                   |   67 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py                                                       |  256 +++
 docs_cn/get_started.md                                                                                                             |    3 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py                              |    2 
 funasr/export/export_model.py                                                                                                      |   20 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer.py                                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py                                  |    2 
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                                                    |   12 
 funasr/bin/asr_inference_mfcca.py                                                                                                  |  764 +++++++++
 funasr/export/models/predictor/cif.py                                                                                              |   50 
 funasr/bin/asr_inference_paraformer_vad.py                                                                                         |    1 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/postprocess_utils.py                                           |  240 +++
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md                                            |    2 
 README.md                                                                                                                          |   38 
 funasr/datasets/large_datasets/dataset.py                                                                                          |    7 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py                              |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py                              |    2 
 funasr/bin/asr_inference_paraformer_timestamp.py                                                                                   |   10 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py                              |    2 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py                                                          |    3 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py                                                  |  103 +
 .gitignore                                                                                                                         |    1 
 funasr/models/e2e_asr_mfcca.py                                                                                                     |  322 ++++
 funasr/models/encoder/mfcca_encoder.py                                                                                             |  450 +++++
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py                |    2 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.py                       |   21 
 funasr/runtime/python/torchscripts/__init__.py                                                                                     |    0 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/README.md                                                            |   75 
 funasr/runtime/python/onnxruntime/__init__.py                                                                                      |    0 
 funasr/datasets/preprocessor.py                                                                                                    |    2 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt                                                     |    6 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/__init__.py                                                    |    0 
 docs/build_task.md                                                                                                                 |  106 +
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py                             |    2 
 funasr/runtime/__init__.py                                                                                                         |    0 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py |   36 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py                             |    2 
 docs/images/dingding.jpg                                                                                                           |    0 
 funasr/utils/wav_utils.py                                                                                                          |   32 
 docs_cn/index.rst                                                                                                                  |    1 
 funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py                                                    |  136 +
 funasr/tasks/asr.py                                                                                                                |  138 +
 funasr/models/frontend/wav_frontend.py                                                                                             |    5 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py                               |    2 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/RESULTS.md                                                |   40 
 funasr/models/frontend/default.py                                                                                                  |  125 +
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md                                                 |   53 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py                             |    2 
 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py                 |    2 
 funasr/bin/asr_inference_launch.py                                                                                                 |    6 
 99 files changed, 4,165 insertions(+), 509 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index cfb94f3..43d1f1d 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -5,7 +5,7 @@
       - main
   push:
     branches:
-      - dev
+      - dev_wjm
 
 jobs:
   docs:
@@ -14,18 +14,27 @@
       - uses: actions/checkout@v1
       - uses: ammaraskar/sphinx-action@master
         with:
+          docs-folder: "docs/"
+          pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
+      - uses: ammaraskar/sphinx-action@master
+        with:
           docs-folder: "docs_cn/"
           pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
 
       - name: deploy copy
-        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm'
         run: |
           mkdir public
           touch public/.nojekyll
-          cp -r docs_cn/_build/html/* public/
+          mkdir public/en
+          touch public/en/.nojekyll
+          cp -r docs/_build/html/* public/en/
+          mkdir public/cn
+          touch public/cn/.nojekyll
+          cp -r docs_cn/_build/html/* public/cn/
 
       - name: deploy github.io pages
-        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev'
+        if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm'
         uses: peaceiris/actions-gh-pages@v2.3.1
         env:
           GITHUB_TOKEN: ${{ secrets.ACCESS_TOKEN }}
diff --git a/.gitignore b/.gitignore
index f8b55d7..7a43407 100644
--- a/.gitignore
+++ b/.gitignore
@@ -6,3 +6,4 @@
 .DS_Store
 init_model/
 *.tar.gz
+test_local/
\ No newline at end of file
diff --git a/README.md b/README.md
index c759b79..eaeb9ba 100644
--- a/README.md
+++ b/README.md
@@ -1,4 +1,4 @@
-<div align="left"><img src="docs/images/funasr_logo.jpg" width="400"/></div>
+[//]: # (<div align="left"><img src="docs/images/funasr_logo.jpg" width="400"/></div>)
 
 # FunASR: A Fundamental End-to-End Speech Recognition Toolkit
 
@@ -7,7 +7,8 @@
 [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
-| [**Docs**](https://alibaba-damo-academy.github.io/FunASR/index.html)
+| [**Docs_CN**](https://alibaba-damo-academy.github.io/FunASR/cn/index.html)
+| [**Docs_EN**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
 | [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
 | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
 | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
@@ -15,14 +16,31 @@
 | [**Contact**](#contact)
 
 ## What's new: 
-### 2023.1.16, funasr-0.1.6
+
+### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
+- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
+- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
+- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
+- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
+- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
+- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
+- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
+- We release several new UniASR model: 
+[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
+[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
+[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
+[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
+[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
+- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
+- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3銆乫lac銆乷gg銆乷pus...
+### 2023.1.16, funasr-0.1.6锛� modelscope-1.2.0
 - We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
  [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
-- We release a new type model, [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
-- We release a new type model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
+- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
+- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
 - We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
 - We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
-- We release a new type model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
+- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
 - We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
 - Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
 
@@ -42,7 +60,7 @@
 For more details, please ref to [installation](https://github.com/alibaba-damo-academy/FunASR/wiki)
 
 ## Usage
-For users who are new to FunASR and ModelScope, please refer to [FunASR Docs](https://alibaba-damo-academy.github.io/FunASR/index.html).
+For users who are new to FunASR and ModelScope, please refer to FunASR Docs([CN](https://alibaba-damo-academy.github.io/FunASR/cn/index.html) / [EN](https://alibaba-damo-academy.github.io/FunASR/en/index.html))
 
 ## Contact
 
@@ -85,4 +103,10 @@
   booktitle={INTERSPEECH},
   year={2022}
 }
+@inproceedings{Shi2023AchievingTP,
+  title={Achieving Timestamp Prediction While Recognizing with Non-Autoregressive End-to-End ASR Model},
+  author={Xian Shi and Yanni Chen and Shiliang Zhang and Zhijie Yan},
+  booktitle={arXiv preprint arXiv:2301.12343}
+  year={2023}
+}
 ```
\ No newline at end of file
diff --git a/docs/build_task.md b/docs/build_task.md
new file mode 100644
index 0000000..a45c820
--- /dev/null
+++ b/docs/build_task.md
@@ -0,0 +1,106 @@
+# Build custom tasks
+FunASR is similar to ESPNet, which applies `Task`  as the general interface ti achieve the training and inference of models. Each `Task` is a class inherited from `AbsTask` and its corresponding code can be seen in `funasr/tasks/abs_task.py`. The main functions of `AbsTask` are shown as follows:
+```python
+class AbsTask(ABC):
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        pass
+    
+    @classmethod
+    def build_preprocess_fn(cls, args, train):
+        (...)
+    
+    @classmethod
+    def build_collate_fn(cls, args: argparse.Namespace):
+        (...)
+
+    @classmethod
+    def build_model(cls, args):
+        (...)
+    
+    @classmethod
+    def main(cls, args):
+        (...)
+```
+- add_task_arguments锛欰dd parameters required by a specified `Task`
+- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞� define how to preprocess samples
+- build_collate_fn锛歞efine how to combine multiple samples into a `batch`
+- build_model锛歞efine the model
+- main锛歵raining interface, starting training through `Task.main()`
+
+Next, we take the speech recognition as an example to introduce how to define a new `Task`. For the corresponding code, please see `ASRTask` in `funasr/tasks/asr.py`. The procedure of defining a new `Task` is actually the procedure of redefining the above functions according to the requirements of the specified `Task`.
+
+- add_task_arguments
+```python
+@classmethod
+def add_task_arguments(cls, parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(description="Task related")
+    group.add_argument(
+        "--token_list",
+        type=str_or_none,
+        default=None,
+        help="A text mapping int-id to token",
+    )
+    (...)
+```
+For speech recognition tasks, specific parameters required include `token_list`, etc. According to the specific requirements of different tasks, users can define corresponding parameters in this function.
+
+- build_preprocess_fn
+```python
+@classmethod
+def build_preprocess_fn(cls, args, train):
+    if args.use_preprocessor:
+        retval = CommonPreprocessor(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    ...
+                )
+    else:
+        retval = None
+    return retval
+```
+This function defines how to preprocess samples. Specifically, the input of speech recognition tasks includes speech and text. For speech, functions such as (optional) adding noise and reverberation to the speech are supported. For text, functions such as (optional) processing text according to bpe and mapping text to `tokenid` are supported. Users can choose the preprocessing operation that needs to be performed on the sample. For the detail implementation, please refer to `CommonPreprocessor`.
+
+- build_collate_fn
+```python
+@classmethod
+def build_collate_fn(cls, args, train):
+    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+```
+This function defines how to combine multiple samples into a `batch`. For speech recognition tasks, `padding` is employed to obtain equal-length data from different speech and text. Specifically, we set `0.0` as the default padding value for speech and `-1` as the default padding value for text. Users can define different `batch` operations here. For the detail implementation, please refer to `CommonCollateFn`.
+
+- build_model
+```python
+@classmethod
+def build_model(cls, args, train):
+    with open(args.token_list, encoding="utf-8") as f:
+        token_list = [line.rstrip() for line in f]
+        vocab_size = len(token_list)
+        frontend = frontend_class(**args.frontend_conf)
+        specaug = specaug_class(**args.specaug_conf)
+        normalize = normalize_class(**args.normalize_conf)
+        preencoder = preencoder_class(**args.preencoder_conf)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
+        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+    return model
+```
+This function defines the detail of the model. For different speech recognition models, the same speech recognition `Task` can usually be shared and the remaining thing needed to be done is to define a specific model in this function. For example, a speech recognition model with a standard encoder-decoder structure has been shown above. Specifically, it first defines each module of the model, including encoder, decoder, etc. and then combine these modules together to generate a complete model. In FunASR, the model needs to inherit `AbsESPnetModel` and the corresponding code can be seen in `funasr/train/abs_espnet_model.py`. The main function needed to be implemented is the `forward` function.
\ No newline at end of file
diff --git a/docs/get_started.md b/docs/get_started.md
index 9f01fdc..4a7d86e 100644
--- a/docs/get_started.md
+++ b/docs/get_started.md
@@ -1,21 +1,21 @@
 # Get Started
-This is an easy example which introduces how to train a paraformer model on AISHELL-1 data from scratch. According to this example, you can train other models (conformer, paraformer, etc.) on other datasets (AISHELL-1, AISHELL-2, etc.) similarly.
+Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
 
 ## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 data  . This recipe consists of five stages and support training on multiple GPUs and decoding by CPU or GPU. Before introduce each stage in detail, we first explain several variables which should be set by users.
+We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
 - `CUDA_VISIBLE_DEVICES`: visible gpu list
 - `gpu_num`: the number of GPUs used for training
 - `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU.
-- `feats_dir`: the path to save processed data
-- `exp_dir`: the path to save experimental results
-- `data_aishell`: the path of raw AISHELL-1 data
-- `tag`: the suffix of experimental result directory
+- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
+- `data_aishell`: the raw path of AISHELL-1 dataset
+- `feats_dir`: the path for saving processed data
 - `nj`: the number of jobs for data preparation
 - `speed_perturb`: the range of speech perturbed
+- `exp_dir`: the path for saving experimental results
+- `tag`: the suffix of experimental result directory
 
 ## Stage 0: Data preparation
-This stage processes raw AISHELL-1 data `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx` and `xxx` means `train/dev/test`. Here we assume you have already downloaded AISHELL-1 data. If not, you can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. Here we show examples for `wav.scp` and `text`, separately.
+This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
 * `wav.scp`
 ```
 BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
@@ -30,17 +30,17 @@
 BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
 ...
 ```
-We can see that these two files both have two columns while the first column is the wav-id and the second column is the corresponding wav-path/label tokens.
+These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
 
 ## Stage 1: Feature Generation
-This stage extracts FBank feature from raw wav `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. You can set `nj` to control the number of jobs for feature generation. The output features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
+This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
 * `feats.scp`
 ```
 ...
-BAC009S0002W0122_sp0.9 /nfs/haoneng.lhn/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
+BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
 ...
 ```
-Note that samples in this file have already been shuffled. This file contains two columns. The first column is the wav-id while the second column is the kaldi-ark feature path. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
+Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
 * `speech_shape`
 ```
 ...
@@ -53,10 +53,10 @@
 BAC009S0002W0122_sp0.9 15
 ...
 ```
-These two files have two columns. The first column is the wav-id and the second column is the corresponding speech feature shape and text length.
+These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
 
 ## Stage 2: Dictionary Preparation
-This stage prepares a dictionary, which is used as a mapping between label characters and integer indices during ASR training. The output dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. Here we show an example of `tokens.txt` as follows:
+This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
 * `tokens.txt`
 ```
 <blank>
@@ -75,7 +75,7 @@
 * `<unk>`: indicates the out-of-vocabulary token
 
 ## Stage 3: Training
-This stage achieves the training of the specified model. To start training, you should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
 
 * DDP Training
 
@@ -83,30 +83,29 @@
 
 * DataLoader
 
-[comment]: <> (We support two types of DataLoaders for small and large datasets, respectively. By default, the small DataLoader is used and you can set `dataset_type=large` to enable large DataLoader. For small DataLoader, )
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and you can set `dataset_type=large` to enable it. 
+We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
 
 * Configuration
 
-The parameters of the training, including model, optimization, dataset, etc., are specified by a YAML file in `conf` directory. Also, you can directly specify the parameters in `run.sh` recipe. Please avoid to specify the same parameters in both the YAML file and the recipe.
+The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
 
 * Training Steps
 
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of the two parameters, the training will be stopped.
+We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
 
 * Tensorboard
 
-You can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
+Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
 ```
 tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
 ```
 
 ## Stage 4: Decoding
-This stage generates the recognition results with acoustic features as input and calculate the `CER` to verify the performance of the trained model. 
+This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
 
 * Mode Selection
 
-As we support conformer, paraformer and uniasr in FunASR and they have different inference interfaces, a `mode` param is specified as `asr/paraformer/uniase` according to the trained model.
+As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
 
 * Configuration
 
diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg
index 5e6cace..2eabf19 100644
--- a/docs/images/dingding.jpg
+++ b/docs/images/dingding.jpg
Binary files differ
diff --git a/docs/images/wechat.png b/docs/images/wechat.png
index 4c1937b..8672127 100644
--- a/docs/images/wechat.png
+++ b/docs/images/wechat.png
Binary files differ
diff --git a/docs/index.rst b/docs/index.rst
index ce44488..d7fc96b 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,12 +16,14 @@
    ./installation.md
    ./papers.md
    ./get_started.md
+   ./build_task.md
 
 .. toctree::
    :maxdepth: 1
    :caption: ModelScope:
 
    ./modelscope_models.md
+   ./modelscope_usages.md
 
 Indices and tables
 ==================
diff --git a/docs/installation.md b/docs/installation.md
index 61d06b5..fb26913 100755
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,35 +1,35 @@
 # Installation
-FunASR is easy to install, which is mainly based on python packages.
+FunASR is easy to install. The detailed installation steps are as follows:
 
-- Clone the repo
-``` sh
-git clone https://github.com/alibaba/FunASR.git
-```
-
-- Install Conda
-``` sh
+- Install Conda and create virtual environment:
+```sh
 wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 sh Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
 conda create -n funasr python=3.7
 conda activate funasr
 ```
 
 - Install Pytorch (version >= 1.7.0):
-
-| cuda  | |
-|:-----:| --- |
-|  9.2  | conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=9.2 -c pytorch |
-| 10.2  | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch |
-| 11.1  | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch |
+```sh
+pip install torch torchaudio
+```
 
 For more versions, please see [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
 
 - Install ModelScope
+
+For users in China, you can configure the following mirror source to speed up the downloading:
 ``` sh
+pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+Install or update ModelScope
+```sh
 pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-- Install other packages
+- Clone the repo and install other packages
 ``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
 pip install --editable ./
 ```
\ No newline at end of file
diff --git a/docs/modelscope_usages.md b/docs/modelscope_usages.md
new file mode 100644
index 0000000..84c8e1d
--- /dev/null
+++ b/docs/modelscope_usages.md
@@ -0,0 +1,53 @@
+# ModelScope Usage
+ModelScope is an open-source model-as-service platform supported by Alibaba, which provides flexible and convenient model applications for users in academia and industry. For specific usages and open source models, please refer to [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition). In the domain of speech, we provide autoregressive/non-autoregressive speech recognition, speech pre-training, punctuation prediction and other models, which are convenient for users.
+
+## Overall Introduction
+We provide the usages of different models under the `egs_modelscope`, which supports directly employing our provided models for inference, as well as finetuning the models we provided as pre-trained initial models. Next, we will introduce the model provided in the `egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch` directory, including `infer.py`, `finetune.py` and `infer_after_finetune .py`. The corresponding functions are as follows:
+- `infer.py`: perform inference on the specified dataset based on our provided model
+- `finetune.py`: employ our provided model as the initial model for fintuning
+- `infer_after_finetune.py`: perform inference on the specified dataset based on the finetuned model
+
+## Inference
+We provide `infer.py` to achieve the inference. Based on this file, users can preform inference on the specified dataset based on our provided model and obtain the corresponding recognition results. If the transcript is given, the `CER` will be calculated at the same time. Before performing inference, users can set the following parameters to modify the inference configuration:
+* `data_dir`锛歞ataset directory. The directory should contain the wav list file `wav.scp` and the transcript file `text` (optional). For the format of these two files, please refer to the instructions in [Quick Start](./get_started.md). If the `text` file exists, the CER will be calculated accordingly, otherwise it will be skipped.
+* `output_dir`锛歵he directory for saving the inference results
+* `batch_size`锛歜atch size during the inference
+* `ctc_weight`锛歴ome models contain a CTC module, users can set this parameter to specify the weight of the CTC module during the inference
+
+In addition to directly setting parameters in `infer.py`, users can also manually set the parameters in the `decoding.yaml` file in the model download directory to modify the inference configuration.
+
+## Finetuning
+We provide `finetune.py` to achieve the finetuning. Based on this file, users can finetune on the specified dataset based on our provided model as the initial model to achieve better performance in the specificed domain. Before finetuning, users can set the following parameters to modify the finetuning configuration:
+* `data_path`锛歞ataset directory銆俆his directory should contain the `train` directory for saving the training set and the `dev` directory for saving the validation set. Each directory needs to contain the wav list file `wav.scp` and the transcript file `text`
+* `output_dir`锛歵he directory for saving the finetuning results
+* `dataset_type`锛歠or small dataset锛宻et as `small`锛沠or dataset larger than 1000 hours锛宻et as `large`
+* `batch_bins`锛歜atch size锛宨f dataset_type is set as `small`锛宼he unit of batch_bins is the number of fbank feature frames; if dataset_type is set as `large`, the unit of batch_bins is milliseconds
+* `max_epoch`锛歵he maximum number of training epochs
+
+The following parameters can also be set. However, if there is no special requirement, users can ignore these parameters and use the default value we provided directly:
+* `accum_grad`锛歵he accumulation of the gradient
+* `keep_nbest_models`锛歴elect the `keep_nbest_models` models with the best performance and average the parameters 
+  of these models to get a better model
+* `optim`锛歴et the optimizer
+* `lr`锛歴et the learning rate
+* `scheduler`锛歴et learning rate adjustment strategy
+* `scheduler_conf`锛歴et the related parameters of the learning rate adjustment strategy
+* `specaug`锛歴et for the spectral augmentation
+* `specaug_conf`锛歴et related parameters of the spectral augmentation
+
+In addition to directly setting parameters in `finetune.py`, users can also manually set the parameters in the `finetune.yaml` file in the model download directory to modify the finetuning configuration.
+
+## Inference after Finetuning
+We provide `infer_after_finetune.py` to achieve the inference based on the model finetuned by users. Based on this file, users can preform inference on the specified dataset based on the finetuned model and obtain the corresponding recognition results. If the transcript is given, the `CER` will be calculated at the same time. Before performing inference, users can set the following parameters to modify the inference configuration:
+* `data_dir`锛歞ataset directory銆俆he directory should contain the wav list file `wav.scp` and the transcript file `text` (optional). If the `text` file exists, the CER will be calculated accordingly, otherwise it will be skipped.
+* `output_dir`锛歵he directory for saving the inference results
+* `batch_size`锛歜atch size during the inference
+* `ctc_weight`锛歴ome models contain a CTC module, users can set this parameter to specify the weight of the CTC module during the inference
+* `decoding_model_name`锛歴et the name of the model used for the inference
+
+The following parameters can also be set. However, if there is no special requirement, users can ignore these parameters and use the default value we provided directly:
+* `modelscope_model_name`锛歵he initial model name used when finetuning
+* `required_files`锛歠iles required for the inference when using the modelscope interface
+
+## Announcements
+Some models may have other specific parameters during the finetuning and inference. The usages of these parameters can be found in the `README.md` file in the corresponding directory.
\ No newline at end of file
diff --git a/docs_cn/build_task.md b/docs_cn/build_task.md
new file mode 100644
index 0000000..5d78993
--- /dev/null
+++ b/docs_cn/build_task.md
@@ -0,0 +1,105 @@
+# 鎼缓鑷畾涔変换鍔�
+FunASR绫讳技ESPNet锛屼互`Task`涓洪�氱敤鎺ュ彛锛屼粠鑰屽疄鐜版ā鍨嬬殑璁粌鍜屾帹鐞嗐�傛瘡涓�涓猔Task`鏄竴涓被锛屽叾闇�瑕佺户鎵縛AbsTask`锛屽叾瀵瑰簲鐨勫叿浣撲唬鐮佽`funasr/tasks/abs_task.py`銆備笅闈㈢粰鍑哄叾鍖呭惈鐨勪富瑕佸嚱鏁板強鍔熻兘浠嬬粛锛�
+```python
+class AbsTask(ABC):
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        pass
+    
+    @classmethod
+    def build_preprocess_fn(cls, args, train):
+        (...)
+    
+    @classmethod
+    def build_collate_fn(cls, args: argparse.Namespace):
+        (...)
+
+    @classmethod
+    def build_model(cls, args):
+        (...)
+    
+    @classmethod
+    def main(cls, args):
+        (...)
+```
+- add_task_arguments锛氭坊鍔犵壒瀹歚Task`闇�瑕佺殑鍙傛暟
+- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞�
+- build_collate_fn锛氬畾涔夊浣曞皢澶氫釜鏍锋湰缁勬垚涓�涓猔batch`
+- build_model锛氬畾涔夋ā鍨�
+- main锛氳缁冨叆鍙ｏ紝閫氳繃`Task.main()`鏉ュ惎鍔ㄨ缁�
+
+涓嬮潰鎴戜滑灏嗕互璇煶璇嗗埆浠诲姟涓轰緥锛屼粙缁嶅浣曞畾涔変竴涓柊鐨刞Task`锛屽叿浣撲唬鐮佽`funasr/tasks/asr.py`涓殑`ASRTask`銆� 瀹氫箟鏂扮殑`Task`鐨勮繃绋嬶紝鍏跺疄灏辨槸鏍规嵁浠诲姟闇�姹傦紝閲嶅畾涔変笂杩板嚱鏁扮殑杩囩▼銆�
+- add_task_arguments
+```python
+@classmethod
+def add_task_arguments(cls, parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(description="Task related")
+    group.add_argument(
+        "--token_list",
+        type=str_or_none,
+        default=None,
+        help="A text mapping int-id to token",
+    )
+    (...)
+```
+瀵逛簬璇煶璇嗗埆浠诲姟锛岄渶瑕佺殑鐗瑰畾鍙傛暟鍖呮嫭`token_list`绛夈�傛牴鎹笉鍚屼换鍔＄殑鐗瑰畾闇�姹傦紝鐢ㄦ埛鍙互鍦ㄦ鍑芥暟涓畾涔夌浉搴旂殑鍙傛暟銆�
+
+- build_preprocess_fn
+```python
+@classmethod
+def build_preprocess_fn(cls, args, train):
+    if args.use_preprocessor:
+        retval = CommonPreprocessor(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    ...
+                )
+    else:
+        retval = None
+    return retval
+```
+璇ュ嚱鏁板畾涔変簡濡備綍瀵规牱鏈繘琛岄澶勭悊銆傚叿浣撳湴锛岃闊宠瘑鍒换鍔＄殑杈撳叆鍖呮嫭闊抽鍜屾妱鏈�傚浜庨煶棰戯紝鍦ㄦ瀹炵幇浜�(鍙��)瀵归煶棰戝姞鍣０锛屽姞娣峰搷绛夊姛鑳斤紱瀵逛簬鎶勬湰锛屽湪姝ゅ疄鐜颁簡(鍙��)鏍规嵁bpe澶勭悊鎶勬湰锛屽皢鎶勬湰鏄犲皠鎴恅tokenid`绛夊姛鑳姐�傜敤鎴峰彲浠ヨ嚜宸遍�夋嫨闇�瑕佸鏍锋湰杩涜鐨勯澶勭悊鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonPreprocessor`銆�
+
+- build_collate_fn
+```python
+@classmethod
+def build_collate_fn(cls, args, train):
+    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+```
+璇ュ嚱鏁板畾涔変簡濡備綍灏嗗涓牱鏈粍鎴愪竴涓猔batch`銆傚浜庤闊宠瘑鍒换鍔★紝鍦ㄦ瀹炵幇鐨勬槸灏嗕笉鍚岀殑闊抽鍜屾妱鏈紝閫氳繃`padding`鐨勬柟寮忔潵寰楀埌绛夐暱鐨勬暟鎹�傚叿浣撳湴锛屾垜浠粯璁ょ敤`0.0`鏉ヤ綔涓洪煶棰戠殑濉厖鍊硷紝鐢╜-1`浣滀负鎶勬湰鐨勯粯璁ゅ～鍏呭�笺�傜敤鎴峰彲浠ュ湪姝ゅ畾涔変笉鍚岀殑缁刞batch`鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonCollateFn`銆�
+
+- build_model
+```python
+@classmethod
+def build_model(cls, args, train):
+    with open(args.token_list, encoding="utf-8") as f:
+        token_list = [line.rstrip() for line in f]
+        vocab_size = len(token_list)
+        frontend = frontend_class(**args.frontend_conf)
+        specaug = specaug_class(**args.specaug_conf)
+        normalize = normalize_class(**args.normalize_conf)
+        preencoder = preencoder_class(**args.preencoder_conf)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
+        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+    return model
+```
+璇ュ嚱鏁板畾涔変簡鍏蜂綋鐨勬ā鍨嬨�傚浜庝笉鍚岀殑璇煶璇嗗埆妯″瀷锛屽線寰�鍙互鍏辩敤鍚屼竴涓闊宠瘑鍒玚Task`锛岄澶栭渶瑕佸仛鐨勬槸鍦ㄦ鍑芥暟涓畾涔夌壒瀹氱殑妯″瀷銆備緥濡傦紝杩欓噷缁欏嚭鐨勬槸涓�涓爣鍑嗙殑encoder-decoder缁撴瀯鐨勮闊宠瘑鍒ā鍨嬨�傚叿浣撳湴锛屽厛瀹氫箟璇ユā鍨嬬殑鍚勪釜妯″潡锛屽寘鎷琫ncoder锛宒ecoder绛夛紝鐒跺悗鍦ㄥ皢杩欎簺妯″潡缁勫悎鍦ㄤ竴璧峰緱鍒颁竴涓畬鏁寸殑妯″瀷銆傚湪FunASR涓紝妯″瀷闇�瑕佺户鎵縛AbsESPnetModel`锛屽叾鍏蜂綋浠ｇ爜瑙乣funasr/train/abs_espnet_model.py`锛屼富瑕侀渶瑕佸疄鐜扮殑鏄痐forward`鍑芥暟銆�
diff --git a/docs_cn/get_started.md b/docs_cn/get_started.md
index 6e077e0..9e1c236 100644
--- a/docs_cn/get_started.md
+++ b/docs_cn/get_started.md
@@ -106,7 +106,8 @@
 鏈樁娈电敤浜庤В鐮佸緱鍒拌瘑鍒粨鏋滐紝鍚屾椂璁＄畻CER鏉ラ獙璇佽缁冨緱鍒扮殑妯″瀷鎬ц兘銆�
 
 * Mode Selection
-鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙ｇ爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniase`绛夈��
+
+鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙ｇ爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniasr`绛夈��
 
 * Configuration
 
diff --git a/docs_cn/index.rst b/docs_cn/index.rst
index e0283ae..4a898e9 100644
--- a/docs_cn/index.rst
+++ b/docs_cn/index.rst
@@ -16,6 +16,7 @@
    ./installation.md
    ./papers.md
    ./get_started.md
+   ./build_task.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md
index 6e1420a..e704e5e 100644
--- a/docs_cn/modelscope_usages.md
+++ b/docs_cn/modelscope_usages.md
@@ -1,14 +1,14 @@
-# 蹇�熶娇鐢∕odelScope
+# ModelScope Usage
 ModelScope鏄樋閲屽反宸存帹鍑虹殑寮�婧愭ā鍨嬪嵆鏈嶅姟鍏变韩骞冲彴锛屼负骞垮ぇ瀛︽湳鐣岀敤鎴峰拰宸ヤ笟鐣岀敤鎴锋彁渚涚伒娲汇�佷究鎹风殑妯″瀷搴旂敤鏀寔銆傚叿浣撶殑浣跨敤鏂规硶鍜屽紑婧愭ā鍨嬪彲浠ュ弬瑙乕ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆傚湪璇煶鏂瑰悜锛屾垜浠彁渚涗簡鑷洖褰�/闈炶嚜鍥炲綊璇煶璇嗗埆锛岃闊抽璁粌锛屾爣鐐归娴嬬瓑妯″瀷锛岀敤鎴峰彲浠ユ柟渚夸娇鐢ㄣ��
 
 ## 鏁翠綋浠嬬粛
-鎴戜滑鍦╡gs_modelscope鐩綍涓嬫彁渚涗簡鐩稿叧妯″瀷鐨勪娇鐢紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勬ā鍨嬩綔涓哄垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
+鎴戜滑鍦╜egs_modelscope` 鐩綍涓嬫彁渚涗簡涓嶅悓妯″瀷鐨勪娇鐢ㄦ柟娉曪紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勫垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互`egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch`鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
 - `infer.py`: 鍩轰簬鎴戜滑鎻愪緵鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
 - `finetune.py`: 灏嗘垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷杩涜寰皟
 - `infer_after_finetune.py`: 鍩轰簬寰皟寰楀埌鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
 
 ## 妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滃悓鏃剁粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
+鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠梎CER`銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
 * `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)锛屽叿浣撴牸寮忓彲浠ュ弬瑙乕蹇�熷紑濮媇(./get_started.md)涓殑璇存槑銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁＄畻CER锛屽惁鍒欎細璺宠繃銆�
 * `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
 * `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
@@ -21,14 +21,14 @@
 * `data_path`锛氭暟鎹洰褰曘�傝鐩綍涓嬪簲璇ュ寘鎷瓨鏀捐缁冮泦鏁版嵁鐨刞train`鐩綍鍜屽瓨鏀鹃獙璇侀泦鏁版嵁鐨刞dev`鐩綍銆傛瘡涓洰褰曚腑闇�瑕佸寘鎷煶棰戝垪琛ㄦ枃浠禶wav.scp`鍜屾妱鏈枃浠禶text`
 * `output_dir`锛氬井璋冪粨鏋滀繚瀛樼洰褰�
 * `dataset_type`锛氬浜庡皬鏁版嵁闆嗭紝璁剧疆涓篳small`锛涘綋鏁版嵁閲忓ぇ浜�1000灏忔椂鏃讹紝璁剧疆涓篳large`
-* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type=`large`锛宐atch_bins鍗曚綅涓烘绉�
+* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type璁剧疆涓篳large`锛宐atch_bins鍗曚綅涓烘绉�
 * `max_epoch`锛氭渶澶х殑璁粌杞暟
 
 浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
 * `accum_grad`锛氭搴︾疮绉�
 * `keep_nbest_models`锛氶�夋嫨鎬ц兘鏈�濂界殑`keep_nbest_models`涓ā鍨嬬殑鍙傛暟杩涜骞冲潎锛屽緱鍒版�ц兘鏇村ソ鐨勬ā鍨�
-* `optim`锛氳缃井璋冩椂鐨勪紭鍖栧櫒
-* `lr`锛氳缃井璋冩椂鐨勫涔犵巼
+* `optim`锛氳缃紭鍖栧櫒
+* `lr`锛氳缃涔犵巼
 * `scheduler`锛氳缃涔犵巼璋冩暣绛栫暐
 * `scheduler_conf`锛氬涔犵巼璋冩暣绛栫暐鐨勭浉鍏冲弬鏁�
 * `specaug`锛氳缃氨澧炲箍
@@ -37,7 +37,7 @@
 闄や簡鐩存帴鍦╜finetune.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`finetune.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀瑰井璋冮厤缃��
 
 ## 鍩轰簬寰皟鍚庣殑妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滃悓鏃剁粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
+鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
 * `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁＄畻CER锛屽惁鍒欎細璺宠繃銆�
 * `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
 * `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
@@ -45,8 +45,8 @@
 * `decoding_model_name`锛氭寚瀹氱敤浜庢帹鐞嗙殑妯″瀷鍚�
 
 浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨�
+* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨嬪悕
 * `required_files`锛氫娇鐢╩odelscope鎺ュ彛杩涜鎺ㄧ悊鏃堕渶瑕佺敤鍒扮殑鏂囦欢
 
 ## 娉ㄦ剰浜嬮」
-閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑README.md鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file
+閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑`README.md`鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file
diff --git a/egs/alimeeting/diarization/sond/README.md b/egs/alimeeting/diarization/sond/README.md
index 8bef142..b36210c 100644
--- a/egs/alimeeting/diarization/sond/README.md
+++ b/egs/alimeeting/diarization/sond/README.md
@@ -1,5 +1,24 @@
+# Get Started
+To use this example, please execute the first stage of run.sh first to obtain the prepared data and pre-trained models:
+```shell
+sh run.sh --stage 0 --stop_stage 0
+```
+Then, you can execute unit_test.py to check the correctness of code:
+```shell
+python unit_test.py
+# you will get the results:
+[{'key': 'R8002_M8002_MS802-S0000_0000000_0001600', 'value': 'spk1 [(0.0, 8.88), (10.72, 11.92), (12.64, 15.2)]\nspk2 [(8.8, 9.76)]\nspk3 [(9.6, 10.96), (15.12, 15.68)]\nspk4 [(11.12, 12.72)]'}]
+[{'key': 'R8002_M8002_MS802-S0000_0000000_0001600', 'value': 'spk1 [(0.0, 8.88), (10.72, 11.92), (12.64, 15.2)]\nspk2 [(8.8, 9.76)]\nspk3 [(9.6, 10.96), (15.12, 15.68)]\nspk4 [(11.12, 12.72)]'}]
+[{'key': 'R8002_M8002_MS802-S0000_0000000_0001600', 'value': 'spk1 [(0.0, 8.88), (10.72, 11.92), (12.64, 15.2)]\nspk2 [(8.8, 9.76)]\nspk3 [(9.6, 10.88), (15.12, 15.68)]\nspk4 [(11.12, 12.72)]'}]
+[{'key': 'test0', 'value': 'spk1 [(0.0, 8.88), (10.64, 15.2)]\nspk2 [(8.88, 9.84)]\nspk3 [(9.6, 11.04), (15.12, 15.68)]\nspk4 [(11.2, 11.76)]'}]
+```
+You can also execute run.sh to reproduce the diarization performance reported in [1]
+```shell
+sh run.sh --stage 1 --stop_stage 2
+```
+
 # Results
-You will get a DER about 4.21%, which is reported in [1], Table 6, line "SOND Oracle Profile".
+After executing "run.sh", you will get a DER about 4.21%, which is reported in [1], Table 6, line "SOND Oracle Profile".
 
 # Reference
 [1] Speaker Overlap-aware Neural Diarization for Multi-party Meeting Analysis, Zhihao Du, Shiliang Zhang, 
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
new file mode 100644
index 0000000..9097e7a
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -0,0 +1,53 @@
+# ModelScope Model
+
+## How to finetune and infer using a pretrained Paraformer-large Model
+
+### Finetune
+
+- Modify finetune training related parameters in `finetune.py`
+    - <strong>output_dir:</strong> # result dir
+    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - <strong>max_epoch:</strong> # number of training epoch
+    - <strong>lr:</strong> # learning rate
+
+- Then you can run the pipeline to finetune with:
+```python
+    python finetune.py
+```
+
+### Inference
+
+Or you can use the finetuned model for inference directly.
+
+- Setting parameters in `infer.py`
+    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+    - <strong>output_dir:</strong> # result dir
+    - <strong>ngpu:</strong> # the number of GPUs for decoding
+    - <strong>njob:</strong> # the number of jobs for each GPU
+
+- Then you can run the pipeline to infer with:
+```python
+    python infer.py
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.sp.cer` and `$output_dir/1best_recog/text.nosp.cer`, which includes recognition results with or without separating character (src) of each sample and the CER metric of the whole test set.
+
+### Inference using local finetuned model
+
+- Modify inference related parameters in `infer_after_finetune.py`
+    - <strong>output_dir:</strong> # result dir
+    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+
+- Then you can run the pipeline to finetune with:
+```python
+    python infer_after_finetune.py
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.sp.cer` and `$output_dir/1best_recog/text.nosp.cer`, which includes recognition results with or without separating character (src) of each sample and the CER metric of the whole test set.
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/RESULTS.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/RESULTS.md
new file mode 100644
index 0000000..716d44e
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/RESULTS.md
@@ -0,0 +1,40 @@
+# Paraformer-Large
+- Model link: <https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary>
+- Model size: 45M
+
+# Environments
+- date: `Tue Feb 13 20:13:22 CST 2023`
+- python version: `3.7.12`
+- FunASR version: `0.1.0`
+- pytorch version: `pytorch 1.7.0`
+- Git hash: ``
+- Commit date: ``
+
+# Beachmark Results
+
+## result (paper)
+beam=20锛孋ER tool锛歨ttps://github.com/yufan-aslp/AliMeeting 
+
+|        model        | Para (M) | Data (hrs) | Eval (CER%) | Test (CER%) |
+|:-------------------:|:---------:|:---------:|:---------:| :---------:|
+| MFCCA | 45   |   917  |   16.1   | 17.5   |
+
+## result锛坢odelscope锛�
+
+beam=10
+
+with separating character (src)
+
+|        model        | Para (M) | Data (hrs) | Eval_sp (CER%) | Test_sp (CER%) | 
+|:-------------------:|:---------:|:---------:|:---------:| :---------:|
+| MFCCA | 45   |   917  |   17.1   | 18.6   |
+
+without separating character (src)
+
+|        model        | Para (M) | Data (hrs) | Eval_nosp (CER%) | Test_nosp (CER%) | 
+|:-------------------:|:---------:|:---------:|:---------:| :---------:|
+| MFCCA | 45   |   917  |   16.4   | 18.0   |
+
+## 鍋忓樊
+
+Considering the differences of the CER calculation tool and decoding beam size, the results of CER are biased (<0.5%).
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py
new file mode 100755
index 0000000..bf8176e
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        model_revision=params.model_revision,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    
+    params = modelscope_args(model="NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950")
+    params.output_dir = "./checkpoint"              # m妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "small"                   # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 1000                       # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 10                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.0001                             # 璁剧疆瀛︿範鐜�
+    params.model_revision = 'v1.0.0'
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
new file mode 100755
index 0000000..fa22aad
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -0,0 +1,103 @@
+import os
+import shutil
+from multiprocessing import Pool
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+import pdb;
+def modelscope_infer_core(output_dir, split_dir, njob, idx):
+    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
+    gpu_id = (int(idx) - 1) // njob
+    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
+        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
+    else:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+	    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+        model_revision='v1.0.0',
+        output_dir=output_dir_job,
+        batch_size=1,
+    )
+    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
+    inference_pipline(audio_in=audio_in)
+
+
+def modelscope_infer(params):
+    # prepare for multi-GPU decoding
+    ngpu = params["ngpu"]
+    njob = params["njob"]
+    output_dir = params["output_dir"]
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.mkdir(output_dir)
+    split_dir = os.path.join(output_dir, "split")
+    os.mkdir(split_dir)
+    nj = ngpu * njob
+    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
+    with open(wav_scp_file) as f:
+        lines = f.readlines()
+        num_lines = len(lines)
+        num_job_lines = num_lines // nj
+    start = 0
+    for i in range(nj):
+        end = start + num_job_lines
+        file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
+        with open(file, "w") as f:
+            if i == nj - 1:
+                f.writelines(lines[start:])
+            else:
+                f.writelines(lines[start:end])
+        start = end
+    p = Pool(nj)
+    for i in range(nj):
+        p.apply_async(modelscope_infer_core,
+                      args=(output_dir, split_dir, njob, str(i + 1)))
+    p.close()
+    p.join()
+
+    # combine decoding results
+    best_recog_path = os.path.join(output_dir, "1best_recog")
+    os.mkdir(best_recog_path)
+    files = ["text", "token", "score"]
+    for file in files:
+        with open(os.path.join(best_recog_path, file), "w") as f:
+            for i in range(nj):
+                job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
+                with open(job_file) as f_job:
+                    lines = f_job.readlines()
+                f.writelines(lines)
+
+    # If text exists, compute CER
+    text_in = os.path.join(params["data_dir"], "text")
+    if os.path.exists(text_in):
+        text_proc_file = os.path.join(best_recog_path, "token")
+        text_proc_file2 = os.path.join(best_recog_path, "token_nosep")
+        with open(text_proc_file, 'r') as hyp_reader:
+                with open(text_proc_file2, 'w') as hyp_writer:
+                    for line in hyp_reader:
+                        new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
+                        hyp_writer.write(new_context+'\n')
+        text_in2 = os.path.join(best_recog_path, "ref_text_nosep")
+        with open(text_in, 'r') as ref_reader:
+            with open(text_in2, 'w') as ref_writer:
+                for line in ref_reader:
+                    new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
+                    ref_writer.write(new_context+'\n')
+
+
+        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.sp.cer"))
+        compute_wer(text_in2, text_proc_file2, os.path.join(best_recog_path, "text.nosp.cer"))
+
+
+if __name__ == "__main__":
+    params = {}
+    params["data_dir"] = "./example_data/validation"
+    params["output_dir"] = "./output_dir"
+    params["ngpu"] = 1
+    params["njob"] = 1
+    modelscope_infer(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
new file mode 100755
index 0000000..e714a3d
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -0,0 +1,67 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+
+def modelscope_infer_after_finetune(params):
+    # prepare for decoding
+    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
+    for file_name in params["required_files"]:
+        if file_name == "configuration.json":
+            with open(os.path.join(pretrained_model_path, file_name)) as f:
+                config_dict = json.load(f)
+                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
+            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
+                json.dump(config_dict, f, indent=4, separators=(',', ': '))
+        else:
+            shutil.copy(os.path.join(pretrained_model_path, file_name),
+                        os.path.join(params["output_dir"], file_name))
+    decoding_path = os.path.join(params["output_dir"], "decode_results")
+    if os.path.exists(decoding_path):
+        shutil.rmtree(decoding_path)
+    os.mkdir(decoding_path)
+
+    # decoding
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=params["output_dir"],
+        output_dir=decoding_path,
+        batch_size=1
+    )
+    audio_in = os.path.join(params["data_dir"], "wav.scp")
+    inference_pipeline(audio_in=audio_in)
+
+    # computer CER if GT text is set
+    text_in = os.path.join(params["data_dir"], "text")
+    if text_in is not None:
+        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        text_proc_file2 = os.path.join(decoding_path, "1best_recog/token_nosep")
+        with open(text_proc_file, 'r') as hyp_reader:
+                with open(text_proc_file2, 'w') as hyp_writer:
+                    for line in hyp_reader:
+                        new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
+                        hyp_writer.write(new_context+'\n')
+        text_in2 = os.path.join(decoding_path, "1best_recog/ref_text_nosep")
+        with open(text_in, 'r') as ref_reader:
+            with open(text_in2, 'w') as ref_writer:
+                for line in ref_reader:
+                    new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
+                    ref_writer.write(new_context+'\n')
+
+
+        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.sp.cer"))
+        compute_wer(text_in2, text_proc_file2, os.path.join(decoding_path, "text.nosp.cer"))
+
+if __name__ == '__main__':
+    params = {}
+    params["modelscope_model_name"] = "NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950"
+    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./example_data/validation"
+    params["decoding_model_name"] = "valid.acc.ave.pth"
+    modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
new file mode 100644
index 0000000..49c0aeb
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@@ -0,0 +1,19 @@
+# ModelScope Model
+
+## How to infer using a pretrained Paraformer-large Model
+
+### Inference
+
+You can use the pretrain model for inference directly.
+
+- Setting parameters in `infer.py`
+    - <strong>audio_in:</strong> # Support wav, url, bytes, and parsed audio format.
+    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
+    - <strong>batch_size:</strong> # Set batch size in inference.
+    - <strong>param_dict:</strong> # Set the hotword list in inference.
+
+- Then you can run the pipeline to infer with:
+```python
+    python infer.py
+```
+
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.py
new file mode 100644
index 0000000..16a57e9
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.py
@@ -0,0 +1,21 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+
+if __name__ == '__main__':
+    param_dict = dict()
+    param_dict['hotword'] = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/hotword.txt"
+
+    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_hotword.wav"
+    output_dir = None
+    batch_size = 1
+
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+        output_dir=output_dir,
+        batch_size=batch_size,
+        param_dict=param_dict)
+
+    rec_result = inference_pipeline(audio_in=audio_in)
+    print(rec_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
new file mode 100644
index 0000000..18897b1
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
@@ -0,0 +1,36 @@
+import os
+import tempfile
+import codecs
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.msdatasets import MsDataset
+
+if __name__ == '__main__':
+    param_dict = dict()
+    param_dict['hotword'] = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/hotword.txt"
+
+    output_dir = "./output"
+    batch_size = 1
+
+    # dataset split ['test']
+    ds_dict = MsDataset.load(dataset_name='speech_asr_aishell1_hotwords_testsets', namespace='speech_asr')
+    work_dir = tempfile.TemporaryDirectory().name
+    if not os.path.exists(work_dir):
+        os.makedirs(work_dir)
+    wav_file_path = os.path.join(work_dir, "wav.scp")
+    
+    with codecs.open(wav_file_path, 'w') as fin: 
+        for line in ds_dict:
+            wav = line["Audio:FILE"]
+            idx = wav.split("/")[-1].split(".")[0]
+            fin.writelines(idx + " " + wav + "\n")
+    audio_in = wav_file_path         
+
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
+        output_dir=output_dir,
+        batch_size=batch_size,
+        param_dict=param_dict)
+
+    rec_result = inference_pipeline(audio_in=audio_in)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
index c06f934..c151149 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
index c06f934..ac73adf 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-cantonese-CHS-16k-common-vocab1468-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
index d23c7f4..5ace7e4 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
index d7840c2..f8d91b8 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
index 0cfe939..49b884b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
index 201f794..57a3afd 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-en-16k-common-vocab1080-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
index 6c416b2..510f008 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
index cfd9e9d..2ec5940 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-es-16k-common-vocab3445-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
index 3a89546..040265d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer.py
@@ -23,7 +23,7 @@
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
index ecb1381..055e4eb 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer.py
@@ -23,7 +23,7 @@
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
index e541f27..6aedeea 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
index f871665..2f3e833 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
index 82d19d0..219c9ec 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
index 82d19d0..ad2671a 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-id-16k-common-vocab1067-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
index f1e156b..a053957 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
index f1e156b..f15bc2d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ja-16k-common-vocab93-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
index 7aba7ee..618b3f6 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
index 5fbfdbb..135e8f8 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ko-16k-common-vocab6400-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dfd509d..dd947d3 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -1,6 +1,6 @@
 # ModelScope Model
 
-## How to finetune and infer using a pretrained Paraformer-large Model
+## How to finetune and infer using a pretrained UniASR Model
 
 ### Finetune
 
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer.py
index 06b172f..d28395b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer.py
@@ -23,7 +23,7 @@
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 96f202a..030c2e2 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -34,7 +34,7 @@
         batch_size=1
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
 
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
index fdab5a3..30a11ff 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
index fdab5a3..aff2a9a 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-pt-16k-common-vocab1617-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
index 8b96ffb..95f447d 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
index 1051b1f..88c06b4 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-ru-16k-common-vocab1664-tensorflow1-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
index b7fcd59..9472104 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-offline",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
index 869082b..4a844fc 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/infer.py
@@ -9,5 +9,5 @@
         model="damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online",
         output_dir=output_dir,
     )
-    rec_result = inference_pipline(audio_in=audio_in)
+    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"normal"})
     print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
index e855032..5d74837 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer.py
@@ -23,7 +23,7 @@
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipline(audio_in=audio_in, param_dict={"decoding_model": "offline"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 6664c3d..3b39a16 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -34,7 +34,7 @@
         batch_size=1
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model": "offline"})
 
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
index 8fd7513..5c62362 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer.py
@@ -23,7 +23,7 @@
         batch_size=1
     )
     audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
-    inference_pipline(audio_in=audio_in)
+    inference_pipline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
 
 
 def modelscope_infer(params):
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index b4dde60..4860cf7 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -34,7 +34,7 @@
         batch_size=1
     )
     audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
+    inference_pipeline(audio_in=audio_in, param_dict={"decoding_model": "normal"})
 
     # computer CER if GT text is set
     text_in = os.path.join(params["data_dir"], "text")
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index d2798b1..d8d5679 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -228,6 +228,9 @@
     elif mode == "vad":
         from funasr.bin.vad_inference import inference_modelscope
         return inference_modelscope(**kwargs)
+    elif mode == "mfcca":
+        from funasr.bin.asr_inference_mfcca import inference_modelscope
+        return inference_modelscope(**kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
@@ -253,6 +256,9 @@
     elif mode == "vad":
         from funasr.bin.vad_inference import inference
         return inference(**kwargs)
+    elif mode == "mfcca":
+        from funasr.bin.asr_inference_mfcca import inference_modelscope
+        return inference_modelscope(**kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
new file mode 100644
index 0000000..ba03449
--- /dev/null
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -0,0 +1,764 @@
+#!/usr/bin/env python3
+# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+import argparse
+import logging
+import sys
+from pathlib import Path
+from typing import Any
+from typing import List
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+from typeguard import check_return_type
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.beam_search.batch_beam_search import BatchBeamSearch
+from funasr.modules.beam_search.beam_search import BeamSearch
+from funasr.modules.beam_search.beam_search import Hypothesis
+from funasr.modules.scorers.ctc import CTCPrefixScorer
+from funasr.modules.scorers.length_bonus import LengthBonus
+from funasr.modules.scorers.scorer_interface import BatchScorerInterface
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.asr import ASRTaskMFCCA as ASRTask
+from funasr.tasks.lm import LMTask
+from funasr.text.build_tokenizer import build_tokenizer
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+import pdb
+
+header_colors = '\033[95m'
+end_colors = '\033[0m'
+
+global_asr_language: str = 'zh-cn'
+global_sample_rate: Union[int, Dict[Any, int]] = {
+    'audio_fs': 16000,
+    'model_fs': 16000
+}
+
+class Speech2Text:
+    """Speech2Text class
+
+    Examples:
+        >>> import soundfile
+        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+        >>> audio, rate = soundfile.read("speech.wav")
+        >>> speech2text(audio)
+        [(text, token, token_int, hypothesis object), ...]
+
+    """
+
+    def __init__(
+            self,
+            asr_train_config: Union[Path, str] = None,
+            asr_model_file: Union[Path, str] = None,
+            cmvn_file: Union[Path, str] = None,
+            lm_train_config: Union[Path, str] = None,
+            lm_file: Union[Path, str] = None,
+            token_type: str = None,
+            bpemodel: str = None,
+            device: str = "cpu",
+            maxlenratio: float = 0.0,
+            minlenratio: float = 0.0,
+            batch_size: int = 1,
+            dtype: str = "float32",
+            beam_size: int = 20,
+            ctc_weight: float = 0.5,
+            lm_weight: float = 1.0,
+            ngram_weight: float = 0.9,
+            penalty: float = 0.0,
+            nbest: int = 1,
+            streaming: bool = False,
+            **kwargs,
+    ):
+        assert check_argument_types()
+
+        # 1. Build ASR model
+        scorers = {}
+        asr_model, asr_train_args = ASRTask.build_model_from_file(
+            asr_train_config, asr_model_file, cmvn_file, device
+        )
+
+        logging.info("asr_model: {}".format(asr_model))
+        logging.info("asr_train_args: {}".format(asr_train_args))
+        asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+        decoder = asr_model.decoder
+
+        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        token_list = asr_model.token_list
+        scorers.update(
+            decoder=decoder,
+            ctc=ctc,
+            length_bonus=LengthBonus(len(token_list)),
+        )
+
+        # 2. Build Language model
+        if lm_train_config is not None:
+            lm, lm_train_args = LMTask.build_model_from_file(
+                lm_train_config, lm_file, device
+            )
+            lm.to(device)
+            scorers["lm"] = lm.lm
+        # 3. Build ngram model
+        # ngram is not supported now
+        ngram = None
+        scorers["ngram"] = ngram
+
+        # 4. Build BeamSearch object
+        # transducer is not supported now
+        beam_search_transducer = None
+
+        weights = dict(
+            decoder=1.0 - ctc_weight,
+            ctc=ctc_weight,
+            lm=lm_weight,
+            ngram=ngram_weight,
+            length_bonus=penalty,
+        )
+        beam_search = BeamSearch(
+            beam_size=beam_size,
+            weights=weights,
+            scorers=scorers,
+            sos=asr_model.sos,
+            eos=asr_model.eos,
+            vocab_size=len(token_list),
+            token_list=token_list,
+            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+        )
+        #beam_search.__class__ = BatchBeamSearch
+        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+        if token_type is None:
+            token_type = asr_train_args.token_type
+        if bpemodel is None:
+            bpemodel = asr_train_args.bpemodel
+        
+        if token_type is None:
+            tokenizer = None
+        elif token_type == "bpe":
+            if bpemodel is not None:
+                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+            else:
+                tokenizer = None
+        else:
+            tokenizer = build_tokenizer(token_type=token_type)
+        converter = TokenIDConverter(token_list=token_list)
+        logging.info(f"Text tokenizer: {tokenizer}")
+
+        self.asr_model = asr_model
+        self.asr_train_args = asr_train_args
+        self.converter = converter
+        self.tokenizer = tokenizer
+        self.beam_search = beam_search
+        self.beam_search_transducer = beam_search_transducer
+        self.maxlenratio = maxlenratio
+        self.minlenratio = minlenratio
+        self.device = device
+        self.dtype = dtype
+        self.nbest = nbest
+
+    @torch.no_grad()
+    def __call__(
+            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
+    ) -> List[
+        Tuple[
+            Optional[str],
+            List[str],
+            List[int],
+            Union[Hypothesis],
+        ]
+    ]:
+        """Inference
+
+        Args:
+            speech: Input speech data
+        Returns:
+            text, token, token_int, hyp
+
+        """
+        assert check_argument_types()
+        # Input as audio signal
+        if isinstance(speech, np.ndarray):
+            speech = torch.tensor(speech)
+
+
+        #speech = speech.unsqueeze(0).to(getattr(torch, self.dtype))
+        speech = speech.to(getattr(torch, self.dtype))
+        # lenghts: (1,)
+        lengths = speech.new_full([1], dtype=torch.long, fill_value=speech.size(1))
+        batch = {"speech": speech, "speech_lengths": lengths}
+
+        # a. To device
+        batch = to_device(batch, device=self.device)
+        
+        # b. Forward Encoder
+        enc, _ = self.asr_model.encode(**batch)
+        
+        assert len(enc) == 1, len(enc)
+
+        # c. Passed the encoder result and the beam search
+        nbest_hyps = self.beam_search(
+            x=enc[0], maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+        )
+        
+        nbest_hyps = nbest_hyps[: self.nbest]
+        
+        results = []
+        for hyp in nbest_hyps:
+            assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+            # remove sos/eos and get results
+            last_pos = -1
+            if isinstance(hyp.yseq, list):
+                token_int = hyp.yseq[1:last_pos]
+            else:
+                token_int = hyp.yseq[1:last_pos].tolist()
+
+            # remove blank symbol id, which is assumed to be 0
+            token_int = list(filter(lambda x: x != 0, token_int))
+
+            # Change integer-ids to tokens
+            token = self.converter.ids2tokens(token_int)
+
+            if self.tokenizer is not None:
+                text = self.tokenizer.tokens2text(token)
+            else:
+                text = None
+            results.append((text, token, token_int, hyp))
+
+        assert check_return_type(results)
+        return results
+
+
+# def inference(
+#         maxlenratio: float,
+#         minlenratio: float,
+#         batch_size: int,
+#         beam_size: int,
+#         ngpu: int,
+#         ctc_weight: float,
+#         lm_weight: float,
+#         penalty: float,
+#         log_level: Union[int, str],
+#         data_path_and_name_and_type,
+#         asr_train_config: Optional[str],
+#         asr_model_file: Optional[str],
+#         cmvn_file: Optional[str] = None,
+#         lm_train_config: Optional[str] = None,
+#         lm_file: Optional[str] = None,
+#         token_type: Optional[str] = None,
+#         key_file: Optional[str] = None,
+#         word_lm_train_config: Optional[str] = None,
+#         bpemodel: Optional[str] = None,
+#         allow_variable_data_keys: bool = False,
+#         streaming: bool = False,
+#         output_dir: Optional[str] = None,
+#         dtype: str = "float32",
+#         seed: int = 0,
+#         ngram_weight: float = 0.9,
+#         nbest: int = 1,
+#         num_workers: int = 1,
+#         **kwargs,
+# ):
+#     assert check_argument_types()
+#     if batch_size > 1:
+#         raise NotImplementedError("batch decoding is not implemented")
+#     if word_lm_train_config is not None:
+#         raise NotImplementedError("Word LM is not implemented")
+#     if ngpu > 1:
+#         raise NotImplementedError("only single GPU decoding is supported")
+#
+#     logging.basicConfig(
+#         level=log_level,
+#         format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+#     )
+#
+#     if ngpu >= 1 and torch.cuda.is_available():
+#         device = "cuda"
+#     else:
+#         device = "cpu"
+#
+#     # 1. Set random-seed
+#     set_all_random_seed(seed)
+#
+#     # 2. Build speech2text
+#     speech2text_kwargs = dict(
+#         asr_train_config=asr_train_config,
+#         asr_model_file=asr_model_file,
+#         cmvn_file=cmvn_file,
+#         lm_train_config=lm_train_config,
+#         lm_file=lm_file,
+#         token_type=token_type,
+#         bpemodel=bpemodel,
+#         device=device,
+#         maxlenratio=maxlenratio,
+#         minlenratio=minlenratio,
+#         dtype=dtype,
+#         beam_size=beam_size,
+#         ctc_weight=ctc_weight,
+#         lm_weight=lm_weight,
+#         ngram_weight=ngram_weight,
+#         penalty=penalty,
+#         nbest=nbest,
+#         streaming=streaming,
+#     )
+#     logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
+#     speech2text = Speech2Text(**speech2text_kwargs)
+#
+#     # 3. Build data-iterator
+#     loader = ASRTask.build_streaming_iterator(
+#         data_path_and_name_and_type,
+#         dtype=dtype,
+#         batch_size=batch_size,
+#         key_file=key_file,
+#         num_workers=num_workers,
+#         preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+#         collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+#         allow_variable_data_keys=allow_variable_data_keys,
+#         inference=True,
+#     )
+#
+#     finish_count = 0
+#     file_count = 1
+#     # 7 .Start for-loop
+#     # FIXME(kamo): The output format should be discussed about
+#     asr_result_list = []
+#     if output_dir is not None:
+#         writer = DatadirWriter(output_dir)
+#     else:
+#         writer = None
+#
+#     for keys, batch in loader:
+#         assert isinstance(batch, dict), type(batch)
+#         assert all(isinstance(s, str) for s in keys), keys
+#         _bs = len(next(iter(batch.values())))
+#         assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+#         #batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+#
+#         # N-best list of (text, token, token_int, hyp_object)
+#         try:
+#             results = speech2text(**batch)
+#         except TooShortUttError as e:
+#             logging.warning(f"Utterance {keys} {e}")
+#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+#             results = [[" ", ["<space>"], [2], hyp]] * nbest
+#
+#         # Only supporting batch_size==1
+#         key = keys[0]
+#         for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+#             # Create a directory: outdir/{n}best_recog
+#             if writer is not None:
+#                 ibest_writer = writer[f"{n}best_recog"]
+#
+#                 # Write the result to each file
+#                 ibest_writer["token"][key] = " ".join(token)
+#                 ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+#                 ibest_writer["score"][key] = str(hyp.score)
+#
+#             if text is not None:
+#                 text_postprocessed = postprocess_utils.sentence_postprocess(token)
+#                 item = {'key': key, 'value': text_postprocessed}
+#                 asr_result_list.append(item)
+#                 finish_count += 1
+#                 asr_utils.print_progress(finish_count / file_count)
+#                 if writer is not None:
+#                     ibest_writer["text"][key] = text
+#     return asr_result_list
+
+def inference(
+        maxlenratio: float,
+        minlenratio: float,
+        batch_size: int,
+        beam_size: int,
+        ngpu: int,
+        ctc_weight: float,
+        lm_weight: float,
+        penalty: float,
+        log_level: Union[int, str],
+        data_path_and_name_and_type,
+        asr_train_config: Optional[str],
+        asr_model_file: Optional[str],
+        cmvn_file: Optional[str] = None,
+        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+        lm_train_config: Optional[str] = None,
+        lm_file: Optional[str] = None,
+        token_type: Optional[str] = None,
+        key_file: Optional[str] = None,
+        word_lm_train_config: Optional[str] = None,
+        bpemodel: Optional[str] = None,
+        allow_variable_data_keys: bool = False,
+        streaming: bool = False,
+        output_dir: Optional[str] = None,
+        dtype: str = "float32",
+        seed: int = 0,
+        ngram_weight: float = 0.9,
+        nbest: int = 1,
+        num_workers: int = 1,
+        **kwargs,
+):
+    inference_pipeline = inference_modelscope(
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        batch_size=batch_size,
+        beam_size=beam_size,
+        ngpu=ngpu,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        penalty=penalty,
+        log_level=log_level,
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        cmvn_file=cmvn_file,
+        raw_inputs=raw_inputs,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        key_file=key_file,
+        word_lm_train_config=word_lm_train_config,
+        bpemodel=bpemodel,
+        allow_variable_data_keys=allow_variable_data_keys,
+        streaming=streaming,
+        output_dir=output_dir,
+        dtype=dtype,
+        seed=seed,
+        ngram_weight=ngram_weight,
+        nbest=nbest,
+        num_workers=num_workers,
+        **kwargs,
+    )
+    return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+def inference_modelscope(
+    maxlenratio: float,
+    minlenratio: float,
+    batch_size: int,
+    beam_size: int,
+    ngpu: int,
+    ctc_weight: float,
+    lm_weight: float,
+    penalty: float,
+    log_level: Union[int, str],
+    # data_path_and_name_and_type,
+    asr_train_config: Optional[str],
+    asr_model_file: Optional[str],
+    cmvn_file: Optional[str] = None,
+    lm_train_config: Optional[str] = None,
+    lm_file: Optional[str] = None,
+    token_type: Optional[str] = None,
+    key_file: Optional[str] = None,
+    word_lm_train_config: Optional[str] = None,
+    bpemodel: Optional[str] = None,
+    allow_variable_data_keys: bool = False,
+    streaming: bool = False,
+    output_dir: Optional[str] = None,
+    dtype: str = "float32",
+    seed: int = 0,
+    ngram_weight: float = 0.9,
+    nbest: int = 1,
+    num_workers: int = 1,
+    param_dict: dict = None,
+    **kwargs,
+):
+    assert check_argument_types()
+    if batch_size > 1:
+        raise NotImplementedError("batch decoding is not implemented")
+    if word_lm_train_config is not None:
+        raise NotImplementedError("Word LM is not implemented")
+    if ngpu > 1:
+        raise NotImplementedError("only single GPU decoding is supported")
+    
+    logging.basicConfig(
+        level=log_level,
+        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+    )
+    
+    if ngpu >= 1 and torch.cuda.is_available():
+        device = "cuda"
+    else:
+        device = "cpu"
+    
+    # 1. Set random-seed
+    set_all_random_seed(seed)
+    
+    # 2. Build speech2text
+    speech2text_kwargs = dict(
+        asr_train_config=asr_train_config,
+        asr_model_file=asr_model_file,
+        cmvn_file=cmvn_file,
+        lm_train_config=lm_train_config,
+        lm_file=lm_file,
+        token_type=token_type,
+        bpemodel=bpemodel,
+        device=device,
+        maxlenratio=maxlenratio,
+        minlenratio=minlenratio,
+        dtype=dtype,
+        beam_size=beam_size,
+        ctc_weight=ctc_weight,
+        lm_weight=lm_weight,
+        ngram_weight=ngram_weight,
+        penalty=penalty,
+        nbest=nbest,
+        streaming=streaming,
+    )
+    logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
+    speech2text = Speech2Text(**speech2text_kwargs)
+    
+    def _forward(data_path_and_name_and_type,
+                 raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+                 output_dir_v2: Optional[str] = None,
+                 fs: dict = None,
+                 param_dict: dict = None,
+                 ):
+        # 3. Build data-iterator
+        if data_path_and_name_and_type is None and raw_inputs is not None:
+            if isinstance(raw_inputs, torch.Tensor):
+                raw_inputs = raw_inputs.numpy()
+            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+        loader = ASRTask.build_streaming_iterator(
+            data_path_and_name_and_type,
+            dtype=dtype,
+            batch_size=batch_size,
+            key_file=key_file,
+            num_workers=num_workers,
+            preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+            collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+            allow_variable_data_keys=allow_variable_data_keys,
+            inference=True,
+        )
+        
+        finish_count = 0
+        file_count = 1
+        # 7 .Start for-loop
+        # FIXME(kamo): The output format should be discussed about
+        asr_result_list = []
+        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+        if output_path is not None:
+            writer = DatadirWriter(output_path)
+        else:
+            writer = None
+        
+        for keys, batch in loader:
+            assert isinstance(batch, dict), type(batch)
+            assert all(isinstance(s, str) for s in keys), keys
+            _bs = len(next(iter(batch.values())))
+            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+            # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
+            
+            # N-best list of (text, token, token_int, hyp_object)
+            try:
+                results = speech2text(**batch)
+            except TooShortUttError as e:
+                logging.warning(f"Utterance {keys} {e}")
+                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+                results = [[" ", ["<space>"], [2], hyp]] * nbest
+            
+            # Only supporting batch_size==1
+            key = keys[0]
+            for n, (text, token, token_int, hyp) in zip(range(1, nbest + 1), results):
+                # Create a directory: outdir/{n}best_recog
+                if writer is not None:
+                    ibest_writer = writer[f"{n}best_recog"]
+                    
+                    # Write the result to each file
+                    ibest_writer["token"][key] = " ".join(token)
+                    # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+                    ibest_writer["score"][key] = str(hyp.score)
+                
+                if text is not None:
+                    text_postprocessed = postprocess_utils.sentence_postprocess(token)
+                    item = {'key': key, 'value': text_postprocessed}
+                    asr_result_list.append(item)
+                    finish_count += 1
+                    asr_utils.print_progress(finish_count / file_count)
+                    if writer is not None:
+                        ibest_writer["text"][key] = text
+        return asr_result_list
+    
+    return _forward
+
+def get_parser():
+    parser = config_argparse.ArgumentParser(
+        description="ASR Decoding",
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+    )
+
+    # Note(kamo): Use '_' instead of '-' as separator.
+    # '-' is confusing if written in yaml.
+    parser.add_argument(
+        "--log_level",
+        type=lambda x: x.upper(),
+        default="INFO",
+        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+        help="The verbose level of logging",
+    )
+
+    parser.add_argument("--output_dir", type=str, required=True)
+    parser.add_argument(
+        "--ngpu",
+        type=int,
+        default=0,
+        help="The number of gpus. 0 indicates CPU mode",
+    )
+    parser.add_argument(
+        "--gpuid_list",
+        type=str,
+        default="",
+        help="The visible gpus",
+    )
+    parser.add_argument("--seed", type=int, default=0, help="Random seed")
+    parser.add_argument(
+        "--dtype",
+        default="float32",
+        choices=["float16", "float32", "float64"],
+        help="Data type",
+    )
+    parser.add_argument(
+        "--num_workers",
+        type=int,
+        default=1,
+        help="The number of workers used for DataLoader",
+    )
+
+    group = parser.add_argument_group("Input data related")
+    group.add_argument(
+        "--data_path_and_name_and_type",
+        type=str2triple_str,
+        required=False,
+        action="append",
+    )
+    group.add_argument("--raw_inputs", type=list, default=None)
+    # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+    group.add_argument("--key_file", type=str_or_none)
+    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+    group = parser.add_argument_group("The model configuration related")
+    group.add_argument(
+        "--asr_train_config",
+        type=str,
+        help="ASR training configuration",
+    )
+    group.add_argument(
+        "--asr_model_file",
+        type=str,
+        help="ASR model parameter file",
+    )
+    group.add_argument(
+        "--cmvn_file",
+        type=str,
+        help="Global cmvn file",
+    )
+    group.add_argument(
+        "--lm_train_config",
+        type=str,
+        help="LM training configuration",
+    )
+    group.add_argument(
+        "--lm_file",
+        type=str,
+        help="LM parameter file",
+    )
+    group.add_argument(
+        "--word_lm_train_config",
+        type=str,
+        help="Word LM training configuration",
+    )
+    group.add_argument(
+        "--word_lm_file",
+        type=str,
+        help="Word LM parameter file",
+    )
+    group.add_argument(
+        "--ngram_file",
+        type=str,
+        help="N-gram parameter file",
+    )
+    group.add_argument(
+        "--model_tag",
+        type=str,
+        help="Pretrained model tag. If specify this option, *_train_config and "
+             "*_file will be overwritten",
+    )
+
+    group = parser.add_argument_group("Beam-search related")
+    group.add_argument(
+        "--batch_size",
+        type=int,
+        default=1,
+        help="The batch size for inference",
+    )
+    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+    group.add_argument(
+        "--maxlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain max output length. "
+             "If maxlenratio=0.0 (default), it uses a end-detect "
+             "function "
+             "to automatically find maximum hypothesis lengths."
+             "If maxlenratio<0.0, its absolute value is interpreted"
+             "as a constant max output length",
+    )
+    group.add_argument(
+        "--minlenratio",
+        type=float,
+        default=0.0,
+        help="Input length ratio to obtain min output length",
+    )
+    group.add_argument(
+        "--ctc_weight",
+        type=float,
+        default=0.5,
+        help="CTC weight in joint decoding",
+    )
+    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+    group.add_argument("--streaming", type=str2bool, default=False)
+
+    group = parser.add_argument_group("Text converter related")
+    group.add_argument(
+        "--token_type",
+        type=str_or_none,
+        default=None,
+        choices=["char", "bpe", None],
+        help="The token type for ASR model. "
+             "If not given, refers from the training args",
+    )
+    group.add_argument(
+        "--bpemodel",
+        type=str_or_none,
+        default=None,
+        help="The model path of sentencepiece. "
+             "If not given, refers from the training args",
+    )
+
+    return parser
+
+
+def main(cmd=None):
+    print(get_commandline_args(), file=sys.stderr)
+    parser = get_parser()
+    args = parser.parse_args(cmd)
+    kwargs = vars(args)
+    kwargs.pop("config", None)
+    inference(**kwargs)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 6c5acfc..be35e78 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -6,6 +6,8 @@
 import copy
 import os
 import codecs
+import tempfile
+import requests
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -175,10 +177,24 @@
         self.converter = converter
         self.tokenizer = tokenizer
 
-        # 6. [Optional] Build hotword list from file or str
+        # 6. [Optional] Build hotword list from str, local file or url
+        # for None 
         if hotword_list_or_file is None:
             self.hotword_list = None
+        # for text str input
+        elif not os.path.exists(hotword_list_or_file) and not hotword_list_or_file.startswith('http'):
+            logging.info("Attempting to parse hotwords as str...")
+            self.hotword_list = []
+            hotword_str_list = []
+            for hw in hotword_list_or_file.strip().split():
+                hotword_str_list.append(hw)
+                self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+            self.hotword_list.append([self.asr_model.sos])
+            hotword_str_list.append('<s>')
+            logging.info("Hotword list: {}.".format(hotword_str_list))
+        # for local txt inputs
         elif os.path.exists(hotword_list_or_file):
+            logging.info("Attempting to parse hotwords from local txt...")
             self.hotword_list = []
             hotword_str_list = []
             with codecs.open(hotword_list_or_file, 'r') as fin:
@@ -186,20 +202,31 @@
                     hw = line.strip()
                     hotword_str_list.append(hw)
                     self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
-                self.hotword_list.append([1])
+                self.hotword_list.append([self.asr_model.sos])
                 hotword_str_list.append('<s>')
             logging.info("Initialized hotword list from file: {}, hotword list: {}."
                 .format(hotword_list_or_file, hotword_str_list))
+        # for url, download and generate txt
         else:
-            logging.info("Attempting to parse hotwords as str...")
+            logging.info("Attempting to parse hotwords from url...")
+            work_dir = tempfile.TemporaryDirectory().name
+            if not os.path.exists(work_dir):
+                os.makedirs(work_dir)
+            text_file_path = os.path.join(work_dir, os.path.basename(hotword_list_or_file))
+            local_file = requests.get(hotword_list_or_file)
+            open(text_file_path, "wb").write(local_file.content)
+            hotword_list_or_file = text_file_path
             self.hotword_list = []
             hotword_str_list = []
-            for hw in hotword_list_or_file.strip().split():
-                hotword_str_list.append(hw)
-                self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
-            self.hotword_list.append([1])
-            hotword_str_list.append('<s>')
-            logging.info("Hotword list: {}.".format(hotword_str_list))
+            with codecs.open(hotword_list_or_file, 'r') as fin:
+                for line in fin.readlines():
+                    hw = line.strip()
+                    hotword_str_list.append(hw)
+                    self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+                self.hotword_list.append([self.asr_model.sos])
+                hotword_str_list.append('<s>')
+            logging.info("Initialized hotword list from file: {}, hotword list: {}."
+                .format(hotword_list_or_file, hotword_str_list))
 
 
         is_use_lm = lm_weight != 0.0 and lm_file is not None
diff --git a/funasr/bin/asr_inference_paraformer_timestamp.py b/funasr/bin/asr_inference_paraformer_timestamp.py
index 7da48e2..d0baa74 100644
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -455,16 +455,6 @@
     return asr_result_list
 
 
-def set_parameters(language: str = None,
-                   sample_rate: Union[int, Dict[Any, int]] = None):
-    if language is not None:
-        global global_asr_language
-        global_asr_language = language
-    if sample_rate is not None:
-        global global_sample_rate
-        global_sample_rate = sample_rate
-
-
 def get_parser():
     parser = config_argparse.ArgumentParser(
         description="ASR Decoding",
diff --git a/funasr/bin/asr_inference_paraformer_vad.py b/funasr/bin/asr_inference_paraformer_vad.py
index dbb2719..c01c6ba 100644
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@@ -38,7 +38,6 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.tasks.vad import VADTask
-from funasr.utils.timestamp_tools import time_stamp_lfr6
 from funasr.bin.punctuation_infer import Text2Punc
 from funasr.bin.asr_inference_paraformer_vad_punc import Speech2Text
 from funasr.bin.asr_inference_paraformer_vad_punc import Speech2VadSegment
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index c4bb61b..755cc9c 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -39,7 +39,7 @@
 from funasr.utils import asr_utils, wav_utils, postprocess_utils
 from funasr.models.frontend.wav_frontend import WavFrontend
 from funasr.tasks.vad import VADTask
-from funasr.utils.timestamp_tools import time_stamp_lfr6, time_stamp_lfr6_pl
+from funasr.utils.timestamp_tools import time_stamp_lfr6_pl
 from funasr.bin.punctuation_infer import Text2Punc
 from funasr.models.e2e_asr_paraformer import BiCifParaformer
 
@@ -282,12 +282,8 @@
                 else:
                     text = None
 
-                if isinstance(self.asr_model, BiCifParaformer):
-                    timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time)
-                    results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor))
-                else:
-                    time_stamp = time_stamp_lfr6(alphas[i:i + 1, ], enc_len[i:i + 1, ], copy.copy(token), begin_time, end_time)
-                    results.append((text, token, token_int, time_stamp, enc_len_batch_total, lfr_factor))
+                timestamp = time_stamp_lfr6_pl(us_alphas[i], us_cif_peak[i], copy.copy(token), begin_time, end_time)
+                results.append((text, token, token_int, timestamp, enc_len_batch_total, lfr_factor))
 
         # assert check_return_type(results)
         return results
@@ -617,7 +613,7 @@
                 result = result_segments[0]
                 text, token, token_int = result[0], result[1], result[2]
                 time_stamp = None if len(result) < 4 else result[3]
-   
+
                 if use_timestamp and time_stamp is not None: 
                     postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                 else:
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 0a5824c..db09d31 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -397,7 +397,7 @@
         device = "cuda"
     else:
         device = "cpu"
-
+    
     # 1. Set random-seed
     set_all_random_seed(seed)
 
@@ -439,6 +439,18 @@
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
             data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+        if param_dict is not None and "decoding_model" in param_dict:
+            if param_dict["decoding_model"] == "fast":
+                speech2text.decoding_ind = 0
+                speech2text.decoding_mode = "model1"
+            elif param_dict["decoding_model"] == "normal":
+                speech2text.decoding_ind = 0
+                speech2text.decoding_mode = "model2"
+            elif param_dict["decoding_model"] == "offline":
+                speech2text.decoding_ind = 1
+                speech2text.decoding_mode = "model2"
+            else:
+                raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"]))
         loader = ASRTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index 0a5824c..de32dcf 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -439,6 +439,18 @@
             if isinstance(raw_inputs, torch.Tensor):
                 raw_inputs = raw_inputs.numpy()
             data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+        if param_dict is not None and "decoding_model" in param_dict:
+            if param_dict["decoding_model"] == "fast":
+                speech2text.decoding_ind = 0
+                speech2text.decoding_mode = "model1"
+            elif param_dict["decoding_model"] == "normal":
+                speech2text.decoding_ind = 0
+                speech2text.decoding_mode = "model2"
+            elif param_dict["decoding_model"] == "offline":
+                speech2text.decoding_ind = 1
+                speech2text.decoding_mode = "model2"
+            else:
+                raise NotImplementedError("unsupported decoding model {}".format(param_dict["decoding_model"]))
         loader = ASRTask.build_streaming_iterator(
             data_path_and_name_and_type,
             dtype=dtype,
diff --git a/funasr/bin/build_trainer.py b/funasr/bin/build_trainer.py
index bb1d7a7..8dee758 100644
--- a/funasr/bin/build_trainer.py
+++ b/funasr/bin/build_trainer.py
@@ -27,6 +27,8 @@
         from funasr.tasks.asr import ASRTaskParaformer as ASRTask
     elif mode == "uniasr":
         from funasr.tasks.asr import ASRTaskUniASR as ASRTask
+    elif mode == "mfcca":
+        from funasr.tasks.asr import ASRTaskMFCCA as ASRTask   
     else:
         raise ValueError("Unknown mode: {}".format(mode))
     parser = ASRTask.get_parser()
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 9f1d0f3..0d96594 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -1,6 +1,7 @@
 import argparse
 import logging
 import sys
+import json
 from pathlib import Path
 from typing import Any
 from typing import List
@@ -105,17 +106,32 @@
             feats_len = feats_len.int()
         else:
             raise Exception("Need to extract feats first, please configure frontend configuration")
-        batch = {"feats": feats, "feats_lengths": feats_len, "waveform": speech}
+        # batch = {"feats": feats, "waveform": speech, "is_final_send": True}
+        # segments = self.vad_model(**batch)
 
-        # a. To device
-        batch = to_device(batch, device=self.device)
-
-        # b. Forward Encoder
-        segments = self.vad_model(**batch)
+        # b. Forward Encoder sreaming
+        segments = []
+        step = 6000
+        t_offset = 0
+        for t_offset in range(0, feats_len, min(step, feats_len - t_offset)):
+            if t_offset + step >= feats_len - 1:
+                step = feats_len - t_offset
+                is_final_send = True
+            else:
+                is_final_send = False
+            batch = {
+                "feats": feats[:, t_offset:t_offset + step, :],
+                "waveform": speech[:, t_offset * 160:min(speech.shape[-1], (t_offset + step - 1) * 160 + 400)],
+                "is_final_send": is_final_send
+            }
+            # a. To device
+            batch = to_device(batch, device=self.device)
+            segments_part = self.vad_model(**batch)
+            if segments_part:
+                segments += segments_part
+        #print(segments)
 
         return segments
-
-
 
 
 def inference(
@@ -152,11 +168,12 @@
     )
     return inference_pipeline(data_path_and_name_and_type, raw_inputs)
 
+
 def inference_modelscope(
         batch_size: int,
         ngpu: int,
         log_level: Union[int, str],
-        #data_path_and_name_and_type,
+        # data_path_and_name_and_type,
         vad_infer_config: Optional[str],
         vad_model_file: Optional[str],
         vad_cmvn_file: Optional[str] = None,
@@ -167,7 +184,6 @@
         dtype: str = "float32",
         seed: int = 0,
         num_workers: int = 1,
-        param_dict: dict = None,
         **kwargs,
 ):
     assert check_argument_types()
@@ -201,11 +217,11 @@
     speech2vadsegment = Speech2VadSegment(**speech2vadsegment_kwargs)
 
     def _forward(
-        data_path_and_name_and_type,
-        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
-        output_dir_v2: Optional[str] = None,
-        fs: dict = None,
-        param_dict: dict = None,
+            data_path_and_name_and_type,
+            raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+            output_dir_v2: Optional[str] = None,
+            fs: dict = None,
+            param_dict: dict = None,
     ):
         # 3. Build data-iterator
         loader = VADTask.build_streaming_iterator(
@@ -243,9 +259,11 @@
             # do vad segment
             results = speech2vadsegment(**batch)
             for i, _ in enumerate(keys):
+                results[i] = json.dumps(results[i])
                 item = {'key': keys[i], 'value': results[i]}
                 vad_results.append(item)
                 if writer is not None:
+                    results[i] = json.loads(results[i])
                     ibest_writer["text"][keys[i]] = "{}".format(results[i])
 
         return vad_results
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 54bf319..42c5c1e 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -107,13 +107,15 @@
 
 
 def inference_launch(mode, **kwargs):
-    if mode == "vad":
+    if mode == "offline":
         from funasr.bin.vad_inference import inference_modelscope
+        return inference_modelscope(**kwargs)
+    elif mode == "online":
+        from funasr.bin.vad_inference_online import inference_modelscope
         return inference_modelscope(**kwargs)
     else:
         logging.info("Unknown decoding mode: {}".format(mode))
         return None
-
 
 def main(cmd=None):
     print(get_commandline_args(), file=sys.stderr)
diff --git a/funasr/datasets/iterable_dataset.py b/funasr/datasets/iterable_dataset.py
index fa0adeb..2f97e78 100644
--- a/funasr/datasets/iterable_dataset.py
+++ b/funasr/datasets/iterable_dataset.py
@@ -174,90 +174,94 @@
     def __iter__(self) -> Iterator[Tuple[Union[str, int], Dict[str, np.ndarray]]]:
         count = 0
         if len(self.path_name_type_list) != 0 and (self.path_name_type_list[0][2] == "bytes" or self.path_name_type_list[0][2] == "waveform"):
+            linenum = len(self.path_name_type_list)
             data = {}
-            value = self.path_name_type_list[0][0]
-            uid = 'utt_id'
-            name = self.path_name_type_list[0][1]
-            _type = self.path_name_type_list[0][2]
-            func = DATA_TYPES[_type]
-            array = func(value)
-            if self.fs is not None and name == "speech":
-                audio_fs = self.fs["audio_fs"]
-                model_fs = self.fs["model_fs"]
-                if audio_fs is not None and model_fs is not None:
-                    array = torch.from_numpy(array)
-                    array = array.unsqueeze(0)
-                    array = torchaudio.transforms.Resample(orig_freq=audio_fs,
-                                                   new_freq=model_fs)(array)
-                    array = array.squeeze(0).numpy()
-            data[name] = array
+            for i in range(linenum):
+                value = self.path_name_type_list[i][0]
+                uid = 'utt_id'
+                name = self.path_name_type_list[i][1]
+                _type = self.path_name_type_list[i][2]
+                func = DATA_TYPES[_type]
+                array = func(value)
+                if self.fs is not None and (name == "speech" or name == "ref_speech"):
+                    audio_fs = self.fs["audio_fs"]
+                    model_fs = self.fs["model_fs"]
+                    if audio_fs is not None and model_fs is not None:
+                        array = torch.from_numpy(array)
+                        array = array.unsqueeze(0)
+                        array = torchaudio.transforms.Resample(orig_freq=audio_fs,
+                                                       new_freq=model_fs)(array)
+                        array = array.squeeze(0).numpy()
+                data[name] = array
 
-            if self.preprocess is not None:
-                data = self.preprocess(uid, data)
-            for name in data:
-                count += 1
-                value = data[name]
-                if not isinstance(value, np.ndarray):
-                    raise RuntimeError(
-                        f'All values must be converted to np.ndarray object '
-                        f'by preprocessing, but "{name}" is still {type(value)}.')
-                # Cast to desired type
-                if value.dtype.kind == 'f':
-                    value = value.astype(self.float_dtype)
-                elif value.dtype.kind == 'i':
-                    value = value.astype(self.int_dtype)
-                else:
-                    raise NotImplementedError(
-                        f'Not supported dtype: {value.dtype}')
-                data[name] = value
+                if self.preprocess is not None:
+                    data = self.preprocess(uid, data)
+                for name in data:
+                    count += 1
+                    value = data[name]
+                    if not isinstance(value, np.ndarray):
+                        raise RuntimeError(
+                            f'All values must be converted to np.ndarray object '
+                            f'by preprocessing, but "{name}" is still {type(value)}.')
+                    # Cast to desired type
+                    if value.dtype.kind == 'f':
+                        value = value.astype(self.float_dtype)
+                    elif value.dtype.kind == 'i':
+                        value = value.astype(self.int_dtype)
+                    else:
+                        raise NotImplementedError(
+                            f'Not supported dtype: {value.dtype}')
+                    data[name] = value
 
             yield uid, data
 
         elif len(self.path_name_type_list) != 0 and self.path_name_type_list[0][2] == "sound" and not self.path_name_type_list[0][0].lower().endswith(".scp"):
+            linenum = len(self.path_name_type_list)
             data = {}
-            value = self.path_name_type_list[0][0]
-            uid = os.path.basename(self.path_name_type_list[0][0]).split(".")[0]
-            name = self.path_name_type_list[0][1]
-            _type = self.path_name_type_list[0][2]
-            if _type == "sound":
-                audio_type = os.path.basename(value).split(".")[1].lower()
-                if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
-                    raise NotImplementedError(
-                        f'Not supported audio type: {audio_type}')
-                if audio_type == "pcm":
-                    _type = "pcm"
+            for i in range(linenum):
+                value = self.path_name_type_list[i][0]
+                uid = os.path.basename(self.path_name_type_list[i][0]).split(".")[0]
+                name = self.path_name_type_list[i][1]
+                _type = self.path_name_type_list[i][2]
+                if _type == "sound":
+                    audio_type = os.path.basename(value).split(".")[-1].lower()
+                    if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
+                        raise NotImplementedError(
+                            f'Not supported audio type: {audio_type}')
+                    if audio_type == "pcm":
+                        _type = "pcm"
 
-            func = DATA_TYPES[_type]
-            array = func(value)
-            if self.fs is not None and name == "speech":
-                audio_fs = self.fs["audio_fs"]
-                model_fs = self.fs["model_fs"]
-                if audio_fs is not None and model_fs is not None:
-                    array = torch.from_numpy(array)
-                    array = array.unsqueeze(0)
-                    array = torchaudio.transforms.Resample(orig_freq=audio_fs,
-                                                           new_freq=model_fs)(array)
-                    array = array.squeeze(0).numpy()
-            data[name] = array
+                func = DATA_TYPES[_type]
+                array = func(value)
+                if self.fs is not None and (name == "speech" or name == "ref_speech"):
+                    audio_fs = self.fs["audio_fs"]
+                    model_fs = self.fs["model_fs"]
+                    if audio_fs is not None and model_fs is not None:
+                        array = torch.from_numpy(array)
+                        array = array.unsqueeze(0)
+                        array = torchaudio.transforms.Resample(orig_freq=audio_fs,
+                                                               new_freq=model_fs)(array)
+                        array = array.squeeze(0).numpy()
+                data[name] = array
 
-            if self.preprocess is not None:
-                data = self.preprocess(uid, data)
-            for name in data:
-                count += 1
-                value = data[name]
-                if not isinstance(value, np.ndarray):
-                    raise RuntimeError(
-                        f'All values must be converted to np.ndarray object '
-                        f'by preprocessing, but "{name}" is still {type(value)}.')
-                # Cast to desired type
-                if value.dtype.kind == 'f':
-                    value = value.astype(self.float_dtype)
-                elif value.dtype.kind == 'i':
-                    value = value.astype(self.int_dtype)
-                else:
-                    raise NotImplementedError(
-                        f'Not supported dtype: {value.dtype}')
-                data[name] = value
+                if self.preprocess is not None:
+                    data = self.preprocess(uid, data)
+                for name in data:
+                    count += 1
+                    value = data[name]
+                    if not isinstance(value, np.ndarray):
+                        raise RuntimeError(
+                            f'All values must be converted to np.ndarray object '
+                            f'by preprocessing, but "{name}" is still {type(value)}.')
+                    # Cast to desired type
+                    if value.dtype.kind == 'f':
+                        value = value.astype(self.float_dtype)
+                    elif value.dtype.kind == 'i':
+                        value = value.astype(self.int_dtype)
+                    else:
+                        raise NotImplementedError(
+                            f'Not supported dtype: {value.dtype}')
+                    data[name] = value
 
             yield uid, data
 
@@ -322,7 +326,7 @@
                 # 2.a. Load data streamingly
                 for value, (path, name, _type) in zip(values, self.path_name_type_list):
                     if _type == "sound":
-                        audio_type = os.path.basename(value).split(".")[1].lower()
+                        audio_type = os.path.basename(value).split(".")[-1].lower()
                         if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
                             raise NotImplementedError(
                                 f'Not supported audio type: {audio_type}')
diff --git a/funasr/datasets/large_datasets/dataset.py b/funasr/datasets/large_datasets/dataset.py
index 55b0678..2123737 100644
--- a/funasr/datasets/large_datasets/dataset.py
+++ b/funasr/datasets/large_datasets/dataset.py
@@ -1,9 +1,10 @@
 import os
 import random
-import soundfile
+import numpy
 from functools import partial
 
 import torch
+import torchaudio
 import torch.distributed as dist
 from kaldiio import ReadHelper
 from torch.utils.data import IterableDataset
@@ -117,7 +118,9 @@
                             sample_dict["key"] = key
                     elif data_type == "sound":
                         key, path = item.strip().split()
-                        mat, sampling_rate = soundfile.read(path)
+                        waveform, sampling_rate = torchaudio.load(path)
+                        waveform = waveform.numpy()
+                        mat = waveform[0]
                         sample_dict[data_name] = mat
                         sample_dict["sampling_rate"] = sampling_rate
                         if data_name == "speech":
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index 79540c1..8e86794 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -363,7 +363,7 @@
             if self.split_with_space:
                 tokens = text.strip().split(" ")
                 if self.seg_dict is not None:
-                    tokens = forward_segment("".join(tokens).lower(), self.seg_dict)
+                    tokens = forward_segment("".join(tokens), self.seg_dict)
                     tokens = seg_tokenize(tokens, self.seg_dict)
             else:
                 tokens = self.tokenizer.text2tokens(text)
diff --git a/funasr/export/README.md b/funasr/export/README.md
index 0ecf272..9946eeb 100644
--- a/funasr/export/README.md
+++ b/funasr/export/README.md
@@ -9,35 +9,35 @@
 
 The installation is the same as [funasr](../../README.md)
 
-## Export onnx format model
+## Export model
+   `Tips`: torch 1.11.0 is required.
+
+   ```shell
+   python -m funasr.export.export_model [model_name] [export_dir] [onnx]
+   ```
+   `model_name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb). 
+   `export_dir`: the dir where the onnx is export.
+    `onnx`: `true`, export onnx format model; `false`, export torchscripts format model.
+
+## For example
+### Export onnx format model
 Export model from modelscope
-```python
-from funasr.export.export_model import ASRModelExportParaformer
-
-output_dir = "../export"  # onnx/torchscripts model save path
-export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=True)
-export_model.export('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+```shell
+python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+```
+Export model from local path, the model'name must be `model.pb`.
+```shell
+python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
 ```
 
-
-Export model from local path
-```python
-export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
-```
-
-## Export torchscripts format model
+### Export torchscripts format model
 Export model from modelscope
-```python
-from funasr.export.export_model import ASRModelExportParaformer
-
-output_dir = "../export"  # onnx/torchscripts model save path
-export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=False)
-export_model.export('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+```shell
+python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
 ```
 
-Export model from local path
-```python
-
-export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+Export model from local path, the model'name must be `model.pb`.
+```shell
+python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
 ```
 
diff --git a/funasr/export/export_model.py b/funasr/export/export_model.py
index 239fd6c..b5c6fa8 100644
--- a/funasr/export/export_model.py
+++ b/funasr/export/export_model.py
@@ -24,7 +24,7 @@
             feats_dim=560,
             onnx=False,
         )
-        logging.info("output dir: {}".format(self.cache_dir))
+        print("output dir: {}".format(self.cache_dir))
         self.onnx = onnx
         
 
@@ -44,13 +44,13 @@
             model,
             self.export_config,
         )
-        self._export_onnx(model, verbose, export_dir)
+        # self._export_onnx(model, verbose, export_dir)
         if self.onnx:
             self._export_onnx(model, verbose, export_dir)
         else:
             self._export_torchscripts(model, verbose, export_dir)
 
-        logging.info("output dir: {}".format(export_dir))
+        print("output dir: {}".format(export_dir))
 
 
     def _export_torchscripts(self, model, verbose, path, enc_size=None):
@@ -117,7 +117,15 @@
         )
 
 if __name__ == '__main__':
-    output_dir = "../export"
-    export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=False)
-    export_model.export('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+    import sys
+    
+    model_path = sys.argv[1]
+    output_dir = sys.argv[2]
+    onnx = sys.argv[3]
+    onnx = onnx.lower()
+    onnx = onnx == 'true'
+    # model_path = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
+    # output_dir = "../export"
+    export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=onnx)
+    export_model.export(model_path)
     # export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
\ No newline at end of file
diff --git a/funasr/export/models/e2e_asr_paraformer.py b/funasr/export/models/e2e_asr_paraformer.py
index 84dd9d2..bf5ed1e 100644
--- a/funasr/export/models/e2e_asr_paraformer.py
+++ b/funasr/export/models/e2e_asr_paraformer.py
@@ -59,7 +59,7 @@
         enc, enc_len = self.encoder(**batch)
         mask = self.make_pad_mask(enc_len)[:, None, :]
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(enc, mask)
-        pre_token_length = pre_token_length.round().long()
+        pre_token_length = pre_token_length.round().type(torch.int32)
 
         decoder_out, _ = self.decoder(enc, enc_len, pre_acoustic_embeds, pre_token_length)
         decoder_out = torch.log_softmax(decoder_out, dim=-1)
diff --git a/funasr/export/models/predictor/cif.py b/funasr/export/models/predictor/cif.py
index 32a3c13..5518cb8 100644
--- a/funasr/export/models/predictor/cif.py
+++ b/funasr/export/models/predictor/cif.py
@@ -116,53 +116,3 @@
 		pad_l = torch.zeros([int(max_label_len - l.size(0)), int(hidden_size)], device=hidden.device)
 		list_ls.append(torch.cat([l, pad_l], 0))
 	return torch.stack(list_ls, 0), fires
-
-
-def CifPredictorV2_test():
-	x = torch.rand([2, 21, 2])
-	x_len = torch.IntTensor([6, 21])
-	
-	mask = sequence_mask(x_len, maxlen=x.size(1), dtype=x.dtype)
-	x = x * mask[:, :, None]
-	
-	predictor_scripts = torch.jit.script(CifPredictorV2(2, 1, 1))
-	# cif_output, cif_length, alphas, cif_peak = predictor_scripts(x, mask=mask[:, None, :])
-	predictor_scripts.save('test.pt')
-	loaded = torch.jit.load('test.pt')
-	cif_output, cif_length, alphas, cif_peak = loaded(x, mask=mask[:, None, :])
-	# print(cif_output)
-	print(predictor_scripts.code)
-	# predictor = CifPredictorV2(2, 1, 1)
-	# cif_output, cif_length, alphas, cif_peak = predictor(x, mask=mask[:, None, :])
-	print(cif_output)
-
-
-def CifPredictorV2_export_test():
-	x = torch.rand([2, 21, 2])
-	x_len = torch.IntTensor([6, 21])
-	
-	mask = sequence_mask(x_len, maxlen=x.size(1), dtype=x.dtype)
-	x = x * mask[:, :, None]
-	
-	# predictor_scripts = torch.jit.script(CifPredictorV2(2, 1, 1))
-	# cif_output, cif_length, alphas, cif_peak = predictor_scripts(x, mask=mask[:, None, :])
-	predictor = CifPredictorV2(2, 1, 1)
-	predictor_trace = torch.jit.trace(predictor, (x, mask[:, None, :]))
-	predictor_trace.save('test_trace.pt')
-	loaded = torch.jit.load('test_trace.pt')
-	
-	x = torch.rand([3, 30, 2])
-	x_len = torch.IntTensor([6, 20, 30])
-	mask = sequence_mask(x_len, maxlen=x.size(1), dtype=x.dtype)
-	x = x * mask[:, :, None]
-	cif_output, cif_length, alphas, cif_peak = loaded(x, mask=mask[:, None, :])
-	print(cif_output)
-	# print(predictor_trace.code)
-	# predictor = CifPredictorV2(2, 1, 1)
-	# cif_output, cif_length, alphas, cif_peak = predictor(x, mask=mask[:, None, :])
-	# print(cif_output)
-
-
-if __name__ == '__main__':
-	# CifPredictorV2_test()
-	CifPredictorV2_export_test()
\ No newline at end of file
diff --git a/funasr/models/e2e_asr_mfcca.py b/funasr/models/e2e_asr_mfcca.py
new file mode 100644
index 0000000..0336133
--- /dev/null
+++ b/funasr/models/e2e_asr_mfcca.py
@@ -0,0 +1,322 @@
+from contextlib import contextmanager
+from distutils.version import LooseVersion
+from typing import Dict
+from typing import List
+from typing import Optional
+from typing import Tuple
+from typing import Union
+import logging
+import torch
+from typeguard import check_argument_types
+
+from funasr.modules.e2e_asr_common import ErrorCalculator
+from funasr.modules.nets_utils import th_accuracy
+from funasr.modules.add_sos_eos import add_sos_eos
+from funasr.losses.label_smoothing_loss import (
+    LabelSmoothingLoss,  # noqa: H301
+)
+from funasr.models.ctc import CTC
+from funasr.models.decoder.abs_decoder import AbsDecoder
+from funasr.models.encoder.abs_encoder import AbsEncoder
+from funasr.models.frontend.abs_frontend import AbsFrontend
+from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
+from funasr.models.specaug.abs_specaug import AbsSpecAug
+from funasr.layers.abs_normalize import AbsNormalize
+from funasr.torch_utils.device_funcs import force_gatherable
+from funasr.train.abs_espnet_model import AbsESPnetModel
+
+if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
+    from torch.cuda.amp import autocast
+else:
+    # Nothing to do if torch<1.6.0
+    @contextmanager
+    def autocast(enabled=True):
+        yield
+import pdb
+import random
+import math
+class MFCCA(AbsESPnetModel):
+    """CTC-attention hybrid Encoder-Decoder model"""
+
+    def __init__(
+        self,
+        vocab_size: int,
+        token_list: Union[Tuple[str, ...], List[str]],
+        frontend: Optional[AbsFrontend],
+        specaug: Optional[AbsSpecAug],
+        normalize: Optional[AbsNormalize],
+        preencoder: Optional[AbsPreEncoder],
+        encoder: AbsEncoder,
+        decoder: AbsDecoder,
+        ctc: CTC,
+        rnnt_decoder: None,
+        ctc_weight: float = 0.5,
+        ignore_id: int = -1,
+        lsm_weight: float = 0.0,
+        mask_ratio: float = 0.0,
+        length_normalized_loss: bool = False,
+        report_cer: bool = True,
+        report_wer: bool = True,
+        sym_space: str = "<space>",
+        sym_blank: str = "<blank>",
+    ):
+        assert check_argument_types()
+        assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+        assert rnnt_decoder is None, "Not implemented"
+
+        super().__init__()
+        # note that eos is the same as sos (equivalent ID)
+        self.sos = vocab_size - 1
+        self.eos = vocab_size - 1
+        self.vocab_size = vocab_size
+        self.ignore_id = ignore_id
+        self.ctc_weight = ctc_weight
+        self.token_list = token_list.copy()
+        
+        self.mask_ratio = mask_ratio
+
+        
+        self.frontend = frontend
+        self.specaug = specaug
+        self.normalize = normalize
+        self.preencoder = preencoder
+        self.encoder = encoder
+        # we set self.decoder = None in the CTC mode since
+        # self.decoder parameters were never used and PyTorch complained
+        # and threw an Exception in the multi-GPU experiment.
+        # thanks Jeff Farris for pointing out the issue.
+        if ctc_weight == 1.0:
+            self.decoder = None
+        else:
+            self.decoder = decoder
+        if ctc_weight == 0.0:
+            self.ctc = None
+        else:
+            self.ctc = ctc
+        self.rnnt_decoder = rnnt_decoder
+        self.criterion_att = LabelSmoothingLoss(
+            size=vocab_size,
+            padding_idx=ignore_id,
+            smoothing=lsm_weight,
+            normalize_length=length_normalized_loss,
+        )
+
+        if report_cer or report_wer:
+            self.error_calculator = ErrorCalculator(
+                token_list, sym_space, sym_blank, report_cer, report_wer
+            )
+        else:
+            self.error_calculator = None
+
+    def forward(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+        """Frontend + Encoder + Decoder + Calc loss
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+            text: (Batch, Length)
+            text_lengths: (Batch,)
+        """
+        assert text_lengths.dim() == 1, text_lengths.shape
+        # Check that batch_size is unified
+        assert (
+            speech.shape[0]
+            == speech_lengths.shape[0]
+            == text.shape[0]
+            == text_lengths.shape[0]
+        ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+        #pdb.set_trace()
+        if(speech.dim()==3 and speech.size(2)==8 and self.mask_ratio !=0):
+            rate_num = random.random()
+            #rate_num = 0.1
+            if(rate_num<=self.mask_ratio):
+                retain_channel = math.ceil(random.random() *8)
+                if(retain_channel>1):
+                    speech = speech[:,:,torch.randperm(8)[0:retain_channel].sort().values]
+                else:
+                    speech = speech[:,:,torch.randperm(8)[0]]
+        #pdb.set_trace()
+        batch_size = speech.shape[0]
+        # for data-parallel
+        text = text[:, : text_lengths.max()]
+
+        # 1. Encoder
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+        # 2a. Attention-decoder branch
+        if self.ctc_weight == 1.0:
+            loss_att, acc_att, cer_att, wer_att = None, None, None, None
+        else:
+            loss_att, acc_att, cer_att, wer_att = self._calc_att_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # 2b. CTC branch
+        if self.ctc_weight == 0.0:
+            loss_ctc, cer_ctc = None, None
+        else:
+            loss_ctc, cer_ctc = self._calc_ctc_loss(
+                encoder_out, encoder_out_lens, text, text_lengths
+            )
+
+        # 2c. RNN-T branch
+        if self.rnnt_decoder is not None:
+            _ = self._calc_rnnt_loss(encoder_out, encoder_out_lens, text, text_lengths)
+
+        if self.ctc_weight == 0.0:
+            loss = loss_att
+        elif self.ctc_weight == 1.0:
+            loss = loss_ctc
+        else:
+            loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att
+
+        stats = dict(
+            loss=loss.detach(),
+            loss_att=loss_att.detach() if loss_att is not None else None,
+            loss_ctc=loss_ctc.detach() if loss_ctc is not None else None,
+            acc=acc_att,
+            cer=cer_att,
+            wer=wer_att,
+            cer_ctc=cer_ctc,
+        )
+
+        # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+        return loss, stats, weight
+
+    def collect_feats(
+        self,
+        speech: torch.Tensor,
+        speech_lengths: torch.Tensor,
+        text: torch.Tensor,
+        text_lengths: torch.Tensor,
+    ) -> Dict[str, torch.Tensor]:
+        feats, feats_lengths, channel_size = self._extract_feats(speech, speech_lengths)
+        return {"feats": feats, "feats_lengths": feats_lengths}
+
+    def encode(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+        Args:
+            speech: (Batch, Length, ...)
+            speech_lengths: (Batch, )
+        """
+        with autocast(False):
+            # 1. Extract feats
+            feats, feats_lengths, channel_size = self._extract_feats(speech, speech_lengths)
+            # 2. Data augmentation
+            if self.specaug is not None and self.training:
+                feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+            if self.normalize is not None:
+                feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+        # Pre-encoder, e.g. used for raw input data
+        if self.preencoder is not None:
+            feats, feats_lengths = self.preencoder(feats, feats_lengths)
+        #pdb.set_trace()
+        encoder_out, encoder_out_lens, _ = self.encoder(feats, feats_lengths, channel_size)
+
+        assert encoder_out.size(0) == speech.size(0), (
+            encoder_out.size(),
+            speech.size(0),
+        )
+        if(encoder_out.dim()==4):
+            assert encoder_out.size(2) <= encoder_out_lens.max(), (
+                encoder_out.size(),
+                encoder_out_lens.max(),
+            )
+        else:
+            assert encoder_out.size(1) <= encoder_out_lens.max(), (
+                encoder_out.size(),
+                encoder_out_lens.max(),
+            )
+
+        return encoder_out, encoder_out_lens
+
+    def _extract_feats(
+        self, speech: torch.Tensor, speech_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        assert speech_lengths.dim() == 1, speech_lengths.shape
+        # for data-parallel
+        speech = speech[:, : speech_lengths.max()]
+        if self.frontend is not None:
+            # Frontend
+            #  e.g. STFT and Feature extract
+            #       data_loader may send time-domain signal in this case
+            # speech (Batch, NSamples) -> feats: (Batch, NFrames, Dim)
+            feats, feats_lengths, channel_size = self.frontend(speech, speech_lengths)
+        else:
+            # No frontend and no feature extract
+            feats, feats_lengths = speech, speech_lengths
+            channel_size = 1
+        return feats, feats_lengths, channel_size
+
+    def _calc_att_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        ys_in_pad, ys_out_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+        ys_in_lens = ys_pad_lens + 1
+
+        # 1. Forward decoder
+        decoder_out, _ = self.decoder(
+            encoder_out, encoder_out_lens, ys_in_pad, ys_in_lens
+        )
+
+        # 2. Compute attention loss
+        loss_att = self.criterion_att(decoder_out, ys_out_pad)
+        acc_att = th_accuracy(
+            decoder_out.view(-1, self.vocab_size),
+            ys_out_pad,
+            ignore_label=self.ignore_id,
+        )
+
+        # Compute cer/wer using attention-decoder
+        if self.training or self.error_calculator is None:
+            cer_att, wer_att = None, None
+        else:
+            ys_hat = decoder_out.argmax(dim=-1)
+            cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+        return loss_att, acc_att, cer_att, wer_att
+
+    def _calc_ctc_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        # Calc CTC loss
+        if(encoder_out.dim()==4):
+            encoder_out = encoder_out.mean(1)
+        loss_ctc = self.ctc(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens)
+
+        # Calc CER using CTC
+        cer_ctc = None
+        if not self.training and self.error_calculator is not None:
+            ys_hat = self.ctc.argmax(encoder_out).data
+            cer_ctc = self.error_calculator(ys_hat.cpu(), ys_pad.cpu(), is_ctc=True)
+        return loss_ctc, cer_ctc
+
+    def _calc_rnnt_loss(
+        self,
+        encoder_out: torch.Tensor,
+        encoder_out_lens: torch.Tensor,
+        ys_pad: torch.Tensor,
+        ys_pad_lens: torch.Tensor,
+    ):
+        raise NotImplementedError
diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index 98504d6..8afc8db 100755
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -5,7 +5,6 @@
 from torch import nn
 import math
 from funasr.models.encoder.fsmn_encoder import FSMN
-# from checkpoint import load_checkpoint
 
 
 class VadStateMachine(Enum):
@@ -136,7 +135,7 @@
 
         self.win_size_frame = int(window_size_ms / frame_size_ms)
         self.win_sum = 0
-        self.win_state = [0 for i in range(0, self.win_size_frame)]  # 鍒濆鍖栫獥
+        self.win_state = [0] * self.win_size_frame  # 鍒濆鍖栫獥
 
         self.cur_win_pos = 0
         self.pre_frame_state = FrameState.kFrameStateSil
@@ -151,7 +150,7 @@
     def Reset(self) -> None:
         self.cur_win_pos = 0
         self.win_sum = 0
-        self.win_state = [0 for i in range(0, self.win_size_frame)]
+        self.win_state = [0] * self.win_size_frame
         self.pre_frame_state = FrameState.kFrameStateSil
         self.cur_frame_state = FrameState.kFrameStateSil
         self.voice_last_frame_count = 0
@@ -192,8 +191,8 @@
         return int(self.frame_size_ms)
 
 
-class E2EVadModel(torch.nn.Module):
-    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any]):
+class E2EVadModel(nn.Module):
+    def __init__(self, encoder: FSMN, vad_post_args: Dict[str, Any], streaming=False):
         super(E2EVadModel, self).__init__()
         self.vad_opts = VADXOptions(**vad_post_args)
         self.windows_detector = WindowDetector(self.vad_opts.window_size_ms,
@@ -212,13 +211,13 @@
         self.confirmed_start_frame = -1
         self.confirmed_end_frame = -1
         self.number_end_time_detected = 0
-        self.is_callback_with_sign = False
         self.sil_frame = 0
         self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
         self.noise_average_decibel = -100.0
         self.pre_end_silence_detected = False
 
         self.output_data_buf = []
+        self.output_data_buf_offset = 0
         self.frame_probs = []
         self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
         self.speech_noise_thres = self.vad_opts.speech_noise_thres
@@ -226,10 +225,13 @@
         self.max_time_out = False
         self.decibel = []
         self.data_buf = None
+        self.data_buf_all = None
         self.waveform = None
+        self.streaming = streaming
         self.ResetDetection()
 
     def AllResetDetection(self):
+        self.encoder.cache_reset()  # reset the in_cache in self.encoder for next query or next long sentence
         self.is_final_send = False
         self.data_buf_start_frame = 0
         self.frm_cnt = 0
@@ -240,13 +242,13 @@
         self.confirmed_start_frame = -1
         self.confirmed_end_frame = -1
         self.number_end_time_detected = 0
-        self.is_callback_with_sign = False
         self.sil_frame = 0
         self.sil_pdf_ids = self.vad_opts.sil_pdf_ids
         self.noise_average_decibel = -100.0
         self.pre_end_silence_detected = False
 
         self.output_data_buf = []
+        self.output_data_buf_offset = 0
         self.frame_probs = []
         self.max_end_sil_frame_cnt_thresh = self.vad_opts.max_end_silence_time - self.vad_opts.speech_to_sil_time_thres
         self.speech_noise_thres = self.vad_opts.speech_noise_thres
@@ -254,6 +256,7 @@
         self.max_time_out = False
         self.decibel = []
         self.data_buf = None
+        self.data_buf_all = None
         self.waveform = None
         self.ResetDetection()
 
@@ -271,26 +274,32 @@
     def ComputeDecibel(self) -> None:
         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
         frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
-        self.data_buf = self.waveform[0]  # 鎸囧悜self.waveform[0]
+        if self.data_buf_all is None:
+            self.data_buf_all = self.waveform[0]  # self.data_buf is pointed to self.waveform[0]
+            self.data_buf = self.data_buf_all
+        else:
+            self.data_buf_all = torch.cat((self.data_buf_all, self.waveform[0]))
         for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
             self.decibel.append(
                 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
                                 0.000001))
 
-    def ComputeScores(self, feats: torch.Tensor, feats_lengths: int) -> None:
-        self.scores = self.encoder(feats)  # return B * T * D
-        self.frm_cnt = feats_lengths # frame
-        # return self.scores
+    def ComputeScores(self, feats: torch.Tensor) -> None:
+        scores = self.encoder(feats)  # return B * T * D
+        assert scores.shape[1] == feats.shape[1], "The shape between feats and scores does not match"
+        self.vad_opts.nn_eval_block_size = scores.shape[1]
+        self.frm_cnt += scores.shape[1]  # count total frames
+        if self.scores is None:
+            self.scores = scores  # the first calculation
+        else:
+            self.scores = torch.cat((self.scores, scores), dim=1)
 
     def PopDataBufTillFrame(self, frame_idx: int) -> None:  # need check again
         while self.data_buf_start_frame < frame_idx:
             if len(self.data_buf) >= int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):
                 self.data_buf_start_frame += 1
-                self.data_buf = self.waveform[0][self.data_buf_start_frame * int(
+                self.data_buf = self.data_buf_all[self.data_buf_start_frame * int(
                     self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000):]
-                # for i in range(0, int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)):
-                #     self.data_buf.popleft()
-                # self.data_buf_start_frame += 1
 
     def PopDataToOutputBuf(self, start_frm: int, frm_cnt: int, first_frm_is_start_point: bool,
                            last_frm_is_end_point: bool, end_point_is_sent_end: bool) -> None:
@@ -301,8 +310,9 @@
                                self.vad_opts.sample_rate * self.vad_opts.frame_in_ms / 1000))
             expected_sample_number += int(extra_sample)
         if end_point_is_sent_end:
-            # expected_sample_number = max(expected_sample_number, len(self.data_buf))
-            pass
+            expected_sample_number = max(expected_sample_number, len(self.data_buf))
+        if len(self.data_buf) < expected_sample_number:
+            print('error in calling pop data_buf\n')
 
         if len(self.output_data_buf) == 0 or first_frm_is_start_point:
             self.output_data_buf.append(E2EVadSpeechBufWithDoa())
@@ -312,15 +322,18 @@
             self.output_data_buf[-1].doa = 0
         cur_seg = self.output_data_buf[-1]
         if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
-            print('warning')
+            print('warning\n')
         out_pos = len(cur_seg.buffer)  # cur_seg.buff鐜板湪娌″仛浠讳綍鎿嶄綔
         data_to_pop = 0
         if end_point_is_sent_end:
             data_to_pop = expected_sample_number
         else:
             data_to_pop = int(frm_cnt * self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
-        # if data_to_pop > len(self.data_buf_)
-        #   pass
+        if data_to_pop > len(self.data_buf):
+            print('VAD data_to_pop is bigger than self.data_buf.size()!!!\n')
+            data_to_pop = len(self.data_buf)
+            expected_sample_number = len(self.data_buf)
+
         cur_seg.doa = 0
         for sample_cpy_out in range(0, data_to_pop):
             # cur_seg.buffer[out_pos ++] = data_buf_.back();
@@ -329,7 +342,7 @@
             # cur_seg.buffer[out_pos++] = data_buf_.back()
             out_pos += 1
         if cur_seg.end_ms != start_frm * self.vad_opts.frame_in_ms:
-            print('warning')
+            print('Something wrong with the VAD algorithm\n')
         self.data_buf_start_frame += frm_cnt
         cur_seg.end_ms = (start_frm + frm_cnt) * self.vad_opts.frame_in_ms
         if first_frm_is_start_point:
@@ -346,14 +359,13 @@
 
     def OnVoiceDetected(self, valid_frame: int) -> None:
         self.latest_confirmed_speech_frame = valid_frame
-        if True:  # is_new_api_enable_ = True
-            self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
+        self.PopDataToOutputBuf(valid_frame, 1, False, False, False)
 
     def OnVoiceStart(self, start_frame: int, fake_result: bool = False) -> None:
         if self.vad_opts.do_start_point_detection:
             pass
         if self.confirmed_start_frame != -1:
-            print('warning')
+            print('not reset vad properly\n')
         else:
             self.confirmed_start_frame = start_frame
 
@@ -366,7 +378,7 @@
         if self.vad_opts.do_end_point_detection:
             pass
         if self.confirmed_end_frame != -1:
-            print('warning')
+            print('not reset vad properly\n')
         else:
             self.confirmed_end_frame = end_frame
         if not fake_result:
@@ -406,7 +418,6 @@
             sil_pdf_scores = [self.scores[0][t][sil_pdf_id] for sil_pdf_id in self.sil_pdf_ids]
             sum_score = sum(sil_pdf_scores)
             noise_prob = math.log(sum_score) * self.vad_opts.speech_2_noise_ratio
-            # total_score = sum(self.scores[0][t][:])
             total_score = 1.0
             sum_score = total_score - sum_score
         speech_prob = math.log(sum_score)
@@ -433,25 +444,59 @@
 
         return frame_state
 
-    def forward(self, feats: torch.Tensor, feats_lengths: int, waveform: torch.tensor) -> List[List[List[int]]]:
-        self.AllResetDetection()
+    def forward(self, feats: torch.Tensor, waveform: torch.tensor, is_final_send: bool = False) -> List[List[List[int]]]:
         self.waveform = waveform  # compute decibel for each frame
         self.ComputeDecibel()
-        self.ComputeScores(feats, feats_lengths)
-        assert len(self.decibel) == len(self.scores[0])  # 淇濊瘉甯ф暟涓�鑷�
-        self.DetectLastFrames()
+        self.ComputeScores(feats)
+        if not is_final_send:
+            self.DetectCommonFrames()
+        else:
+            if self.streaming:
+                self.DetectLastFrames()
+            else:
+                self.AllResetDetection()
+                self.DetectAllFrames()  # offline decode and is_final_send == True
         segments = []
         for batch_num in range(0, feats.shape[0]):  # only support batch_size = 1 now
             segment_batch = []
-            for i in range(0, len(self.output_data_buf)):
-                segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
-                segment_batch.append(segment)
-            segments.append(segment_batch)
+            if len(self.output_data_buf) > 0:
+                for i in range(self.output_data_buf_offset, len(self.output_data_buf)):
+                    if self.output_data_buf[i].contain_seg_start_point and self.output_data_buf[
+                        i].contain_seg_end_point:
+                        segment = [self.output_data_buf[i].start_ms, self.output_data_buf[i].end_ms]
+                        segment_batch.append(segment)
+                        self.output_data_buf_offset += 1  # need update this parameter
+            if segment_batch:
+                segments.append(segment_batch)
+
         return segments
 
+    def DetectCommonFrames(self) -> int:
+        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+            return 0
+        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+            frame_state = FrameState.kFrameStateInvalid
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+
+        return 0
+
     def DetectLastFrames(self) -> int:
         if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
             return 0
+        for i in range(self.vad_opts.nn_eval_block_size - 1, -1, -1):
+            frame_state = FrameState.kFrameStateInvalid
+            frame_state = self.GetFrameState(self.frm_cnt - 1 - i)
+            if i != 0:
+                self.DetectOneFrame(frame_state, self.frm_cnt - 1 - i, False)
+            else:
+                self.DetectOneFrame(frame_state, self.frm_cnt - 1, True)
+
+        return 0
+
+    def DetectAllFrames(self) -> int:
+        if self.vad_state_machine == VadStateMachine.kVadInStateEndPointDetected:
+            return 0
         if self.vad_opts.nn_eval_block_size != self.vad_opts.dcd_block_size:
             frame_state = FrameState.kFrameStateInvalid
             for t in range(0, self.frm_cnt):
diff --git a/funasr/models/encoder/encoder_layer_mfcca.py b/funasr/models/encoder/encoder_layer_mfcca.py
new file mode 100644
index 0000000..e0bd006
--- /dev/null
+++ b/funasr/models/encoder/encoder_layer_mfcca.py
@@ -0,0 +1,270 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
+# Copyright 2020 Johns Hopkins University (Shinji Watanabe)
+#                Northwestern Polytechnical University (Pengcheng Guo)
+#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
+
+"""Encoder self-attention layer definition."""
+
+import torch
+
+from torch import nn
+
+from funasr.modules.layer_norm import LayerNorm
+from torch.autograd import Variable
+
+
+
+class Encoder_Conformer_Layer(nn.Module):
+    """Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+        self,
+        size,
+        self_attn,
+        feed_forward,
+        feed_forward_macaron,
+        conv_module,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+        cca_pos=0,
+    ):
+        """Construct an Encoder_Conformer_Layer object."""
+        super(Encoder_Conformer_Layer, self).__init__()
+        self.self_attn = self_attn
+        self.feed_forward = feed_forward
+        self.feed_forward_macaron = feed_forward_macaron
+        self.conv_module = conv_module
+        self.norm_ff = LayerNorm(size)  # for the FNN module
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        if feed_forward_macaron is not None:
+            self.norm_ff_macaron = LayerNorm(size)
+            self.ff_scale = 0.5
+        else:
+            self.ff_scale = 1.0
+        if self.conv_module is not None:
+            self.norm_conv = LayerNorm(size)  # for the CNN module
+            self.norm_final = LayerNorm(size)  # for the final output of the block
+        self.dropout = nn.Dropout(dropout_rate)
+        self.size = size
+        self.normalize_before = normalize_before
+        self.concat_after = concat_after
+        self.cca_pos = cca_pos
+
+        if self.concat_after:
+            self.concat_linear = nn.Linear(size + size, size)
+
+    def forward(self, x_input, mask, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        # whether to use macaron style
+        if self.feed_forward_macaron is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_ff_macaron(x)
+            x = residual + self.ff_scale * self.dropout(self.feed_forward_macaron(x))
+            if not self.normalize_before:
+                x = self.norm_ff_macaron(x)
+
+        # multi-headed self-attention module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_mha(x)
+
+
+        if cache is None:
+            x_q = x
+        else:
+            assert cache.shape == (x.shape[0], x.shape[1] - 1, self.size)
+            x_q = x[:, -1:, :]
+            residual = residual[:, -1:, :]
+            mask = None if mask is None else mask[:, -1:, :]
+
+        if self.cca_pos<2:
+            if pos_emb is not None:
+                x_att = self.self_attn(x_q, x, x, pos_emb, mask)
+            else:
+                x_att = self.self_attn(x_q, x, x, mask)
+        else:    
+            x_att = self.self_attn(x_q, x, x, mask)
+
+        if self.concat_after:
+            x_concat = torch.cat((x, x_att), dim=-1)
+            x = residual + self.concat_linear(x_concat)
+        else:
+            x = residual + self.dropout(x_att)
+        if not self.normalize_before:
+            x = self.norm_mha(x)
+
+        # convolution module
+        if self.conv_module is not None:
+            residual = x
+            if self.normalize_before:
+                x = self.norm_conv(x)
+            x = residual + self.dropout(self.conv_module(x))
+            if not self.normalize_before:
+                x = self.norm_conv(x)
+
+        # feed forward module
+        residual = x
+        if self.normalize_before:
+            x = self.norm_ff(x)
+        x = residual + self.ff_scale * self.dropout(self.feed_forward(x))
+        if not self.normalize_before:
+            x = self.norm_ff(x)
+
+        if self.conv_module is not None:
+            x = self.norm_final(x)
+
+        if cache is not None:
+            x = torch.cat([cache, x], dim=1)
+
+        if pos_emb is not None:
+            return (x, pos_emb), mask
+
+        return x, mask
+
+
+
+
+class EncoderLayer(nn.Module):
+    """Encoder layer module.
+
+    Args:
+        size (int): Input dimension.
+        self_attn (torch.nn.Module): Self-attention module instance.
+            `MultiHeadedAttention` or `RelPositionMultiHeadedAttention` instance
+            can be used as the argument.
+        feed_forward (torch.nn.Module): Feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        feed_forward_macaron (torch.nn.Module): Additional feed-forward module instance.
+            `PositionwiseFeedForward`, `MultiLayeredConv1d`, or `Conv1dLinear` instance
+            can be used as the argument.
+        conv_module (torch.nn.Module): Convolution module instance.
+            `ConvlutionModule` instance can be used as the argument.
+        dropout_rate (float): Dropout rate.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            if True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            if False, no additional linear will be applied. i.e. x -> x + att(x)
+
+    """
+
+    def __init__(
+        self,
+        size,
+        self_attn_cros_channel,
+        self_attn_conformer,
+        feed_forward_csa,
+        feed_forward_macaron_csa,
+        conv_module_csa,
+        dropout_rate,
+        normalize_before=True,
+        concat_after=False,
+    ):
+        """Construct an EncoderLayer object."""
+        super(EncoderLayer, self).__init__()
+
+        self.encoder_cros_channel_atten = self_attn_cros_channel
+        self.encoder_csa = Encoder_Conformer_Layer(
+                size,
+                self_attn_conformer,
+                feed_forward_csa,
+                feed_forward_macaron_csa,
+                conv_module_csa,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+                cca_pos=0)
+        self.norm_mha = LayerNorm(size)  # for the MHA module
+        self.dropout = nn.Dropout(dropout_rate)
+
+
+    def forward(self, x_input, mask, channel_size, cache=None):
+        """Compute encoded features.
+
+        Args:
+            x_input (Union[Tuple, torch.Tensor]): Input tensor w/ or w/o pos emb.
+                - w/ pos emb: Tuple of tensors [(#batch, time, size), (1, time, size)].
+                - w/o pos emb: Tensor (#batch, time, size).
+            mask (torch.Tensor): Mask tensor for the input (#batch, time).
+            cache (torch.Tensor): Cache tensor of the input (#batch, time - 1, size).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, size).
+            torch.Tensor: Mask tensor (#batch, time).
+
+        """
+        if isinstance(x_input, tuple):
+            x, pos_emb = x_input[0], x_input[1]
+        else:
+            x, pos_emb = x_input, None
+        residual = x
+        x = self.norm_mha(x)
+        t_leng = x.size(1)
+        d_dim = x.size(2)
+        x_new = x.reshape(-1,channel_size,t_leng,d_dim).transpose(1,2) # x_new B*T * C * D
+        x_k_v = x_new.new(x_new.size(0),x_new.size(1),5,x_new.size(2),x_new.size(3))
+        pad_before = Variable(torch.zeros(x_new.size(0),2,x_new.size(2),x_new.size(3))).type(x_new.type())
+        pad_after = Variable(torch.zeros(x_new.size(0),2,x_new.size(2),x_new.size(3))).type(x_new.type())
+        x_pad = torch.cat([pad_before,x_new, pad_after], 1)
+        x_k_v[:,:,0,:,:]=x_pad[:,0:-4,:,:]
+        x_k_v[:,:,1,:,:]=x_pad[:,1:-3,:,:]
+        x_k_v[:,:,2,:,:]=x_pad[:,2:-2,:,:]
+        x_k_v[:,:,3,:,:]=x_pad[:,3:-1,:,:]
+        x_k_v[:,:,4,:,:]=x_pad[:,4:,:,:]
+        x_new = x_new.reshape(-1,channel_size,d_dim)
+        x_k_v = x_k_v.reshape(-1,5*channel_size,d_dim)
+        x_att = self.encoder_cros_channel_atten(x_new, x_k_v, x_k_v, None)
+        x_att = x_att.reshape(-1,t_leng,channel_size,d_dim).transpose(1,2).reshape(-1,t_leng,d_dim)
+        x = residual + self.dropout(x_att)
+        if pos_emb is not None:
+            x_input =  (x, pos_emb)
+        else:
+            x_input = x
+        x_input, mask = self.encoder_csa(x_input, mask)
+
+
+        return x_input, mask , channel_size
diff --git a/funasr/models/encoder/fsmn_encoder.py b/funasr/models/encoder/fsmn_encoder.py
index 643cefc..54a113d 100755
--- a/funasr/models/encoder/fsmn_encoder.py
+++ b/funasr/models/encoder/fsmn_encoder.py
@@ -1,55 +1,50 @@
+from typing import Tuple, Dict
+import copy
+
 import numpy as np
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 
-from typing import Tuple
-
-
 class LinearTransform(nn.Module):
 
-    def __init__(self, input_dim, output_dim, quantize=0):
+    def __init__(self, input_dim, output_dim):
         super(LinearTransform, self).__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
         self.linear = nn.Linear(input_dim, output_dim, bias=False)
-        self.quantize = quantize
-        self.quant = torch.quantization.QuantStub()
-        self.dequant = torch.quantization.DeQuantStub()
 
     def forward(self, input):
-        if self.quantize:
-            output = self.quant(input)
-        else:
-            output = input
-        output = self.linear(output)
-        if self.quantize:
-            output = self.dequant(output)
+        output = self.linear(input)
 
         return output
 
 
 class AffineTransform(nn.Module):
 
-    def __init__(self, input_dim, output_dim, quantize=0):
+    def __init__(self, input_dim, output_dim):
         super(AffineTransform, self).__init__()
         self.input_dim = input_dim
         self.output_dim = output_dim
-        self.quantize = quantize
         self.linear = nn.Linear(input_dim, output_dim)
-        self.quant = torch.quantization.QuantStub()
-        self.dequant = torch.quantization.DeQuantStub()
 
     def forward(self, input):
-        if self.quantize:
-            output = self.quant(input)
-        else:
-            output = input
-        output = self.linear(output)
-        if self.quantize:
-            output = self.dequant(output)
+        output = self.linear(input)
 
         return output
+
+
+class RectifiedLinear(nn.Module):
+
+    def __init__(self, input_dim, output_dim):
+        super(RectifiedLinear, self).__init__()
+        self.dim = input_dim
+        self.relu = nn.ReLU()
+        self.dropout = nn.Dropout(0.1)
+
+    def forward(self, input):
+        out = self.relu(input)
+        return out
 
 
 class FSMNBlock(nn.Module):
@@ -62,7 +57,6 @@
             rorder=None,
             lstride=1,
             rstride=1,
-            quantize=0
     ):
         super(FSMNBlock, self).__init__()
 
@@ -84,71 +78,75 @@
                 self.dim, self.dim, [rorder, 1], dilation=[rstride, 1], groups=self.dim, bias=False)
         else:
             self.conv_right = None
-        self.quantize = quantize
-        self.quant = torch.quantization.QuantStub()
-        self.dequant = torch.quantization.DeQuantStub()
 
-    def forward(self, input):
+    def forward(self, input: torch.Tensor, in_cache=None):
         x = torch.unsqueeze(input, 1)
-        x_per = x.permute(0, 3, 2, 1)
-
-        y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])
-        if self.quantize:
-            y_left = self.quant(y_left)
+        x_per = x.permute(0, 3, 2, 1)  # B D T C
+        if in_cache is None:  # offline
+            y_left = F.pad(x_per, [0, 0, (self.lorder - 1) * self.lstride, 0])
+        else:
+            y_left = torch.cat((in_cache, x_per), dim=2)
+            in_cache = y_left[:, :, -(self.lorder - 1) * self.lstride:, :]
         y_left = self.conv_left(y_left)
-        if self.quantize:
-            y_left = self.dequant(y_left)
         out = x_per + y_left
 
         if self.conv_right is not None:
+            # maybe need to check
             y_right = F.pad(x_per, [0, 0, 0, self.rorder * self.rstride])
             y_right = y_right[:, :, self.rstride:, :]
-            if self.quantize:
-                y_right = self.quant(y_right)
             y_right = self.conv_right(y_right)
-            if self.quantize:
-                y_right = self.dequant(y_right)
             out += y_right
 
         out_per = out.permute(0, 3, 2, 1)
         output = out_per.squeeze(1)
 
-        return output
+        return output, in_cache
 
 
-class RectifiedLinear(nn.Module):
+class BasicBlock(nn.Sequential):
+    def __init__(self,
+                 linear_dim: int,
+                 proj_dim: int,
+                 lorder: int,
+                 rorder: int,
+                 lstride: int,
+                 rstride: int,
+                 stack_layer: int
+                 ):
+        super(BasicBlock, self).__init__()
+        self.lorder = lorder
+        self.rorder = rorder
+        self.lstride = lstride
+        self.rstride = rstride
+        self.stack_layer = stack_layer
+        self.linear = LinearTransform(linear_dim, proj_dim)
+        self.fsmn_block = FSMNBlock(proj_dim, proj_dim, lorder, rorder, lstride, rstride)
+        self.affine = AffineTransform(proj_dim, linear_dim)
+        self.relu = RectifiedLinear(linear_dim, linear_dim)
 
-    def __init__(self, input_dim, output_dim):
-        super(RectifiedLinear, self).__init__()
-        self.dim = input_dim
-        self.relu = nn.ReLU()
-        self.dropout = nn.Dropout(0.1)
-
-    def forward(self, input):
-        out = self.relu(input)
-        # out = self.dropout(out)
-        return out
+    def forward(self, input: torch.Tensor, in_cache=None):
+        x1 = self.linear(input)  # B T D
+        if in_cache is not None:  # Dict[str, tensor.Tensor]
+            cache_layer_name = 'cache_layer_{}'.format(self.stack_layer)
+            if cache_layer_name not in in_cache:
+                in_cache[cache_layer_name] = torch.zeros(x1.shape[0], x1.shape[-1], (self.lorder - 1) * self.lstride, 1)
+            x2, in_cache[cache_layer_name] = self.fsmn_block(x1, in_cache[cache_layer_name])
+        else:
+            x2, _ = self.fsmn_block(x1)
+        x3 = self.affine(x2)
+        x4 = self.relu(x3)
+        return x4, in_cache
 
 
-def _build_repeats(
-        fsmn_layers: int,
-        linear_dim: int,
-        proj_dim: int,
-        lorder: int,
-        rorder: int,
-        lstride=1,
-        rstride=1,
-):
-    repeats = [
-        nn.Sequential(
-            LinearTransform(linear_dim, proj_dim),
-            FSMNBlock(proj_dim, proj_dim, lorder, rorder, 1, 1),
-            AffineTransform(proj_dim, linear_dim),
-            RectifiedLinear(linear_dim, linear_dim))
-        for i in range(fsmn_layers)
-    ]
+class FsmnStack(nn.Sequential):
+    def __init__(self, *args):
+        super(FsmnStack, self).__init__(*args)
 
-    return nn.Sequential(*repeats)
+    def forward(self, input: torch.Tensor, in_cache=None):
+        x = input
+        for module in self._modules.values():
+            x, in_cache = module(x, in_cache)
+        return x
 
 
 '''
@@ -177,6 +175,7 @@
             rstride: int,
             output_affine_dim: int,
             output_dim: int,
+            streaming=False
     ):
         super(FSMN, self).__init__()
 
@@ -185,23 +184,16 @@
         self.fsmn_layers = fsmn_layers
         self.linear_dim = linear_dim
         self.proj_dim = proj_dim
-        self.lorder = lorder
-        self.rorder = rorder
-        self.lstride = lstride
-        self.rstride = rstride
         self.output_affine_dim = output_affine_dim
         self.output_dim = output_dim
+        self.in_cache_original = dict() if streaming else None
+        self.in_cache = copy.deepcopy(self.in_cache_original)
 
         self.in_linear1 = AffineTransform(input_dim, input_affine_dim)
         self.in_linear2 = AffineTransform(input_affine_dim, linear_dim)
         self.relu = RectifiedLinear(linear_dim, linear_dim)
-
-        self.fsmn = _build_repeats(fsmn_layers,
-                                   linear_dim,
-                                   proj_dim,
-                                   lorder, rorder,
-                                   lstride, rstride)
-
+        self.fsmn = FsmnStack(*[BasicBlock(linear_dim, proj_dim, lorder, rorder, lstride, rstride, i) for i in
+                                range(fsmn_layers)])
         self.out_linear1 = AffineTransform(linear_dim, output_affine_dim)
         self.out_linear2 = AffineTransform(output_affine_dim, output_dim)
         self.softmax = nn.Softmax(dim=-1)
@@ -209,27 +201,29 @@
     def fuse_modules(self):
         pass
 
+    def cache_reset(self):
+        self.in_cache = copy.deepcopy(self.in_cache_original)
+
     def forward(
             self,
             input: torch.Tensor,
-            in_cache: torch.Tensor = torch.zeros(0, 0, 0, dtype=torch.float)
-    ) -> torch.Tensor:
+    ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor]]:
         """
         Args:
             input (torch.Tensor): Input tensor (B, T, D)
-            in_cache(torhc.Tensor): (B, D, C), C is the accumulated cache size
+            in_cache: when in_cache is not None, the forward is in streaming. The type of in_cache is a dict, egs,
+            {'cache_layer_1': torch.Tensor(B, T1, D)}, T1 is equal to self.lorder. It is {} for the 1st frame
         """
 
         x1 = self.in_linear1(input)
         x2 = self.in_linear2(x1)
         x3 = self.relu(x2)
-        x4 = self.fsmn(x3)
+        x4 = self.fsmn(x3, self.in_cache)  # if in_cache is not None, self.fsmn is streaming's format, it will update automatically in self.fsmn
         x5 = self.out_linear1(x4)
         x6 = self.out_linear2(x5)
         x7 = self.softmax(x6)
 
         return x7
-        # return x6, in_cache
 
 
 '''
diff --git a/funasr/models/encoder/mfcca_encoder.py b/funasr/models/encoder/mfcca_encoder.py
new file mode 100644
index 0000000..83d0b0e
--- /dev/null
+++ b/funasr/models/encoder/mfcca_encoder.py
@@ -0,0 +1,450 @@
+from typing import Optional
+from typing import Tuple
+
+import logging
+import torch
+from torch import nn
+
+from typeguard import check_argument_types
+
+from funasr.models.encoder.encoder_layer_mfcca import EncoderLayer
+from funasr.modules.nets_utils import get_activation
+from funasr.modules.nets_utils import make_pad_mask
+from funasr.modules.attention import (
+    MultiHeadedAttention,  # noqa: H301
+    RelPositionMultiHeadedAttention,  # noqa: H301
+    LegacyRelPositionMultiHeadedAttention,  # noqa: H301
+)
+from funasr.modules.embedding import (
+    PositionalEncoding,  # noqa: H301
+    ScaledPositionalEncoding,  # noqa: H301
+    RelPositionalEncoding,  # noqa: H301
+    LegacyRelPositionalEncoding,  # noqa: H301
+)
+from funasr.modules.layer_norm import LayerNorm
+from funasr.modules.multi_layer_conv import Conv1dLinear
+from funasr.modules.multi_layer_conv import MultiLayeredConv1d
+from funasr.modules.positionwise_feed_forward import (
+    PositionwiseFeedForward,  # noqa: H301
+)
+from funasr.modules.repeat import repeat
+from funasr.modules.subsampling import Conv2dSubsampling
+from funasr.modules.subsampling import Conv2dSubsampling2
+from funasr.modules.subsampling import Conv2dSubsampling6
+from funasr.modules.subsampling import Conv2dSubsampling8
+from funasr.modules.subsampling import TooShortUttError
+from funasr.modules.subsampling import check_short_utt
+from funasr.models.encoder.abs_encoder import AbsEncoder
+import pdb
+import math
+
+class ConvolutionModule(nn.Module):
+    """ConvolutionModule in Conformer model.
+
+    Args:
+        channels (int): The number of channels of conv layers.
+        kernel_size (int): Kernerl size of conv layers.
+
+    """
+
+    def __init__(self, channels, kernel_size, activation=nn.ReLU(), bias=True):
+        """Construct an ConvolutionModule object."""
+        super(ConvolutionModule, self).__init__()
+        # kernerl_size should be a odd number for 'SAME' padding
+        assert (kernel_size - 1) % 2 == 0
+
+        self.pointwise_conv1 = nn.Conv1d(
+            channels,
+            2 * channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.depthwise_conv = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size,
+            stride=1,
+            padding=(kernel_size - 1) // 2,
+            groups=channels,
+            bias=bias,
+        )
+        self.norm = nn.BatchNorm1d(channels)
+        self.pointwise_conv2 = nn.Conv1d(
+            channels,
+            channels,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=bias,
+        )
+        self.activation = activation
+
+    def forward(self, x):
+        """Compute convolution module.
+
+        Args:
+            x (torch.Tensor): Input tensor (#batch, time, channels).
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, time, channels).
+
+        """
+        # exchange the temporal dimension and the feature dimension
+        x = x.transpose(1, 2)
+
+        # GLU mechanism
+        x = self.pointwise_conv1(x)  # (batch, 2*channel, dim)
+        x = nn.functional.glu(x, dim=1)  # (batch, channel, dim)
+
+        # 1D Depthwise Conv
+        x = self.depthwise_conv(x)
+        x = self.activation(self.norm(x))
+
+        x = self.pointwise_conv2(x)
+
+        return x.transpose(1, 2)
+
+
+
+class MFCCAEncoder(AbsEncoder):
+    """Conformer encoder module.
+
+    Args:
+        input_size (int): Input dimension.
+        output_size (int): Dimention of attention.
+        attention_heads (int): The number of heads of multi head attention.
+        linear_units (int): The number of units of position-wise feed forward.
+        num_blocks (int): The number of decoder blocks.
+        dropout_rate (float): Dropout rate.
+        attention_dropout_rate (float): Dropout rate in attention.
+        positional_dropout_rate (float): Dropout rate after adding positional encoding.
+        input_layer (Union[str, torch.nn.Module]): Input layer type.
+        normalize_before (bool): Whether to use layer_norm before the first block.
+        concat_after (bool): Whether to concat attention layer's input and output.
+            If True, additional linear will be applied.
+            i.e. x -> x + linear(concat(x, att(x)))
+            If False, no additional linear will be applied. i.e. x -> x + att(x)
+        positionwise_layer_type (str): "linear", "conv1d", or "conv1d-linear".
+        positionwise_conv_kernel_size (int): Kernel size of positionwise conv1d layer.
+        rel_pos_type (str): Whether to use the latest relative positional encoding or
+            the legacy one. The legacy relative positional encoding will be deprecated
+            in the future. More Details can be found in
+            https://github.com/espnet/espnet/pull/2816.
+        encoder_pos_enc_layer_type (str): Encoder positional encoding layer type.
+        encoder_attn_layer_type (str): Encoder attention layer type.
+        activation_type (str): Encoder activation function type.
+        macaron_style (bool): Whether to use macaron style for positionwise layer.
+        use_cnn_module (bool): Whether to use convolution module.
+        zero_triu (bool): Whether to zero the upper triangular part of attention matrix.
+        cnn_module_kernel (int): Kernerl size of convolution module.
+        padding_idx (int): Padding idx for input_layer=embed.
+
+    """
+
+    def __init__(
+        self,
+        input_size: int,
+        output_size: int = 256,
+        attention_heads: int = 4,
+        linear_units: int = 2048,
+        num_blocks: int = 6,
+        dropout_rate: float = 0.1,
+        positional_dropout_rate: float = 0.1,
+        attention_dropout_rate: float = 0.0,
+        input_layer: str = "conv2d",
+        normalize_before: bool = True,
+        concat_after: bool = False,
+        positionwise_layer_type: str = "linear",
+        positionwise_conv_kernel_size: int = 3,
+        macaron_style: bool = False,
+        rel_pos_type: str = "legacy",
+        pos_enc_layer_type: str = "rel_pos",
+        selfattention_layer_type: str = "rel_selfattn",
+        activation_type: str = "swish",
+        use_cnn_module: bool = True,
+        zero_triu: bool = False,
+        cnn_module_kernel: int = 31,
+        padding_idx: int = -1,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        self._output_size = output_size
+
+        if rel_pos_type == "legacy":
+            if pos_enc_layer_type == "rel_pos":
+                pos_enc_layer_type = "legacy_rel_pos"
+            if selfattention_layer_type == "rel_selfattn":
+                selfattention_layer_type = "legacy_rel_selfattn"
+        elif rel_pos_type == "latest":
+            assert selfattention_layer_type != "legacy_rel_selfattn"
+            assert pos_enc_layer_type != "legacy_rel_pos"
+        else:
+            raise ValueError("unknown rel_pos_type: " + rel_pos_type)
+
+        activation = get_activation(activation_type)
+        if pos_enc_layer_type == "abs_pos":
+            pos_enc_class = PositionalEncoding
+        elif pos_enc_layer_type == "scaled_abs_pos":
+            pos_enc_class = ScaledPositionalEncoding
+        elif pos_enc_layer_type == "rel_pos":
+            assert selfattention_layer_type == "rel_selfattn"
+            pos_enc_class = RelPositionalEncoding
+        elif pos_enc_layer_type == "legacy_rel_pos":
+            assert selfattention_layer_type == "legacy_rel_selfattn"
+            pos_enc_class = LegacyRelPositionalEncoding
+            logging.warning(
+                "Using legacy_rel_pos and it will be deprecated in the future."
+            )
+        else:
+            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
+        
+        if input_layer == "linear":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Linear(input_size, output_size),
+                torch.nn.LayerNorm(output_size),
+                torch.nn.Dropout(dropout_rate),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d":
+            self.embed = Conv2dSubsampling(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d6":
+            self.embed = Conv2dSubsampling6(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "conv2d8":
+            self.embed = Conv2dSubsampling8(
+                input_size,
+                output_size,
+                dropout_rate,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer == "embed":
+            self.embed = torch.nn.Sequential(
+                torch.nn.Embedding(input_size, output_size, padding_idx=padding_idx),
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif isinstance(input_layer, torch.nn.Module):
+            self.embed = torch.nn.Sequential(
+                input_layer,
+                pos_enc_class(output_size, positional_dropout_rate),
+            )
+        elif input_layer is None:
+            self.embed = torch.nn.Sequential(
+                pos_enc_class(output_size, positional_dropout_rate)
+            )
+        else:
+            raise ValueError("unknown input_layer: " + input_layer)
+        self.normalize_before = normalize_before
+        if positionwise_layer_type == "linear":
+            positionwise_layer = PositionwiseFeedForward
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                dropout_rate,
+                activation,
+            )
+        elif positionwise_layer_type == "conv1d":
+            positionwise_layer = MultiLayeredConv1d
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        elif positionwise_layer_type == "conv1d-linear":
+            positionwise_layer = Conv1dLinear
+            positionwise_layer_args = (
+                output_size,
+                linear_units,
+                positionwise_conv_kernel_size,
+                dropout_rate,
+            )
+        else:
+            raise NotImplementedError("Support only linear or conv1d.")
+
+        if selfattention_layer_type == "selfattn":
+            encoder_selfattn_layer = MultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+        elif selfattention_layer_type == "legacy_rel_selfattn":
+            assert pos_enc_layer_type == "legacy_rel_pos"
+            encoder_selfattn_layer = LegacyRelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+               attention_heads,
+                output_size,
+                attention_dropout_rate,
+            )
+            logging.warning(
+                "Using legacy_rel_selfattn and it will be deprecated in the future."
+            )
+        elif selfattention_layer_type == "rel_selfattn":
+            assert pos_enc_layer_type == "rel_pos"
+            encoder_selfattn_layer = RelPositionMultiHeadedAttention
+            encoder_selfattn_layer_args = (
+                attention_heads,
+                output_size,
+                attention_dropout_rate,
+                zero_triu,
+            )
+        else:
+            raise ValueError("unknown encoder_attn_layer: " + selfattention_layer_type)
+
+        convolution_layer = ConvolutionModule
+        convolution_layer_args = (output_size, cnn_module_kernel, activation)
+        encoder_selfattn_layer_raw = MultiHeadedAttention
+        encoder_selfattn_layer_args_raw = (
+            attention_heads,
+            output_size,
+            attention_dropout_rate,
+        )
+        self.encoders = repeat(
+            num_blocks,
+            lambda lnum: EncoderLayer(
+                output_size,
+                encoder_selfattn_layer_raw(*encoder_selfattn_layer_args_raw),
+                encoder_selfattn_layer(*encoder_selfattn_layer_args),
+                positionwise_layer(*positionwise_layer_args),
+                positionwise_layer(*positionwise_layer_args) if macaron_style else None,
+                convolution_layer(*convolution_layer_args) if use_cnn_module else None,
+                dropout_rate,
+                normalize_before,
+                concat_after,
+            ),
+        )
+        if self.normalize_before:
+            self.after_norm = LayerNorm(output_size)
+        self.conv1 = torch.nn.Conv2d(8, 16, [5,7], stride=[1,1], padding=(2,3))
+
+        self.conv2 = torch.nn.Conv2d(16, 32, [5,7], stride=[1,1], padding=(2,3))
+
+        self.conv3 = torch.nn.Conv2d(32, 16, [5,7], stride=[1,1], padding=(2,3))
+
+        self.conv4 = torch.nn.Conv2d(16, 1, [5,7], stride=[1,1], padding=(2,3))
+
+    def output_size(self) -> int:
+        return self._output_size
+
+    def forward(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        channel_size: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+        xs_pad, masks, channel_size = self.encoders(xs_pad, masks, channel_size)
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+
+        t_leng = xs_pad.size(1)
+        d_dim = xs_pad.size(2)
+        xs_pad = xs_pad.reshape(-1,channel_size,t_leng,d_dim)
+        #pdb.set_trace()
+        if(channel_size<8):
+            repeat_num = math.ceil(8/channel_size)
+            xs_pad = xs_pad.repeat(1,repeat_num,1,1)[:,0:8,:,:]
+        xs_pad = self.conv1(xs_pad)
+        xs_pad = self.conv2(xs_pad)
+        xs_pad = self.conv3(xs_pad)
+        xs_pad = self.conv4(xs_pad)
+        xs_pad = xs_pad.squeeze().reshape(-1,t_leng,d_dim)
+        mask_tmp = masks.size(1)
+        masks = masks.reshape(-1,channel_size,mask_tmp,t_leng)[:,0,:,:]
+
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+
+        olens = masks.squeeze(1).sum(1)
+        return xs_pad, olens, None
+    def forward_hidden(
+        self,
+        xs_pad: torch.Tensor,
+        ilens: torch.Tensor,
+        prev_states: torch.Tensor = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
+        """Calculate forward propagation.
+
+        Args:
+            xs_pad (torch.Tensor): Input tensor (#batch, L, input_size).
+            ilens (torch.Tensor): Input length (#batch).
+            prev_states (torch.Tensor): Not to be used now.
+
+        Returns:
+            torch.Tensor: Output tensor (#batch, L, output_size).
+            torch.Tensor: Output length (#batch).
+            torch.Tensor: Not to be used now.
+
+        """
+        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
+        if (
+            isinstance(self.embed, Conv2dSubsampling)
+            or isinstance(self.embed, Conv2dSubsampling6)
+            or isinstance(self.embed, Conv2dSubsampling8)
+        ):
+            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
+            if short_status:
+                raise TooShortUttError(
+                    f"has {xs_pad.size(1)} frames and is too short for subsampling "
+                    + f"(it needs more than {limit_size} frames), return empty results",
+                    xs_pad.size(1),
+                    limit_size,
+                )
+            xs_pad, masks = self.embed(xs_pad, masks)
+        else:
+            xs_pad = self.embed(xs_pad)
+        num_layer = len(self.encoders)
+        for idx, encoder in enumerate(self.encoders):
+            xs_pad, masks = encoder(xs_pad, masks)
+            if idx == num_layer // 2 - 1:
+                hidden_feature = xs_pad
+        if isinstance(xs_pad, tuple):
+            xs_pad = xs_pad[0]
+            hidden_feature = hidden_feature[0]
+        if self.normalize_before:
+            xs_pad = self.after_norm(xs_pad)
+            self.hidden_feature = self.after_norm(hidden_feature)
+
+        olens = masks.squeeze(1).sum(1)
+        return xs_pad, olens, None
diff --git a/funasr/models/frontend/default.py b/funasr/models/frontend/default.py
index fad6b70..9671fe9 100644
--- a/funasr/models/frontend/default.py
+++ b/funasr/models/frontend/default.py
@@ -131,3 +131,128 @@
         # input_stft: (..., F, 2) -> (..., F)
         input_stft = ComplexTensor(input_stft[..., 0], input_stft[..., 1])
         return input_stft, feats_lens
+
+
+
+
+class MultiChannelFrontend(AbsFrontend):
+    """Conventional frontend structure for ASR.
+
+    Stft -> WPE -> MVDR-Beamformer -> Power-spec -> Mel-Fbank -> CMVN
+    """
+
+    def __init__(
+            self,
+            fs: Union[int, str] = 16000,
+            n_fft: int = 512,
+            win_length: int = None,
+            hop_length: int = 128,
+            window: Optional[str] = "hann",
+            center: bool = True,
+            normalized: bool = False,
+            onesided: bool = True,
+            n_mels: int = 80,
+            fmin: int = None,
+            fmax: int = None,
+            htk: bool = False,
+            frontend_conf: Optional[dict] = get_default_kwargs(Frontend),
+            apply_stft: bool = True,
+            frame_length: int = None,
+            frame_shift: int = None,
+            lfr_m: int = None,
+            lfr_n: int = None,
+    ):
+        assert check_argument_types()
+        super().__init__()
+        if isinstance(fs, str):
+            fs = humanfriendly.parse_size(fs)
+
+        # Deepcopy (In general, dict shouldn't be used as default arg)
+        frontend_conf = copy.deepcopy(frontend_conf)
+        self.hop_length = hop_length
+
+        if apply_stft:
+            self.stft = Stft(
+                n_fft=n_fft,
+                win_length=win_length,
+                hop_length=hop_length,
+                center=center,
+                window=window,
+                normalized=normalized,
+                onesided=onesided,
+            )
+        else:
+            self.stft = None
+        self.apply_stft = apply_stft
+
+        if frontend_conf is not None:
+            self.frontend = Frontend(idim=n_fft // 2 + 1, **frontend_conf)
+        else:
+            self.frontend = None
+
+        self.logmel = LogMel(
+            fs=fs,
+            n_fft=n_fft,
+            n_mels=n_mels,
+            fmin=fmin,
+            fmax=fmax,
+            htk=htk,
+        )
+        self.n_mels = n_mels
+        self.frontend_type = "multichannelfrontend"
+
+    def output_size(self) -> int:
+        return self.n_mels
+
+    def forward(
+            self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        # 1. Domain-conversion: e.g. Stft: time -> time-freq
+        #import pdb;pdb.set_trace()
+        if self.stft is not None:
+            input_stft, feats_lens = self._compute_stft(input, input_lengths)
+        else:
+            if isinstance(input, ComplexTensor):
+                input_stft = input
+            else:
+                input_stft = ComplexTensor(input[..., 0], input[..., 1])
+            feats_lens = input_lengths
+        # 2. [Option] Speech enhancement
+        if self.frontend is not None:
+            assert isinstance(input_stft, ComplexTensor), type(input_stft)
+            # input_stft: (Batch, Length, [Channel], Freq)
+            input_stft, _, mask = self.frontend(input_stft, feats_lens)
+        # 4. STFT -> Power spectrum
+        # h: ComplexTensor(B, T, F) -> torch.Tensor(B, T, F)
+        input_power = input_stft.real ** 2 + input_stft.imag ** 2
+
+        # 5. Feature transform e.g. Stft -> Log-Mel-Fbank
+        # input_power: (Batch, [Channel,] Length, Freq)
+        #       -> input_feats: (Batch, Length, Dim)
+        input_feats, _ = self.logmel(input_power, feats_lens)
+        bt = input_feats.size(0)
+        if input_feats.dim() ==4:
+            channel_size = input_feats.size(2)
+            # batch * channel * T * D
+            #pdb.set_trace()
+            input_feats = input_feats.transpose(1,2).reshape(bt*channel_size,-1,80).contiguous()
+            # input_feats = input_feats.transpose(1,2)
+            # batch * channel
+            feats_lens = feats_lens.repeat(1,channel_size).squeeze()
+        else:
+            channel_size = 1
+        return input_feats, feats_lens, channel_size
+
+    def _compute_stft(
+            self, input: torch.Tensor, input_lengths: torch.Tensor
+    ) -> torch.Tensor:
+        input_stft, feats_lens = self.stft(input, input_lengths)
+
+        assert input_stft.dim() >= 4, input_stft.shape
+        # "2" refers to the real/imag parts of Complex
+        assert input_stft.shape[-1] == 2, input_stft.shape
+
+        # Change torch.Tensor to ComplexTensor
+        # input_stft: (..., F, 2) -> (..., F)
+        input_stft = ComplexTensor(input_stft[..., 0], input_stft[..., 1])
+        return input_stft, feats_lens
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 7a6425b..ed8cb36 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -171,10 +171,7 @@
                               window_type=self.window,
                               sample_frequency=self.fs)
 
-            # if self.lfr_m != 1 or self.lfr_n != 1:
-            #     mat = apply_lfr(mat, self.lfr_m, self.lfr_n)
-            # if self.cmvn_file is not None:
-            #     mat = apply_cmvn(mat, self.cmvn_file)
+
             feat_length = mat.size(0)
             feats.append(mat)
             feats_lens.append(feat_length)
diff --git a/funasr/runtime/__init__.py b/funasr/runtime/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/__init__.py
diff --git a/funasr/runtime/python/__init__.py b/funasr/runtime/python/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/__init__.py
diff --git a/funasr/runtime/python/onnxruntime/__init__.py b/funasr/runtime/python/onnxruntime/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/__init__.py
diff --git a/funasr/runtime/python/onnxruntime/paraformer/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/__init__.py
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/README.md b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/README.md
new file mode 100644
index 0000000..74a9d09
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/README.md
@@ -0,0 +1,75 @@
+## Using paraformer with ONNXRuntime
+
+<p align="left">
+    <a href=""><img src="https://img.shields.io/badge/Python->=3.7,<=3.10-aff.svg"></a>
+    <a href=""><img src="https://img.shields.io/badge/OS-Linux%2C%20Win%2C%20Mac-pink.svg"></a>
+</p>
+
+### Introduction
+- Model comes from [speech_paraformer](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
+
+
+### Steps:
+1. Download the whole directory
+```shell
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
+cd funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer
+```
+2. Install the related packages.
+   ```bash
+   pip install -r requirements.txt
+   ```
+3. Export the model.
+   
+   `Tips`: torch 1.11.0 is required.
+
+   ```shell
+   python -m funasr.export.export_model [model_name] [export_dir] [true]
+   ```
+   `model_name`: the model is to export.
+
+   `export_dir`: the dir where the onnx is export.
+
+   More details ref to ([export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export))
+
+
+   - `e.g.`, Export model from modelscope
+      ```shell
+      python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+      ```
+   - `e.g.`, Export model from local path, the model'name must be `model.pb`.
+      ```shell
+      python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+      ```
+
+5. Run the demo.
+   - Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
+   - Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+   - Output: `List[str]`: recognition result.
+   - Example:
+        ```python
+        from paraformer_onnx import Paraformer
+
+        model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+        model = Paraformer(model_dir, batch_size=1)
+
+        wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+        result = model(wav_path)
+        print(result)
+        ```
+
+## Speed
+
+Environment锛欼ntel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz
+
+Test [wav, 5.3s, 100 times avg.](https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav)
+
+| Backend |        RTF        |
+|:-------:|:-----------------:|
+| Pytorch |       0.110       |
+|  Onnx   |       0.038       |
+
+
+## Acknowledge
+1. We acknowledge [SWHL](https://github.com/RapidAI/RapidASR) for contributing the onnxruntime(python api).
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
new file mode 100644
index 0000000..d071b07
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/__init__.py
@@ -0,0 +1,3 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/demo.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/demo.py
new file mode 100644
index 0000000..5d658dd
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/demo.py
@@ -0,0 +1,9 @@
+from paraformer_onnx import Paraformer
+
+model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model = Paraformer(model_dir, batch_size=1)
+
+wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+result = model(wav_path)
+print(result)
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
new file mode 100644
index 0000000..d51c6bf
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/paraformer_onnx.py
@@ -0,0 +1,144 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import os.path
+import traceback
+from pathlib import Path
+from typing import List, Union, Tuple
+
+import librosa
+import numpy as np
+
+from utils.utils import (CharTokenizer, Hypothesis, ONNXRuntimeError,
+                    OrtInferSession, TokenIDConverter, get_logger,
+                    read_yaml)
+from utils.postprocess_utils import sentence_postprocess
+from utils.frontend import WavFrontend
+
+logging = get_logger()
+
+
+class Paraformer():
+    def __init__(self, model_dir: Union[str, Path]=None,
+                 batch_size: int = 1,
+                 device_id: Union[str, int]="-1",
+                 ):
+        
+        if not Path(model_dir).exists():
+            raise FileNotFoundError(f'{model_dir} does not exist.')
+
+        model_file = os.path.join(model_dir, 'model.onnx')
+        config_file = os.path.join(model_dir, 'config.yaml')
+        cmvn_file = os.path.join(model_dir, 'am.mvn')
+        config = read_yaml(config_file)
+
+        self.converter = TokenIDConverter(config['token_list'])
+        self.tokenizer = CharTokenizer()
+        self.frontend = WavFrontend(
+            cmvn_file=cmvn_file,
+            **config['frontend_conf']
+        )
+        self.ort_infer = OrtInferSession(model_file, device_id)
+        self.batch_size = batch_size
+
+    def __call__(self, wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+        waveform_list = self.load_data(wav_content, fs)
+        waveform_nums = len(waveform_list)
+
+        asr_res = []
+        for beg_idx in range(0, waveform_nums, self.batch_size):
+            end_idx = min(waveform_nums, beg_idx + self.batch_size)
+
+            feats, feats_len = self.extract_feat(waveform_list[beg_idx:end_idx])
+
+            try:
+                am_scores, valid_token_lens = self.infer(feats, feats_len)
+            except ONNXRuntimeError:
+                #logging.warning(traceback.format_exc())
+                logging.warning("input wav is silence or noise")
+                preds = ['']
+            else:
+                preds = self.decode(am_scores, valid_token_lens)
+
+            asr_res.extend(preds)
+        return asr_res
+
+    def load_data(self,
+                  wav_content: Union[str, np.ndarray, List[str]], fs: int = None) -> List:
+        def load_wav(path: str) -> np.ndarray:
+            waveform, _ = librosa.load(path, sr=fs)
+            return waveform
+
+        if isinstance(wav_content, np.ndarray):
+            return [wav_content]
+
+        if isinstance(wav_content, str):
+            return [load_wav(wav_content)]
+
+        if isinstance(wav_content, list):
+            return [load_wav(path) for path in wav_content]
+
+        raise TypeError(
+            f'The type of {wav_content} is not in [str, np.ndarray, list]')
+
+    def extract_feat(self,
+                     waveform_list: List[np.ndarray]
+                     ) -> Tuple[np.ndarray, np.ndarray]:
+        feats, feats_len = [], []
+        for waveform in waveform_list:
+            speech, _ = self.frontend.fbank(waveform)
+            feat, feat_len = self.frontend.lfr_cmvn(speech)
+            feats.append(feat)
+            feats_len.append(feat_len)
+
+        feats = self.pad_feats(feats, np.max(feats_len))
+        feats_len = np.array(feats_len).astype(np.int32)
+        return feats, feats_len
+
+    @staticmethod
+    def pad_feats(feats: List[np.ndarray], max_feat_len: int) -> np.ndarray:
+        def pad_feat(feat: np.ndarray, cur_len: int) -> np.ndarray:
+            pad_width = ((0, max_feat_len - cur_len), (0, 0))
+            return np.pad(feat, pad_width, 'constant', constant_values=0)
+
+        feat_res = [pad_feat(feat, feat.shape[0]) for feat in feats]
+        feats = np.array(feat_res).astype(np.float32)
+        return feats
+
+    def infer(self, feats: np.ndarray,
+              feats_len: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        am_scores, token_nums = self.ort_infer([feats, feats_len])
+        return am_scores, token_nums
+
+    def decode(self, am_scores: np.ndarray, token_nums: int) -> List[str]:
+        return [self.decode_one(am_score, token_num)
+                for am_score, token_num in zip(am_scores, token_nums)]
+
+    def decode_one(self,
+                   am_score: np.ndarray,
+                   valid_token_num: int) -> List[str]:
+        yseq = am_score.argmax(axis=-1)
+        score = am_score.max(axis=-1)
+        score = np.sum(score, axis=-1)
+
+        # pad with mask tokens to ensure compatibility with sos/eos tokens
+        # asr_model.sos:1  asr_model.eos:2
+        yseq = np.array([1] + yseq.tolist() + [2])
+        hyp = Hypothesis(yseq=yseq, score=score)
+
+        # remove sos/eos and get results
+        last_pos = -1
+        token_int = hyp.yseq[1:last_pos].tolist()
+
+        # remove blank symbol id, which is assumed to be 0
+        token_int = list(filter(lambda x: x not in (0, 2), token_int))
+
+        # Change integer-ids to tokens
+        token = self.converter.ids2tokens(token_int)
+        token = token[:valid_token_num-1]
+        texts = sentence_postprocess(token)
+        text = texts[0]
+        # text = self.tokenizer.tokens2text(token)
+        return text
+
+
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
new file mode 100644
index 0000000..aeb085a
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/requirements.txt
@@ -0,0 +1,6 @@
+librosa
+numpy
+onnxruntime
+scipy
+typeguard
+kaldi-native-fbank
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/__init__.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/__init__.py
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
new file mode 100644
index 0000000..eb8a7c8
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/frontend.py
@@ -0,0 +1,136 @@
+# -*- encoding: utf-8 -*-
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+from typeguard import check_argument_types
+import kaldi_native_fbank as knf
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class WavFrontend():
+    """Conventional frontend structure for ASR.
+    """
+
+    def __init__(
+            self,
+            cmvn_file: str = None,
+            fs: int = 16000,
+            window: str = 'hamming',
+            n_mels: int = 80,
+            frame_length: int = 25,
+            frame_shift: int = 10,
+            filter_length_min: int = -1,
+            filter_length_max: float = -1,
+            lfr_m: int = 1,
+            lfr_n: int = 1,
+            dither: float = 1.0
+    ) -> None:
+        check_argument_types()
+
+        opts = knf.FbankOptions()
+        opts.frame_opts.samp_freq = fs
+        opts.frame_opts.dither = dither
+        opts.frame_opts.window_type = window
+        opts.frame_opts.frame_shift_ms = float(frame_shift)
+        opts.frame_opts.frame_length_ms = float(frame_length)
+        opts.mel_opts.num_bins = n_mels
+        opts.energy_floor = 0
+        opts.frame_opts.snip_edges = True
+        opts.mel_opts.debug_mel = False
+        self.opts = opts
+
+        self.filter_length_min = filter_length_min
+        self.filter_length_max = filter_length_max
+        self.lfr_m = lfr_m
+        self.lfr_n = lfr_n
+        self.cmvn_file = cmvn_file
+
+        if self.cmvn_file:
+            self.cmvn = self.load_cmvn()
+
+    def fbank(self,
+              waveform: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        waveform = waveform * (1 << 15)
+        fbank_fn = knf.OnlineFbank(self.opts)
+        fbank_fn.accept_waveform(self.opts.frame_opts.samp_freq, waveform.tolist())
+        frames = fbank_fn.num_frames_ready
+        mat = np.empty([frames, self.opts.mel_opts.num_bins])
+        for i in range(frames):
+            mat[i, :] = fbank_fn.get_frame(i)
+        feat = mat.astype(np.float32)
+        feat_len = np.array(mat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    def lfr_cmvn(self, feat: np.ndarray) -> Tuple[np.ndarray, np.ndarray]:
+        if self.lfr_m != 1 or self.lfr_n != 1:
+            feat = self.apply_lfr(feat, self.lfr_m, self.lfr_n)
+
+        if self.cmvn_file:
+            feat = self.apply_cmvn(feat)
+
+        feat_len = np.array(feat.shape[0]).astype(np.int32)
+        return feat, feat_len
+
+    @staticmethod
+    def apply_lfr(inputs: np.ndarray, lfr_m: int, lfr_n: int) -> np.ndarray:
+        LFR_inputs = []
+
+        T = inputs.shape[0]
+        T_lfr = int(np.ceil(T / lfr_n))
+        left_padding = np.tile(inputs[0], ((lfr_m - 1) // 2, 1))
+        inputs = np.vstack((left_padding, inputs))
+        T = T + (lfr_m - 1) // 2
+        for i in range(T_lfr):
+            if lfr_m <= T - i * lfr_n:
+                LFR_inputs.append(
+                    (inputs[i * lfr_n:i * lfr_n + lfr_m]).reshape(1, -1))
+            else:
+                # process last LFR frame
+                num_padding = lfr_m - (T - i * lfr_n)
+                frame = inputs[i * lfr_n:].reshape(-1)
+                for _ in range(num_padding):
+                    frame = np.hstack((frame, inputs[-1]))
+
+                LFR_inputs.append(frame)
+        LFR_outputs = np.vstack(LFR_inputs).astype(np.float32)
+        return LFR_outputs
+
+    def apply_cmvn(self, inputs: np.ndarray) -> np.ndarray:
+        """
+        Apply CMVN with mvn data
+        """
+        frame, dim = inputs.shape
+        means = np.tile(self.cmvn[0:1, :dim], (frame, 1))
+        vars = np.tile(self.cmvn[1:2, :dim], (frame, 1))
+        inputs = (inputs + means) * vars
+        return inputs
+
+    def load_cmvn(self,) -> np.ndarray:
+        with open(self.cmvn_file, 'r', encoding='utf-8') as f:
+            lines = f.readlines()
+
+        means_list = []
+        vars_list = []
+        for i in range(len(lines)):
+            line_item = lines[i].split()
+            if line_item[0] == '<AddShift>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    add_shift_line = line_item[3:(len(line_item) - 1)]
+                    means_list = list(add_shift_line)
+                    continue
+            elif line_item[0] == '<Rescale>':
+                line_item = lines[i + 1].split()
+                if line_item[0] == '<LearnRateCoef>':
+                    rescale_line = line_item[3:(len(line_item) - 1)]
+                    vars_list = list(rescale_line)
+                    continue
+
+        means = np.array(means_list).astype(np.float64)
+        vars = np.array(vars_list).astype(np.float64)
+        cmvn = np.array([means, vars])
+        return cmvn
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/postprocess_utils.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/postprocess_utils.py
new file mode 100644
index 0000000..575fb90
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/postprocess_utils.py
@@ -0,0 +1,240 @@
+# Copyright (c) Alibaba, Inc. and its affiliates.
+
+import string
+import logging
+from typing import Any, List, Union
+
+
+def isChinese(ch: str):
+    if '\u4e00' <= ch <= '\u9fff' or '\u0030' <= ch <= '\u0039':
+        return True
+    return False
+
+
+def isAllChinese(word: Union[List[Any], str]):
+    word_lists = []
+    for i in word:
+        cur = i.replace(' ', '')
+        cur = cur.replace('</s>', '')
+        cur = cur.replace('<s>', '')
+        word_lists.append(cur)
+
+    if len(word_lists) == 0:
+        return False
+
+    for ch in word_lists:
+        if isChinese(ch) is False:
+            return False
+    return True
+
+
+def isAllAlpha(word: Union[List[Any], str]):
+    word_lists = []
+    for i in word:
+        cur = i.replace(' ', '')
+        cur = cur.replace('</s>', '')
+        cur = cur.replace('<s>', '')
+        word_lists.append(cur)
+
+    if len(word_lists) == 0:
+        return False
+
+    for ch in word_lists:
+        if ch.isalpha() is False and ch != "'":
+            return False
+        elif ch.isalpha() is True and isChinese(ch) is True:
+            return False
+
+    return True
+
+
+# def abbr_dispose(words: List[Any]) -> List[Any]:
+def abbr_dispose(words: List[Any], time_stamp: List[List] = None) -> List[Any]:
+    words_size = len(words)
+    word_lists = []
+    abbr_begin = []
+    abbr_end = []
+    last_num = -1
+    ts_lists = []
+    ts_nums = []
+    ts_index = 0
+    for num in range(words_size):
+        if num <= last_num:
+            continue
+
+        if len(words[num]) == 1 and words[num].encode('utf-8').isalpha():
+            if num + 1 < words_size and words[
+                    num + 1] == ' ' and num + 2 < words_size and len(
+                        words[num +
+                              2]) == 1 and words[num +
+                                                 2].encode('utf-8').isalpha():
+                # found the begin of abbr
+                abbr_begin.append(num)
+                num += 2
+                abbr_end.append(num)
+                # to find the end of abbr
+                while True:
+                    num += 1
+                    if num < words_size and words[num] == ' ':
+                        num += 1
+                        if num < words_size and len(
+                                words[num]) == 1 and words[num].encode(
+                                    'utf-8').isalpha():
+                            abbr_end.pop()
+                            abbr_end.append(num)
+                            last_num = num
+                        else:
+                            break
+                    else:
+                        break
+
+    for num in range(words_size):
+        if words[num] == ' ':
+            ts_nums.append(ts_index)
+        else:
+            ts_nums.append(ts_index)
+            ts_index += 1 
+    last_num = -1
+    for num in range(words_size):
+        if num <= last_num:
+            continue
+
+        if num in abbr_begin:
+            if time_stamp is not None:
+                begin = time_stamp[ts_nums[num]][0]
+            word_lists.append(words[num].upper())
+            num += 1
+            while num < words_size:
+                if num in abbr_end:
+                    word_lists.append(words[num].upper())
+                    last_num = num
+                    break
+                else:
+                    if words[num].encode('utf-8').isalpha():
+                        word_lists.append(words[num].upper())
+                num += 1
+            if time_stamp is not None:
+                end = time_stamp[ts_nums[num]][1]
+                ts_lists.append([begin, end])
+        else:
+            word_lists.append(words[num])
+            if time_stamp is not None and words[num] != ' ':
+                begin = time_stamp[ts_nums[num]][0]
+                end = time_stamp[ts_nums[num]][1]
+                ts_lists.append([begin, end])
+                begin = end
+
+    if time_stamp is not None:
+        return word_lists, ts_lists
+    else:
+        return word_lists
+
+
+def sentence_postprocess(words: List[Any], time_stamp: List[List] = None):
+    middle_lists = []
+    word_lists = []
+    word_item = ''
+    ts_lists = []
+
+    # wash words lists
+    for i in words:
+        word = ''
+        if isinstance(i, str):
+            word = i
+        else:
+            word = i.decode('utf-8')
+
+        if word in ['<s>', '</s>', '<unk>']:
+            continue
+        else:
+            middle_lists.append(word)
+
+    # all chinese characters
+    if isAllChinese(middle_lists):
+        for i, ch in enumerate(middle_lists):
+            word_lists.append(ch.replace(' ', ''))
+        if time_stamp is not None:
+            ts_lists = time_stamp
+
+    # all alpha characters
+    elif isAllAlpha(middle_lists):
+        ts_flag = True
+        for i, ch in enumerate(middle_lists):
+            if ts_flag and time_stamp is not None:
+                begin = time_stamp[i][0]
+                end = time_stamp[i][1]
+            word = ''
+            if '@@' in ch:
+                word = ch.replace('@@', '')
+                word_item += word
+                if time_stamp is not None:
+                    ts_flag = False
+                    end = time_stamp[i][1]
+            else:
+                word_item += ch
+                word_lists.append(word_item)
+                word_lists.append(' ')
+                word_item = ''
+                if time_stamp is not None:
+                    ts_flag = True
+                    end = time_stamp[i][1]
+                    ts_lists.append([begin, end])
+                    begin = end
+
+    # mix characters
+    else:
+        alpha_blank = False
+        ts_flag = True
+        begin = -1
+        end = -1
+        for i, ch in enumerate(middle_lists):
+            if ts_flag and time_stamp is not None:
+                begin = time_stamp[i][0]
+                end = time_stamp[i][1]
+            word = ''
+            if isAllChinese(ch):
+                if alpha_blank is True:
+                    word_lists.pop()
+                word_lists.append(ch)
+                alpha_blank = False
+                if time_stamp is not None:
+                    ts_flag = True
+                    ts_lists.append([begin, end])
+                    begin = end
+            elif '@@' in ch:
+                word = ch.replace('@@', '')
+                word_item += word
+                alpha_blank = False
+                if time_stamp is not None:
+                    ts_flag = False
+                    end = time_stamp[i][1]
+            elif isAllAlpha(ch):
+                word_item += ch
+                word_lists.append(word_item)
+                word_lists.append(' ')
+                word_item = ''
+                alpha_blank = True
+                if time_stamp is not None:
+                    ts_flag = True
+                    end = time_stamp[i][1] 
+                    ts_lists.append([begin, end])
+                    begin = end
+            else:
+                raise ValueError('invalid character: {}'.format(ch))
+
+    if time_stamp is not None: 
+        word_lists, ts_lists = abbr_dispose(word_lists, ts_lists)
+        real_word_lists = []
+        for ch in word_lists:
+            if ch != ' ':
+                real_word_lists.append(ch)
+        sentence = ' '.join(real_word_lists).strip()
+        return sentence, ts_lists, real_word_lists
+    else:
+        word_lists = abbr_dispose(word_lists)
+        real_word_lists = []
+        for ch in word_lists:
+            if ch != ' ':
+                real_word_lists.append(ch)
+        sentence = ''.join(word_lists).strip()
+        return sentence, real_word_lists
diff --git a/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
new file mode 100644
index 0000000..8e220e0
--- /dev/null
+++ b/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer/utils/utils.py
@@ -0,0 +1,256 @@
+# -*- encoding: utf-8 -*-
+# @Author: SWHL
+# @Contact: liekkaskono@163.com
+import functools
+import logging
+import pickle
+from pathlib import Path
+from typing import Any, Dict, Iterable, List, NamedTuple, Set, Tuple, Union
+
+import numpy as np
+import yaml
+from onnxruntime import (GraphOptimizationLevel, InferenceSession,
+                         SessionOptions, get_available_providers, get_device)
+from typeguard import check_argument_types
+
+import warnings
+
+root_dir = Path(__file__).resolve().parent
+
+logger_initialized = {}
+
+
+class TokenIDConverter():
+    def __init__(self, token_list: Union[List, str],
+                 ):
+        check_argument_types()
+
+        # self.token_list = self.load_token(token_path)
+        self.token_list = token_list
+        self.unk_symbol = token_list[-1]
+
+    # @staticmethod
+    # def load_token(file_path: Union[Path, str]) -> List:
+    #     if not Path(file_path).exists():
+    #         raise TokenIDConverterError(f'The {file_path} does not exist.')
+    #
+    #     with open(str(file_path), 'rb') as f:
+    #         token_list = pickle.load(f)
+    #
+    #     if len(token_list) != len(set(token_list)):
+    #         raise TokenIDConverterError('The Token exists duplicated symbol.')
+    #     return token_list
+
+    def get_num_vocabulary_size(self) -> int:
+        return len(self.token_list)
+
+    def ids2tokens(self,
+                   integers: Union[np.ndarray, Iterable[int]]) -> List[str]:
+        if isinstance(integers, np.ndarray) and integers.ndim != 1:
+            raise TokenIDConverterError(
+                f"Must be 1 dim ndarray, but got {integers.ndim}")
+        return [self.token_list[i] for i in integers]
+
+    def tokens2ids(self, tokens: Iterable[str]) -> List[int]:
+        token2id = {v: i for i, v in enumerate(self.token_list)}
+        if self.unk_symbol not in token2id:
+            raise TokenIDConverterError(
+                f"Unknown symbol '{self.unk_symbol}' doesn't exist in the token_list"
+            )
+        unk_id = token2id[self.unk_symbol]
+        return [token2id.get(i, unk_id) for i in tokens]
+
+
+class CharTokenizer():
+    def __init__(
+        self,
+        symbol_value: Union[Path, str, Iterable[str]] = None,
+        space_symbol: str = "<space>",
+        remove_non_linguistic_symbols: bool = False,
+    ):
+        check_argument_types()
+
+        self.space_symbol = space_symbol
+        self.non_linguistic_symbols = self.load_symbols(symbol_value)
+        self.remove_non_linguistic_symbols = remove_non_linguistic_symbols
+
+    @staticmethod
+    def load_symbols(value: Union[Path, str, Iterable[str]] = None) -> Set:
+        if value is None:
+            return set()
+
+        if isinstance(value, Iterable[str]):
+            return set(value)
+
+        file_path = Path(value)
+        if not file_path.exists():
+            logging.warning("%s doesn't exist.", file_path)
+            return set()
+
+        with file_path.open("r", encoding="utf-8") as f:
+            return set(line.rstrip() for line in f)
+
+    def text2tokens(self, line: Union[str, list]) -> List[str]:
+        tokens = []
+        while len(line) != 0:
+            for w in self.non_linguistic_symbols:
+                if line.startswith(w):
+                    if not self.remove_non_linguistic_symbols:
+                        tokens.append(line[: len(w)])
+                    line = line[len(w):]
+                    break
+            else:
+                t = line[0]
+                if t == " ":
+                    t = "<space>"
+                tokens.append(t)
+                line = line[1:]
+        return tokens
+
+    def tokens2text(self, tokens: Iterable[str]) -> str:
+        tokens = [t if t != self.space_symbol else " " for t in tokens]
+        return "".join(tokens)
+
+    def __repr__(self):
+        return (
+            f"{self.__class__.__name__}("
+            f'space_symbol="{self.space_symbol}"'
+            f'non_linguistic_symbols="{self.non_linguistic_symbols}"'
+            f")"
+        )
+
+
+
+class Hypothesis(NamedTuple):
+    """Hypothesis data type."""
+
+    yseq: np.ndarray
+    score: Union[float, np.ndarray] = 0
+    scores: Dict[str, Union[float, np.ndarray]] = dict()
+    states: Dict[str, Any] = dict()
+
+    def asdict(self) -> dict:
+        """Convert data to JSON-friendly dict."""
+        return self._replace(
+            yseq=self.yseq.tolist(),
+            score=float(self.score),
+            scores={k: float(v) for k, v in self.scores.items()},
+        )._asdict()
+
+
+class TokenIDConverterError(Exception):
+    pass
+
+
+class ONNXRuntimeError(Exception):
+    pass
+
+
+class OrtInferSession():
+    def __init__(self, model_file, device_id=-1):
+        sess_opt = SessionOptions()
+        sess_opt.log_severity_level = 4
+        sess_opt.enable_cpu_mem_arena = False
+        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
+
+        cuda_ep = 'CUDAExecutionProvider'
+        cuda_provider_options = {
+            "device_id": device_id,
+            "arena_extend_strategy": "kNextPowerOfTwo",
+            "cudnn_conv_algo_search": "EXHAUSTIVE",
+            "do_copy_in_default_stream": "true",
+        }
+        cpu_ep = 'CPUExecutionProvider'
+        cpu_provider_options = {
+            "arena_extend_strategy": "kSameAsRequested",
+        }
+
+        EP_list = []
+        if device_id != -1 and get_device() == 'GPU' \
+                and cuda_ep in get_available_providers():
+            EP_list = [(cuda_ep, cuda_provider_options)]
+        EP_list.append((cpu_ep, cpu_provider_options))
+
+        self._verify_model(model_file)
+        self.session = InferenceSession(model_file,
+                                        sess_options=sess_opt,
+                                        providers=EP_list)
+
+        if device_id != -1 and cuda_ep not in self.session.get_providers():
+            warnings.warn(f'{cuda_ep} is not avaiable for current env, the inference part is automatically shifted to be executed under {cpu_ep}.\n'
+                          'Please ensure the installed onnxruntime-gpu version matches your cuda and cudnn version, '
+                          'you can check their relations from the offical web site: '
+                          'https://onnxruntime.ai/docs/execution-providers/CUDA-ExecutionProvider.html',
+                          RuntimeWarning)
+
+    def __call__(self,
+                 input_content: List[Union[np.ndarray, np.ndarray]]) -> np.ndarray:
+        input_dict = dict(zip(self.get_input_names(), input_content))
+        try:
+            return self.session.run(None, input_dict)
+        except Exception as e:
+            raise ONNXRuntimeError('ONNXRuntime inferece failed.') from e
+
+    def get_input_names(self, ):
+        return [v.name for v in self.session.get_inputs()]
+
+    def get_output_names(self,):
+        return [v.name for v in self.session.get_outputs()]
+
+    def get_character_list(self, key: str = 'character'):
+        return self.meta_dict[key].splitlines()
+
+    def have_key(self, key: str = 'character') -> bool:
+        self.meta_dict = self.session.get_modelmeta().custom_metadata_map
+        if key in self.meta_dict.keys():
+            return True
+        return False
+
+    @staticmethod
+    def _verify_model(model_path):
+        model_path = Path(model_path)
+        if not model_path.exists():
+            raise FileNotFoundError(f'{model_path} does not exists.')
+        if not model_path.is_file():
+            raise FileExistsError(f'{model_path} is not a file.')
+
+
+def read_yaml(yaml_path: Union[str, Path]) -> Dict:
+    if not Path(yaml_path).exists():
+        raise FileExistsError(f'The {yaml_path} does not exist.')
+
+    with open(str(yaml_path), 'rb') as f:
+        data = yaml.load(f, Loader=yaml.Loader)
+    return data
+
+
+@functools.lru_cache()
+def get_logger(name='rapdi_paraformer'):
+    """Initialize and get a logger by name.
+    If the logger has not been initialized, this method will initialize the
+    logger by adding one or two handlers, otherwise the initialized logger will
+    be directly returned. During initialization, a StreamHandler will always be
+    added.
+    Args:
+        name (str): Logger name.
+    Returns:
+        logging.Logger: The expected logger.
+    """
+    logger = logging.getLogger(name)
+    if name in logger_initialized:
+        return logger
+
+    for logger_name in logger_initialized:
+        if name.startswith(logger_name):
+            return logger
+
+    formatter = logging.Formatter(
+        '[%(asctime)s] %(name)s %(levelname)s: %(message)s',
+        datefmt="%Y/%m/%d %H:%M:%S")
+
+    sh = logging.StreamHandler()
+    sh.setFormatter(formatter)
+    logger.addHandler(sh)
+    logger_initialized[name] = True
+    logger.propagate = False
+    return logger
diff --git a/funasr/runtime/python/torchscripts/__init__.py b/funasr/runtime/python/torchscripts/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/torchscripts/__init__.py
diff --git a/funasr/runtime/python/torchscripts/paraformer/__init__.py b/funasr/runtime/python/torchscripts/paraformer/__init__.py
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/funasr/runtime/python/torchscripts/paraformer/__init__.py
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index 02311fd..5be9089 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -71,7 +71,7 @@
 from funasr.utils.types import str2triple_str
 from funasr.utils.types import str_or_int
 from funasr.utils.types import str_or_none
-from funasr.utils.wav_utils import calc_shape, generate_data_list
+from funasr.utils.wav_utils import calc_shape, generate_data_list, filter_wav_text
 from funasr.utils.yaml_no_alias_safe_dump import yaml_no_alias_safe_dump
 
 try:
@@ -1153,6 +1153,14 @@
                 if args.batch_bins is not None:
                     args.batch_bins = args.batch_bins * args.ngpu
 
+        # filter samples if wav.scp and text are mismatch
+        if (args.train_shape_file is None and args.dataset_type == "small") or args.train_data_file is None and args.dataset_type == "large":
+            if not args.simple_ddp or distributed_option.dist_rank == 0:
+                filter_wav_text(args.data_dir, args.train_set)
+                filter_wav_text(args.data_dir, args.dev_set)
+            if args.simple_ddp:
+                dist.barrier()
+
         if args.train_shape_file is None and args.dataset_type == "small":
             if not args.simple_ddp or distributed_option.dist_rank == 0:
                 calc_shape(args.data_dir, args.train_set, args.frontend_conf, args.speech_length_min, args.speech_length_max)
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index e62a748..23ac976 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -40,6 +40,7 @@
 from funasr.models.decoder.contextual_decoder import ContextualParaformerDecoder
 from funasr.models.e2e_asr import ESPnetASRModel
 from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerBert, BiCifParaformer, ContextualParaformer
+from funasr.models.e2e_asr_mfcca import MFCCA
 from funasr.models.e2e_uni_asr import UniASR
 from funasr.models.encoder.abs_encoder import AbsEncoder
 from funasr.models.encoder.conformer_encoder import ConformerEncoder
@@ -47,8 +48,10 @@
 from funasr.models.encoder.rnn_encoder import RNNEncoder
 from funasr.models.encoder.sanm_encoder import SANMEncoder, SANMEncoderChunkOpt
 from funasr.models.encoder.transformer_encoder import TransformerEncoder
+from funasr.models.encoder.mfcca_encoder import MFCCAEncoder
 from funasr.models.frontend.abs_frontend import AbsFrontend
 from funasr.models.frontend.default import DefaultFrontend
+from funasr.models.frontend.default import MultiChannelFrontend
 from funasr.models.frontend.fused import FusedFrontends
 from funasr.models.frontend.s3prl import S3prlFrontend
 from funasr.models.frontend.wav_frontend import WavFrontend
@@ -86,6 +89,7 @@
         s3prl=S3prlFrontend,
         fused=FusedFrontends,
         wav_frontend=WavFrontend,
+        multichannelfrontend=MultiChannelFrontend,
     ),
     type_check=AbsFrontend,
     default="default",
@@ -119,6 +123,7 @@
         paraformer_bert=ParaformerBert,
         bicif_paraformer=BiCifParaformer,
         contextual_paraformer=ContextualParaformer,
+        mfcca=MFCCA,
     ),
     type_check=AbsESPnetModel,
     default="asr",
@@ -142,6 +147,7 @@
         sanm=SANMEncoder,
         sanm_chunk_opt=SANMEncoderChunkOpt,
         data2vec_encoder=Data2VecEncoder,
+        mfcca_enc=MFCCAEncoder,
     ),
     type_check=AbsEncoder,
     default="rnn",
@@ -1106,3 +1112,135 @@
         var_dict_torch_update.update(var_dict_torch_update_local)
 
         return var_dict_torch_update
+
+
+
+class ASRTaskMFCCA(ASRTask):
+    # If you need more than one optimizers, change this value
+    num_optimizers: int = 1
+
+    # Add variable objects configurations
+    class_choices_list = [
+        # --frontend and --frontend_conf
+        frontend_choices,
+        # --specaug and --specaug_conf
+        specaug_choices,
+        # --normalize and --normalize_conf
+        normalize_choices,
+        # --model and --model_conf
+        model_choices,
+        # --preencoder and --preencoder_conf
+        preencoder_choices,
+        # --encoder and --encoder_conf
+        encoder_choices,
+        # --decoder and --decoder_conf
+        decoder_choices,
+    ]
+
+    # If you need to modify train() or eval() procedures, change Trainer class here
+    trainer = Trainer
+
+    @classmethod
+    def build_model(cls, args: argparse.Namespace):
+        assert check_argument_types()
+        if isinstance(args.token_list, str):
+            with open(args.token_list, encoding="utf-8") as f:
+                token_list = [line.rstrip() for line in f]
+
+            # Overwriting token_list to keep it as "portable".
+            args.token_list = list(token_list)
+        elif isinstance(args.token_list, (tuple, list)):
+            token_list = list(args.token_list)
+        else:
+            raise RuntimeError("token_list must be str or list")
+        vocab_size = len(token_list)
+        logging.info(f"Vocabulary size: {vocab_size}")
+
+        # 1. frontend
+        if args.input_size is None:
+            # Extract features in the model
+            frontend_class = frontend_choices.get_class(args.frontend)
+            if args.frontend == 'wav_frontend':
+                frontend = frontend_class(cmvn_file=args.cmvn_file, **args.frontend_conf)
+            else:
+                frontend = frontend_class(**args.frontend_conf)
+            input_size = frontend.output_size()
+        else:
+            # Give features from data-loader
+            args.frontend = None
+            args.frontend_conf = {}
+            frontend = None
+            input_size = args.input_size
+
+        # 2. Data augmentation for spectrogram
+        if args.specaug is not None:
+            specaug_class = specaug_choices.get_class(args.specaug)
+            specaug = specaug_class(**args.specaug_conf)
+        else:
+            specaug = None
+
+        # 3. Normalization layer
+        if args.normalize is not None:
+            normalize_class = normalize_choices.get_class(args.normalize)
+            normalize = normalize_class(stats_file=args.cmvn_file,**args.normalize_conf)
+        else:
+            normalize = None
+
+        # 4. Pre-encoder input block
+        # NOTE(kan-bayashi): Use getattr to keep the compatibility
+        if getattr(args, "preencoder", None) is not None:
+            preencoder_class = preencoder_choices.get_class(args.preencoder)
+            preencoder = preencoder_class(**args.preencoder_conf)
+            input_size = preencoder.output_size()
+        else:
+            preencoder = None
+
+        # 5. Encoder
+        encoder_class = encoder_choices.get_class(args.encoder)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+
+        # 7. Decoder
+        decoder_class = decoder_choices.get_class(args.decoder)
+        decoder = decoder_class(
+            vocab_size=vocab_size,
+            encoder_output_size=encoder.output_size(),
+            **args.decoder_conf,
+        )
+
+        # 8. CTC
+        ctc = CTC(
+            odim=vocab_size, encoder_output_size=encoder.output_size(), **args.ctc_conf
+        )
+
+
+        # 10. Build model
+        try:
+            model_class = model_choices.get_class(args.model)
+        except AttributeError:
+            model_class = model_choices.get_class("asr")
+
+        rnnt_decoder = None
+
+        # 8. Build model
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            decoder=decoder,
+            ctc=ctc,
+            rnnt_decoder=rnnt_decoder,
+            token_list=token_list,
+            **args.model_conf,
+        )
+
+        # 11. Initialize
+        if args.init is not None:
+            initialize(model, args.init)
+
+        assert check_return_type(model)
+        return model
+
+
diff --git a/funasr/tasks/vad.py b/funasr/tasks/vad.py
index dfd07c4..e2a9123 100644
--- a/funasr/tasks/vad.py
+++ b/funasr/tasks/vad.py
@@ -235,7 +235,7 @@
             cls, args: argparse.Namespace, train: bool
     ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
         assert check_argument_types()
-        #if args.use_preprocessor:
+        # if args.use_preprocessor:
         #    retval = CommonPreprocessor(
         #        train=train,
         #        # NOTE(kamo): Check attribute existence for backward compatibility
@@ -254,7 +254,7 @@
         #        if hasattr(args, "rir_scp")
         #        else None,
         #    )
-        #else:
+        # else:
         #    retval = None
         retval = None
         assert check_return_type(retval)
@@ -291,7 +291,8 @@
             model_class = model_choices.get_class(args.model)
         except AttributeError:
             model_class = model_choices.get_class("e2evad")
-        model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf)
+        model = model_class(encoder=encoder, vad_post_args=args.vad_post_conf,
+                            streaming=args.encoder_conf.get('streaming', False))
 
         return model
 
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index 33d1255..f966aee 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -4,88 +4,6 @@
 import numpy as np
 from typing import Any, List, Tuple, Union
 
-def cut_interval(alphas: torch.Tensor, start: int, end: int, tail: bool):
-    if not tail:
-        if end == start + 1:
-            cut = (end + start) / 2.0
-        else:
-            alpha = alphas[start+1: end].tolist()
-            reverse_steps = 1
-            for reverse_alpha in alpha[::-1]:
-                if reverse_alpha > 0.35:
-                    reverse_steps += 1
-                else:
-                    break
-            cut = end - reverse_steps
-    else:
-        if end != len(alphas) - 1:
-            cut = end + 1
-        else:
-            cut = start + 1
-    return float(cut)
-
-def time_stamp_lfr6(alphas: torch.Tensor, speech_lengths: torch.Tensor, raw_text: List[str], begin: int = 0, end: int = None):
-    time_stamp_list = []
-    alphas = alphas[0]
-    text = copy.deepcopy(raw_text)
-    if end is None:
-        time = speech_lengths * 60 / 1000
-        sacle_rate = (time / speech_lengths[0]).tolist()
-    else:
-        time = (end - begin) / 1000
-        sacle_rate = (time / speech_lengths[0]).tolist()
-
-    predictor = (alphas > 0.5).int()
-    fire_places = torch.nonzero(predictor == 1).squeeze(1).tolist()
-    
-    cuts = []
-    npeak = int(predictor.sum())
-    nchar = len(raw_text)
-    if npeak - 1 == nchar:
-        fire_places = torch.where((alphas > 0.5) == 1)[0].tolist()
-        for i in range(len(fire_places)):
-            if fire_places[i] < len(alphas) - 1:
-                if 0.05 < alphas[fire_places[i]+1] < 0.5:
-                    fire_places[i] += 1
-    elif npeak < nchar:
-        lost_num = nchar - npeak
-        lost_fire = speech_lengths[0].tolist() - fire_places[-1]
-        interval_distance = lost_fire // (lost_num + 1)
-        for i in range(1, lost_num + 1):
-            fire_places.append(fire_places[-1] + interval_distance)
-    elif npeak - 1 > nchar:
-        redundance_num = npeak - 1 - nchar
-        for i in range(redundance_num):
-            fire_places.pop() 
-
-    cuts.append(0)
-    start_sil = True
-    if start_sil:
-        text.insert(0, '<sil>')
-
-    for i in range(len(fire_places)-1):
-        cuts.append(cut_interval(alphas, fire_places[i], fire_places[i+1], tail=(i==len(fire_places)-2)))
-
-    for i in range(2, len(fire_places)-2):
-        if fire_places[i-2] == fire_places[i-1] - 1 and fire_places[i-1] != fire_places[i] - 1:
-            cuts[i-1] += 1
-
-    if cuts[-1] != len(alphas) - 1:
-        text.append('<sil>')
-        cuts.append(speech_lengths[0].tolist())
-    cuts.insert(-1, (cuts[-1] + cuts[-2]) * 0.5)
-    sec_fire_places = np.array(cuts) * sacle_rate
-    for i in range(1, len(sec_fire_places) - 1):
-        start, end = sec_fire_places[i], sec_fire_places[i+1]
-        if i == len(sec_fire_places) - 2:
-            end = time
-        time_stamp_list.append([int(round(start, 2) * 1000) + begin, int(round(end, 2) * 1000) + begin])
-        text = text[1:]
-    if npeak - 1 == nchar or npeak > nchar:
-        return time_stamp_list[:-1]
-    else:
-        return time_stamp_list
-
 def time_stamp_lfr6_pl(us_alphas, us_cif_peak, char_list, begin_time=0.0, end_time=None):
     START_END_THRESHOLD = 5
     TIME_RATE = 10.0 * 6 / 1000 / 3  #  3 times upsampled
diff --git a/funasr/utils/wav_utils.py b/funasr/utils/wav_utils.py
index 8325ad3..590ccdb 100644
--- a/funasr/utils/wav_utils.py
+++ b/funasr/utils/wav_utils.py
@@ -287,3 +287,35 @@
             wav_path = os.path.join(split_dir, str(i + 1), "wav.scp")
             text_path = os.path.join(split_dir, str(i + 1), "text")
             f_data.write(wav_path + " " + text_path + "\n")
+
+def filter_wav_text(data_dir, dataset):
+    wav_file = os.path.join(data_dir,dataset,"wav.scp")
+    text_file = os.path.join(data_dir, dataset, "text")
+    with open(wav_file) as f_wav, open(text_file) as f_text:
+        wav_lines = f_wav.readlines()
+        text_lines = f_text.readlines()
+    os.rename(wav_file, "{}.bak".format(wav_file))
+    os.rename(text_file, "{}.bak".format(text_file))
+    wav_dict = {}
+    for line in wav_lines:
+        parts = line.strip().split()
+        if len(parts) < 2:
+            continue
+        sample_name, wav_path = parts
+        wav_dict[sample_name] = wav_path
+    text_dict = {}
+    for line in text_lines:
+        parts = line.strip().split(" ", 1)
+        if len(parts) < 2:
+            continue
+        sample_name, txt = parts
+        text_dict[sample_name] = txt
+    filter_count = 0
+    with open(wav_file, "w") as f_wav, open(text_file, "w") as f_text:
+        for sample_name, wav_path in wav_dict.items():
+            if sample_name in text_dict.keys():
+                f_wav.write(sample_name + " " + wav_path  + "\n")
+                f_text.write(sample_name + " " + text_dict[sample_name] + "\n")
+            else:
+                filter_count += 1
+    print("{}/{} samples in {} are filtered because of the mismatch between wav.scp and text".format(len(wav_lines), filter_count, dataset))
\ No newline at end of file

--
Gitblit v1.9.1