From 5de9e75d587b752d8d1063cc7903c4571df99189 Mon Sep 17 00:00:00 2001
From: yhliang <68215459+yhliang-aslp@users.noreply.github.com>
Date: 星期四, 20 四月 2023 16:52:47 +0800
Subject: [PATCH] Merge pull request #389 from alibaba-damo-academy/main

---
 docs/modescope_pipeline/vad_pipeline.md                                                                    |  101 +++++
 egs_modelscope/asr/TEMPLATE/infer.py                                                                       |   25 +
 funasr/bin/lm_inference_launch.py                                                                          |    5 
 funasr/train/trainer.py                                                                                    |    3 
 docs/modescope_pipeline/lm_pipeline.md                                                                     |    8 
 funasr/bin/asr_inference_paraformer.py                                                                     |    8 
 docs/FQA.md                                                                                                |   22 +
 docs/recipe/sd_recipe.md                                                                                   |  129 ++++++
 funasr/bin/vad_inference.py                                                                                |    3 
 funasr/bin/punctuation_infer_vadrealtime.py                                                                |    6 
 funasr/bin/tp_inference_launch.py                                                                          |    4 
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                            |    2 
 docs/modescope_pipeline/tp_pipeline.md                                                                     |    6 
 funasr/bin/sond_inference.py                                                                               |    2 
 funasr/bin/sv_inference_launch.py                                                                          |    2 
 funasr/bin/asr_inference_mfcca.py                                                                          |    2 
 funasr/bin/asr_inference_paraformer_vad.py                                                                 |    2 
 funasr/bin/punc_inference_launch.py                                                                        |    4 
 docs/modescope_pipeline/asr_pipeline.md                                                                    |  184 +++++++++
 funasr/bin/tp_inference.py                                                                                 |    5 
 egs_modelscope/asr/TEMPLATE/utils                                                                          |    1 
 README.md                                                                                                  |    4 
 docs/modescope_pipeline/sd_pipeline.md                                                                     |   20 +
 funasr/bin/asr_inference_uniasr.py                                                                         |    2 
 funasr/bin/asr_inference.py                                                                                |    2 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md |   26 +
 funasr/models/joint_net/__init__.py                                                                        |    1 
 docs/modelscope_models.md                                                                                  |    2 
 funasr/bin/eend_ola_inference.py                                                                           |    2 
 egs_modelscope/asr/TEMPLATE/infer.sh                                                                       |   96 +++++
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh  |    3 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py                          |    1 
 funasr/runtime/python/benchmark_onnx.md                                                                    |   34 +
 .gitignore                                                                                                 |    6 
 docs/index.rst                                                                                             |   12 
 egs_modelscope/asr/TEMPLATE/infer_after_finetune.py                                                        |   48 ++
 funasr/runtime/python/benchmark_libtorch.md                                                                |   35 +
 egs_modelscope/asr/TEMPLATE/finetune.py                                                                    |   36 +
 .github/workflows/main.yml                                                                                 |    7 
 docs/benchmark/benchmark_libtorch.md                                                                       |    1 
 funasr/bin/lm_inference.py                                                                                 |    7 
 funasr/bin/asr_inference_paraformer_streaming.py                                                           |    2 
 docs/modescope_pipeline/quick_start.md                                                                     |   72 +++
 docs/images/dingding.jpg                                                                                   |    0 
 funasr/bin/vad_inference_launch.py                                                                         |    5 
 funasr/bin/diar_inference_launch.py                                                                        |    2 
 /dev/null                                                                                                  |    4 
 funasr/runtime/python/onnxruntime/README.md                                                                |  116 +++++
 funasr/bin/vad_inference_online.py                                                                         |    3 
 docs/benchmark/benchmark_onnx.md                                                                           |    1 
 funasr/bin/sv_inference.py                                                                                 |    3 
 docs/modescope_pipeline/sv_pipeline.md                                                                     |    6 
 docs/modescope_pipeline/punc_pipeline.md                                                                   |    6 
 funasr/bin/asr_inference_launch.py                                                                         |    5 
 54 files changed, 968 insertions(+), 126 deletions(-)

diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2497ac2..75651b6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,10 +18,6 @@
         with:
           docs-folder: "docs/"
           pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
-      - uses: ammaraskar/sphinx-action@master
-        with:
-          docs-folder: "docs_cn/"
-          pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
 
       - name: deploy copy
         if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh'
@@ -31,9 +27,6 @@
           mkdir public/en
           touch public/en/.nojekyll
           cp -r docs/_build/html/* public/en/
-          mkdir public/cn
-          touch public/cn/.nojekyll
-          cp -r docs_cn/_build/html/* public/cn/
           mkdir public/m2met2
           touch public/m2met2/.nojekyll
           cp -r docs_m2met2/_build/html/* public/m2met2/
diff --git a/.gitignore b/.gitignore
index 13d2fff..33b8c39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,8 @@
 *.pyc
 .eggs
 MaaS-lib
-.gitignore
\ No newline at end of file
+.gitignore
+.egg*
+dist
+build
+funasr.egg-info
\ No newline at end of file
diff --git a/README.md b/README.md
index 03156f3..29ddd4a 100644
--- a/README.md
+++ b/README.md
@@ -12,13 +12,13 @@
 [**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new) 
 | [**Highlights**](#highlights)
 | [**Installation**](#installation)
-| [**Docs_EN**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
+| [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
 | [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
 | [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
 | [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
 | [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/modelscope_models.md)
 | [**Contact**](#contact)
-
+|
 [**M2MET2.0 Guidence_CN**](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)
 | [**M2MET2.0 Guidence_EN**](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)
 
diff --git a/docs/FQA.md b/docs/FQA.md
new file mode 100644
index 0000000..46c5aa3
--- /dev/null
+++ b/docs/FQA.md
@@ -0,0 +1,22 @@
+# FQA
+
+## How to use VAD model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+## How to use Punctuation model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/238)
+
+## How to use Parafomrer model for streaming by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+## How to use vad, asr and punc model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/278)
+
+## How to combine vad, asr, punc and nnlm models inside modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/134)
+
+## How to combine timestamp prediction model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/246)
+
+## How to switch decoding mode between online and offline for UniASR model
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_libtorch.md b/docs/benchmark/benchmark_libtorch.md
new file mode 120000
index 0000000..f1cd73c
--- /dev/null
+++ b/docs/benchmark/benchmark_libtorch.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_libtorch.md
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_onnx.md b/docs/benchmark/benchmark_onnx.md
new file mode 120000
index 0000000..14e2fbe
--- /dev/null
+++ b/docs/benchmark/benchmark_onnx.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_onnx.md
\ No newline at end of file
diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg
index aea2b06..6ac3ab8 100644
--- a/docs/images/dingding.jpg
+++ b/docs/images/dingding.jpg
Binary files differ
diff --git a/docs/index.rst b/docs/index.rst
index e5b9ab8..14c9525 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,7 @@
    ./modescope_pipeline/punc_pipeline.md
    ./modescope_pipeline/tp_pipeline.md
    ./modescope_pipeline/sv_pipeline.md
+   ./modescope_pipeline/sd_pipeline.md
    ./modescope_pipeline/lm_pipeline.md
 
 .. toctree::
@@ -63,11 +64,22 @@
 
 .. toctree::
    :maxdepth: 1
+   :caption: Benchmark and Leadboard
+
+   ./benchmark/benchmark_onnx.md
+   ./benchmark/benchmark_libtorch.md
+
+.. toctree::
+   :maxdepth: 1
    :caption: Papers
 
    ./papers.md
 
+.. toctree::
+   :maxdepth: 1
+   :caption: FQA
 
+   ./FQA.md
 
 
 Indices and tables
diff --git a/docs/modelscope_models.md b/docs/modelscope_models.md
index b35d625..3538ae0 100644
--- a/docs/modelscope_models.md
+++ b/docs/modelscope_models.md
@@ -80,7 +80,7 @@
 | [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours)  |   17.5M    |    3465    |    Xvector, speaker verification, Chinese   |
 | [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) |    61M     |    6135    |   Xvector, speaker verification, English    |
 
-### Speaker diarization Models
+### Speaker Diarization Models
 
 |                                                    Model Name                                                    |    Training Data    | Parameters | Notes |
 |:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
diff --git a/docs/modescope_pipeline/asr_pipeline.md b/docs/modescope_pipeline/asr_pipeline.md
index 3dc0bd0..8b6b24d 100644
--- a/docs/modescope_pipeline/asr_pipeline.md
+++ b/docs/modescope_pipeline/asr_pipeline.md
@@ -1,20 +1,196 @@
 # Speech Recognition
 
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of Paraformer and Paraformer-online as example to demonstrate the usage.
+
 ## Inference
 
 ### Quick start
+#### [Paraformer model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
 
-#### Inference with you data
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
 
-#### Inference with multi-threads on CPU
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
 
-#### Inference with multi GPU
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride] 
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+    param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal`
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be: 
+  - wav_path, `e.g.`: asr_example.wav,
+  - pcm_path, `e.g.`: asr_example.pcm, 
+  - audio bytes stream, `e.g.`: bytes data from a microphone
+  - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+  - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: 
+  ```cat wav.scp
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+    - <strong>model:</strong> # model name on ModelScope
+    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - <strong>output_dir:</strong> # result dir
+    - <strong>batch_size:</strong> # batchsize of inference
+    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
+    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
+    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
 
 ## Finetune with pipeline
 
 ### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    from funasr.utils.modelscope_param import modelscope_args
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    params.output_dir = "./checkpoint"                      # 妯″瀷淇濆瓨璺緞
+    params.data_path = "speech_asr_aishell1_trainsets"      # 鏁版嵁璺緞锛屽彲浠ヤ负modelscope涓凡涓婁紶鏁版嵁锛屼篃鍙互鏄湰鍦版暟鎹�
+    params.dataset_type = "small"                           # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 2000                                # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 50                                   # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                                     # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
 
 ### Finetune with your data
 
-## Inference with your finetuned model
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+    - <strong>output_dir:</strong> # result dir
+    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - <strong>max_epoch:</strong> # number of training epoch
+    - <strong>lr:</strong> # learning rate
 
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+- Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py)
+    - <strong>modelscope_model_name: </strong> # model name on ModelScope
+    - <strong>output_dir:</strong> # result dir
+    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+    - <strong>batch_size:</strong> # batchsize of inference  
+
+- Then you can run the pipeline to finetune with:
+```python
+    python infer_after_finetune.py
+```
diff --git a/docs/modescope_pipeline/lm_pipeline.md b/docs/modescope_pipeline/lm_pipeline.md
index cb81871..c4090ec 100644
--- a/docs/modescope_pipeline/lm_pipeline.md
+++ b/docs/modescope_pipeline/lm_pipeline.md
@@ -1,10 +1,10 @@
-# Speech Recognition
+# Language Models
 
 ## Inference with pipeline
 ### Quick start
-#### Inference with you data
-#### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with you data
+### Inference with multi-threads on CPU
+### Inference with multi GPU
 
 ## Finetune with pipeline
 ### Quick start
diff --git a/docs/modescope_pipeline/punc_pipeline.md b/docs/modescope_pipeline/punc_pipeline.md
index 67ee695..a0203d7 100644
--- a/docs/modescope_pipeline/punc_pipeline.md
+++ b/docs/modescope_pipeline/punc_pipeline.md
@@ -4,11 +4,11 @@
 
 ### Quick start
 
-#### Inference with you data
+### Inference with you data
 
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
 
-#### Inference with multi GPU
+### Inference with multi GPU
 
 ## Finetune with pipeline
 
diff --git a/docs/modescope_pipeline/quick_start.md b/docs/modescope_pipeline/quick_start.md
index ab46a7c..b1614f5 100644
--- a/docs/modescope_pipeline/quick_start.md
+++ b/docs/modescope_pipeline/quick_start.md
@@ -59,8 +59,7 @@
 
 inference_pipeline = pipeline(
     task=Tasks.speech_timestamp,
-    model='damo/speech_timestamp_prediction-v1-16k-offline',
-    output_dir='./tmp')
+    model='damo/speech_timestamp_prediction-v1-16k-offline',)
 
 rec_result = inference_pipeline(
     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
@@ -86,6 +85,71 @@
 # speaker verification
 rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav','https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
 print(rec_result["scores"][0])
+```
+
+### Speaker diarization
+#### SOND
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+    mode="sond_demo",
+    num_workers=0,
+    task=Tasks.speaker_diarization,
+    diar_model_config="sond.yaml",
+    model='damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch',
+    sv_model="damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch",
+    sv_model_revision="master",
+)
+
+audio_list=[
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
+    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+
+### FAQ
+#### How to switch device from GPU to CPU with pipeline
+
+The pipeline defaults to decoding with GPU (`ngpu=1`) when GPU is available. If you want to switch to CPU, you could set `ngpu=0`
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+    ngpu=0,
+)
+```
+
+#### How to infer from local model path
+Download model to local dir, by modelscope-sdk
+
+```python
+from modelscope.hub.snapshot_download import snapshot_download
+
+local_dir_root = "./models_from_modelscope"
+model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', cache_dir=local_dir_root)
+```
+
+Or download model to local dir, by git lfs
+```shell
+git lfs install
+# git clone https://www.modelscope.cn/<namespace>/<model-name>.git
+git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
+```
+
+Infer with local model path
+```python
+local_dir_root = "./models_from_modelscope/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model=local_dir_root,
+)
 ```
 
 ## Finetune with pipeline
@@ -132,6 +196,10 @@
 ```shell
 python finetune.py &> log.txt &
 ```
+
+### FAQ
+### Multi GPUs training and distributed training
+
 If you want finetune with multi-GPUs, you could:
 ```shell
 CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
diff --git a/docs/modescope_pipeline/sd_pipeline.md b/docs/modescope_pipeline/sd_pipeline.md
new file mode 100644
index 0000000..1330fe6
--- /dev/null
+++ b/docs/modescope_pipeline/sd_pipeline.md
@@ -0,0 +1,20 @@
+# Speaker Diarization
+
+## Inference with pipeline
+
+### Quick start
+
+### Inference with you data
+
+### Inference with multi-threads on CPU
+
+### Inference with multi GPU
+
+## Finetune with pipeline
+
+### Quick start
+
+### Finetune with your data
+
+## Inference with your finetuned model
+
diff --git a/docs/modescope_pipeline/sv_pipeline.md b/docs/modescope_pipeline/sv_pipeline.md
index 6ce8c6a..c57db38 100644
--- a/docs/modescope_pipeline/sv_pipeline.md
+++ b/docs/modescope_pipeline/sv_pipeline.md
@@ -4,11 +4,11 @@
 
 ### Quick start
 
-#### Inference with you data
+### Inference with you data
 
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
 
-#### Inference with multi GPU
+### Inference with multi GPU
 
 ## Finetune with pipeline
 
diff --git a/docs/modescope_pipeline/tp_pipeline.md b/docs/modescope_pipeline/tp_pipeline.md
index fad55e3..9b1719b 100644
--- a/docs/modescope_pipeline/tp_pipeline.md
+++ b/docs/modescope_pipeline/tp_pipeline.md
@@ -4,11 +4,11 @@
 
 ### Quick start
 
-#### Inference with you data
+### Inference with you data
 
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
 
-#### Inference with multi GPU
+### Inference with multi GPU
 
 ## Finetune with pipeline
 
diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md
index 5dcbe59..9d9b77a 100644
--- a/docs/modescope_pipeline/vad_pipeline.md
+++ b/docs/modescope_pipeline/vad_pipeline.md
@@ -1,14 +1,107 @@
 # Voice Activity Detection
 
-## Inference with pipeline
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of FSMN-VAD as example to demonstrate the usage.
+
+## Inference
 
 ### Quick start
+#### [FSMN-VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
 
-#### Inference with you data
+inference_pipeline = pipeline(
+    task=Tasks.voice_activity_detection,
+    model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+)
 
-#### Inference with multi-threads on CPU
+segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
+print(segments_result)
+```
+#### [FSMN-VAD-online model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+    )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
 
-#### Inference with multi GPU
+param_dict = {"in_cache": dict(), "is_final": False}
+chunk_stride = 1600# 100ms
+# first chunk, 100ms
+speech_chunk = speech[0:chunk_stride] 
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be: 
+  - wav_path, `e.g.`: asr_example.wav,
+  - pcm_path, `e.g.`: asr_example.pcm, 
+  - audio bytes stream, `e.g.`: bytes data from a microphone
+  - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+  - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: 
+  ```cat wav.scp
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+    - <strong>model:</strong> # model name on ModelScope
+    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - <strong>output_dir:</strong> # result dir
+    - <strong>batch_size:</strong> # batchsize of inference
+    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
+    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
+    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference true \
+    --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
 
 ## Finetune with pipeline
 
diff --git a/docs/recipe/sd_recipe.md b/docs/recipe/sd_recipe.md
new file mode 100644
index 0000000..90eb4b3
--- /dev/null
+++ b/docs/recipe/sd_recipe.md
@@ -0,0 +1,129 @@
+# Speaker Diarization
+Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
+
+## Overall Introduction
+We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
+- `CUDA_VISIBLE_DEVICES`: visible gpu list
+- `gpu_num`: the number of GPUs used for training
+- `gpu_inference`: whether to use GPUs for decoding
+- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
+- `data_aishell`: the raw path of AISHELL-1 dataset
+- `feats_dir`: the path for saving processed data
+- `nj`: the number of jobs for data preparation
+- `speed_perturb`: the range of speech perturbed
+- `exp_dir`: the path for saving experimental results
+- `tag`: the suffix of experimental result directory
+
+## Stage 0: Data preparation
+This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
+* `wav.scp`
+```
+BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
+BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
+...
+```
+* `text`
+```
+BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
+BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
+BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
+...
+```
+These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
+
+## Stage 1: Feature Generation
+This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
+* `feats.scp`
+```
+...
+BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
+...
+```
+Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
+* `speech_shape`
+```
+...
+BAC009S0002W0122_sp0.9 665,80
+...
+```
+* `text_shape`
+```
+...
+BAC009S0002W0122_sp0.9 15
+...
+```
+These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
+
+## Stage 2: Dictionary Preparation
+This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
+* `tokens.txt`
+```
+<blank>
+<s>
+</s>
+涓�
+涓�
+...
+榫�
+榫�
+<unk>
+```
+* `<blank>`: indicates the blank token for CTC
+* `<s>`: indicates the start-of-sentence token
+* `</s>`: indicates the end-of-sentence token
+* `<unk>`: indicates the out-of-vocabulary token
+
+## Stage 3: Training
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
+
+* DDP Training
+
+We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
+
+* DataLoader
+
+We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
+
+* Configuration
+
+The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
+
+* Training Steps
+
+We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
+
+* Tensorboard
+
+Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
+```
+tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
+```
+
+## Stage 4: Decoding
+This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
+
+* Mode Selection
+
+As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
+
+* Configuration
+
+We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
+
+* CPU/GPU Decoding
+
+We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
+
+* Performance
+
+We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
+* `text.cer`
+```
+...
+BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
+ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
+res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
+...
+```
+
diff --git a/docs_cn/Makefile b/docs_cn/Makefile
deleted file mode 100644
index d58379b..0000000
--- a/docs_cn/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS    =
-SPHINXBUILD   = sphinx-build
-SPHINXPROJ    = FunASR
-SOURCEDIR     = .
-BUILDDIR      = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
-	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs_cn/build_task.md b/docs_cn/build_task.md
deleted file mode 100644
index c23b19f..0000000
--- a/docs_cn/build_task.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# 鎼缓鑷畾涔変换鍔�
-FunASR绫讳技ESPNet锛屼互`Task`涓洪�氱敤鎺ュ彛锛屼粠鑰屽疄鐜版ā鍨嬬殑璁粌鍜屾帹鐞嗐�傛瘡涓�涓猔Task`鏄竴涓被锛屽叾闇�瑕佺户鎵縛AbsTask`锛屽叾瀵瑰簲鐨勫叿浣撲唬鐮佽`funasr/tasks/abs_task.py`銆備笅闈㈢粰鍑哄叾鍖呭惈鐨勪富瑕佸嚱鏁板強鍔熻兘浠嬬粛锛�
-```python
-class AbsTask(ABC):
-    @classmethod
-    def add_task_arguments(cls, parser: argparse.ArgumentParser):
-        pass
-    
-    @classmethod
-    def build_preprocess_fn(cls, args, train):
-        (...)
-    
-    @classmethod
-    def build_collate_fn(cls, args: argparse.Namespace):
-        (...)
-
-    @classmethod
-    def build_model(cls, args):
-        (...)
-    
-    @classmethod
-    def main(cls, args):
-        (...)
-```
-- add_task_arguments锛氭坊鍔犵壒瀹歚Task`闇�瑕佺殑鍙傛暟
-- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞�
-- build_collate_fn锛氬畾涔夊浣曞皢澶氫釜鏍锋湰缁勬垚涓�涓猔batch`
-- build_model锛氬畾涔夋ā鍨�
-- main锛氳缁冨叆鍙ｏ紝閫氳繃`Task.main()`鏉ュ惎鍔ㄨ缁�
-
-涓嬮潰鎴戜滑灏嗕互璇煶璇嗗埆浠诲姟涓轰緥锛屼粙缁嶅浣曞畾涔変竴涓柊鐨刞Task`锛屽叿浣撲唬鐮佽`funasr/tasks/asr.py`涓殑`ASRTask`銆� 瀹氫箟鏂扮殑`Task`鐨勮繃绋嬶紝鍏跺疄灏辨槸鏍规嵁浠诲姟闇�姹傦紝閲嶅畾涔変笂杩板嚱鏁扮殑杩囩▼銆�
-- add_task_arguments
-```python
-@classmethod
-def add_task_arguments(cls, parser: argparse.ArgumentParser):
-    group = parser.add_argument_group(description="Task related")
-    group.add_argument(
-        "--token_list",
-        type=str_or_none,
-        default=None,
-        help="A text mapping int-id to token",
-    )
-    (...)
-```
-瀵逛簬璇煶璇嗗埆浠诲姟锛岄渶瑕佺殑鐗瑰畾鍙傛暟鍖呮嫭`token_list`绛夈�傛牴鎹笉鍚屼换鍔＄殑鐗瑰畾闇�姹傦紝鐢ㄦ埛鍙互鍦ㄦ鍑芥暟涓畾涔夌浉搴旂殑鍙傛暟銆�
-
-- build_preprocess_fn
-```python
-@classmethod
-def build_preprocess_fn(cls, args, train):
-    if args.use_preprocessor:
-        retval = CommonPreprocessor(
-                    train=train,
-                    token_type=args.token_type,
-                    token_list=args.token_list,
-                    bpemodel=args.bpemodel,
-                    non_linguistic_symbols=args.non_linguistic_symbols,
-                    text_cleaner=args.cleaner,
-                    ...
-                )
-    else:
-        retval = None
-    return retval
-```
-璇ュ嚱鏁板畾涔変簡濡備綍瀵规牱鏈繘琛岄澶勭悊銆傚叿浣撳湴锛岃闊宠瘑鍒换鍔＄殑杈撳叆鍖呮嫭闊抽鍜屾妱鏈�傚浜庨煶棰戯紝鍦ㄦ瀹炵幇浜�(鍙��)瀵归煶棰戝姞鍣０锛屽姞娣峰搷绛夊姛鑳斤紱瀵逛簬鎶勬湰锛屽湪姝ゅ疄鐜颁簡(鍙��)鏍规嵁bpe澶勭悊鎶勬湰锛屽皢鎶勬湰鏄犲皠鎴恅tokenid`绛夊姛鑳姐�傜敤鎴峰彲浠ヨ嚜宸遍�夋嫨闇�瑕佸鏍锋湰杩涜鐨勯澶勭悊鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonPreprocessor`銆�
-
-- build_collate_fn
-```python
-@classmethod
-def build_collate_fn(cls, args, train):
-    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
-```
-璇ュ嚱鏁板畾涔変簡濡備綍灏嗗涓牱鏈粍鎴愪竴涓猔batch`銆傚浜庤闊宠瘑鍒换鍔★紝鍦ㄦ瀹炵幇鐨勬槸灏嗕笉鍚岀殑闊抽鍜屾妱鏈紝閫氳繃`padding`鐨勬柟寮忔潵寰楀埌绛夐暱鐨勬暟鎹�傚叿浣撳湴锛屾垜浠粯璁ょ敤`0.0`鏉ヤ綔涓洪煶棰戠殑濉厖鍊硷紝鐢╜-1`浣滀负鎶勬湰鐨勯粯璁ゅ～鍏呭�笺�傜敤鎴峰彲浠ュ湪姝ゅ畾涔変笉鍚岀殑缁刞batch`鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonCollateFn`銆�
-
-- build_model
-```python
-@classmethod
-def build_model(cls, args, train):
-    with open(args.token_list, encoding="utf-8") as f:
-        token_list = [line.rstrip() for line in f]
-        vocab_size = len(token_list)
-        frontend = frontend_class(**args.frontend_conf)
-        specaug = specaug_class(**args.specaug_conf)
-        normalize = normalize_class(**args.normalize_conf)
-        preencoder = preencoder_class(**args.preencoder_conf)
-        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
-        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
-        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
-        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
-        model = model_class(
-            vocab_size=vocab_size,
-            frontend=frontend,
-            specaug=specaug,
-            normalize=normalize,
-            preencoder=preencoder,
-            encoder=encoder,
-            postencoder=postencoder,
-            decoder=decoder,
-            ctc=ctc,
-            token_list=token_list,
-            **args.model_conf,
-        )
-    return model
-```
-璇ュ嚱鏁板畾涔変簡鍏蜂綋鐨勬ā鍨嬨�傚浜庝笉鍚岀殑璇煶璇嗗埆妯″瀷锛屽線寰�鍙互鍏辩敤鍚屼竴涓闊宠瘑鍒玚Task`锛岄澶栭渶瑕佸仛鐨勬槸鍦ㄦ鍑芥暟涓畾涔夌壒瀹氱殑妯″瀷銆備緥濡傦紝杩欓噷缁欏嚭鐨勬槸涓�涓爣鍑嗙殑encoder-decoder缁撴瀯鐨勮闊宠瘑鍒ā鍨嬨�傚叿浣撳湴锛屽厛瀹氫箟璇ユā鍨嬬殑鍚勪釜妯″潡锛屽寘鎷琫ncoder锛宒ecoder绛夛紝鐒跺悗鍦ㄥ皢杩欎簺妯″潡缁勫悎鍦ㄤ竴璧峰緱鍒颁竴涓畬鏁寸殑妯″瀷銆傚湪FunASR涓紝妯″瀷闇�瑕佺户鎵縛AbsESPnetModel`锛屽叾鍏蜂綋浠ｇ爜瑙乣funasr/train/abs_espnet_model.py`锛屼富瑕侀渶瑕佸疄鐜扮殑鏄痐forward`鍑芥暟銆�
-
-涓嬮潰鎴戜滑灏嗕互`SANMEncoder`涓轰緥锛屼粙缁嶅浣曞湪瀹氫箟妯″瀷鐨勬椂鍊欙紝浣跨敤鑷畾涔夌殑`encoder`鏉ヤ綔涓烘ā鍨嬬殑缁勬垚閮ㄥ垎锛屽叾鍏蜂綋鐨勪唬鐮佽`funasr/models/encoder/sanm_encoder.py`銆傚浜庤嚜瀹氫箟鐨刞encoder`锛岄櫎浜嗛渶瑕佺户鎵块�氱敤鐨刞encoder`绫籤AbsEncoder`澶栵紝杩橀渶瑕佽嚜瀹氫箟`forward`鍑芥暟锛屽疄鐜癭encoder`鐨勫墠鍚戣绠椼�傚湪瀹氫箟瀹宍encoder`鍚庯紝杩橀渶瑕佸湪`Task`涓鍏惰繘琛屾敞鍐岋紝涓嬮潰缁欏嚭浜嗙浉搴旂殑浠ｇ爜绀轰緥锛�
-```python
-encoder_choices = ClassChoices(
-    "encoder",
-    classes=dict(
-        conformer=ConformerEncoder,
-        transformer=TransformerEncoder,
-        rnn=RNNEncoder,
-        sanm=SANMEncoder,
-        sanm_chunk_opt=SANMEncoderChunkOpt,
-        data2vec_encoder=Data2VecEncoder,
-        mfcca_enc=MFCCAEncoder,
-    ),
-    type_check=AbsEncoder,
-    default="rnn",
-)
-```
-鍙互鐪嬪埌锛宍sanm=SANMEncoder`灏嗘柊瀹氫箟鐨刞SANMEncoder`浣滀负浜哷encoder`鐨勪竴绉嶅彲閫夐」锛屽綋鐢ㄦ埛鍦ㄩ厤缃枃浠朵腑鎸囧畾`encoder`涓篳sanm`鏃讹紝鍗充細鐩稿簲鍦板皢`SANMEncoder`浣滀负妯″瀷鐨刞encoder`妯″潡銆�
\ No newline at end of file
diff --git a/docs_cn/conf.py b/docs_cn/conf.py
deleted file mode 100644
index 0189991..0000000
--- a/docs_cn/conf.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = 'FunASR'
-copyright = '2022, Speech Lab, Alibaba Group'
-author = 'Speech Lab, Alibaba Grou'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "nbsphinx",
-    "sphinx.ext.autodoc",
-    'sphinx.ext.napoleon',
-    'sphinx.ext.viewcode',
-    "sphinx.ext.mathjax",
-    "sphinx.ext.todo",
-    # "sphinxarg.ext",
-    "sphinx_markdown_tables",
-    'recommonmark',
-    'sphinx_rtd_theme',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-source_suffix = [".rst", ".md"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
diff --git a/docs_cn/get_started.md b/docs_cn/get_started.md
deleted file mode 100644
index 9e1c236..0000000
--- a/docs_cn/get_started.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# 蹇�熷紑濮�
-鍦ㄦ鎴戜滑灏嗕互"浣跨敤AISHELL-1鏁版嵁闆嗭紝浠庨殢鏈哄垵濮嬪寲璁粌涓�涓猵araformer妯″瀷"涓轰緥锛屼粙缁嶅浣曚娇鐢‵unASR銆傛牴鎹繖涓緥瀛愶紝鐢ㄦ埛鍙互绫讳技鍦颁娇鐢ㄥ埆鐨勬暟鎹泦锛堝AISHELL-2鏁版嵁闆嗙瓑锛夎缁冨埆鐨勬ā鍨嬶紙濡俢onformer锛宼ransformer绛夛級銆�
-
-## 鏁翠綋浠嬬粛
-
-鎴戜滑鎻愪緵浜哷egs/aishell/paraformer/run.sh`鏉ュ疄鐜颁娇鐢ˋISHELL-1鏁版嵁闆嗚缁冧竴涓猵araformer妯″瀷銆傝鑴氭湰鍖呭惈5涓樁娈碉紝鍖呮嫭浠庢暟鎹鐞嗗埌璁粌瑙ｇ爜绛夋暣涓祦绋嬶紝鍚屾椂鎻愪緵浜嗗崟/澶欸PU璁粌鍜孋PU/GPU瑙ｇ爜銆傚湪璇︾粏浠嬬粛姣忎釜闃舵涔嬪墠锛屾垜浠厛瀵圭敤鎴烽渶瑕佹墜鍔ㄨ缃殑涓�浜涘弬鏁拌繘琛岃鏄庛��
-- `CUDA_VISIBLE_DEVICES`: 鍙敤鐨凣PU鍒楄〃
-- `gpu_num`: 鐢ㄤ簬璁粌鐨凣PU鏁伴噺
-- `gpu_inference`: 鏄惁浣跨敤GPU杩涜瑙ｇ爜
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU. 瀵逛簬CPU瑙ｇ爜锛岃〃绀鸿В鐮佷换鍔℃暟锛涘浜嶨PU瑙ｇ爜
-- `data_aishell`: AISHELL-1鍘熷鏁版嵁鐨勮矾寰�
-- `feats_dir`: 缁忚繃澶勭悊寰楀埌鐨勭壒寰佺殑淇濆瓨璺緞
-- `nj`: 鏁版嵁澶勭悊鏃剁殑骞惰浠诲姟鏁�
-- `speed_perturb`: 鍙橀�熻缃�
-- `exp_dir`: 瀹為獙缁撴灉鐨勪繚瀛樿矾寰�
-- `tag`: 瀹為獙缁撴灉鐩綍鐨勫悗缂�鍚�
-
-## 闃舵 0锛� 鏁版嵁鍑嗗
-鏈樁娈电敤浜庡鐞嗗師濮嬬殑AISHELL-1鏁版嵁锛屽苟鐢熸垚鐩稿簲鐨刞wav.scp`鍜宍text`锛屼繚瀛樺湪`$feats_dir/data/xxx`鐩綍涓嬶紝杩欓噷鐨刞xxx`琛ㄧず`train`, `dev` 鎴� `test`锛堜笅鍚岋級銆� 杩欓噷鎴戜滑鍋囪鐢ㄦ埛宸茬粡涓嬭浇濂戒簡AISHELL-1鏁版嵁闆嗐�傚鏋滄病鏈夛紝鐢ㄦ埛鍙互鍦╗杩欓噷](https://www.openslr.org/33/) 涓嬭浇鏁版嵁锛屽苟灏哷$data_aishell`璁剧疆涓虹浉搴旂殑璺緞銆備笅闈㈢粰鍑虹敓鎴愮殑`wav.scp`鍜宍text`鐨勭ず渚嬶細
-鏈樁娈电敤浜庡鐞嗗師濮嬬殑AISHELL-1鏁版嵁锛屽苟鐢熸垚鐩稿簲鐨刞wav.scp`鍜宍text`锛屼繚瀛樺湪`$feats_dir/data/xxx`鐩綍涓嬶紝杩欓噷鐨刞xxx`琛ㄧず`train`, `dev` 鎴� `test`锛堜笅鍚岋級銆� 杩欓噷鎴戜滑鍋囪鐢ㄦ埛宸茬粡涓嬭浇濂戒簡AISHELL-1鏁版嵁闆嗐�傚鏋滄病鏈夛紝鐢ㄦ埛鍙互鍦╗杩欓噷](https://www.openslr.org/33/) 涓嬭浇鏁版嵁锛屽苟灏哷$data_aishell`璁剧疆涓虹浉搴旂殑璺緞銆備笅闈㈢粰鍑虹敓鎴愮殑`wav.scp`鍜宍text`鐨勭ず渚嬶細
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-鍙互鐪嬪埌锛岃繖涓や釜鏂囦欢鍧囧寘鎷袱鍒楋紝绗竴鍒楁槸闊抽鐨刬d锛岀浜屽垪鍒嗗埆鏄煶棰戣矾寰勫拰闊抽瀵瑰簲鐨勬妱鏈��
-
-## 闃舵 1锛氱壒寰佹彁鍙�
-鏈樁娈靛皢浼氬熀浜庡師濮嬬殑闊抽`wav.scp`鎻愬彇FBank鐗瑰緛銆傚鏋滄寚瀹氫簡鍙傛暟`speed_perturb`锛屽垯浼氶澶栧闊抽杩涜鍙橀�熸潵瀹炵幇鏁版嵁澧炲己銆傜敤鎴峰彲浠ヨ缃甡nj`鍙傛暟鏉ユ帶鍒剁壒寰佹彁鍙栫殑骞惰浠诲姟鏁般�傚鐞嗗悗鐨勭壒寰佷繚瀛樺湪鐩綍`$feats_dir/dump/xxx/ark`涓嬶紝鐩稿簲鐨刞feats.scp`鏂囦欢璺緞涓篳$feats_dir/dump/xxx/feats.scp`銆備笅闈㈢粰鍑篳feats.scp`鐨勭ず渚嬶細
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-娉ㄦ剰锛岃鏂囦欢鐨勬牱鏈『搴忓凡缁忚繘琛屼簡闅忔満鎵撲贡銆傝鏂囦欢鍖呮嫭涓ゅ垪锛岀涓�鍒楁槸闊抽鐨刬d锛岀浜屽垪鏄搴旂殑kaldi-ark鏍煎紡鐨勭壒寰併�傚彟澶栵紝鍦ㄦ闃舵杩樹細鐢熸垚璁粌闇�瑕佺敤鍒扮殑`speech_shape`鍜宍text_shape`涓や釜鏂囦欢锛岃褰曚簡姣忎釜鏍锋湰鐨勭壒寰佺淮搴﹀拰鎶勬湰闀垮害銆備笅闈㈢粰鍑鸿繖涓や釜鏂囦欢鐨勭ず渚嬶細
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-鍙互鐪嬪埌锛岃繖涓や釜鏂囦欢鍧囧寘鎷袱鍒楋紝绗竴鍒楁槸闊抽鐨刬d锛岀浜屽垪鏄搴旂殑鐗瑰緛鐨勭淮搴﹀拰鎶勬湰鐨勯暱搴︺��
-
-## 闃舵 2锛氬瓧鍏稿噯澶�
-鏈樁娈电敤浜庣敓鎴愬瓧鍏革紝鐢ㄤ簬璁粌杩囩▼涓紝瀛楃鍒版暣鏁扮储寮曚箣闂寸殑鏄犲皠銆傜敓鎴愮殑瀛楀吀鏂囦欢鐨勮矾寰勪负`$feats_dir/data/zh_toekn_list/char/tokens.txt`銆備笅闈㈢粰鍑篳tokens.txt`鐨勭ず渚嬶細
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: 琛ㄧずCTC璁粌涓殑blank
-* `<s>`: 琛ㄧず鍙ュ瓙鐨勮捣濮嬬
-* `</s>`: 琛ㄧず鍙ュ瓙鐨勭粓姝㈢
-* `<unk>`: 琛ㄧず瀛楀吀澶栫殑瀛楃
-
-## 闃舵 3锛氳缁�
-鏈樁娈靛搴旀ā鍨嬬殑璁粌銆傚湪寮�濮嬭缁冧箣鍓嶏紝闇�瑕佹寚瀹氬疄楠岀粨鏋滀繚瀛樼洰褰昤exp_dir`锛岃缁冨彲鐢℅PU`CUDA_VISIBLE_DEVICES`鍜岃缁冪殑gpu鏁伴噺`gpu_num`銆傞粯璁ゆ儏鍐典笅锛屾渶濂界殑`$keep_nbest_models`妯″瀷缁撴灉浼氳骞冲潎浠庤�屾潵鑾峰彇鏇村ソ鐨勬�ц兘銆�
-
-* DDP Training
-
-鎴戜滑鎻愪緵浜嗗垎甯冨紡璁粌锛圖DP锛夊姛鑳斤紝鍏蜂綋鐨勭粏鑺傚彲浠ュ湪[杩欓噷](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) 鎵惧埌銆備负浜嗗紑鍚垎甯冨紡璁粌锛岄渶瑕佽缃甡gpu_num`澶т簬1銆備緥濡傦紝璁剧疆`CUDA_VISIBLE_DEVICES=0,1,5,6,7`锛宍gpu_num=3`锛屽垯缂栧彿涓�0锛�1鍜�5鐨凣PU浼氳鐢ㄤ簬璁粌銆�
-
-* DataLoader
-
-鎴戜滑鎻愪緵浜嗗熀浜嶽Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) 瀹炵幇鐨勫ぇ鏁版嵁DataLoader锛岀敤鎴峰彲浠ラ�氳繃璁剧疆`dataset_type=large`鏉ュ惎鐢ㄣ�� 
-
-* Configuration
-
-璁粌鐩稿叧鐨勫弬鏁帮紝鍖呮嫭妯″瀷锛屼紭鍖栧櫒锛屾暟鎹瓑锛屽潎鍙互閫氳繃`conf`鐩綍涓嬬殑config鏂囦欢鎸囧畾銆傚悓鏃讹紝鐢ㄦ埛涔熷彲浠ョ洿鎺ュ湪`run.sh`鑴氭湰涓寚瀹氱浉鍏冲弬鏁般�傝閬垮厤鍦╟onfig鏂囦欢鍜宍run.sh`鑴氭湰涓缃浉鍚岀殑鍙傛暟锛屼互鍏嶉�犳垚姝т箟銆�
-
-* Training Steps
-
-鎴戜滑鎻愪緵浜嗕袱绉嶆柟寮忔潵鎺у埗璁粌鐨勬�绘鏁帮紝瀵瑰簲鐨勫弬鏁板垎鍒负`max_epoch`鍜宍max_update`銆俙max_epoch`琛ㄧず璁粌鐨勬渶澶poch鏁帮紝`max_update`琛ㄧず璁粌鐨勬渶澶ц凯浠ｆ鏁般�傚鏋滆繖涓や釜鍙傛暟鍚屾椂琚寚瀹氾紝鍒欎竴鏃﹁缁冩鏁板埌杈惧叾涓换鎰忎竴涓弬鏁帮紝璁粌缁撴潫銆�
-
-* Tensorboard
-
-鐢ㄦ埛鍙互閫氳繃tensorboard鏉ヨ瀵熻缁冭繃绋嬩腑鐨勬崯澶憋紝瀛︿範鐜囩瓑銆傚彲浠ラ�氳繃涓嬭堪鎸囧畾鏉ュ疄鐜帮細
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## 闃舵 4: 瑙ｇ爜
-鏈樁娈电敤浜庤В鐮佸緱鍒拌瘑鍒粨鏋滐紝鍚屾椂璁＄畻CER鏉ラ獙璇佽缁冨緱鍒扮殑妯″瀷鎬ц兘銆�
-
-* Mode Selection
-
-鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙ｇ爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniasr`绛夈��
-
-* Configuration
-
-鎴戜滑鎻愪緵浜哻tc瑙ｇ爜, attention瑙ｇ爜鍜宑tc-attention娣峰悎瑙ｇ爜銆傝繖鍑犵瑙ｇ爜鏂瑰紡鍙互閫氳繃`conf`涓嬬殑瑙ｇ爜閰嶇疆鏂囦欢涓殑`ctc_weight`鍙傛暟鏉ユ寚瀹氥�傚叿浣撶殑锛宍ctc_weight=1.0`琛ㄧずCTC瑙ｇ爜, `ctc_weight=0.0`琛ㄧずattention瑙ｇ爜, `0.0<ctc_weight<1.0`琛ㄧずctc-attention娣峰悎瑙ｇ爜銆�
-
-* CPU/GPU Decoding
-
-鎴戜滑鎻愪緵CPU/GPU瑙ｇ爜銆傚浜嶤PU瑙ｇ爜锛岀敤鎴烽渶瑕佽缃甡gpu_inference=False`锛屽悓鏃惰缃甡njob`鏉ユ寚瀹氬苟琛岃В鐮佷换鍔℃暟閲忋�傚浜嶨PU瑙ｇ爜锛岀敤鎴烽渶瑕佽缃甡gpu_inference=True`锛岃缃甡gpuid_list`鏉ユ寚瀹氬摢浜汫PU鐢ㄤ簬瑙ｇ爜锛岃缃甡njobs`鏉ユ寚瀹氭瘡寮燝PU涓婄殑骞惰瑙ｇ爜浠诲姟鏁伴噺銆�
-
-* Performance
-
-鎴戜滑閲囩敤`CER`鏉ラ獙璇佹ā鍨嬬殑鎬ц兘銆傝В鐮佺粨鏋滀繚瀛樺湪`$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`锛屽叿浣撳寘鎷琡text.cer`鍜宍text.cer.txt`涓や釜鏂囦欢銆俙text.cer`涓殑鍐呭涓鸿瘑鍒粨鏋滃拰瀵瑰簲鎶勬湰涔嬮棿鐨勬瘮杈冿紝`text.cer.txt`璁板綍浜嗘渶缁堢殑`CER`銆備笅闈㈢粰鍑篳text.cer`鐨勭ず渚�:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res:    鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
diff --git a/docs_cn/images/DeepScience.png b/docs_cn/images/DeepScience.png
deleted file mode 100644
index 9f46165..0000000
--- a/docs_cn/images/DeepScience.png
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/dingding.jpg b/docs_cn/images/dingding.jpg
deleted file mode 100644
index 4cdad28..0000000
--- a/docs_cn/images/dingding.jpg
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/funasr_logo.jpg b/docs_cn/images/funasr_logo.jpg
deleted file mode 100644
index a47243e..0000000
--- a/docs_cn/images/funasr_logo.jpg
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/wechat.png b/docs_cn/images/wechat.png
deleted file mode 100644
index e7b7349..0000000
--- a/docs_cn/images/wechat.png
+++ /dev/null
Binary files differ
diff --git a/docs_cn/index.rst b/docs_cn/index.rst
deleted file mode 100644
index 4a898e9..0000000
--- a/docs_cn/index.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. Funasr documentation master file, created by
-   sphinx-quickstart on Tues Dec 6 19:05:00 2022.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-FunASR: A Fundamental End-to-End Speech Recognition Toolkit
-============================================================
-.. image:: ./images/funasr_logo.jpg
-
-FunASR鑷村姏浜庡湪璇煶璇嗗埆鐨勫鏈爺绌跺拰宸ヤ笟搴旂敤涔嬮棿鏋勫缓璧蜂竴搴фˉ姊併�傞�氳繃鍦� `ModelScope <https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition>`_ 涓婂彂甯冨伐涓氱骇璇煶璇嗗埆妯″瀷浠ュ強鏀寔鐩稿叧鐨勮缁冨拰寰皟锛岀爺绌惰�呭拰寮�鍙戣�呬滑鍙互鏇存柟渚垮湴杩涜璇煶璇嗗埆妯″瀷鐨勭爺绌跺拰鐢熶骇锛屼績杩涜闊宠瘑鍒敓鎬佺殑鍙戝睍銆侫SR for Fun!
-
-.. toctree::
-   :maxdepth: 1
-   :caption: 鏁欑▼:
-
-   ./installation.md
-   ./papers.md
-   ./get_started.md
-   ./build_task.md
-
-.. toctree::
-   :maxdepth: 1
-   :caption: ModelScope:
-
-   ./modelscope_models.md
-   ./modelscope_usages.md
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs_cn/installation.md b/docs_cn/installation.md
deleted file mode 100755
index a31bc01..0000000
--- a/docs_cn/installation.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 瀹夎
-FunASR鐨勫畨瑁呭崄鍒嗕究鎹凤紝涓嬮潰灏嗙粰鍑鸿缁嗙殑瀹夎姝ラ锛�
-
-- 瀹夎Conda骞跺垱寤鸿櫄鎷熺幆澧�
-``` sh
-wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-sh Miniconda3-latest-Linux-x86_64.sh
-source ~/.bashrc
-conda create -n funasr python=3.7
-conda activate funasr
-```
-
-- 瀹夎Pytorch (鐗堟湰 >= 1.7.0):
-
-```sh
-pip install torch torchaudio
-```
-
-鍏充簬鏇村鐨勭増鏈�, 璇峰弬鐓� [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
-
-- 瀹夎 ModelScope
-
-瀵逛簬鍥藉唴鐢ㄦ埛锛屽彲浠ラ�氳繃閰嶇疆涓嬭堪闀滃儚婧愭潵鍔犲揩涓嬭浇閫熷害
-```sh
-pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
-```
-
-瀹夎鎴栨洿鏂癕odelScope
-``` sh
-pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-- 涓嬭浇FunASR浠撳簱锛屽苟瀹夎鍓╀綑鎵�闇�渚濊禆
-``` sh
-git clone https://github.com/alibaba/FunASR.git && cd FunASR
-pip install --editable ./
-```
\ No newline at end of file
diff --git a/docs_cn/make.bat b/docs_cn/make.bat
deleted file mode 100644
index 747ffb7..0000000
--- a/docs_cn/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
-	set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
-	echo.
-	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
-	echo.installed, then set the SPHINXBUILD environment variable to point
-	echo.to the full path of the 'sphinx-build' executable. Alternatively you
-	echo.may add the Sphinx directory to PATH.
-	echo.
-	echo.If you don't have Sphinx installed, grab it from
-	echo.https://www.sphinx-doc.org/
-	exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs_cn/modelscope_models.md b/docs_cn/modelscope_models.md
deleted file mode 100644
index 8501c1f..0000000
--- a/docs_cn/modelscope_models.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# ModelScope涓婄殑棰勮缁冩ā鍨�
-
-## 妯″瀷璁稿彲璇�
--  Apache License 2.0
-
-## 妯″瀷搴�
-杩欓噷鎴戜滑鎻愪緵浜嗕竴浜涘熀浜庝笉鍚屾暟鎹泦璁粌寰楀埌鐨勫嚑绉嶉璁粌妯″瀷锛屾墍鏈夌殑棰勮缁冩ā鍨嬪拰鏇村缁嗚妭鍙互鍙傝 [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆�
-
-| Datasets  | Hours |     Model      | Online/Offline | Language | Framework | Checkpoint |
-|:-----:|:-----:|:--------------:|:--------------:| :---: | :---: | --- |
-| Alibaba Speech Data | 60000 |   Paraformer   |   Offline   |       CN       | Pytorch |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) |
-| Alibaba Speech Data | 50000 |   Paraformer   |   Offline   |       CN       | Tensorflow |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 |   Paraformer   |   Offline   |       CN       | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 |   Paraformer   |   Online    |       CN       | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online](http://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 |    UniASR     |   Online    |       CN       | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 |    UniASR     |   Offline   |       CN       | Tensorflow |[speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 50000 |    UniASR     |   Online    |     CN&EN      | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 |    UniASR     |   Offline   |     CN&EN      | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 20000 |    UniASR     |   Online    |   CN-Accent    | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 20000 |    UniASR     |    Offline     |   CN-Accent    | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K |     Online     |       CN       | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data |  30000   | Paraformer-8K |    Offline     |       CN       | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data |  30000   | Paraformer-8K |     Online     |       CN       | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data |  30000   | Paraformer-8K |    Offline     |       CN       | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| Alibaba Speech Data |  30000   |   UniASR-8K   |     Online     |       CN       | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data |  30000   |   UniASR-8K   |    Offline     |       CN       | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data |  30000   |   UniASR-8K   |     Online     |       CN       | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data |  30000   |   UniASR-8K   |    Offline     |       CN       | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| AISHELL-1 |  178  |   Paraformer   | Offline |       CN       | Pytorch | [speech_paraformer_asr_nat-aishell1-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary) |
-| AISHELL-2 | 1000  |   Paraformer   |   Offline   |       CN       | Pytorch | [speech_paraformer_asr_nat-aishell2-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell2-pytorch/summary) |
-| AISHELL-1 |  178  | ParaformerBert |   Offline   |       CN       | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000  | ParaformerBert |   Offline   |       CN       | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
-| AISHELL-1 |  178  |   Conformer   |    Offline     |       CN       | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000  |   Conformer   |    Offline     |       CN       | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md
deleted file mode 100644
index c91de76..0000000
--- a/docs_cn/modelscope_usages.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# ModelScope 浣跨敤璇存槑
-ModelScope鏄樋閲屽反宸存帹鍑虹殑寮�婧愭ā鍨嬪嵆鏈嶅姟鍏变韩骞冲彴锛屼负骞垮ぇ瀛︽湳鐣岀敤鎴峰拰宸ヤ笟鐣岀敤鎴锋彁渚涚伒娲汇�佷究鎹风殑妯″瀷搴旂敤鏀寔銆傚叿浣撶殑浣跨敤鏂规硶鍜屽紑婧愭ā鍨嬪彲浠ュ弬瑙乕ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆傚湪璇煶鏂瑰悜锛屾垜浠彁渚涗簡鑷洖褰�/闈炶嚜鍥炲綊璇煶璇嗗埆锛岃闊抽璁粌锛屾爣鐐归娴嬬瓑妯″瀷锛岀敤鎴峰彲浠ユ柟渚夸娇鐢ㄣ��
-
-## 鏁翠綋浠嬬粛
-鎴戜滑鍦╜egs_modelscope` 鐩綍涓嬫彁渚涗簡涓嶅悓妯″瀷鐨勪娇鐢ㄦ柟娉曪紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勫垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互`egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch`鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
-- `infer.py`: 鍩轰簬鎴戜滑鎻愪緵鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
-- `finetune.py`: 灏嗘垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷杩涜寰皟
-- `infer_after_finetune.py`: 鍩轰簬寰皟寰楀埌鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
-
-## 妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠梎CER`銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
-* `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)锛屽叿浣撴牸寮忓彲浠ュ弬瑙乕蹇�熷紑濮媇(./get_started.md)涓殑璇存槑銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁＄畻CER锛屽惁鍒欎細璺宠繃銆�
-* `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
-* `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
-* `ctc_weight`锛氶儴鍒嗘ā鍨嬪寘鍚獵TC妯″潡锛屽彲浠ヨ缃鍙傛暟鏉ユ寚瀹氭帹鐞嗘椂锛孋TC妯″潡鐨勬潈閲�
-
-闄や簡鐩存帴鍦╜infer.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`decoding.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀规帹鐞嗛厤缃��
-
-## 妯″瀷寰皟
-鎴戜滑鎻愪緵浜哷finetune.py`鏉ュ疄鐜版ā鍨嬪井璋冦�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷锛屽湪鎸囧畾鐨勬暟鎹泦涓婅繘琛屽井璋冿紝浠庤�屽湪鐗瑰緛棰嗗煙鍙栧緱鏇村ソ鐨勬�ц兘銆傚湪寰皟寮�濮嬪墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼寰皟閰嶇疆锛�
-* `data_path`锛氭暟鎹洰褰曘�傝鐩綍涓嬪簲璇ュ寘鎷瓨鏀捐缁冮泦鏁版嵁鐨刞train`鐩綍鍜屽瓨鏀鹃獙璇侀泦鏁版嵁鐨刞dev`鐩綍銆傛瘡涓洰褰曚腑闇�瑕佸寘鎷煶棰戝垪琛ㄦ枃浠禶wav.scp`鍜屾妱鏈枃浠禶text`
-* `output_dir`锛氬井璋冪粨鏋滀繚瀛樼洰褰�
-* `dataset_type`锛氬浜庡皬鏁版嵁闆嗭紝璁剧疆涓篳small`锛涘綋鏁版嵁閲忓ぇ浜�1000灏忔椂鏃讹紝璁剧疆涓篳large`
-* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type璁剧疆涓篳large`锛宐atch_bins鍗曚綅涓烘绉�
-* `max_epoch`锛氭渶澶х殑璁粌杞暟
-
-浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `accum_grad`锛氭搴︾疮绉�
-* `keep_nbest_models`锛氶�夋嫨鎬ц兘鏈�濂界殑`keep_nbest_models`涓ā鍨嬬殑鍙傛暟杩涜骞冲潎锛屽緱鍒版�ц兘鏇村ソ鐨勬ā鍨�
-* `optim`锛氳缃紭鍖栧櫒
-* `lr`锛氳缃涔犵巼
-* `scheduler`锛氳缃涔犵巼璋冩暣绛栫暐
-* `scheduler_conf`锛氬涔犵巼璋冩暣绛栫暐鐨勭浉鍏冲弬鏁�
-* `specaug`锛氳缃氨澧炲箍
-* `specaug_conf`锛氳氨澧炲箍鐨勭浉鍏冲弬鏁�
-
-闄や簡鐩存帴鍦╜finetune.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`finetune.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀瑰井璋冮厤缃��
-
-## 鍩轰簬寰皟鍚庣殑妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
-* `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁＄畻CER锛屽惁鍒欎細璺宠繃銆�
-* `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
-* `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
-* `ctc_weight`锛氶儴鍒嗘ā鍨嬪寘鍚獵TC妯″潡锛屽彲浠ヨ缃鍙傛暟鏉ユ寚瀹氭帹鐞嗘椂锛孋TC妯″潡鐨勬潈閲�
-* `decoding_model_name`锛氭寚瀹氱敤浜庢帹鐞嗙殑妯″瀷鍚�
-
-浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨嬪悕
-* `required_files`锛氫娇鐢╩odelscope鎺ュ彛杩涜鎺ㄧ悊鏃堕渶瑕佺敤鍒扮殑鏂囦欢
-
-## 娉ㄦ剰浜嬮」
-閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑`README.md`鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file
diff --git a/docs_cn/papers.md b/docs_cn/papers.md
deleted file mode 100644
index 34a8150..0000000
--- a/docs_cn/papers.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# 璁烘枃
-
-- [Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model](https://arxiv.org/abs/2010.14099), arXiv preprint arXiv:2010.14099, 2020.
-- [Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition](https://arxiv.org/abs/2206.08317), INTERSPEECH 2022.
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/finetune.py b/egs_modelscope/asr/TEMPLATE/finetune.py
new file mode 100644
index 0000000..1935258
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/finetune.py
@@ -0,0 +1,36 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", data_path="./data")
+    params.output_dir = "./checkpoint"              # m妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "small"                   # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 2000                       # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 50                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                             # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/TEMPLATE/infer.py b/egs_modelscope/asr/TEMPLATE/infer.py
new file mode 100644
index 0000000..9f280d5
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+    )
+    inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/infer.sh b/egs_modelscope/asr/TEMPLATE/infer.sh
new file mode 100644
index 0000000..b8b011c
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=4    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+    echo "Computing WER ..."
+    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+    tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+    echo "SpeechIO TIOBE textnorm"
+    echo "$0 --> Normalizing REF text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${data_dir}/text \
+        ${output_dir}/1best_recog/ref.txt
+
+    echo "$0 --> Normalizing HYP text ..."
+    ./utils/textnorm_zh.py \
+        --has_key --to_upper \
+        ${output_dir}/1best_recog/text.proc \
+        ${output_dir}/1best_recog/rec.txt
+    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+    echo "$0 --> computing WER/CER and alignment ..."
+    ./utils/error_rate_zh \
+        --tokenizer char \
+        --ref ${output_dir}/1best_recog/ref.txt \
+        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
new file mode 100644
index 0000000..2d311dd
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
@@ -0,0 +1,48 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
+
+from funasr.utils.compute_wer import compute_wer
+
+def modelscope_infer_after_finetune(params):
+    # prepare for decoding
+
+    try:
+        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
+    decoding_path = os.path.join(params["output_dir"], "decode_results")
+    if os.path.exists(decoding_path):
+        shutil.rmtree(decoding_path)
+    os.mkdir(decoding_path)
+
+    # decoding
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=pretrained_model_path,
+        output_dir=decoding_path,
+        batch_size=params["batch_size"]
+    )
+    audio_in = os.path.join(params["data_dir"], "wav.scp")
+    inference_pipeline(audio_in=audio_in)
+
+    # computer CER if GT text is set
+    text_in = os.path.join(params["data_dir"], "text")
+    if os.path.exists(text_in):
+        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
+        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
+
+
+if __name__ == '__main__':
+    params = {}
+    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data/test"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+    params["batch_size"] = 64
+    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/utils b/egs_modelscope/asr/TEMPLATE/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
index b3bfe8e..8abadd7 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -7,7 +7,6 @@
 
 from funasr.utils.compute_wer import compute_wer
 
-import pdb;
 def modelscope_infer_core(output_dir, split_dir, njob, idx):
     output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
     gpu_id = (int(idx) - 1) // njob
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 79cc3c3..c740f71 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -23,21 +23,37 @@
 
 - Setting parameters in `infer.sh`
     - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
     - <strong>output_dir:</strong> # result dir
     - <strong>batch_size:</strong> # batchsize of inference
     - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
     - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
     - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
 
-- Then you can run the pipeline to infer with:
-```python
-    sh infer.sh
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
+```
+
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
 ```
 
 - Results
 
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
 
 If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
 
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
index 221479d..b8b011c 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -14,8 +14,9 @@
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
 njob=4    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
 
+. utils/parse_options.sh || exit 1;
 
-if ${gpu_inference}; then
+if ${gpu_inference} == "true"; then
     nj=$(echo $gpuid_list | awk -F "," '{print NF}')
 else
     nj=$njob
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index f3b4d56..4722602 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -346,6 +346,8 @@
     **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 2b6716e..e10ebf4 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -1,9 +1,4 @@
 #!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 6f3dbb1..e832869 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -472,6 +472,8 @@
     **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 8cbd419..5546c92 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -612,7 +612,9 @@
         **kwargs,
 ):
     assert check_argument_types()
-
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+    
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
     if ngpu > 1:
@@ -629,7 +631,9 @@
         export_mode = param_dict.get("export_mode", False)
     else:
         hotword_list_or_file = None
-
+    
+    if kwargs.get("device", None) == "cpu":
+        ngpu = 0
     if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
     else:
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 944685f..821f694 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -536,6 +536,8 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
 
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad.py b/funasr/bin/asr_inference_paraformer_vad.py
index 1548f9f..977dc9b 100644
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@@ -157,6 +157,8 @@
     **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 9dc0b79..197930f 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -484,6 +484,8 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
 
     if word_lm_train_config is not None:
         raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 4aea720..35ecdc2 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -379,6 +379,8 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if word_lm_train_config is not None:
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 83436e8..07974c0 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -2,8 +2,6 @@
 # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
 #  MIT License  (https://opensource.org/licenses/MIT)
 
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 01d3f29..87816dd 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -158,6 +158,8 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/bin/lm_inference.py b/funasr/bin/lm_inference.py
index 15c56ca..76de6df 100644
--- a/funasr/bin/lm_inference.py
+++ b/funasr/bin/lm_inference.py
@@ -89,10 +89,9 @@
     **kwargs,
 ):
     assert check_argument_types()
-    logging.basicConfig(
-        level=log_level,
-        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-    )
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+
 
     if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
diff --git a/funasr/bin/lm_inference_launch.py b/funasr/bin/lm_inference_launch.py
index d229cc6..dc6414f 100644
--- a/funasr/bin/lm_inference_launch.py
+++ b/funasr/bin/lm_inference_launch.py
@@ -1,9 +1,6 @@
 #!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-import torch
-torch.set_num_threads(1)
+
 
 import argparse
 import logging
diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py
index 2c5a286..b1d9235 100755
--- a/funasr/bin/punc_inference_launch.py
+++ b/funasr/bin/punc_inference_launch.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punctuation_infer_vadrealtime.py
index 5157eeb..b2db1bf 100644
--- a/funasr/bin/punctuation_infer_vadrealtime.py
+++ b/funasr/bin/punctuation_infer_vadrealtime.py
@@ -203,10 +203,8 @@
     **kwargs,
 ):
     assert check_argument_types()
-    logging.basicConfig(
-        level=log_level,
-        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
-    )
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
 
     if ngpu >= 1 and torch.cuda.is_available():
         device = "cuda"
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index 5a0a8e2..c55bc35 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -252,6 +252,8 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index 7e63bbd..76b1dfb 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -179,6 +179,9 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+    
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/bin/sv_inference_launch.py b/funasr/bin/sv_inference_launch.py
index 64a3cff..8806070 100755
--- a/funasr/bin/sv_inference_launch.py
+++ b/funasr/bin/sv_inference_launch.py
@@ -2,8 +2,6 @@
 # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
 #  MIT License  (https://opensource.org/licenses/MIT)
 
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py
index 6360b17..df029fd 100644
--- a/funasr/bin/tp_inference.py
+++ b/funasr/bin/tp_inference.py
@@ -54,7 +54,7 @@
         assert check_argument_types()
         # 1. Build ASR model
         tp_model, tp_train_args = ASRTask.build_model_from_file(
-            timestamp_infer_config, timestamp_model_file, device
+            timestamp_infer_config, timestamp_model_file, device=device
         )
         if 'cuda' in device:
             tp_model = tp_model.cuda()  # force model to cuda
@@ -179,6 +179,9 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+    
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py
index 55debac..6cdff05 100644
--- a/funasr/bin/tp_inference_launch.py
+++ b/funasr/bin/tp_inference_launch.py
@@ -1,9 +1,5 @@
 #!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
 
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 08d65a4..aff0a44 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -192,6 +192,9 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+    
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 8fea8db..4a1f334 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -1,9 +1,4 @@
 #!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-#  Apache 2.0  (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
 
 import argparse
 import logging
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
index 9ed0721..4d02620 100644
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -151,6 +151,9 @@
         **kwargs,
 ):
     assert check_argument_types()
+    ncpu = kwargs.get("ncpu", 1)
+    torch.set_num_threads(ncpu)
+    
     if batch_size > 1:
         raise NotImplementedError("batch decoding is not implemented")
     if ngpu > 1:
diff --git a/funasr/models/joint_net/__init__.py b/funasr/models/joint_net/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/funasr/models/joint_net/__init__.py
@@ -0,0 +1 @@
+
diff --git a/funasr/runtime/python/benchmark_libtorch.md b/funasr/runtime/python/benchmark_libtorch.md
index 6c068fe..52927b1 100644
--- a/funasr/runtime/python/benchmark_libtorch.md
+++ b/funasr/runtime/python/benchmark_libtorch.md
@@ -1,27 +1,32 @@
-# Benchmark 
+# CPU Benchmark (Libtorch)
 
+## Configuration
 ### Data set:
 Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
 
 ### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
 
-    ```shell
-    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
-    pip install --editable ./
-    cd funasr/runtime/python/utils
-    pip install -r requirements.txt
-    ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
 
-- recipe
+#### Recipe
 
-    set the model, data path and output_dir
+set the model, data path and output_dir
 
-    ```shell
-    nohup bash test_rtf.sh &> log.txt &
-    ```
-
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
 
 
 ## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 
diff --git a/funasr/runtime/python/benchmark_onnx.md b/funasr/runtime/python/benchmark_onnx.md
index 533798a..9f92094 100644
--- a/funasr/runtime/python/benchmark_onnx.md
+++ b/funasr/runtime/python/benchmark_onnx.md
@@ -1,26 +1,32 @@
-# Benchmark 
+# CPU Benchmark (ONNX)
 
+## Configuration
 ### Data set:
 Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
 
 ### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
 
-    ```shell
-    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
-    pip install --editable ./
-    cd funasr/runtime/python/utils
-    pip install -r requirements.txt
-    ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
 
-- recipe
+#### Recipe
 
-    set the model, data path and output_dir
+set the model, data path and output_dir
 
-    ```shell
-    nohup bash test_rtf.sh &> log.txt &
-    ```
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
 
 
 ## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index e85e08a..3f4e762 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -35,22 +35,114 @@
 # pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
 ```
 
-## Run the demo
-- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
+## Inference with runtime
+
+### Speech Recognition
+#### Paraformer
+ ```python
+ from funasr_onnx import Paraformer
+
+ model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ model = Paraformer(model_dir, batch_size=1)
+
+ wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+ result = model(wav_path)
+ print(result)
+ ```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
 - Input: wav formt file, support formats: `str, np.ndarray, List[str]`
-- Output: `List[str]`: recognition result.
-- Example:
-     ```python
-     from funasr_onnx import Paraformer
+- Output: `List[str]`: recognition result
 
-     model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-     model = Paraformer(model_dir, batch_size=1)
+#### Paraformer-online
 
-     wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+### Voice Activity Detection
+#### FSMN-VAD
+```python
+from funasr_onnx import Fsmn_vad
 
-     result = model(wav_path)
-     print(result)
-     ```
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad(model_dir)
+
+result = model(wav_path)
+print(result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### FSMN-VAD-online
+```python
+from funasr_onnx import Fsmn_vad_online
+import soundfile
+
+
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad_online(model_dir)
+
+
+##online vad
+speech, sample_rate = soundfile.read(wav_path)
+speech_length = speech.shape[0]
+#
+sample_offset = 0
+step = 1600
+param_dict = {'in_cache': []}
+for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+    if sample_offset + step >= speech_length - 1:
+        step = speech_length - sample_offset
+        is_final = True
+    else:
+        is_final = False
+    param_dict['is_final'] = is_final
+    segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
+                            param_dict=param_dict)
+    if segments_result:
+        print(segments_result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+### Punctuation Restoration
+#### CT-Transformer
+```python
+from funasr_onnx import CT_Transformer
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model = CT_Transformer(model_dir)
+
+text_in="璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭"
+result = model(text_in)
+print(result[0])
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### CT-Transformer-online
+```python
+from funasr_onnx import CT_Transformer_VadRealtime
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+model = CT_Transformer_VadRealtime(model_dir)
+
+text_in  = "璺ㄥ娌虫祦鏄吇鑲叉部宀竱浜烘皯鐨勭敓鍛戒箣婧愰暱鏈熶互鏉ヤ负甯姪涓嬫父鍦板尯闃茬伨鍑忕伨涓柟鎶�鏈汉鍛榺鍦ㄤ笂娓稿湴鍖烘瀬涓烘伓鍔ｇ殑鑷劧鏉′欢涓嬪厠鏈嶅法澶у洶闅剧敋鑷冲啋鐫�鐢熷懡鍗遍櫓|鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦>闂涓婄殑鍏冲垏|鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒秥鍑℃槸|涓柟鑳藉仛鐨勬垜浠瑋閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑|浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛瑙勫垝鍜岃璇佸吋椤句笂涓嬫父鐨勫埄鐩�"
+
+vads = text_in.split("|")
+rec_result_all=""
+param_dict = {"cache": []}
+for vad in vads:
+    result = model(vad, param_dict=param_dict)
+    rec_result_all += result[0]
+
+print(rec_result_all)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
 
 ## Performance benchmark
 
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 9574a0d..9c4af41 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -186,9 +186,6 @@
                 logging.warning("No keep_nbest_models is given. Change to [1]")
                 trainer_options.keep_nbest_models = [1]
             keep_nbest_models = trainer_options.keep_nbest_models
-     
-        #assert batch_interval is set and >0
-        assert trainer_options.batch_interval > 0
  
         output_dir = Path(trainer_options.output_dir)
         reporter = Reporter()

--
Gitblit v1.9.1