From 5de9e75d587b752d8d1063cc7903c4571df99189 Mon Sep 17 00:00:00 2001
From: yhliang <68215459+yhliang-aslp@users.noreply.github.com>
Date: 星期四, 20 四月 2023 16:52:47 +0800
Subject: [PATCH] Merge pull request #389 from alibaba-damo-academy/main
---
docs/modescope_pipeline/vad_pipeline.md | 101 +++++
egs_modelscope/asr/TEMPLATE/infer.py | 25 +
funasr/bin/lm_inference_launch.py | 5
funasr/train/trainer.py | 3
docs/modescope_pipeline/lm_pipeline.md | 8
funasr/bin/asr_inference_paraformer.py | 8
docs/FQA.md | 22 +
docs/recipe/sd_recipe.md | 129 ++++++
funasr/bin/vad_inference.py | 3
funasr/bin/punctuation_infer_vadrealtime.py | 6
funasr/bin/tp_inference_launch.py | 4
funasr/bin/asr_inference_paraformer_vad_punc.py | 2
docs/modescope_pipeline/tp_pipeline.md | 6
funasr/bin/sond_inference.py | 2
funasr/bin/sv_inference_launch.py | 2
funasr/bin/asr_inference_mfcca.py | 2
funasr/bin/asr_inference_paraformer_vad.py | 2
funasr/bin/punc_inference_launch.py | 4
docs/modescope_pipeline/asr_pipeline.md | 184 +++++++++
funasr/bin/tp_inference.py | 5
egs_modelscope/asr/TEMPLATE/utils | 1
README.md | 4
docs/modescope_pipeline/sd_pipeline.md | 20 +
funasr/bin/asr_inference_uniasr.py | 2
funasr/bin/asr_inference.py | 2
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md | 26 +
funasr/models/joint_net/__init__.py | 1
docs/modelscope_models.md | 2
funasr/bin/eend_ola_inference.py | 2
egs_modelscope/asr/TEMPLATE/infer.sh | 96 +++++
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh | 3
egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py | 1
funasr/runtime/python/benchmark_onnx.md | 34 +
.gitignore | 6
docs/index.rst | 12
egs_modelscope/asr/TEMPLATE/infer_after_finetune.py | 48 ++
funasr/runtime/python/benchmark_libtorch.md | 35 +
egs_modelscope/asr/TEMPLATE/finetune.py | 36 +
.github/workflows/main.yml | 7
docs/benchmark/benchmark_libtorch.md | 1
funasr/bin/lm_inference.py | 7
funasr/bin/asr_inference_paraformer_streaming.py | 2
docs/modescope_pipeline/quick_start.md | 72 +++
docs/images/dingding.jpg | 0
funasr/bin/vad_inference_launch.py | 5
funasr/bin/diar_inference_launch.py | 2
/dev/null | 4
funasr/runtime/python/onnxruntime/README.md | 116 +++++
funasr/bin/vad_inference_online.py | 3
docs/benchmark/benchmark_onnx.md | 1
funasr/bin/sv_inference.py | 3
docs/modescope_pipeline/sv_pipeline.md | 6
docs/modescope_pipeline/punc_pipeline.md | 6
funasr/bin/asr_inference_launch.py | 5
54 files changed, 968 insertions(+), 126 deletions(-)
diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml
index 2497ac2..75651b6 100644
--- a/.github/workflows/main.yml
+++ b/.github/workflows/main.yml
@@ -18,10 +18,6 @@
with:
docs-folder: "docs/"
pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
- - uses: ammaraskar/sphinx-action@master
- with:
- docs-folder: "docs_cn/"
- pre-build-command: "pip install sphinx-markdown-tables nbsphinx jinja2 recommonmark sphinx_rtd_theme"
- name: deploy copy
if: github.ref == 'refs/heads/main' || github.ref == 'refs/heads/dev_wjm' || github.ref == 'refs/heads/dev_lyh'
@@ -31,9 +27,6 @@
mkdir public/en
touch public/en/.nojekyll
cp -r docs/_build/html/* public/en/
- mkdir public/cn
- touch public/cn/.nojekyll
- cp -r docs_cn/_build/html/* public/cn/
mkdir public/m2met2
touch public/m2met2/.nojekyll
cp -r docs_m2met2/_build/html/* public/m2met2/
diff --git a/.gitignore b/.gitignore
index 13d2fff..33b8c39 100644
--- a/.gitignore
+++ b/.gitignore
@@ -12,4 +12,8 @@
*.pyc
.eggs
MaaS-lib
-.gitignore
\ No newline at end of file
+.gitignore
+.egg*
+dist
+build
+funasr.egg-info
\ No newline at end of file
diff --git a/README.md b/README.md
index 03156f3..29ddd4a 100644
--- a/README.md
+++ b/README.md
@@ -12,13 +12,13 @@
[**News**](https://github.com/alibaba-damo-academy/FunASR#whats-new)
| [**Highlights**](#highlights)
| [**Installation**](#installation)
-| [**Docs_EN**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
+| [**Docs**](https://alibaba-damo-academy.github.io/FunASR/en/index.html)
| [**Tutorial**](https://github.com/alibaba-damo-academy/FunASR/wiki#funasr%E7%94%A8%E6%88%B7%E6%89%8B%E5%86%8C)
| [**Papers**](https://github.com/alibaba-damo-academy/FunASR#citations)
| [**Runtime**](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime)
| [**Model Zoo**](https://github.com/alibaba-damo-academy/FunASR/blob/main/docs/modelscope_models.md)
| [**Contact**](#contact)
-
+|
[**M2MET2.0 Guidence_CN**](https://alibaba-damo-academy.github.io/FunASR/m2met2_cn/index.html)
| [**M2MET2.0 Guidence_EN**](https://alibaba-damo-academy.github.io/FunASR/m2met2/index.html)
diff --git a/docs/FQA.md b/docs/FQA.md
new file mode 100644
index 0000000..46c5aa3
--- /dev/null
+++ b/docs/FQA.md
@@ -0,0 +1,22 @@
+# FQA
+
+## How to use VAD model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+## How to use Punctuation model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/238)
+
+## How to use Parafomrer model for streaming by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+## How to use vad, asr and punc model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/278)
+
+## How to combine vad, asr, punc and nnlm models inside modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/134)
+
+## How to combine timestamp prediction model by modelscope pipeline
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/246)
+
+## How to switch decoding mode between online and offline for UniASR model
+Ref to [docs](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_libtorch.md b/docs/benchmark/benchmark_libtorch.md
new file mode 120000
index 0000000..f1cd73c
--- /dev/null
+++ b/docs/benchmark/benchmark_libtorch.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_libtorch.md
\ No newline at end of file
diff --git a/docs/benchmark/benchmark_onnx.md b/docs/benchmark/benchmark_onnx.md
new file mode 120000
index 0000000..14e2fbe
--- /dev/null
+++ b/docs/benchmark/benchmark_onnx.md
@@ -0,0 +1 @@
+../../funasr/runtime/python/benchmark_onnx.md
\ No newline at end of file
diff --git a/docs/images/dingding.jpg b/docs/images/dingding.jpg
index aea2b06..6ac3ab8 100644
--- a/docs/images/dingding.jpg
+++ b/docs/images/dingding.jpg
Binary files differ
diff --git a/docs/index.rst b/docs/index.rst
index e5b9ab8..14c9525 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -47,6 +47,7 @@
./modescope_pipeline/punc_pipeline.md
./modescope_pipeline/tp_pipeline.md
./modescope_pipeline/sv_pipeline.md
+ ./modescope_pipeline/sd_pipeline.md
./modescope_pipeline/lm_pipeline.md
.. toctree::
@@ -63,11 +64,22 @@
.. toctree::
:maxdepth: 1
+ :caption: Benchmark and Leadboard
+
+ ./benchmark/benchmark_onnx.md
+ ./benchmark/benchmark_libtorch.md
+
+.. toctree::
+ :maxdepth: 1
:caption: Papers
./papers.md
+.. toctree::
+ :maxdepth: 1
+ :caption: FQA
+ ./FQA.md
Indices and tables
diff --git a/docs/modelscope_models.md b/docs/modelscope_models.md
index b35d625..3538ae0 100644
--- a/docs/modelscope_models.md
+++ b/docs/modelscope_models.md
@@ -80,7 +80,7 @@
| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary) | CNCeleb (1,200 hours) | 17.5M | 3465 | Xvector, speaker verification, Chinese |
| [Xvector](https://www.modelscope.cn/models/damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch/summary) | CallHome (60 hours) | 61M | 6135 | Xvector, speaker verification, English |
-### Speaker diarization Models
+### Speaker Diarization Models
| Model Name | Training Data | Parameters | Notes |
|:----------------------------------------------------------------------------------------------------------------:|:-------------------:|:----------:|:------|
diff --git a/docs/modescope_pipeline/asr_pipeline.md b/docs/modescope_pipeline/asr_pipeline.md
index 3dc0bd0..8b6b24d 100644
--- a/docs/modescope_pipeline/asr_pipeline.md
+++ b/docs/modescope_pipeline/asr_pipeline.md
@@ -1,20 +1,196 @@
# Speech Recognition
+> **Note**:
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of Paraformer and Paraformer-online as example to demonstrate the usage.
+
## Inference
### Quick start
+#### [Paraformer model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
-#### Inference with you data
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
-#### Inference with multi-threads on CPU
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+ )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
-#### Inference with multi GPU
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+ param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal`
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be:
+ - wav_path, `e.g.`: asr_example.wav,
+ - pcm_path, `e.g.`: asr_example.pcm,
+ - audio bytes stream, `e.g.`: bytes data from a microphone
+ - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+ - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`:
+ ```cat wav.scp
+ asr_example1 ./audios/asr_example1.wav
+ asr_example2 ./audios/asr_example2.wav
+ ```
+ In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+ - <strong>model:</strong> # model name on ModelScope
+ - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+ - <strong>output_dir:</strong> # result dir
+ - <strong>batch_size:</strong> # batchsize of inference
+ - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
+ - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
+ - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --batch_size 64 \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
## Finetune with pipeline
### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+ if not os.path.exists(params.output_dir):
+ os.makedirs(params.output_dir, exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+ kwargs = dict(
+ model=params.model,
+ data_dir=ds_dict,
+ dataset_type=params.dataset_type,
+ work_dir=params.output_dir,
+ batch_bins=params.batch_bins,
+ max_epoch=params.max_epoch,
+ lr=params.lr)
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ from funasr.utils.modelscope_param import modelscope_args
+ params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+ params.output_dir = "./checkpoint" # 妯″瀷淇濆瓨璺緞
+ params.data_path = "speech_asr_aishell1_trainsets" # 鏁版嵁璺緞锛屽彲浠ヤ负modelscope涓凡涓婁紶鏁版嵁锛屼篃鍙互鏄湰鍦版暟鎹�
+ params.dataset_type = "small" # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+ params.batch_bins = 2000 # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+ params.max_epoch = 50 # 鏈�澶ц缁冭疆鏁�
+ params.lr = 0.00005 # 璁剧疆瀛︿範鐜�
+
+ modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
### Finetune with your data
-## Inference with your finetuned model
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+ - <strong>output_dir:</strong> # result dir
+ - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+ - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+ - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+ - <strong>max_epoch:</strong> # number of training epoch
+ - <strong>lr:</strong> # learning rate
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+- Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py)
+ - <strong>modelscope_model_name: </strong> # model name on ModelScope
+ - <strong>output_dir:</strong> # result dir
+ - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+ - <strong>batch_size:</strong> # batchsize of inference
+
+- Then you can run the pipeline to finetune with:
+```python
+ python infer_after_finetune.py
+```
diff --git a/docs/modescope_pipeline/lm_pipeline.md b/docs/modescope_pipeline/lm_pipeline.md
index cb81871..c4090ec 100644
--- a/docs/modescope_pipeline/lm_pipeline.md
+++ b/docs/modescope_pipeline/lm_pipeline.md
@@ -1,10 +1,10 @@
-# Speech Recognition
+# Language Models
## Inference with pipeline
### Quick start
-#### Inference with you data
-#### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with you data
+### Inference with multi-threads on CPU
+### Inference with multi GPU
## Finetune with pipeline
### Quick start
diff --git a/docs/modescope_pipeline/punc_pipeline.md b/docs/modescope_pipeline/punc_pipeline.md
index 67ee695..a0203d7 100644
--- a/docs/modescope_pipeline/punc_pipeline.md
+++ b/docs/modescope_pipeline/punc_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/quick_start.md b/docs/modescope_pipeline/quick_start.md
index ab46a7c..b1614f5 100644
--- a/docs/modescope_pipeline/quick_start.md
+++ b/docs/modescope_pipeline/quick_start.md
@@ -59,8 +59,7 @@
inference_pipeline = pipeline(
task=Tasks.speech_timestamp,
- model='damo/speech_timestamp_prediction-v1-16k-offline',
- output_dir='./tmp')
+ model='damo/speech_timestamp_prediction-v1-16k-offline',)
rec_result = inference_pipeline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
@@ -86,6 +85,71 @@
# speaker verification
rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav','https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
print(rec_result["scores"][0])
+```
+
+### Speaker diarization
+#### SOND
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+ mode="sond_demo",
+ num_workers=0,
+ task=Tasks.speaker_diarization,
+ diar_model_config="sond.yaml",
+ model='damo/speech_diarization_sond-en-us-callhome-8k-n16k4-pytorch',
+ sv_model="damo/speech_xvector_sv-en-us-callhome-8k-spk6135-pytorch",
+ sv_model_revision="master",
+)
+
+audio_list=[
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
+ "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
+]
+
+results = inference_diar_pipline(audio_in=audio_list)
+print(results)
+```
+
+### FAQ
+#### How to switch device from GPU to CPU with pipeline
+
+The pipeline defaults to decoding with GPU (`ngpu=1`) when GPU is available. If you want to switch to CPU, you could set `ngpu=0`
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+ ngpu=0,
+)
+```
+
+#### How to infer from local model path
+Download model to local dir, by modelscope-sdk
+
+```python
+from modelscope.hub.snapshot_download import snapshot_download
+
+local_dir_root = "./models_from_modelscope"
+model_dir = snapshot_download('damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch', cache_dir=local_dir_root)
+```
+
+Or download model to local dir, by git lfs
+```shell
+git lfs install
+# git clone https://www.modelscope.cn/<namespace>/<model-name>.git
+git clone https://www.modelscope.cn/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch.git
+```
+
+Infer with local model path
+```python
+local_dir_root = "./models_from_modelscope/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=local_dir_root,
+)
```
## Finetune with pipeline
@@ -132,6 +196,10 @@
```shell
python finetune.py &> log.txt &
```
+
+### FAQ
+### Multi GPUs training and distributed training
+
If you want finetune with multi-GPUs, you could:
```shell
CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
diff --git a/docs/modescope_pipeline/sd_pipeline.md b/docs/modescope_pipeline/sd_pipeline.md
new file mode 100644
index 0000000..1330fe6
--- /dev/null
+++ b/docs/modescope_pipeline/sd_pipeline.md
@@ -0,0 +1,20 @@
+# Speaker Diarization
+
+## Inference with pipeline
+
+### Quick start
+
+### Inference with you data
+
+### Inference with multi-threads on CPU
+
+### Inference with multi GPU
+
+## Finetune with pipeline
+
+### Quick start
+
+### Finetune with your data
+
+## Inference with your finetuned model
+
diff --git a/docs/modescope_pipeline/sv_pipeline.md b/docs/modescope_pipeline/sv_pipeline.md
index 6ce8c6a..c57db38 100644
--- a/docs/modescope_pipeline/sv_pipeline.md
+++ b/docs/modescope_pipeline/sv_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/tp_pipeline.md b/docs/modescope_pipeline/tp_pipeline.md
index fad55e3..9b1719b 100644
--- a/docs/modescope_pipeline/tp_pipeline.md
+++ b/docs/modescope_pipeline/tp_pipeline.md
@@ -4,11 +4,11 @@
### Quick start
-#### Inference with you data
+### Inference with you data
-#### Inference with multi-threads on CPU
+### Inference with multi-threads on CPU
-#### Inference with multi GPU
+### Inference with multi GPU
## Finetune with pipeline
diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md
index 5dcbe59..9d9b77a 100644
--- a/docs/modescope_pipeline/vad_pipeline.md
+++ b/docs/modescope_pipeline/vad_pipeline.md
@@ -1,14 +1,107 @@
# Voice Activity Detection
-## Inference with pipeline
+> **Note**:
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of FSMN-VAD as example to demonstrate the usage.
+
+## Inference
### Quick start
+#### [FSMN-VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
-#### Inference with you data
+inference_pipeline = pipeline(
+ task=Tasks.voice_activity_detection,
+ model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+)
-#### Inference with multi-threads on CPU
+segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
+print(segments_result)
+```
+#### [FSMN-VAD-online model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary)
+```python
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+ )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
-#### Inference with multi GPU
+param_dict = {"in_cache": dict(), "is_final": False}
+chunk_stride = 1600# 100ms
+# first chunk, 100ms
+speech_chunk = speech[0:chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
+
+
+#### API-reference
+##### define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU
+- `output_dir`: None (Defalut), the output path of results if set
+- `batch_size`: 1 (Defalut), batch size when decoding
+##### infer pipeline
+- `audio_in`: the input to decode, which could be:
+ - wav_path, `e.g.`: asr_example.wav,
+ - pcm_path, `e.g.`: asr_example.pcm,
+ - audio bytes stream, `e.g.`: bytes data from a microphone
+ - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+ - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`:
+ ```cat wav.scp
+ asr_example1 ./audios/asr_example1.wav
+ asr_example2 ./audios/asr_example2.wav
+ ```
+ In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Defalut), the output path of results if set
+
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+
+- Setting parameters in `infer.sh`
+ - <strong>model:</strong> # model name on ModelScope
+ - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+ - <strong>output_dir:</strong> # result dir
+ - <strong>batch_size:</strong> # batchsize of inference
+ - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
+ - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
+ - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
+```
+
+- Results
+
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
## Finetune with pipeline
diff --git a/docs/recipe/sd_recipe.md b/docs/recipe/sd_recipe.md
new file mode 100644
index 0000000..90eb4b3
--- /dev/null
+++ b/docs/recipe/sd_recipe.md
@@ -0,0 +1,129 @@
+# Speaker Diarization
+Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
+
+## Overall Introduction
+We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
+- `CUDA_VISIBLE_DEVICES`: visible gpu list
+- `gpu_num`: the number of GPUs used for training
+- `gpu_inference`: whether to use GPUs for decoding
+- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
+- `data_aishell`: the raw path of AISHELL-1 dataset
+- `feats_dir`: the path for saving processed data
+- `nj`: the number of jobs for data preparation
+- `speed_perturb`: the range of speech perturbed
+- `exp_dir`: the path for saving experimental results
+- `tag`: the suffix of experimental result directory
+
+## Stage 0: Data preparation
+This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
+* `wav.scp`
+```
+BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
+BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
+...
+```
+* `text`
+```
+BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
+BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
+BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
+...
+```
+These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
+
+## Stage 1: Feature Generation
+This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
+* `feats.scp`
+```
+...
+BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
+...
+```
+Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
+* `speech_shape`
+```
+...
+BAC009S0002W0122_sp0.9 665,80
+...
+```
+* `text_shape`
+```
+...
+BAC009S0002W0122_sp0.9 15
+...
+```
+These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
+
+## Stage 2: Dictionary Preparation
+This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
+* `tokens.txt`
+```
+<blank>
+<s>
+</s>
+涓�
+涓�
+...
+榫�
+榫�
+<unk>
+```
+* `<blank>`: indicates the blank token for CTC
+* `<s>`: indicates the start-of-sentence token
+* `</s>`: indicates the end-of-sentence token
+* `<unk>`: indicates the out-of-vocabulary token
+
+## Stage 3: Training
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
+
+* DDP Training
+
+We support the DistributedDataParallel (DDP) training and the detail can be found [here](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html). To enable DDP training, please set `gpu_num` greater than 1. For example, if you set `CUDA_VISIBLE_DEVICES=0,1,5,6,7` and `gpu_num=3`, then the gpus with ids 0, 1 and 5 will be used for training.
+
+* DataLoader
+
+We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it.
+
+* Configuration
+
+The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
+
+* Training Steps
+
+We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
+
+* Tensorboard
+
+Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
+```
+tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
+```
+
+## Stage 4: Decoding
+This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model.
+
+* Mode Selection
+
+As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
+
+* Configuration
+
+We support CTC decoding, attention decoding and hybrid CTC-attention decoding in FunASR, which can be specified by `ctc_weight` in a YAML file in `conf` directory. Specifically, `ctc_weight=1.0` indicates CTC decoding, `ctc_weight=0.0` indicates attention decoding, `0.0<ctc_weight<1.0` indicates hybrid CTC-attention decoding.
+
+* CPU/GPU Decoding
+
+We support CPU and GPU decoding in FunASR. For CPU decoding, you should set `gpu_inference=False` and set `njob` to specify the total number of CPU decoding jobs. For GPU decoding, you should set `gpu_inference=True`. You should also set `gpuid_list` to indicate which GPUs are used for decoding and `njobs` to indicate the number of decoding jobs on each GPU.
+
+* Performance
+
+We adopt `CER` to verify the performance. The results are in `$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`, namely `text.cer` and `text.cer.txt`. `text.cer` saves the comparison between the recognized text and the reference text while `text.cer.txt` saves the final `CER` result. The following is an example of `text.cer`:
+* `text.cer`
+```
+...
+BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
+ref: 鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
+res: 鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
+...
+```
+
diff --git a/docs_cn/Makefile b/docs_cn/Makefile
deleted file mode 100644
index d58379b..0000000
--- a/docs_cn/Makefile
+++ /dev/null
@@ -1,21 +0,0 @@
-# Minimal makefile for Sphinx documentation
-#
-
-# You can set these variables from the command line, and also
-# from the environment for the first two.
-SPHINXOPTS =
-SPHINXBUILD = sphinx-build
-SPHINXPROJ = FunASR
-SOURCEDIR = .
-BUILDDIR = _build
-
-# Put it first so that "make" without argument is like "make help".
-help:
- @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
-
-.PHONY: help Makefile
-
-# Catch-all target: route all unknown targets to Sphinx using the new
-# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
-%: Makefile
- @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
\ No newline at end of file
diff --git a/docs_cn/build_task.md b/docs_cn/build_task.md
deleted file mode 100644
index c23b19f..0000000
--- a/docs_cn/build_task.md
+++ /dev/null
@@ -1,124 +0,0 @@
-# 鎼缓鑷畾涔変换鍔�
-FunASR绫讳技ESPNet锛屼互`Task`涓洪�氱敤鎺ュ彛锛屼粠鑰屽疄鐜版ā鍨嬬殑璁粌鍜屾帹鐞嗐�傛瘡涓�涓猔Task`鏄竴涓被锛屽叾闇�瑕佺户鎵縛AbsTask`锛屽叾瀵瑰簲鐨勫叿浣撲唬鐮佽`funasr/tasks/abs_task.py`銆備笅闈㈢粰鍑哄叾鍖呭惈鐨勪富瑕佸嚱鏁板強鍔熻兘浠嬬粛锛�
-```python
-class AbsTask(ABC):
- @classmethod
- def add_task_arguments(cls, parser: argparse.ArgumentParser):
- pass
-
- @classmethod
- def build_preprocess_fn(cls, args, train):
- (...)
-
- @classmethod
- def build_collate_fn(cls, args: argparse.Namespace):
- (...)
-
- @classmethod
- def build_model(cls, args):
- (...)
-
- @classmethod
- def main(cls, args):
- (...)
-```
-- add_task_arguments锛氭坊鍔犵壒瀹歚Task`闇�瑕佺殑鍙傛暟
-- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞�
-- build_collate_fn锛氬畾涔夊浣曞皢澶氫釜鏍锋湰缁勬垚涓�涓猔batch`
-- build_model锛氬畾涔夋ā鍨�
-- main锛氳缁冨叆鍙o紝閫氳繃`Task.main()`鏉ュ惎鍔ㄨ缁�
-
-涓嬮潰鎴戜滑灏嗕互璇煶璇嗗埆浠诲姟涓轰緥锛屼粙缁嶅浣曞畾涔変竴涓柊鐨刞Task`锛屽叿浣撲唬鐮佽`funasr/tasks/asr.py`涓殑`ASRTask`銆� 瀹氫箟鏂扮殑`Task`鐨勮繃绋嬶紝鍏跺疄灏辨槸鏍规嵁浠诲姟闇�姹傦紝閲嶅畾涔変笂杩板嚱鏁扮殑杩囩▼銆�
-- add_task_arguments
-```python
-@classmethod
-def add_task_arguments(cls, parser: argparse.ArgumentParser):
- group = parser.add_argument_group(description="Task related")
- group.add_argument(
- "--token_list",
- type=str_or_none,
- default=None,
- help="A text mapping int-id to token",
- )
- (...)
-```
-瀵逛簬璇煶璇嗗埆浠诲姟锛岄渶瑕佺殑鐗瑰畾鍙傛暟鍖呮嫭`token_list`绛夈�傛牴鎹笉鍚屼换鍔$殑鐗瑰畾闇�姹傦紝鐢ㄦ埛鍙互鍦ㄦ鍑芥暟涓畾涔夌浉搴旂殑鍙傛暟銆�
-
-- build_preprocess_fn
-```python
-@classmethod
-def build_preprocess_fn(cls, args, train):
- if args.use_preprocessor:
- retval = CommonPreprocessor(
- train=train,
- token_type=args.token_type,
- token_list=args.token_list,
- bpemodel=args.bpemodel,
- non_linguistic_symbols=args.non_linguistic_symbols,
- text_cleaner=args.cleaner,
- ...
- )
- else:
- retval = None
- return retval
-```
-璇ュ嚱鏁板畾涔変簡濡備綍瀵规牱鏈繘琛岄澶勭悊銆傚叿浣撳湴锛岃闊宠瘑鍒换鍔$殑杈撳叆鍖呮嫭闊抽鍜屾妱鏈�傚浜庨煶棰戯紝鍦ㄦ瀹炵幇浜�(鍙��)瀵归煶棰戝姞鍣0锛屽姞娣峰搷绛夊姛鑳斤紱瀵逛簬鎶勬湰锛屽湪姝ゅ疄鐜颁簡(鍙��)鏍规嵁bpe澶勭悊鎶勬湰锛屽皢鎶勬湰鏄犲皠鎴恅tokenid`绛夊姛鑳姐�傜敤鎴峰彲浠ヨ嚜宸遍�夋嫨闇�瑕佸鏍锋湰杩涜鐨勯澶勭悊鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonPreprocessor`銆�
-
-- build_collate_fn
-```python
-@classmethod
-def build_collate_fn(cls, args, train):
- return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
-```
-璇ュ嚱鏁板畾涔変簡濡備綍灏嗗涓牱鏈粍鎴愪竴涓猔batch`銆傚浜庤闊宠瘑鍒换鍔★紝鍦ㄦ瀹炵幇鐨勬槸灏嗕笉鍚岀殑闊抽鍜屾妱鏈紝閫氳繃`padding`鐨勬柟寮忔潵寰楀埌绛夐暱鐨勬暟鎹�傚叿浣撳湴锛屾垜浠粯璁ょ敤`0.0`鏉ヤ綔涓洪煶棰戠殑濉厖鍊硷紝鐢╜-1`浣滀负鎶勬湰鐨勯粯璁ゅ~鍏呭�笺�傜敤鎴峰彲浠ュ湪姝ゅ畾涔変笉鍚岀殑缁刞batch`鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonCollateFn`銆�
-
-- build_model
-```python
-@classmethod
-def build_model(cls, args, train):
- with open(args.token_list, encoding="utf-8") as f:
- token_list = [line.rstrip() for line in f]
- vocab_size = len(token_list)
- frontend = frontend_class(**args.frontend_conf)
- specaug = specaug_class(**args.specaug_conf)
- normalize = normalize_class(**args.normalize_conf)
- preencoder = preencoder_class(**args.preencoder_conf)
- encoder = encoder_class(input_size=input_size, **args.encoder_conf)
- postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
- decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size, **args.decoder_conf)
- ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
- model = model_class(
- vocab_size=vocab_size,
- frontend=frontend,
- specaug=specaug,
- normalize=normalize,
- preencoder=preencoder,
- encoder=encoder,
- postencoder=postencoder,
- decoder=decoder,
- ctc=ctc,
- token_list=token_list,
- **args.model_conf,
- )
- return model
-```
-璇ュ嚱鏁板畾涔変簡鍏蜂綋鐨勬ā鍨嬨�傚浜庝笉鍚岀殑璇煶璇嗗埆妯″瀷锛屽線寰�鍙互鍏辩敤鍚屼竴涓闊宠瘑鍒玚Task`锛岄澶栭渶瑕佸仛鐨勬槸鍦ㄦ鍑芥暟涓畾涔夌壒瀹氱殑妯″瀷銆備緥濡傦紝杩欓噷缁欏嚭鐨勬槸涓�涓爣鍑嗙殑encoder-decoder缁撴瀯鐨勮闊宠瘑鍒ā鍨嬨�傚叿浣撳湴锛屽厛瀹氫箟璇ユā鍨嬬殑鍚勪釜妯″潡锛屽寘鎷琫ncoder锛宒ecoder绛夛紝鐒跺悗鍦ㄥ皢杩欎簺妯″潡缁勫悎鍦ㄤ竴璧峰緱鍒颁竴涓畬鏁寸殑妯″瀷銆傚湪FunASR涓紝妯″瀷闇�瑕佺户鎵縛AbsESPnetModel`锛屽叾鍏蜂綋浠g爜瑙乣funasr/train/abs_espnet_model.py`锛屼富瑕侀渶瑕佸疄鐜扮殑鏄痐forward`鍑芥暟銆�
-
-涓嬮潰鎴戜滑灏嗕互`SANMEncoder`涓轰緥锛屼粙缁嶅浣曞湪瀹氫箟妯″瀷鐨勬椂鍊欙紝浣跨敤鑷畾涔夌殑`encoder`鏉ヤ綔涓烘ā鍨嬬殑缁勬垚閮ㄥ垎锛屽叾鍏蜂綋鐨勪唬鐮佽`funasr/models/encoder/sanm_encoder.py`銆傚浜庤嚜瀹氫箟鐨刞encoder`锛岄櫎浜嗛渶瑕佺户鎵块�氱敤鐨刞encoder`绫籤AbsEncoder`澶栵紝杩橀渶瑕佽嚜瀹氫箟`forward`鍑芥暟锛屽疄鐜癭encoder`鐨勫墠鍚戣绠椼�傚湪瀹氫箟瀹宍encoder`鍚庯紝杩橀渶瑕佸湪`Task`涓鍏惰繘琛屾敞鍐岋紝涓嬮潰缁欏嚭浜嗙浉搴旂殑浠g爜绀轰緥锛�
-```python
-encoder_choices = ClassChoices(
- "encoder",
- classes=dict(
- conformer=ConformerEncoder,
- transformer=TransformerEncoder,
- rnn=RNNEncoder,
- sanm=SANMEncoder,
- sanm_chunk_opt=SANMEncoderChunkOpt,
- data2vec_encoder=Data2VecEncoder,
- mfcca_enc=MFCCAEncoder,
- ),
- type_check=AbsEncoder,
- default="rnn",
-)
-```
-鍙互鐪嬪埌锛宍sanm=SANMEncoder`灏嗘柊瀹氫箟鐨刞SANMEncoder`浣滀负浜哷encoder`鐨勪竴绉嶅彲閫夐」锛屽綋鐢ㄦ埛鍦ㄩ厤缃枃浠朵腑鎸囧畾`encoder`涓篳sanm`鏃讹紝鍗充細鐩稿簲鍦板皢`SANMEncoder`浣滀负妯″瀷鐨刞encoder`妯″潡銆�
\ No newline at end of file
diff --git a/docs_cn/conf.py b/docs_cn/conf.py
deleted file mode 100644
index 0189991..0000000
--- a/docs_cn/conf.py
+++ /dev/null
@@ -1,67 +0,0 @@
-# Configuration file for the Sphinx documentation builder.
-#
-# This file only contains a selection of the most common options. For a full
-# list see the documentation:
-# https://www.sphinx-doc.org/en/master/usage/configuration.html
-
-# -- Path setup --------------------------------------------------------------
-
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-# import os
-# import sys
-# sys.path.insert(0, os.path.abspath('.'))
-
-
-# -- Project information -----------------------------------------------------
-
-project = 'FunASR'
-copyright = '2022, Speech Lab, Alibaba Group'
-author = 'Speech Lab, Alibaba Grou'
-
-
-# -- General configuration ---------------------------------------------------
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
- "nbsphinx",
- "sphinx.ext.autodoc",
- 'sphinx.ext.napoleon',
- 'sphinx.ext.viewcode',
- "sphinx.ext.mathjax",
- "sphinx.ext.todo",
- # "sphinxarg.ext",
- "sphinx_markdown_tables",
- 'recommonmark',
- 'sphinx_rtd_theme',
-]
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ['_templates']
-
-source_suffix = [".rst", ".md"]
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = []
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = "sphinx"
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages. See the documentation for
-# a list of builtin themes.
-#
-
-html_theme = "sphinx_rtd_theme"
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
\ No newline at end of file
diff --git a/docs_cn/get_started.md b/docs_cn/get_started.md
deleted file mode 100644
index 9e1c236..0000000
--- a/docs_cn/get_started.md
+++ /dev/null
@@ -1,131 +0,0 @@
-# 蹇�熷紑濮�
-鍦ㄦ鎴戜滑灏嗕互"浣跨敤AISHELL-1鏁版嵁闆嗭紝浠庨殢鏈哄垵濮嬪寲璁粌涓�涓猵araformer妯″瀷"涓轰緥锛屼粙缁嶅浣曚娇鐢‵unASR銆傛牴鎹繖涓緥瀛愶紝鐢ㄦ埛鍙互绫讳技鍦颁娇鐢ㄥ埆鐨勬暟鎹泦锛堝AISHELL-2鏁版嵁闆嗙瓑锛夎缁冨埆鐨勬ā鍨嬶紙濡俢onformer锛宼ransformer绛夛級銆�
-
-## 鏁翠綋浠嬬粛
-
-鎴戜滑鎻愪緵浜哷egs/aishell/paraformer/run.sh`鏉ュ疄鐜颁娇鐢ˋISHELL-1鏁版嵁闆嗚缁冧竴涓猵araformer妯″瀷銆傝鑴氭湰鍖呭惈5涓樁娈碉紝鍖呮嫭浠庢暟鎹鐞嗗埌璁粌瑙g爜绛夋暣涓祦绋嬶紝鍚屾椂鎻愪緵浜嗗崟/澶欸PU璁粌鍜孋PU/GPU瑙g爜銆傚湪璇︾粏浠嬬粛姣忎釜闃舵涔嬪墠锛屾垜浠厛瀵圭敤鎴烽渶瑕佹墜鍔ㄨ缃殑涓�浜涘弬鏁拌繘琛岃鏄庛��
-- `CUDA_VISIBLE_DEVICES`: 鍙敤鐨凣PU鍒楄〃
-- `gpu_num`: 鐢ㄤ簬璁粌鐨凣PU鏁伴噺
-- `gpu_inference`: 鏄惁浣跨敤GPU杩涜瑙g爜
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU. 瀵逛簬CPU瑙g爜锛岃〃绀鸿В鐮佷换鍔℃暟锛涘浜嶨PU瑙g爜
-- `data_aishell`: AISHELL-1鍘熷鏁版嵁鐨勮矾寰�
-- `feats_dir`: 缁忚繃澶勭悊寰楀埌鐨勭壒寰佺殑淇濆瓨璺緞
-- `nj`: 鏁版嵁澶勭悊鏃剁殑骞惰浠诲姟鏁�
-- `speed_perturb`: 鍙橀�熻缃�
-- `exp_dir`: 瀹為獙缁撴灉鐨勪繚瀛樿矾寰�
-- `tag`: 瀹為獙缁撴灉鐩綍鐨勫悗缂�鍚�
-
-## 闃舵 0锛� 鏁版嵁鍑嗗
-鏈樁娈电敤浜庡鐞嗗師濮嬬殑AISHELL-1鏁版嵁锛屽苟鐢熸垚鐩稿簲鐨刞wav.scp`鍜宍text`锛屼繚瀛樺湪`$feats_dir/data/xxx`鐩綍涓嬶紝杩欓噷鐨刞xxx`琛ㄧず`train`, `dev` 鎴� `test`锛堜笅鍚岋級銆� 杩欓噷鎴戜滑鍋囪鐢ㄦ埛宸茬粡涓嬭浇濂戒簡AISHELL-1鏁版嵁闆嗐�傚鏋滄病鏈夛紝鐢ㄦ埛鍙互鍦╗杩欓噷](https://www.openslr.org/33/) 涓嬭浇鏁版嵁锛屽苟灏哷$data_aishell`璁剧疆涓虹浉搴旂殑璺緞銆備笅闈㈢粰鍑虹敓鎴愮殑`wav.scp`鍜宍text`鐨勭ず渚嬶細
-鏈樁娈电敤浜庡鐞嗗師濮嬬殑AISHELL-1鏁版嵁锛屽苟鐢熸垚鐩稿簲鐨刞wav.scp`鍜宍text`锛屼繚瀛樺湪`$feats_dir/data/xxx`鐩綍涓嬶紝杩欓噷鐨刞xxx`琛ㄧず`train`, `dev` 鎴� `test`锛堜笅鍚岋級銆� 杩欓噷鎴戜滑鍋囪鐢ㄦ埛宸茬粡涓嬭浇濂戒簡AISHELL-1鏁版嵁闆嗐�傚鏋滄病鏈夛紝鐢ㄦ埛鍙互鍦╗杩欓噷](https://www.openslr.org/33/) 涓嬭浇鏁版嵁锛屽苟灏哷$data_aishell`璁剧疆涓虹浉搴旂殑璺緞銆備笅闈㈢粰鍑虹敓鎴愮殑`wav.scp`鍜宍text`鐨勭ず渚嬶細
-* `wav.scp`
-```
-BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
-BAC009S0002W0123 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0123.wav
-BAC009S0002W0124 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0124.wav
-...
-```
-* `text`
-```
-BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
-BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
-BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
-...
-```
-鍙互鐪嬪埌锛岃繖涓や釜鏂囦欢鍧囧寘鎷袱鍒楋紝绗竴鍒楁槸闊抽鐨刬d锛岀浜屽垪鍒嗗埆鏄煶棰戣矾寰勫拰闊抽瀵瑰簲鐨勬妱鏈��
-
-## 闃舵 1锛氱壒寰佹彁鍙�
-鏈樁娈靛皢浼氬熀浜庡師濮嬬殑闊抽`wav.scp`鎻愬彇FBank鐗瑰緛銆傚鏋滄寚瀹氫簡鍙傛暟`speed_perturb`锛屽垯浼氶澶栧闊抽杩涜鍙橀�熸潵瀹炵幇鏁版嵁澧炲己銆傜敤鎴峰彲浠ヨ缃甡nj`鍙傛暟鏉ユ帶鍒剁壒寰佹彁鍙栫殑骞惰浠诲姟鏁般�傚鐞嗗悗鐨勭壒寰佷繚瀛樺湪鐩綍`$feats_dir/dump/xxx/ark`涓嬶紝鐩稿簲鐨刞feats.scp`鏂囦欢璺緞涓篳$feats_dir/dump/xxx/feats.scp`銆備笅闈㈢粰鍑篳feats.scp`鐨勭ず渚嬶細
-* `feats.scp`
-```
-...
-BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
-...
-```
-娉ㄦ剰锛岃鏂囦欢鐨勬牱鏈『搴忓凡缁忚繘琛屼簡闅忔満鎵撲贡銆傝鏂囦欢鍖呮嫭涓ゅ垪锛岀涓�鍒楁槸闊抽鐨刬d锛岀浜屽垪鏄搴旂殑kaldi-ark鏍煎紡鐨勭壒寰併�傚彟澶栵紝鍦ㄦ闃舵杩樹細鐢熸垚璁粌闇�瑕佺敤鍒扮殑`speech_shape`鍜宍text_shape`涓や釜鏂囦欢锛岃褰曚簡姣忎釜鏍锋湰鐨勭壒寰佺淮搴﹀拰鎶勬湰闀垮害銆備笅闈㈢粰鍑鸿繖涓や釜鏂囦欢鐨勭ず渚嬶細
-* `speech_shape`
-```
-...
-BAC009S0002W0122_sp0.9 665,80
-...
-```
-* `text_shape`
-```
-...
-BAC009S0002W0122_sp0.9 15
-...
-```
-鍙互鐪嬪埌锛岃繖涓や釜鏂囦欢鍧囧寘鎷袱鍒楋紝绗竴鍒楁槸闊抽鐨刬d锛岀浜屽垪鏄搴旂殑鐗瑰緛鐨勭淮搴﹀拰鎶勬湰鐨勯暱搴︺��
-
-## 闃舵 2锛氬瓧鍏稿噯澶�
-鏈樁娈电敤浜庣敓鎴愬瓧鍏革紝鐢ㄤ簬璁粌杩囩▼涓紝瀛楃鍒版暣鏁扮储寮曚箣闂寸殑鏄犲皠銆傜敓鎴愮殑瀛楀吀鏂囦欢鐨勮矾寰勪负`$feats_dir/data/zh_toekn_list/char/tokens.txt`銆備笅闈㈢粰鍑篳tokens.txt`鐨勭ず渚嬶細
-* `tokens.txt`
-```
-<blank>
-<s>
-</s>
-涓�
-涓�
-...
-榫�
-榫�
-<unk>
-```
-* `<blank>`: 琛ㄧずCTC璁粌涓殑blank
-* `<s>`: 琛ㄧず鍙ュ瓙鐨勮捣濮嬬
-* `</s>`: 琛ㄧず鍙ュ瓙鐨勭粓姝㈢
-* `<unk>`: 琛ㄧず瀛楀吀澶栫殑瀛楃
-
-## 闃舵 3锛氳缁�
-鏈樁娈靛搴旀ā鍨嬬殑璁粌銆傚湪寮�濮嬭缁冧箣鍓嶏紝闇�瑕佹寚瀹氬疄楠岀粨鏋滀繚瀛樼洰褰昤exp_dir`锛岃缁冨彲鐢℅PU`CUDA_VISIBLE_DEVICES`鍜岃缁冪殑gpu鏁伴噺`gpu_num`銆傞粯璁ゆ儏鍐典笅锛屾渶濂界殑`$keep_nbest_models`妯″瀷缁撴灉浼氳骞冲潎浠庤�屾潵鑾峰彇鏇村ソ鐨勬�ц兘銆�
-
-* DDP Training
-
-鎴戜滑鎻愪緵浜嗗垎甯冨紡璁粌锛圖DP锛夊姛鑳斤紝鍏蜂綋鐨勭粏鑺傚彲浠ュ湪[杩欓噷](https://pytorch.org/tutorials/intermediate/ddp_tutorial.html) 鎵惧埌銆備负浜嗗紑鍚垎甯冨紡璁粌锛岄渶瑕佽缃甡gpu_num`澶т簬1銆備緥濡傦紝璁剧疆`CUDA_VISIBLE_DEVICES=0,1,5,6,7`锛宍gpu_num=3`锛屽垯缂栧彿涓�0锛�1鍜�5鐨凣PU浼氳鐢ㄤ簬璁粌銆�
-
-* DataLoader
-
-鎴戜滑鎻愪緵浜嗗熀浜嶽Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) 瀹炵幇鐨勫ぇ鏁版嵁DataLoader锛岀敤鎴峰彲浠ラ�氳繃璁剧疆`dataset_type=large`鏉ュ惎鐢ㄣ��
-
-* Configuration
-
-璁粌鐩稿叧鐨勫弬鏁帮紝鍖呮嫭妯″瀷锛屼紭鍖栧櫒锛屾暟鎹瓑锛屽潎鍙互閫氳繃`conf`鐩綍涓嬬殑config鏂囦欢鎸囧畾銆傚悓鏃讹紝鐢ㄦ埛涔熷彲浠ョ洿鎺ュ湪`run.sh`鑴氭湰涓寚瀹氱浉鍏冲弬鏁般�傝閬垮厤鍦╟onfig鏂囦欢鍜宍run.sh`鑴氭湰涓缃浉鍚岀殑鍙傛暟锛屼互鍏嶉�犳垚姝т箟銆�
-
-* Training Steps
-
-鎴戜滑鎻愪緵浜嗕袱绉嶆柟寮忔潵鎺у埗璁粌鐨勬�绘鏁帮紝瀵瑰簲鐨勫弬鏁板垎鍒负`max_epoch`鍜宍max_update`銆俙max_epoch`琛ㄧず璁粌鐨勬渶澶poch鏁帮紝`max_update`琛ㄧず璁粌鐨勬渶澶ц凯浠f鏁般�傚鏋滆繖涓や釜鍙傛暟鍚屾椂琚寚瀹氾紝鍒欎竴鏃﹁缁冩鏁板埌杈惧叾涓换鎰忎竴涓弬鏁帮紝璁粌缁撴潫銆�
-
-* Tensorboard
-
-鐢ㄦ埛鍙互閫氳繃tensorboard鏉ヨ瀵熻缁冭繃绋嬩腑鐨勬崯澶憋紝瀛︿範鐜囩瓑銆傚彲浠ラ�氳繃涓嬭堪鎸囧畾鏉ュ疄鐜帮細
-```
-tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
-```
-
-## 闃舵 4: 瑙g爜
-鏈樁娈电敤浜庤В鐮佸緱鍒拌瘑鍒粨鏋滐紝鍚屾椂璁$畻CER鏉ラ獙璇佽缁冨緱鍒扮殑妯″瀷鎬ц兘銆�
-
-* Mode Selection
-
-鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙g爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniasr`绛夈��
-
-* Configuration
-
-鎴戜滑鎻愪緵浜哻tc瑙g爜, attention瑙g爜鍜宑tc-attention娣峰悎瑙g爜銆傝繖鍑犵瑙g爜鏂瑰紡鍙互閫氳繃`conf`涓嬬殑瑙g爜閰嶇疆鏂囦欢涓殑`ctc_weight`鍙傛暟鏉ユ寚瀹氥�傚叿浣撶殑锛宍ctc_weight=1.0`琛ㄧずCTC瑙g爜, `ctc_weight=0.0`琛ㄧずattention瑙g爜, `0.0<ctc_weight<1.0`琛ㄧずctc-attention娣峰悎瑙g爜銆�
-
-* CPU/GPU Decoding
-
-鎴戜滑鎻愪緵CPU/GPU瑙g爜銆傚浜嶤PU瑙g爜锛岀敤鎴烽渶瑕佽缃甡gpu_inference=False`锛屽悓鏃惰缃甡njob`鏉ユ寚瀹氬苟琛岃В鐮佷换鍔℃暟閲忋�傚浜嶨PU瑙g爜锛岀敤鎴烽渶瑕佽缃甡gpu_inference=True`锛岃缃甡gpuid_list`鏉ユ寚瀹氬摢浜汫PU鐢ㄤ簬瑙g爜锛岃缃甡njobs`鏉ユ寚瀹氭瘡寮燝PU涓婄殑骞惰瑙g爜浠诲姟鏁伴噺銆�
-
-* Performance
-
-鎴戜滑閲囩敤`CER`鏉ラ獙璇佹ā鍨嬬殑鎬ц兘銆傝В鐮佺粨鏋滀繚瀛樺湪`$exp_dir/exp/$model_dir/$decoding_yaml_name/$average_model_name/$dset`锛屽叿浣撳寘鎷琡text.cer`鍜宍text.cer.txt`涓や釜鏂囦欢銆俙text.cer`涓殑鍐呭涓鸿瘑鍒粨鏋滃拰瀵瑰簲鎶勬湰涔嬮棿鐨勬瘮杈冿紝`text.cer.txt`璁板綍浜嗘渶缁堢殑`CER`銆備笅闈㈢粰鍑篳text.cer`鐨勭ず渚�:
-* `text.cer`
-```
-...
-BAC009S0764W0213(nwords=11,cor=11,ins=0,del=0,sub=0) corr=100.00%,cer=0.00%
-ref: 鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-res: 鏋� 寤� 鑹� 濂� 鐨� 鏃� 娓� 甯� 鍦� 鐜� 澧�
-...
-```
-
diff --git a/docs_cn/images/DeepScience.png b/docs_cn/images/DeepScience.png
deleted file mode 100644
index 9f46165..0000000
--- a/docs_cn/images/DeepScience.png
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/dingding.jpg b/docs_cn/images/dingding.jpg
deleted file mode 100644
index 4cdad28..0000000
--- a/docs_cn/images/dingding.jpg
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/funasr_logo.jpg b/docs_cn/images/funasr_logo.jpg
deleted file mode 100644
index a47243e..0000000
--- a/docs_cn/images/funasr_logo.jpg
+++ /dev/null
Binary files differ
diff --git a/docs_cn/images/wechat.png b/docs_cn/images/wechat.png
deleted file mode 100644
index e7b7349..0000000
--- a/docs_cn/images/wechat.png
+++ /dev/null
Binary files differ
diff --git a/docs_cn/index.rst b/docs_cn/index.rst
deleted file mode 100644
index 4a898e9..0000000
--- a/docs_cn/index.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. Funasr documentation master file, created by
- sphinx-quickstart on Tues Dec 6 19:05:00 2022.
- You can adapt this file completely to your liking, but it should at least
- contain the root `toctree` directive.
-
-FunASR: A Fundamental End-to-End Speech Recognition Toolkit
-============================================================
-.. image:: ./images/funasr_logo.jpg
-
-FunASR鑷村姏浜庡湪璇煶璇嗗埆鐨勫鏈爺绌跺拰宸ヤ笟搴旂敤涔嬮棿鏋勫缓璧蜂竴搴фˉ姊併�傞�氳繃鍦� `ModelScope <https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition>`_ 涓婂彂甯冨伐涓氱骇璇煶璇嗗埆妯″瀷浠ュ強鏀寔鐩稿叧鐨勮缁冨拰寰皟锛岀爺绌惰�呭拰寮�鍙戣�呬滑鍙互鏇存柟渚垮湴杩涜璇煶璇嗗埆妯″瀷鐨勭爺绌跺拰鐢熶骇锛屼績杩涜闊宠瘑鍒敓鎬佺殑鍙戝睍銆侫SR for Fun!
-
-.. toctree::
- :maxdepth: 1
- :caption: 鏁欑▼:
-
- ./installation.md
- ./papers.md
- ./get_started.md
- ./build_task.md
-
-.. toctree::
- :maxdepth: 1
- :caption: ModelScope:
-
- ./modelscope_models.md
- ./modelscope_usages.md
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs_cn/installation.md b/docs_cn/installation.md
deleted file mode 100755
index a31bc01..0000000
--- a/docs_cn/installation.md
+++ /dev/null
@@ -1,37 +0,0 @@
-# 瀹夎
-FunASR鐨勫畨瑁呭崄鍒嗕究鎹凤紝涓嬮潰灏嗙粰鍑鸿缁嗙殑瀹夎姝ラ锛�
-
-- 瀹夎Conda骞跺垱寤鸿櫄鎷熺幆澧�
-``` sh
-wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
-sh Miniconda3-latest-Linux-x86_64.sh
-source ~/.bashrc
-conda create -n funasr python=3.7
-conda activate funasr
-```
-
-- 瀹夎Pytorch (鐗堟湰 >= 1.7.0):
-
-```sh
-pip install torch torchaudio
-```
-
-鍏充簬鏇村鐨勭増鏈�, 璇峰弬鐓� [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
-
-- 瀹夎 ModelScope
-
-瀵逛簬鍥藉唴鐢ㄦ埛锛屽彲浠ラ�氳繃閰嶇疆涓嬭堪闀滃儚婧愭潵鍔犲揩涓嬭浇閫熷害
-```sh
-pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
-```
-
-瀹夎鎴栨洿鏂癕odelScope
-``` sh
-pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
-```
-
-- 涓嬭浇FunASR浠撳簱锛屽苟瀹夎鍓╀綑鎵�闇�渚濊禆
-``` sh
-git clone https://github.com/alibaba/FunASR.git && cd FunASR
-pip install --editable ./
-```
\ No newline at end of file
diff --git a/docs_cn/make.bat b/docs_cn/make.bat
deleted file mode 100644
index 747ffb7..0000000
--- a/docs_cn/make.bat
+++ /dev/null
@@ -1,35 +0,0 @@
-@ECHO OFF
-
-pushd %~dp0
-
-REM Command file for Sphinx documentation
-
-if "%SPHINXBUILD%" == "" (
- set SPHINXBUILD=sphinx-build
-)
-set SOURCEDIR=source
-set BUILDDIR=build
-
-%SPHINXBUILD% >NUL 2>NUL
-if errorlevel 9009 (
- echo.
- echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
- echo.installed, then set the SPHINXBUILD environment variable to point
- echo.to the full path of the 'sphinx-build' executable. Alternatively you
- echo.may add the Sphinx directory to PATH.
- echo.
- echo.If you don't have Sphinx installed, grab it from
- echo.https://www.sphinx-doc.org/
- exit /b 1
-)
-
-if "%1" == "" goto help
-
-%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-goto end
-
-:help
-%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
-
-:end
-popd
diff --git a/docs_cn/modelscope_models.md b/docs_cn/modelscope_models.md
deleted file mode 100644
index 8501c1f..0000000
--- a/docs_cn/modelscope_models.md
+++ /dev/null
@@ -1,34 +0,0 @@
-# ModelScope涓婄殑棰勮缁冩ā鍨�
-
-## 妯″瀷璁稿彲璇�
-- Apache License 2.0
-
-## 妯″瀷搴�
-杩欓噷鎴戜滑鎻愪緵浜嗕竴浜涘熀浜庝笉鍚屾暟鎹泦璁粌寰楀埌鐨勫嚑绉嶉璁粌妯″瀷锛屾墍鏈夌殑棰勮缁冩ā鍨嬪拰鏇村缁嗚妭鍙互鍙傝 [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆�
-
-| Datasets | Hours | Model | Online/Offline | Language | Framework | Checkpoint |
-|:-----:|:-----:|:--------------:|:--------------:| :---: | :---: | --- |
-| Alibaba Speech Data | 60000 | Paraformer | Offline | CN | Pytorch |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 50000 | Paraformer | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online](http://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Offline | CN | Tensorflow |[speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR-large_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Online | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 50000 | UniASR | Offline | CN&EN | Tensorflow |[speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-en-moe-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 20000 | UniASR | Online | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 20000 | UniASR | Offline | CN-Accent | Tensorflow |[speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-cn-dialect-16k-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab3444-tensorflow1-online/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Tensorflow |[speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data | 30000 | Paraformer-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-online/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Tensorflow |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab8358-tensorflow1-offline/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Online | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary) |
-| Alibaba Speech Data | 30000 | UniASR-8K | Offline | CN | Pytorch |[speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/summary) |
-| AISHELL-1 | 178 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell1-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell1-pytorch/summary) |
-| AISHELL-2 | 1000 | Paraformer | Offline | CN | Pytorch | [speech_paraformer_asr_nat-aishell2-pytorch](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-aishell2-pytorch/summary) |
-| AISHELL-1 | 178 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000 | ParaformerBert | Offline | CN | Pytorch | [speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_paraformerbert_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
-| AISHELL-1 | 178 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/summary) |
-| AISHELL-2 | 1000 | Conformer | Offline | CN | Pytorch | [speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch](https://modelscope.cn/models/damo/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/summary) |
diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md
deleted file mode 100644
index c91de76..0000000
--- a/docs_cn/modelscope_usages.md
+++ /dev/null
@@ -1,52 +0,0 @@
-# ModelScope 浣跨敤璇存槑
-ModelScope鏄樋閲屽反宸存帹鍑虹殑寮�婧愭ā鍨嬪嵆鏈嶅姟鍏变韩骞冲彴锛屼负骞垮ぇ瀛︽湳鐣岀敤鎴峰拰宸ヤ笟鐣岀敤鎴锋彁渚涚伒娲汇�佷究鎹风殑妯″瀷搴旂敤鏀寔銆傚叿浣撶殑浣跨敤鏂规硶鍜屽紑婧愭ā鍨嬪彲浠ュ弬瑙乕ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆傚湪璇煶鏂瑰悜锛屾垜浠彁渚涗簡鑷洖褰�/闈炶嚜鍥炲綊璇煶璇嗗埆锛岃闊抽璁粌锛屾爣鐐归娴嬬瓑妯″瀷锛岀敤鎴峰彲浠ユ柟渚夸娇鐢ㄣ��
-
-## 鏁翠綋浠嬬粛
-鎴戜滑鍦╜egs_modelscope` 鐩綍涓嬫彁渚涗簡涓嶅悓妯″瀷鐨勪娇鐢ㄦ柟娉曪紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勫垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互`egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch`鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
-- `infer.py`: 鍩轰簬鎴戜滑鎻愪緵鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
-- `finetune.py`: 灏嗘垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷杩涜寰皟
-- `infer_after_finetune.py`: 鍩轰簬寰皟寰楀埌鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
-
-## 妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠梎CER`銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
-* `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)锛屽叿浣撴牸寮忓彲浠ュ弬瑙乕蹇�熷紑濮媇(./get_started.md)涓殑璇存槑銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁$畻CER锛屽惁鍒欎細璺宠繃銆�
-* `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
-* `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
-* `ctc_weight`锛氶儴鍒嗘ā鍨嬪寘鍚獵TC妯″潡锛屽彲浠ヨ缃鍙傛暟鏉ユ寚瀹氭帹鐞嗘椂锛孋TC妯″潡鐨勬潈閲�
-
-闄や簡鐩存帴鍦╜infer.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`decoding.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀规帹鐞嗛厤缃��
-
-## 妯″瀷寰皟
-鎴戜滑鎻愪緵浜哷finetune.py`鏉ュ疄鐜版ā鍨嬪井璋冦�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷锛屽湪鎸囧畾鐨勬暟鎹泦涓婅繘琛屽井璋冿紝浠庤�屽湪鐗瑰緛棰嗗煙鍙栧緱鏇村ソ鐨勬�ц兘銆傚湪寰皟寮�濮嬪墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼寰皟閰嶇疆锛�
-* `data_path`锛氭暟鎹洰褰曘�傝鐩綍涓嬪簲璇ュ寘鎷瓨鏀捐缁冮泦鏁版嵁鐨刞train`鐩綍鍜屽瓨鏀鹃獙璇侀泦鏁版嵁鐨刞dev`鐩綍銆傛瘡涓洰褰曚腑闇�瑕佸寘鎷煶棰戝垪琛ㄦ枃浠禶wav.scp`鍜屾妱鏈枃浠禶text`
-* `output_dir`锛氬井璋冪粨鏋滀繚瀛樼洰褰�
-* `dataset_type`锛氬浜庡皬鏁版嵁闆嗭紝璁剧疆涓篳small`锛涘綋鏁版嵁閲忓ぇ浜�1000灏忔椂鏃讹紝璁剧疆涓篳large`
-* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type璁剧疆涓篳large`锛宐atch_bins鍗曚綅涓烘绉�
-* `max_epoch`锛氭渶澶х殑璁粌杞暟
-
-浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `accum_grad`锛氭搴︾疮绉�
-* `keep_nbest_models`锛氶�夋嫨鎬ц兘鏈�濂界殑`keep_nbest_models`涓ā鍨嬬殑鍙傛暟杩涜骞冲潎锛屽緱鍒版�ц兘鏇村ソ鐨勬ā鍨�
-* `optim`锛氳缃紭鍖栧櫒
-* `lr`锛氳缃涔犵巼
-* `scheduler`锛氳缃涔犵巼璋冩暣绛栫暐
-* `scheduler_conf`锛氬涔犵巼璋冩暣绛栫暐鐨勭浉鍏冲弬鏁�
-* `specaug`锛氳缃氨澧炲箍
-* `specaug_conf`锛氳氨澧炲箍鐨勭浉鍏冲弬鏁�
-
-闄や簡鐩存帴鍦╜finetune.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`finetune.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀瑰井璋冮厤缃��
-
-## 鍩轰簬寰皟鍚庣殑妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
-* `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁$畻CER锛屽惁鍒欎細璺宠繃銆�
-* `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
-* `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
-* `ctc_weight`锛氶儴鍒嗘ā鍨嬪寘鍚獵TC妯″潡锛屽彲浠ヨ缃鍙傛暟鏉ユ寚瀹氭帹鐞嗘椂锛孋TC妯″潡鐨勬潈閲�
-* `decoding_model_name`锛氭寚瀹氱敤浜庢帹鐞嗙殑妯″瀷鍚�
-
-浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨嬪悕
-* `required_files`锛氫娇鐢╩odelscope鎺ュ彛杩涜鎺ㄧ悊鏃堕渶瑕佺敤鍒扮殑鏂囦欢
-
-## 娉ㄦ剰浜嬮」
-閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑`README.md`鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file
diff --git a/docs_cn/papers.md b/docs_cn/papers.md
deleted file mode 100644
index 34a8150..0000000
--- a/docs_cn/papers.md
+++ /dev/null
@@ -1,4 +0,0 @@
-# 璁烘枃
-
-- [Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model](https://arxiv.org/abs/2010.14099), arXiv preprint arXiv:2010.14099, 2020.
-- [Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition](https://arxiv.org/abs/2206.08317), INTERSPEECH 2022.
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/finetune.py b/egs_modelscope/asr/TEMPLATE/finetune.py
new file mode 100644
index 0000000..1935258
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/finetune.py
@@ -0,0 +1,36 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params.output_dir):
+ os.makedirs(params.output_dir, exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params.data_path)
+ kwargs = dict(
+ model=params.model,
+ data_dir=ds_dict,
+ dataset_type=params.dataset_type,
+ work_dir=params.output_dir,
+ batch_bins=params.batch_bins,
+ max_epoch=params.max_epoch,
+ lr=params.lr)
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", data_path="./data")
+ params.output_dir = "./checkpoint" # m妯″瀷淇濆瓨璺緞
+ params.data_path = "./example_data/" # 鏁版嵁璺緞
+ params.dataset_type = "small" # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+ params.batch_bins = 2000 # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+ params.max_epoch = 50 # 鏈�澶ц缁冭疆鏁�
+ params.lr = 0.00005 # 璁剧疆瀛︿範鐜�
+
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/TEMPLATE/infer.py b/egs_modelscope/asr/TEMPLATE/infer.py
new file mode 100644
index 0000000..9f280d5
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=args.model,
+ output_dir=args.output_dir,
+ batch_size=args.batch_size,
+ )
+ inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+ parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+ parser.add_argument('--output_dir', type=str, default="./results/")
+ parser.add_argument('--batch_size', type=int, default=64)
+ parser.add_argument('--gpuid', type=str, default="0")
+ args = parser.parse_args()
+ modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/infer.sh b/egs_modelscope/asr/TEMPLATE/infer.sh
new file mode 100644
index 0000000..b8b011c
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer.sh
@@ -0,0 +1,96 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true # whether to perform gpu decoding
+gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
+njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+ nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+ nj=$njob
+ batch_size=1
+ gpuid_list=""
+ for JOB in $(seq ${nj}); do
+ gpuid_list=$gpuid_list"-1,"
+ done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+ split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+ echo "Decoding ..."
+ gpuid_list_array=(${gpuid_list//,/ })
+ for JOB in $(seq ${nj}); do
+ {
+ id=$((JOB-1))
+ gpuid=${gpuid_list_array[$id]}
+ mkdir -p ${output_dir}/output.$JOB
+ python infer.py \
+ --model ${model} \
+ --audio_in ${output_dir}/split/wav.$JOB.scp \
+ --output_dir ${output_dir}/output.$JOB \
+ --batch_size ${batch_size} \
+ --gpuid ${gpuid}
+ }&
+ done
+ wait
+
+ mkdir -p ${output_dir}/1best_recog
+ for f in token score text; do
+ if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+ for i in $(seq "${nj}"); do
+ cat "${output_dir}/output.${i}/1best_recog/${f}"
+ done | sort -k1 >"${output_dir}/1best_recog/${f}"
+ fi
+ done
+fi
+
+if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
+ echo "Computing WER ..."
+ cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
+ cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
+ python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
+ tail -n 3 ${output_dir}/1best_recog/text.cer
+fi
+
+if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
+ echo "SpeechIO TIOBE textnorm"
+ echo "$0 --> Normalizing REF text ..."
+ ./utils/textnorm_zh.py \
+ --has_key --to_upper \
+ ${data_dir}/text \
+ ${output_dir}/1best_recog/ref.txt
+
+ echo "$0 --> Normalizing HYP text ..."
+ ./utils/textnorm_zh.py \
+ --has_key --to_upper \
+ ${output_dir}/1best_recog/text.proc \
+ ${output_dir}/1best_recog/rec.txt
+ grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
+
+ echo "$0 --> computing WER/CER and alignment ..."
+ ./utils/error_rate_zh \
+ --tokenizer char \
+ --ref ${output_dir}/1best_recog/ref.txt \
+ --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
+ ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
+ rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
+fi
+
diff --git a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
new file mode 100644
index 0000000..2d311dd
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
@@ -0,0 +1,48 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
+
+from funasr.utils.compute_wer import compute_wer
+
+def modelscope_infer_after_finetune(params):
+ # prepare for decoding
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
+ decoding_path = os.path.join(params["output_dir"], "decode_results")
+ if os.path.exists(decoding_path):
+ shutil.rmtree(decoding_path)
+ os.mkdir(decoding_path)
+
+ # decoding
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=pretrained_model_path,
+ output_dir=decoding_path,
+ batch_size=params["batch_size"]
+ )
+ audio_in = os.path.join(params["data_dir"], "wav.scp")
+ inference_pipeline(audio_in=audio_in)
+
+ # computer CER if GT text is set
+ text_in = os.path.join(params["data_dir"], "text")
+ if os.path.exists(text_in):
+ text_proc_file = os.path.join(decoding_path, "1best_recog/text")
+ compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
+
+
+if __name__ == '__main__':
+ params = {}
+ params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data/test"
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/utils b/egs_modelscope/asr/TEMPLATE/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/asr/TEMPLATE/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
index b3bfe8e..8abadd7 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer.py
@@ -7,7 +7,6 @@
from funasr.utils.compute_wer import compute_wer
-import pdb;
def modelscope_infer_core(output_dir, split_dir, njob, idx):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
gpu_id = (int(idx) - 1) // njob
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 79cc3c3..c740f71 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -23,21 +23,37 @@
- Setting parameters in `infer.sh`
- <strong>model:</strong> # model name on ModelScope
- - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
+ - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
- <strong>output_dir:</strong> # result dir
- <strong>batch_size:</strong> # batchsize of inference
- <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
- <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
- <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
-- Then you can run the pipeline to infer with:
-```python
- sh infer.sh
+- Decode with multi GPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --batch_size 64 \
+ --gpu_inference true \
+ --gpuid_list "0,1"
+```
+
+- Decode with multi-thread CPUs:
+```shell
+ bash infer.sh \
+ --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+ --data_dir "./data/test" \
+ --output_dir "./results" \
+ --gpu_inference false \
+ --njob 64
```
- Results
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
index 221479d..b8b011c 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -14,8 +14,9 @@
gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1"
njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+. utils/parse_options.sh || exit 1;
-if ${gpu_inference}; then
+if ${gpu_inference} == "true"; then
nj=$(echo $gpuid_list | awk -F "," '{print NF}')
else
nj=$njob
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index f3b4d56..4722602 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -346,6 +346,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index 2b6716e..e10ebf4 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -1,9 +1,4 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 6f3dbb1..e832869 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -472,6 +472,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 8cbd419..5546c92 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -612,7 +612,9 @@
**kwargs,
):
assert check_argument_types()
-
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
if ngpu > 1:
@@ -629,7 +631,9 @@
export_mode = param_dict.get("export_mode", False)
else:
hotword_list_or_file = None
-
+
+ if kwargs.get("device", None) == "cpu":
+ ngpu = 0
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
else:
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
index 944685f..821f694 100644
--- a/funasr/bin/asr_inference_paraformer_streaming.py
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -536,6 +536,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad.py b/funasr/bin/asr_inference_paraformer_vad.py
index 1548f9f..977dc9b 100644
--- a/funasr/bin/asr_inference_paraformer_vad.py
+++ b/funasr/bin/asr_inference_paraformer_vad.py
@@ -157,6 +157,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 9dc0b79..197930f 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -484,6 +484,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if word_lm_train_config is not None:
raise NotImplementedError("Word LM is not implemented")
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 4aea720..35ecdc2 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -379,6 +379,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 83436e8..07974c0 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -2,8 +2,6 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index 01d3f29..87816dd 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -158,6 +158,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/lm_inference.py b/funasr/bin/lm_inference.py
index 15c56ca..76de6df 100644
--- a/funasr/bin/lm_inference.py
+++ b/funasr/bin/lm_inference.py
@@ -89,10 +89,9 @@
**kwargs,
):
assert check_argument_types()
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
diff --git a/funasr/bin/lm_inference_launch.py b/funasr/bin/lm_inference_launch.py
index d229cc6..dc6414f 100644
--- a/funasr/bin/lm_inference_launch.py
+++ b/funasr/bin/lm_inference_launch.py
@@ -1,9 +1,6 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
+
import argparse
import logging
diff --git a/funasr/bin/punc_inference_launch.py b/funasr/bin/punc_inference_launch.py
index 2c5a286..b1d9235 100755
--- a/funasr/bin/punc_inference_launch.py
+++ b/funasr/bin/punc_inference_launch.py
@@ -1,9 +1,5 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/punctuation_infer_vadrealtime.py b/funasr/bin/punctuation_infer_vadrealtime.py
index 5157eeb..b2db1bf 100644
--- a/funasr/bin/punctuation_infer_vadrealtime.py
+++ b/funasr/bin/punctuation_infer_vadrealtime.py
@@ -203,10 +203,8 @@
**kwargs,
):
assert check_argument_types()
- logging.basicConfig(
- level=log_level,
- format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
- )
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if ngpu >= 1 and torch.cuda.is_available():
device = "cuda"
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index 5a0a8e2..c55bc35 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -252,6 +252,8 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index 7e63bbd..76b1dfb 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -179,6 +179,9 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/sv_inference_launch.py b/funasr/bin/sv_inference_launch.py
index 64a3cff..8806070 100755
--- a/funasr/bin/sv_inference_launch.py
+++ b/funasr/bin/sv_inference_launch.py
@@ -2,8 +2,6 @@
# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
# MIT License (https://opensource.org/licenses/MIT)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/tp_inference.py b/funasr/bin/tp_inference.py
index 6360b17..df029fd 100644
--- a/funasr/bin/tp_inference.py
+++ b/funasr/bin/tp_inference.py
@@ -54,7 +54,7 @@
assert check_argument_types()
# 1. Build ASR model
tp_model, tp_train_args = ASRTask.build_model_from_file(
- timestamp_infer_config, timestamp_model_file, device
+ timestamp_infer_config, timestamp_model_file, device=device
)
if 'cuda' in device:
tp_model = tp_model.cuda() # force model to cuda
@@ -179,6 +179,9 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/tp_inference_launch.py b/funasr/bin/tp_inference_launch.py
index 55debac..6cdff05 100644
--- a/funasr/bin/tp_inference_launch.py
+++ b/funasr/bin/tp_inference_launch.py
@@ -1,9 +1,5 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/vad_inference.py b/funasr/bin/vad_inference.py
index 08d65a4..aff0a44 100644
--- a/funasr/bin/vad_inference.py
+++ b/funasr/bin/vad_inference.py
@@ -192,6 +192,9 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/bin/vad_inference_launch.py b/funasr/bin/vad_inference_launch.py
index 8fea8db..4a1f334 100644
--- a/funasr/bin/vad_inference_launch.py
+++ b/funasr/bin/vad_inference_launch.py
@@ -1,9 +1,4 @@
#!/usr/bin/env python3
-# Copyright ESPnet (https://github.com/espnet/espnet). All Rights Reserved.
-# Apache 2.0 (http://www.apache.org/licenses/LICENSE-2.0)
-
-import torch
-torch.set_num_threads(1)
import argparse
import logging
diff --git a/funasr/bin/vad_inference_online.py b/funasr/bin/vad_inference_online.py
index 9ed0721..4d02620 100644
--- a/funasr/bin/vad_inference_online.py
+++ b/funasr/bin/vad_inference_online.py
@@ -151,6 +151,9 @@
**kwargs,
):
assert check_argument_types()
+ ncpu = kwargs.get("ncpu", 1)
+ torch.set_num_threads(ncpu)
+
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if ngpu > 1:
diff --git a/funasr/models/joint_net/__init__.py b/funasr/models/joint_net/__init__.py
new file mode 100644
index 0000000..8b13789
--- /dev/null
+++ b/funasr/models/joint_net/__init__.py
@@ -0,0 +1 @@
+
diff --git a/funasr/runtime/python/benchmark_libtorch.md b/funasr/runtime/python/benchmark_libtorch.md
index 6c068fe..52927b1 100644
--- a/funasr/runtime/python/benchmark_libtorch.md
+++ b/funasr/runtime/python/benchmark_libtorch.md
@@ -1,27 +1,32 @@
-# Benchmark
+# CPU Benchmark (Libtorch)
+## Configuration
### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
- ```shell
- pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
- git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
- pip install --editable ./
- cd funasr/runtime/python/utils
- pip install -r requirements.txt
- ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
-- recipe
+#### Recipe
- set the model, data path and output_dir
+set the model, data path and output_dir
- ```shell
- nohup bash test_rtf.sh &> log.txt &
- ```
-
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
diff --git a/funasr/runtime/python/benchmark_onnx.md b/funasr/runtime/python/benchmark_onnx.md
index 533798a..9f92094 100644
--- a/funasr/runtime/python/benchmark_onnx.md
+++ b/funasr/runtime/python/benchmark_onnx.md
@@ -1,26 +1,32 @@
-# Benchmark
+# CPU Benchmark (ONNX)
+## Configuration
### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
### Tools
-- Install ModelScope and FunASR
+#### Install Requirements
+Install ModelScope and FunASR
+```shell
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+#pip install -U funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
- ```shell
- pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
- git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
- pip install --editable ./
- cd funasr/runtime/python/utils
- pip install -r requirements.txt
- ```
+Install requirements
+```shell
+git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+cd funasr/runtime/python/utils
+pip install -r requirements.txt
+```
-- recipe
+#### Recipe
- set the model, data path and output_dir
+set the model, data path and output_dir
- ```shell
- nohup bash test_rtf.sh &> log.txt &
- ```
+```shell
+nohup bash test_rtf.sh &> log.txt &
+```
## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index e85e08a..3f4e762 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -35,22 +35,114 @@
# pip install -e ./ -i https://mirror.sjtu.edu.cn/pypi/web/simple
```
-## Run the demo
-- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`.
+## Inference with runtime
+
+### Speech Recognition
+#### Paraformer
+ ```python
+ from funasr_onnx import Paraformer
+
+ model_dir = "./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+ model = Paraformer(model_dir, batch_size=1)
+
+ wav_path = ['./export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+
+ result = model(wav_path)
+ print(result)
+ ```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
-- Output: `List[str]`: recognition result.
-- Example:
- ```python
- from funasr_onnx import Paraformer
+- Output: `List[str]`: recognition result
- model_dir = "/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- model = Paraformer(model_dir, batch_size=1)
+#### Paraformer-online
- wav_path = ['/nfs/zhifu.gzf/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/example/asr_example.wav']
+### Voice Activity Detection
+#### FSMN-VAD
+```python
+from funasr_onnx import Fsmn_vad
- result = model(wav_path)
- print(result)
- ```
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad(model_dir)
+
+result = model(wav_path)
+print(result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### FSMN-VAD-online
+```python
+from funasr_onnx import Fsmn_vad_online
+import soundfile
+
+
+model_dir = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch"
+wav_path = "./export/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/example/vad_example.wav"
+model = Fsmn_vad_online(model_dir)
+
+
+##online vad
+speech, sample_rate = soundfile.read(wav_path)
+speech_length = speech.shape[0]
+#
+sample_offset = 0
+step = 1600
+param_dict = {'in_cache': []}
+for sample_offset in range(0, speech_length, min(step, speech_length - sample_offset)):
+ if sample_offset + step >= speech_length - 1:
+ step = speech_length - sample_offset
+ is_final = True
+ else:
+ is_final = False
+ param_dict['is_final'] = is_final
+ segments_result = model(audio_in=speech[sample_offset: sample_offset + step],
+ param_dict=param_dict)
+ if segments_result:
+ print(segments_result)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+### Punctuation Restoration
+#### CT-Transformer
+```python
+from funasr_onnx import CT_Transformer
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch"
+model = CT_Transformer(model_dir)
+
+text_in="璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭"
+result = model(text_in)
+print(result[0])
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
+
+#### CT-Transformer-online
+```python
+from funasr_onnx import CT_Transformer_VadRealtime
+
+model_dir = "./export/damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727"
+model = CT_Transformer_VadRealtime(model_dir)
+
+text_in = "璺ㄥ娌虫祦鏄吇鑲叉部宀竱浜烘皯鐨勭敓鍛戒箣婧愰暱鏈熶互鏉ヤ负甯姪涓嬫父鍦板尯闃茬伨鍑忕伨涓柟鎶�鏈汉鍛榺鍦ㄤ笂娓稿湴鍖烘瀬涓烘伓鍔g殑鑷劧鏉′欢涓嬪厠鏈嶅法澶у洶闅剧敋鑷冲啋鐫�鐢熷懡鍗遍櫓|鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦>闂涓婄殑鍏冲垏|鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒秥鍑℃槸|涓柟鑳藉仛鐨勬垜浠瑋閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑|浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛瑙勫垝鍜岃璇佸吋椤句笂涓嬫父鐨勫埄鐩�"
+
+vads = text_in.split("|")
+rec_result_all=""
+param_dict = {"cache": []}
+for vad in vads:
+ result = model(vad, param_dict=param_dict)
+ rec_result_all += result[0]
+
+print(rec_result_all)
+```
+- Model_dir: the model path, which contains `model.onnx`, `config.yaml`, `am.mvn`
+- Input: wav formt file, support formats: `str, np.ndarray, List[str]`
+- Output: `List[str]`: recognition result
## Performance benchmark
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 9574a0d..9c4af41 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -186,9 +186,6 @@
logging.warning("No keep_nbest_models is given. Change to [1]")
trainer_options.keep_nbest_models = [1]
keep_nbest_models = trainer_options.keep_nbest_models
-
- #assert batch_interval is set and >0
- assert trainer_options.batch_interval > 0
output_dir = Path(trainer_options.output_dir)
reporter = Reporter()
--
Gitblit v1.9.1