From b27f0a4691e3798283e0841e027f422d5920d7cf Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 15 二月 2023 16:23:29 +0800
Subject: [PATCH] Merge pull request #112 from alibaba-damo-academy/dev_wjm

---
 docs_cn/index.rst            |    1 
 docs/get_started.md          |   43 +++---
 docs/installation.md         |   30 ++--
 docs_cn/get_started.md       |    3 
 docs/index.rst               |    1 
 docs_cn/build_task.md        |  105 +++++++++++++++++
 docs_cn/modelscope_usages.md |   16 +-
 docs/build_task.md           |  106 +++++++++++++++++
 docs/modelscope_usages.md    |   53 ++++++++
 9 files changed, 312 insertions(+), 46 deletions(-)

diff --git a/docs/build_task.md b/docs/build_task.md
new file mode 100644
index 0000000..a45c820
--- /dev/null
+++ b/docs/build_task.md
@@ -0,0 +1,106 @@
+# Build custom tasks
+FunASR is similar to ESPNet, which applies `Task`  as the general interface ti achieve the training and inference of models. Each `Task` is a class inherited from `AbsTask` and its corresponding code can be seen in `funasr/tasks/abs_task.py`. The main functions of `AbsTask` are shown as follows:
+```python
+class AbsTask(ABC):
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        pass
+    
+    @classmethod
+    def build_preprocess_fn(cls, args, train):
+        (...)
+    
+    @classmethod
+    def build_collate_fn(cls, args: argparse.Namespace):
+        (...)
+
+    @classmethod
+    def build_model(cls, args):
+        (...)
+    
+    @classmethod
+    def main(cls, args):
+        (...)
+```
+- add_task_arguments锛欰dd parameters required by a specified `Task`
+- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞� define how to preprocess samples
+- build_collate_fn锛歞efine how to combine multiple samples into a `batch`
+- build_model锛歞efine the model
+- main锛歵raining interface, starting training through `Task.main()`
+
+Next, we take the speech recognition as an example to introduce how to define a new `Task`. For the corresponding code, please see `ASRTask` in `funasr/tasks/asr.py`. The procedure of defining a new `Task` is actually the procedure of redefining the above functions according to the requirements of the specified `Task`.
+
+- add_task_arguments
+```python
+@classmethod
+def add_task_arguments(cls, parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(description="Task related")
+    group.add_argument(
+        "--token_list",
+        type=str_or_none,
+        default=None,
+        help="A text mapping int-id to token",
+    )
+    (...)
+```
+For speech recognition tasks, specific parameters required include `token_list`, etc. According to the specific requirements of different tasks, users can define corresponding parameters in this function.
+
+- build_preprocess_fn
+```python
+@classmethod
+def build_preprocess_fn(cls, args, train):
+    if args.use_preprocessor:
+        retval = CommonPreprocessor(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    ...
+                )
+    else:
+        retval = None
+    return retval
+```
+This function defines how to preprocess samples. Specifically, the input of speech recognition tasks includes speech and text. For speech, functions such as (optional) adding noise and reverberation to the speech are supported. For text, functions such as (optional) processing text according to bpe and mapping text to `tokenid` are supported. Users can choose the preprocessing operation that needs to be performed on the sample. For the detail implementation, please refer to `CommonPreprocessor`.
+
+- build_collate_fn
+```python
+@classmethod
+def build_collate_fn(cls, args, train):
+    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+```
+This function defines how to combine multiple samples into a `batch`. For speech recognition tasks, `padding` is employed to obtain equal-length data from different speech and text. Specifically, we set `0.0` as the default padding value for speech and `-1` as the default padding value for text. Users can define different `batch` operations here. For the detail implementation, please refer to `CommonCollateFn`.
+
+- build_model
+```python
+@classmethod
+def build_model(cls, args, train):
+    with open(args.token_list, encoding="utf-8") as f:
+        token_list = [line.rstrip() for line in f]
+        vocab_size = len(token_list)
+        frontend = frontend_class(**args.frontend_conf)
+        specaug = specaug_class(**args.specaug_conf)
+        normalize = normalize_class(**args.normalize_conf)
+        preencoder = preencoder_class(**args.preencoder_conf)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
+        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+    return model
+```
+This function defines the detail of the model. For different speech recognition models, the same speech recognition `Task` can usually be shared and the remaining thing needed to be done is to define a specific model in this function. For example, a speech recognition model with a standard encoder-decoder structure has been shown above. Specifically, it first defines each module of the model, including encoder, decoder, etc. and then combine these modules together to generate a complete model. In FunASR, the model needs to inherit `AbsESPnetModel` and the corresponding code can be seen in `funasr/train/abs_espnet_model.py`. The main function needed to be implemented is the `forward` function.
\ No newline at end of file
diff --git a/docs/get_started.md b/docs/get_started.md
index 9f01fdc..4a7d86e 100644
--- a/docs/get_started.md
+++ b/docs/get_started.md
@@ -1,21 +1,21 @@
 # Get Started
-This is an easy example which introduces how to train a paraformer model on AISHELL-1 data from scratch. According to this example, you can train other models (conformer, paraformer, etc.) on other datasets (AISHELL-1, AISHELL-2, etc.) similarly.
+Here we take "Training a paraformer model from scratch using the AISHELL-1 dataset" as an example to introduce how to use FunASR. According to this example, users can similarly employ other datasets (such as AISHELL-2 dataset, etc.) to train other models (such as conformer, transformer, etc.).
 
 ## Overall Introduction
-We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 data  . This recipe consists of five stages and support training on multiple GPUs and decoding by CPU or GPU. Before introduce each stage in detail, we first explain several variables which should be set by users.
+We provide a recipe `egs/aishell/paraformer/run.sh` for training a paraformer model on AISHELL-1 dataset. This recipe consists of five stages, supporting training on multiple GPUs and decoding by CPU or GPU. Before introducing each stage in detail, we first explain several parameters which should be set by users.
 - `CUDA_VISIBLE_DEVICES`: visible gpu list
 - `gpu_num`: the number of GPUs used for training
 - `gpu_inference`: whether to use GPUs for decoding
-- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU.
-- `feats_dir`: the path to save processed data
-- `exp_dir`: the path to save experimental results
-- `data_aishell`: the path of raw AISHELL-1 data
-- `tag`: the suffix of experimental result directory
+- `njob`: for CPU decoding, indicating the total number of CPU jobs; for GPU decoding, indicating the number of jobs on each GPU
+- `data_aishell`: the raw path of AISHELL-1 dataset
+- `feats_dir`: the path for saving processed data
 - `nj`: the number of jobs for data preparation
 - `speed_perturb`: the range of speech perturbed
+- `exp_dir`: the path for saving experimental results
+- `tag`: the suffix of experimental result directory
 
 ## Stage 0: Data preparation
-This stage processes raw AISHELL-1 data `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx` and `xxx` means `train/dev/test`. Here we assume you have already downloaded AISHELL-1 data. If not, you can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. Here we show examples for `wav.scp` and `text`, separately.
+This stage processes raw AISHELL-1 dataset `$data_aishell` and generates the corresponding `wav.scp` and `text` in `$feats_dir/data/xxx`. `xxx` means `train/dev/test`. Here we assume users have already downloaded AISHELL-1 dataset. If not, users can download data [here](https://www.openslr.org/33/) and set the path for `$data_aishell`. The examples of `wav.scp` and `text` are as follows:
 * `wav.scp`
 ```
 BAC009S0002W0122 /nfs/ASR_DATA/AISHELL-1/data_aishell/wav/train/S0002/BAC009S0002W0122.wav
@@ -30,17 +30,17 @@
 BAC009S0002W0124 鑷� 鍏� 鏈� 搴� 鍛� 鍜� 娴� 鐗� 甯� 鐜� 鍏� 瀹� 甯� 鍙� 娑� 闄� 璐� 鍚�
 ...
 ```
-We can see that these two files both have two columns while the first column is the wav-id and the second column is the corresponding wav-path/label tokens.
+These two files both have two columns, while the first column is wav ids and the second column is the corresponding wav paths/label tokens.
 
 ## Stage 1: Feature Generation
-This stage extracts FBank feature from raw wav `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. You can set `nj` to control the number of jobs for feature generation. The output features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
+This stage extracts FBank features from `wav.scp` and apply speed perturbation as data augmentation according to `speed_perturb`. Users can set `nj` to control the number of jobs for feature generation. The generated features are saved in `$feats_dir/dump/xxx/ark` and the corresponding `feats.scp` files are saved as `$feats_dir/dump/xxx/feats.scp`. An example of `feats.scp` can be seen as follows:
 * `feats.scp`
 ```
 ...
-BAC009S0002W0122_sp0.9 /nfs/haoneng.lhn/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
+BAC009S0002W0122_sp0.9 /nfs/funasr_data/aishell-1/dump/fbank/train/ark/feats.16.ark:592751055
 ...
 ```
-Note that samples in this file have already been shuffled. This file contains two columns. The first column is the wav-id while the second column is the kaldi-ark feature path. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
+Note that samples in this file have already been shuffled randomly. This file contains two columns. The first column is wav ids while the second column is kaldi-ark feature paths. Besides, `speech_shape` and `text_shape` are also generated in this stage, denoting the speech feature shape and text length of each sample. The examples are shown as follows:
 * `speech_shape`
 ```
 ...
@@ -53,10 +53,10 @@
 BAC009S0002W0122_sp0.9 15
 ...
 ```
-These two files have two columns. The first column is the wav-id and the second column is the corresponding speech feature shape and text length.
+These two files have two columns. The first column is wav ids and the second column is the corresponding speech feature shape and text length.
 
 ## Stage 2: Dictionary Preparation
-This stage prepares a dictionary, which is used as a mapping between label characters and integer indices during ASR training. The output dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. Here we show an example of `tokens.txt` as follows:
+This stage processes the dictionary, which is used as a mapping between label characters and integer indices during ASR training. The processed dictionary file is saved as `$feats_dir/data/$lang_toekn_list/$token_type/tokens.txt`. An example of `tokens.txt` is as follows:
 * `tokens.txt`
 ```
 <blank>
@@ -75,7 +75,7 @@
 * `<unk>`: indicates the out-of-vocabulary token
 
 ## Stage 3: Training
-This stage achieves the training of the specified model. To start training, you should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
+This stage achieves the training of the specified model. To start training, users should manually set `exp_dir`, `CUDA_VISIBLE_DEVICES` and `gpu_num`, which have already been explained above. By default, the best `$keep_nbest_models` checkpoints on validation dataset will be averaged to generate a better model and adopted for decoding.
 
 * DDP Training
 
@@ -83,30 +83,29 @@
 
 * DataLoader
 
-[comment]: <> (We support two types of DataLoaders for small and large datasets, respectively. By default, the small DataLoader is used and you can set `dataset_type=large` to enable large DataLoader. For small DataLoader, )
-We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and you can set `dataset_type=large` to enable it. 
+We support an optional iterable-style DataLoader based on [Pytorch Iterable-style DataPipes](https://pytorch.org/data/beta/torchdata.datapipes.iter.html) for large dataset and users can set `dataset_type=large` to enable it. 
 
 * Configuration
 
-The parameters of the training, including model, optimization, dataset, etc., are specified by a YAML file in `conf` directory. Also, you can directly specify the parameters in `run.sh` recipe. Please avoid to specify the same parameters in both the YAML file and the recipe.
+The parameters of the training, including model, optimization, dataset, etc., can be set by a YAML file in `conf` directory. Also, users can directly set the parameters in `run.sh` recipe. Please avoid to set the same parameters in both the YAML file and the recipe.
 
 * Training Steps
 
-We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of the two parameters, the training will be stopped.
+We support two parameters to specify the training steps, namely `max_epoch` and `max_update`. `max_epoch` indicates the total training epochs while `max_update` indicates the total training steps. If these two parameters are specified at the same time, once the training reaches any one of these two parameters, the training will be stopped.
 
 * Tensorboard
 
-You can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
+Users can use tensorboard to observe the loss, learning rate, etc. Please run the following command:
 ```
 tensorboard --logdir ${exp_dir}/exp/${model_dir}/tensorboard/train
 ```
 
 ## Stage 4: Decoding
-This stage generates the recognition results with acoustic features as input and calculate the `CER` to verify the performance of the trained model. 
+This stage generates the recognition results and calculates the `CER` to verify the performance of the trained model. 
 
 * Mode Selection
 
-As we support conformer, paraformer and uniasr in FunASR and they have different inference interfaces, a `mode` param is specified as `asr/paraformer/uniase` according to the trained model.
+As we support paraformer, uniasr, conformer and other models in FunASR, a `mode` parameter should be specified as `asr/paraformer/uniasr` according to the trained model.
 
 * Configuration
 
diff --git a/docs/index.rst b/docs/index.rst
index ce44488..d29b500 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -16,6 +16,7 @@
    ./installation.md
    ./papers.md
    ./get_started.md
+   ./build_task.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs/installation.md b/docs/installation.md
index 61d06b5..fb26913 100755
--- a/docs/installation.md
+++ b/docs/installation.md
@@ -1,35 +1,35 @@
 # Installation
-FunASR is easy to install, which is mainly based on python packages.
+FunASR is easy to install. The detailed installation steps are as follows:
 
-- Clone the repo
-``` sh
-git clone https://github.com/alibaba/FunASR.git
-```
-
-- Install Conda
-``` sh
+- Install Conda and create virtual environment:
+```sh
 wget https://repo.continuum.io/miniconda/Miniconda3-latest-Linux-x86_64.sh
 sh Miniconda3-latest-Linux-x86_64.sh
+source ~/.bashrc
 conda create -n funasr python=3.7
 conda activate funasr
 ```
 
 - Install Pytorch (version >= 1.7.0):
-
-| cuda  | |
-|:-----:| --- |
-|  9.2  | conda install pytorch==1.7.0 torchvision==0.8.0 torchaudio==0.7.0 cudatoolkit=9.2 -c pytorch |
-| 10.2  | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=10.2 -c pytorch |
-| 11.1  | conda install pytorch==1.8.0 torchvision==0.9.0 torchaudio==0.8.0 cudatoolkit=11.1 -c pytorch |
+```sh
+pip install torch torchaudio
+```
 
 For more versions, please see [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
 
 - Install ModelScope
+
+For users in China, you can configure the following mirror source to speed up the downloading:
 ``` sh
+pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+Install or update ModelScope
+```sh
 pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
-- Install other packages
+- Clone the repo and install other packages
 ``` sh
+git clone https://github.com/alibaba/FunASR.git && cd FunASR
 pip install --editable ./
 ```
\ No newline at end of file
diff --git a/docs/modelscope_usages.md b/docs/modelscope_usages.md
new file mode 100644
index 0000000..af8d6da
--- /dev/null
+++ b/docs/modelscope_usages.md
@@ -0,0 +1,53 @@
+# 蹇�熶娇鐢∕odelScope
+ModelScope is an open-source model-as-service platform supported by Alibaba, which provides flexible and convenient model applications for users in academia and industry. For specific usages and open source models, please refer to [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition). In the domain of speech, we provide autoregressive/non-autoregressive speech recognition, speech pre-training, punctuation prediction and other models, which are convenient for users.
+
+## Overall Introduction
+We provide the usages of different models under the `egs_modelscope`, which supports directly employing our provided models for inference, as well as finetuning the models we provided as pre-trained initial models. Next, we will introduce the model provided in the `egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch` directory, including `infer.py`, `finetune.py` and `infer_after_finetune .py`. The corresponding functions are as follows:
+- `infer.py`: perform inference on the specified dataset based on our provided model
+- `finetune.py`: employ our provided model as the initial model for fintuning
+- `infer_after_finetune.py`: perform inference on the specified dataset based on the finetuned model
+
+## Inference
+We provide `infer.py` to achieve the inference. Based on this file, users can preform inference on the specified dataset based on our provided model and obtain the corresponding recognition results. If the transcript is given, the `CER` will be calculated at the same time. Before performing inference, users can set the following parameters to modify the inference configuration:
+* `data_dir`锛歞ataset directory. The directory should contain the wav list file `wav.scp` and the transcript file `text` (optional). For the format of these two files, please refer to the instructions in [Quick Start](./get_started.md). If the `text` file exists, the CER will be calculated accordingly, otherwise it will be skipped.
+* `output_dir`锛歵he directory for saving the inference results
+* `batch_size`锛歜atch size during the inference
+* `ctc_weight`锛歴ome models contain a CTC module, users can set this parameter to specify the weight of the CTC module during the inference
+
+In addition to directly setting parameters in `infer.py`, users can also manually set the parameters in the `decoding.yaml` file in the model download directory to modify the inference configuration.
+
+## Finetuning
+We provide `finetune.py` to achieve the finetuning. Based on this file, users can finetune on the specified dataset based on our provided model as the initial model to achieve better performance in the specificed domain. Before finetuning, users can set the following parameters to modify the finetuning configuration:
+* `data_path`锛歞ataset directory銆俆his directory should contain the `train` directory for saving the training set and the `dev` directory for saving the validation set. Each directory needs to contain the wav list file `wav.scp` and the transcript file `text`
+* `output_dir`锛歵he directory for saving the finetuning results
+* `dataset_type`锛歠or small dataset锛宻et as `small`锛沠or dataset larger than 1000 hours锛宻et as `large`
+* `batch_bins`锛歜atch size锛宨f dataset_type is set as `small`锛宼he unit of batch_bins is the number of fbank feature frames; if dataset_type is set as `large`, the unit of batch_bins is milliseconds
+* `max_epoch`锛歵he maximum number of training epochs
+
+The following parameters can also be set. However, if there is no special requirement, users can ignore these parameters and use the default value we provided directly:
+* `accum_grad`锛歵he accumulation of the gradient
+* `keep_nbest_models`锛歴elect the `keep_nbest_models` models with the best performance and average the parameters 
+  of these models to get a better model
+* `optim`锛歴et the optimizer
+* `lr`锛歴et the learning rate
+* `scheduler`锛歴et learning rate adjustment strategy
+* `scheduler_conf`锛歴et the related parameters of the learning rate adjustment strategy
+* `specaug`锛歴et for the spectral augmentation
+* `specaug_conf`锛歴et related parameters of the spectral augmentation
+
+In addition to directly setting parameters in `finetune.py`, users can also manually set the parameters in the `finetune.yaml` file in the model download directory to modify the finetuning configuration.
+
+## Inference after Finetuning
+We provide `infer_after_finetune.py` to achieve the inference based on the model finetuned by users. Based on this file, users can preform inference on the specified dataset based on the finetuned model and obtain the corresponding recognition results. If the transcript is given, the `CER` will be calculated at the same time. Before performing inference, users can set the following parameters to modify the inference configuration:
+* `data_dir`锛歞ataset directory銆俆he directory should contain the wav list file `wav.scp` and the transcript file `text` (optional). If the `text` file exists, the CER will be calculated accordingly, otherwise it will be skipped.
+* `output_dir`锛歵he directory for saving the inference results
+* `batch_size`锛歜atch size during the inference
+* `ctc_weight`锛歴ome models contain a CTC module, users can set this parameter to specify the weight of the CTC module during the inference
+* `decoding_model_name`锛歴et the name of the model used for the inference
+
+The following parameters can also be set. However, if there is no special requirement, users can ignore these parameters and use the default value we provided directly:
+* `modelscope_model_name`锛歵he initial model name used when finetuning
+* `required_files`锛歠iles required for the inference when using the modelscope interface
+
+## Announcements
+Some models may have other unique parameters during the finetuning and inference. The specific usages of these parameters can be found in the `README.md` file in the corresponding directory.
\ No newline at end of file
diff --git a/docs_cn/build_task.md b/docs_cn/build_task.md
new file mode 100644
index 0000000..5d78993
--- /dev/null
+++ b/docs_cn/build_task.md
@@ -0,0 +1,105 @@
+# 鎼缓鑷畾涔変换鍔�
+FunASR绫讳技ESPNet锛屼互`Task`涓洪�氱敤鎺ュ彛锛屼粠鑰屽疄鐜版ā鍨嬬殑璁粌鍜屾帹鐞嗐�傛瘡涓�涓猔Task`鏄竴涓被锛屽叾闇�瑕佺户鎵縛AbsTask`锛屽叾瀵瑰簲鐨勫叿浣撲唬鐮佽`funasr/tasks/abs_task.py`銆備笅闈㈢粰鍑哄叾鍖呭惈鐨勪富瑕佸嚱鏁板強鍔熻兘浠嬬粛锛�
+```python
+class AbsTask(ABC):
+    @classmethod
+    def add_task_arguments(cls, parser: argparse.ArgumentParser):
+        pass
+    
+    @classmethod
+    def build_preprocess_fn(cls, args, train):
+        (...)
+    
+    @classmethod
+    def build_collate_fn(cls, args: argparse.Namespace):
+        (...)
+
+    @classmethod
+    def build_model(cls, args):
+        (...)
+    
+    @classmethod
+    def main(cls, args):
+        (...)
+```
+- add_task_arguments锛氭坊鍔犵壒瀹歚Task`闇�瑕佺殑鍙傛暟
+- build_preprocess_fn锛氬畾涔夊浣曞鐞嗗鏍锋湰杩涜棰勫鐞�
+- build_collate_fn锛氬畾涔夊浣曞皢澶氫釜鏍锋湰缁勬垚涓�涓猔batch`
+- build_model锛氬畾涔夋ā鍨�
+- main锛氳缁冨叆鍙o紝閫氳繃`Task.main()`鏉ュ惎鍔ㄨ缁�
+
+涓嬮潰鎴戜滑灏嗕互璇煶璇嗗埆浠诲姟涓轰緥锛屼粙缁嶅浣曞畾涔変竴涓柊鐨刞Task`锛屽叿浣撲唬鐮佽`funasr/tasks/asr.py`涓殑`ASRTask`銆� 瀹氫箟鏂扮殑`Task`鐨勮繃绋嬶紝鍏跺疄灏辨槸鏍规嵁浠诲姟闇�姹傦紝閲嶅畾涔変笂杩板嚱鏁扮殑杩囩▼銆�
+- add_task_arguments
+```python
+@classmethod
+def add_task_arguments(cls, parser: argparse.ArgumentParser):
+    group = parser.add_argument_group(description="Task related")
+    group.add_argument(
+        "--token_list",
+        type=str_or_none,
+        default=None,
+        help="A text mapping int-id to token",
+    )
+    (...)
+```
+瀵逛簬璇煶璇嗗埆浠诲姟锛岄渶瑕佺殑鐗瑰畾鍙傛暟鍖呮嫭`token_list`绛夈�傛牴鎹笉鍚屼换鍔$殑鐗瑰畾闇�姹傦紝鐢ㄦ埛鍙互鍦ㄦ鍑芥暟涓畾涔夌浉搴旂殑鍙傛暟銆�
+
+- build_preprocess_fn
+```python
+@classmethod
+def build_preprocess_fn(cls, args, train):
+    if args.use_preprocessor:
+        retval = CommonPreprocessor(
+                    train=train,
+                    token_type=args.token_type,
+                    token_list=args.token_list,
+                    bpemodel=args.bpemodel,
+                    non_linguistic_symbols=args.non_linguistic_symbols,
+                    text_cleaner=args.cleaner,
+                    ...
+                )
+    else:
+        retval = None
+    return retval
+```
+璇ュ嚱鏁板畾涔変簡濡備綍瀵规牱鏈繘琛岄澶勭悊銆傚叿浣撳湴锛岃闊宠瘑鍒换鍔$殑杈撳叆鍖呮嫭闊抽鍜屾妱鏈�傚浜庨煶棰戯紝鍦ㄦ瀹炵幇浜�(鍙��)瀵归煶棰戝姞鍣0锛屽姞娣峰搷绛夊姛鑳斤紱瀵逛簬鎶勬湰锛屽湪姝ゅ疄鐜颁簡(鍙��)鏍规嵁bpe澶勭悊鎶勬湰锛屽皢鎶勬湰鏄犲皠鎴恅tokenid`绛夊姛鑳姐�傜敤鎴峰彲浠ヨ嚜宸遍�夋嫨闇�瑕佸鏍锋湰杩涜鐨勯澶勭悊鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonPreprocessor`銆�
+
+- build_collate_fn
+```python
+@classmethod
+def build_collate_fn(cls, args, train):
+    return CommonCollateFn(float_pad_value=0.0, int_pad_value=-1)
+```
+璇ュ嚱鏁板畾涔変簡濡備綍灏嗗涓牱鏈粍鎴愪竴涓猔batch`銆傚浜庤闊宠瘑鍒换鍔★紝鍦ㄦ瀹炵幇鐨勬槸灏嗕笉鍚岀殑闊抽鍜屾妱鏈紝閫氳繃`padding`鐨勬柟寮忔潵寰楀埌绛夐暱鐨勬暟鎹�傚叿浣撳湴锛屾垜浠粯璁ょ敤`0.0`鏉ヤ綔涓洪煶棰戠殑濉厖鍊硷紝鐢╜-1`浣滀负鎶勬湰鐨勯粯璁ゅ~鍏呭�笺�傜敤鎴峰彲浠ュ湪姝ゅ畾涔変笉鍚岀殑缁刞batch`鎿嶄綔锛屽疄鐜版柟娉曞彲浠ュ弬鑰僠CommonCollateFn`銆�
+
+- build_model
+```python
+@classmethod
+def build_model(cls, args, train):
+    with open(args.token_list, encoding="utf-8") as f:
+        token_list = [line.rstrip() for line in f]
+        vocab_size = len(token_list)
+        frontend = frontend_class(**args.frontend_conf)
+        specaug = specaug_class(**args.specaug_conf)
+        normalize = normalize_class(**args.normalize_conf)
+        preencoder = preencoder_class(**args.preencoder_conf)
+        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+        postencoder = postencoder_class(input_size=encoder_output_size, **args.postencoder_conf)
+        decoder = decoder_class(vocab_size=vocab_size, encoder_output_size=encoder_output_size,  **args.decoder_conf)
+        ctc = CTC(odim=vocab_size, encoder_output_size=encoder_output_size, **args.ctc_conf)
+        model = model_class(
+            vocab_size=vocab_size,
+            frontend=frontend,
+            specaug=specaug,
+            normalize=normalize,
+            preencoder=preencoder,
+            encoder=encoder,
+            postencoder=postencoder,
+            decoder=decoder,
+            ctc=ctc,
+            token_list=token_list,
+            **args.model_conf,
+        )
+    return model
+```
+璇ュ嚱鏁板畾涔変簡鍏蜂綋鐨勬ā鍨嬨�傚浜庝笉鍚岀殑璇煶璇嗗埆妯″瀷锛屽線寰�鍙互鍏辩敤鍚屼竴涓闊宠瘑鍒玚Task`锛岄澶栭渶瑕佸仛鐨勬槸鍦ㄦ鍑芥暟涓畾涔夌壒瀹氱殑妯″瀷銆備緥濡傦紝杩欓噷缁欏嚭鐨勬槸涓�涓爣鍑嗙殑encoder-decoder缁撴瀯鐨勮闊宠瘑鍒ā鍨嬨�傚叿浣撳湴锛屽厛瀹氫箟璇ユā鍨嬬殑鍚勪釜妯″潡锛屽寘鎷琫ncoder锛宒ecoder绛夛紝鐒跺悗鍦ㄥ皢杩欎簺妯″潡缁勫悎鍦ㄤ竴璧峰緱鍒颁竴涓畬鏁寸殑妯″瀷銆傚湪FunASR涓紝妯″瀷闇�瑕佺户鎵縛AbsESPnetModel`锛屽叾鍏蜂綋浠g爜瑙乣funasr/train/abs_espnet_model.py`锛屼富瑕侀渶瑕佸疄鐜扮殑鏄痐forward`鍑芥暟銆�
diff --git a/docs_cn/get_started.md b/docs_cn/get_started.md
index 6e077e0..9e1c236 100644
--- a/docs_cn/get_started.md
+++ b/docs_cn/get_started.md
@@ -106,7 +106,8 @@
 鏈樁娈电敤浜庤В鐮佸緱鍒拌瘑鍒粨鏋滐紝鍚屾椂璁$畻CER鏉ラ獙璇佽缁冨緱鍒扮殑妯″瀷鎬ц兘銆�
 
 * Mode Selection
-鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙g爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniase`绛夈��
+
+鐢变簬鎴戜滑鎻愪緵浜唒araformer锛寀niasr鍜宑onformer绛夋ā鍨嬶紝鍥犳鍦ㄨВ鐮佹椂锛岄渶瑕佹寚瀹氱浉搴旂殑瑙g爜妯″紡銆傚搴旂殑鍙傛暟涓篳mode`锛岀浉搴旂殑鍙�夎缃负`asr/paraformer/uniasr`绛夈��
 
 * Configuration
 
diff --git a/docs_cn/index.rst b/docs_cn/index.rst
index e0283ae..4a898e9 100644
--- a/docs_cn/index.rst
+++ b/docs_cn/index.rst
@@ -16,6 +16,7 @@
    ./installation.md
    ./papers.md
    ./get_started.md
+   ./build_task.md
 
 .. toctree::
    :maxdepth: 1
diff --git a/docs_cn/modelscope_usages.md b/docs_cn/modelscope_usages.md
index 6e1420a..c7c6de1 100644
--- a/docs_cn/modelscope_usages.md
+++ b/docs_cn/modelscope_usages.md
@@ -2,13 +2,13 @@
 ModelScope鏄樋閲屽反宸存帹鍑虹殑寮�婧愭ā鍨嬪嵆鏈嶅姟鍏变韩骞冲彴锛屼负骞垮ぇ瀛︽湳鐣岀敤鎴峰拰宸ヤ笟鐣岀敤鎴锋彁渚涚伒娲汇�佷究鎹风殑妯″瀷搴旂敤鏀寔銆傚叿浣撶殑浣跨敤鏂规硶鍜屽紑婧愭ā鍨嬪彲浠ュ弬瑙乕ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition) 銆傚湪璇煶鏂瑰悜锛屾垜浠彁渚涗簡鑷洖褰�/闈炶嚜鍥炲綊璇煶璇嗗埆锛岃闊抽璁粌锛屾爣鐐归娴嬬瓑妯″瀷锛岀敤鎴峰彲浠ユ柟渚夸娇鐢ㄣ��
 
 ## 鏁翠綋浠嬬粛
-鎴戜滑鍦╡gs_modelscope鐩綍涓嬫彁渚涗簡鐩稿叧妯″瀷鐨勪娇鐢紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勬ā鍨嬩綔涓哄垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
+鎴戜滑鍦╜egs_modelscope` 鐩綍涓嬫彁渚涗簡涓嶅悓妯″瀷鐨勪娇鐢ㄦ柟娉曪紝鏀寔鐩存帴鐢ㄦ垜浠彁渚涚殑妯″瀷杩涜鎺ㄧ悊锛屽悓鏃朵篃鏀寔灏嗘垜浠彁渚涚殑妯″瀷浣滀负棰勮缁冨ソ鐨勫垵濮嬫ā鍨嬭繘琛屽井璋冦�備笅闈紝鎴戜滑灏嗕互`egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch`鐩綍涓彁渚涚殑妯″瀷鏉ヨ繘琛屼粙缁嶏紝鍖呮嫭`infer.py`锛宍finetune.py`鍜宍infer_after_finetune.py`锛屽搴旂殑鍔熻兘濡備笅锛�
 - `infer.py`: 鍩轰簬鎴戜滑鎻愪緵鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
 - `finetune.py`: 灏嗘垜浠彁渚涚殑妯″瀷浣滀负鍒濆妯″瀷杩涜寰皟
 - `infer_after_finetune.py`: 鍩轰簬寰皟寰楀埌鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞�
 
 ## 妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滃悓鏃剁粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
+鎴戜滑鎻愪緵浜哷infer.py`鏉ュ疄鐜版ā鍨嬫帹鐞嗐�傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庢垜浠彁渚涚殑妯″瀷锛屽鎸囧畾鐨勬暟鎹泦杩涜鎺ㄧ悊锛屽緱鍒扮浉搴旂殑璇嗗埆缁撴灉銆傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠梎CER`銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
 * `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)锛屽叿浣撴牸寮忓彲浠ュ弬瑙乕蹇�熷紑濮媇(./get_started.md)涓殑璇存槑銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁$畻CER锛屽惁鍒欎細璺宠繃銆�
 * `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
 * `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
@@ -21,14 +21,14 @@
 * `data_path`锛氭暟鎹洰褰曘�傝鐩綍涓嬪簲璇ュ寘鎷瓨鏀捐缁冮泦鏁版嵁鐨刞train`鐩綍鍜屽瓨鏀鹃獙璇侀泦鏁版嵁鐨刞dev`鐩綍銆傛瘡涓洰褰曚腑闇�瑕佸寘鎷煶棰戝垪琛ㄦ枃浠禶wav.scp`鍜屾妱鏈枃浠禶text`
 * `output_dir`锛氬井璋冪粨鏋滀繚瀛樼洰褰�
 * `dataset_type`锛氬浜庡皬鏁版嵁闆嗭紝璁剧疆涓篳small`锛涘綋鏁版嵁閲忓ぇ浜�1000灏忔椂鏃讹紝璁剧疆涓篳large`
-* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type=`large`锛宐atch_bins鍗曚綅涓烘绉�
+* `batch_bins`锛歜atch size锛屽鏋渄ataset_type璁剧疆涓篳small`锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛涘鏋渄ataset_type璁剧疆涓篳large`锛宐atch_bins鍗曚綅涓烘绉�
 * `max_epoch`锛氭渶澶х殑璁粌杞暟
 
 浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
 * `accum_grad`锛氭搴︾疮绉�
 * `keep_nbest_models`锛氶�夋嫨鎬ц兘鏈�濂界殑`keep_nbest_models`涓ā鍨嬬殑鍙傛暟杩涜骞冲潎锛屽緱鍒版�ц兘鏇村ソ鐨勬ā鍨�
-* `optim`锛氳缃井璋冩椂鐨勪紭鍖栧櫒
-* `lr`锛氳缃井璋冩椂鐨勫涔犵巼
+* `optim`锛氳缃紭鍖栧櫒
+* `lr`锛氳缃涔犵巼
 * `scheduler`锛氳缃涔犵巼璋冩暣绛栫暐
 * `scheduler_conf`锛氬涔犵巼璋冩暣绛栫暐鐨勭浉鍏冲弬鏁�
 * `specaug`锛氳缃氨澧炲箍
@@ -37,7 +37,7 @@
 闄や簡鐩存帴鍦╜finetune.py`涓缃弬鏁板锛岀敤鎴蜂篃鍙互閫氳繃鎵嬪姩淇敼妯″瀷涓嬭浇鐩綍涓嬬殑`finetune.yaml`鏂囦欢涓殑鍙傛暟鏉ヤ慨鏀瑰井璋冮厤缃��
 
 ## 鍩轰簬寰皟鍚庣殑妯″瀷鎺ㄧ悊
-鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滃悓鏃剁粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
+鎴戜滑鎻愪緵浜哷infer_after_finetune.py`鏉ュ疄鐜板熀浜庣敤鎴疯嚜宸卞井璋冨緱鍒扮殑妯″瀷杩涜鎺ㄧ悊銆傚熀浜庢鏂囦欢锛岀敤鎴峰彲浠ュ熀浜庡井璋冨悗鐨勬ā鍨嬶紝瀵规寚瀹氱殑鏁版嵁闆嗚繘琛屾帹鐞嗭紝寰楀埌鐩稿簲鐨勮瘑鍒粨鏋溿�傚鏋滅粰瀹氫簡鎶勬湰锛屽垯浼氬悓鏃惰绠桟ER銆傚湪寮�濮嬫帹鐞嗗墠锛岀敤鎴峰彲浠ユ寚瀹氬涓嬪弬鏁版潵淇敼鎺ㄧ悊閰嶇疆锛�
 * `data_dir`锛氭暟鎹泦鐩綍銆傜洰褰曚笅搴旇鍖呮嫭闊抽鍒楄〃鏂囦欢`wav.scp`鍜屾妱鏈枃浠禶text`(鍙��)銆傚鏋渀text`鏂囦欢瀛樺湪锛屽垯浼氱浉搴旂殑璁$畻CER锛屽惁鍒欎細璺宠繃銆�
 * `output_dir`锛氭帹鐞嗙粨鏋滀繚瀛樼洰褰�
 * `batch_size`锛氭帹鐞嗘椂鐨刡atch澶у皬
@@ -45,8 +45,8 @@
 * `decoding_model_name`锛氭寚瀹氱敤浜庢帹鐞嗙殑妯″瀷鍚�
 
 浠ヤ笅鍙傛暟涔熷彲浠ヨ繘琛岃缃�備絾鏄鏋滄病鏈夌壒鍒殑闇�姹傦紝鍙互蹇界暐锛岀洿鎺ヤ娇鐢ㄦ垜浠粰瀹氱殑榛樿鍊硷細
-* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨�
+* `modelscope_model_name`锛氬井璋冩椂浣跨敤鐨勫垵濮嬫ā鍨嬪悕
 * `required_files`锛氫娇鐢╩odelscope鎺ュ彛杩涜鎺ㄧ悊鏃堕渶瑕佺敤鍒扮殑鏂囦欢
 
 ## 娉ㄦ剰浜嬮」
-閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑README.md鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file
+閮ㄥ垎妯″瀷鍙兘鍦ㄥ井璋冦�佹帹鐞嗘椂瀛樺湪涓�浜涚壒鏈夌殑鍙傛暟锛岃繖閮ㄥ垎鍙傛暟鍙互鍦ㄥ搴旂洰褰曠殑`README.md`鏂囦欢涓壘鍒板叿浣撶敤娉曘��
\ No newline at end of file

--
Gitblit v1.9.1