From ad128cbe0c1af41363c0fad0b0a291f7bd847bf7 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 27 四月 2023 19:10:35 +0800
Subject: [PATCH] docs

---
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py             |    1 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md            |    1 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py                                             |    1 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md                                                      |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py               |    1 
 egs_modelscope/vad/TEMPLATE/README.md                                                                             |    4 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py                                                      |    1 
 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py                                  |   11 +
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md                                            |    1 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py              |    0 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh                          |    1 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py                                              |   12 +
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md |  264 +++++++++++++++++++++++++++++++++----
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py                                                        |    0 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh                      |    1 
 egs_modelscope/tp/TEMPLATE/infer.sh                                                                               |    2 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh                                                      |    1 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py                                                |    0 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh               |    1 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py   |   15 ++
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md     |    1 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md        |    1 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py  |   30 +++-
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py             |    1 
 egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh                                             |    1 
 egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh      |    1 
 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh         |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py                          |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py                       |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh             |    1 
 egs_modelscope/tp/TEMPLATE/README.md                                                                              |    8 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py                                                       |    0 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py                                                       |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py                |    0 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh               |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md                     |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py                      |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md              |    1 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md                                                     |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py               |    1 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md            |    1 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh  |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py              |    2 
 egs_modelscope/tp/TEMPLATE/infer.py                                                                               |    0 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py                                                 |    0 
 egs_modelscope/punctuation/TEMPLATE/README.md                                                                     |    7 
 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils     |    1 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh             |    1 
 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh                                                       |    1 
 /dev/null                                                                                                         |   24 ---
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md                         |    1 
 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py                           |    0 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md              |    1 
 egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py                |    0 
 54 files changed, 334 insertions(+), 77 deletions(-)

diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
deleted file mode 100644
index c68a8cd..0000000
--- a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
rename to egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
rename to egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/asr/conformer/speech_conformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
new file mode 100644
index 0000000..f6026d6
--- /dev/null
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/demo.py
@@ -0,0 +1,11 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
\ No newline at end of file
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
deleted file mode 100755
index 333b66a..0000000
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ /dev/null
@@ -1,67 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-
-from funasr.utils.compute_wer import compute_wer
-
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
-    for file_name in params["required_files"]:
-        if file_name == "configuration.json":
-            with open(os.path.join(pretrained_model_path, file_name)) as f:
-                config_dict = json.load(f)
-                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
-            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
-                json.dump(config_dict, f, indent=4, separators=(',', ': '))
-        else:
-            shutil.copy(os.path.join(pretrained_model_path, file_name),
-                        os.path.join(params["output_dir"], file_name))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=params["output_dir"],
-        output_dir=decoding_path,
-        batch_size=1
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if text_in is not None:
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        text_proc_file2 = os.path.join(decoding_path, "1best_recog/token_nosep")
-        with open(text_proc_file, 'r') as hyp_reader:
-                with open(text_proc_file2, 'w') as hyp_writer:
-                    for line in hyp_reader:
-                        new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                        hyp_writer.write(new_context+'\n')
-        text_in2 = os.path.join(decoding_path, "1best_recog/ref_text_nosep")
-        with open(text_in, 'r') as ref_reader:
-            with open(text_in2, 'w') as ref_writer:
-                for line in ref_reader:
-                    new_context = line.strip().replace("src","").replace("  "," ").replace("  "," ").strip()
-                    ref_writer.write(new_context+'\n')
-
-
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.sp.cer"))
-        compute_wer(text_in2, text_proc_file2, os.path.join(decoding_path, "text.nosp.cer"))
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950"
-    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./example_data/validation"
-    params["decoding_model_name"] = "valid.acc.ave.pb"
-    modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
deleted file mode 100644
index 49c0aeb..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to infer using a pretrained Paraformer-large Model
-
-### Inference
-
-You can use the pretrain model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # Support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-    - <strong>batch_size:</strong> # Set batch size in inference.
-    - <strong>param_dict:</strong> # Set the hotword list in inference.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
deleted file mode 100644
index e60f6d9..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
+++ /dev/null
@@ -1,105 +0,0 @@
-#!/usr/bin/env bash
-
-set -e
-set -u
-set -o pipefail
-
-stage=1
-stop_stage=2
-model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404"
-data_dir="./data/test"
-output_dir="./results"
-batch_size=64
-gpu_inference=true    # whether to perform gpu decoding
-gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
-njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
-checkpoint_dir=
-checkpoint_name="valid.cer_ctc.ave.pb"
-hotword_txt=None
-
-. utils/parse_options.sh || exit 1;
-
-if ${gpu_inference} == "true"; then
-    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
-else
-    nj=$njob
-    batch_size=1
-    gpuid_list=""
-    for JOB in $(seq ${nj}); do
-        gpuid_list=$gpuid_list"-1,"
-    done
-fi
-
-mkdir -p $output_dir/split
-split_scps=""
-for JOB in $(seq ${nj}); do
-    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
-done
-perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-
-if [ -n "${checkpoint_dir}" ]; then
-  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
-  model=${checkpoint_dir}/${model}
-fi
-
-if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
-    echo "Decoding ..."
-    gpuid_list_array=(${gpuid_list//,/ })
-    for JOB in $(seq ${nj}); do
-        {
-        id=$((JOB-1))
-        gpuid=${gpuid_list_array[$id]}
-        mkdir -p ${output_dir}/output.$JOB
-        python infer.py \
-            --model ${model} \
-            --audio_in ${output_dir}/split/wav.$JOB.scp \
-            --output_dir ${output_dir}/output.$JOB \
-            --batch_size ${batch_size} \
-            --gpuid ${gpuid} \
-            --hotword_txt ${hotword_txt}
-        }&
-    done
-    wait
-
-    mkdir -p ${output_dir}/1best_recog
-    for f in token score text; do
-        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
-          for i in $(seq "${nj}"); do
-              cat "${output_dir}/output.${i}/1best_recog/${f}"
-          done | sort -k1 >"${output_dir}/1best_recog/${f}"
-        fi
-    done
-fi
-
-if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then
-    echo "Computing WER ..."
-    cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc
-    cp ${data_dir}/text ${output_dir}/1best_recog/text.ref
-    python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer
-    tail -n 3 ${output_dir}/1best_recog/text.cer
-fi
-
-if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then
-    echo "SpeechIO TIOBE textnorm"
-    echo "$0 --> Normalizing REF text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${data_dir}/text \
-        ${output_dir}/1best_recog/ref.txt
-
-    echo "$0 --> Normalizing HYP text ..."
-    ./utils/textnorm_zh.py \
-        --has_key --to_upper \
-        ${output_dir}/1best_recog/text.proc \
-        ${output_dir}/1best_recog/rec.txt
-    grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt
-
-    echo "$0 --> computing WER/CER and alignment ..."
-    ./utils/error_rate_zh \
-        --tokenizer char \
-        --ref ${output_dir}/1best_recog/ref.txt \
-        --hyp ${output_dir}/1best_recog/rec_non_empty.txt \
-        ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt
-    rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt
-fi
-
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
deleted file mode 100644
index 18897b1..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/infer_aishell1_subtest_demo.py
+++ /dev/null
@@ -1,36 +0,0 @@
-import os
-import tempfile
-import codecs
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.msdatasets import MsDataset
-
-if __name__ == '__main__':
-    param_dict = dict()
-    param_dict['hotword'] = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/hotword.txt"
-
-    output_dir = "./output"
-    batch_size = 1
-
-    # dataset split ['test']
-    ds_dict = MsDataset.load(dataset_name='speech_asr_aishell1_hotwords_testsets', namespace='speech_asr')
-    work_dir = tempfile.TemporaryDirectory().name
-    if not os.path.exists(work_dir):
-        os.makedirs(work_dir)
-    wav_file_path = os.path.join(work_dir, "wav.scp")
-    
-    with codecs.open(wav_file_path, 'w') as fin: 
-        for line in ds_dict:
-            wav = line["Audio:FILE"]
-            idx = wav.split("/")[-1].split(".")[0]
-            fin.writelines(idx + " " + wav + "\n")
-    audio_in = wav_file_path         
-
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model="damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404",
-        output_dir=output_dir,
-        batch_size=batch_size,
-        param_dict=param_dict)
-
-    rec_result = inference_pipeline(audio_in=audio_in)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
deleted file mode 100644
index c740f71..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ /dev/null
@@ -1,76 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
-    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
-    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.sh`
-    - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - <strong>output_dir:</strong> # result dir
-    - <strong>batch_size:</strong> # batchsize of inference
-    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
-    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
-    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
-
-- Decode with multi GPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --batch_size 64 \
-    --gpu_inference true \
-    --gpuid_list "0,1"
-```
-
-- Decode with multi-thread CPUs:
-```shell
-    bash infer.sh \
-    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
-    --data_dir "./data/test" \
-    --output_dir "./results" \
-    --gpu_inference false \
-    --njob 64
-```
-
-- Results
-
-The decoding results can be found in `${output_dir}/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>modelscope_model_name: </strong> # model name on ModelScope
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-    - <strong>batch_size:</strong> # batchsize of inference  
-
-- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
-```
-
-- Results
-
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
deleted file mode 100644
index 2d311dd..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ /dev/null
@@ -1,48 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
-    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/text")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
deleted file mode 100644
index c68a8cd..0000000
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
+++ /dev/null
@@ -1,30 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained Paraformer-large Model
-
-### Finetune
-
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
-```python
-    python finetune.py
-```
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
similarity index 95%
rename from egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
rename to egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
index 8a6c87b..4125a57 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/demo.py
@@ -8,7 +8,7 @@
         task=Tasks.auto_speech_recognition,
         model="damo/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch",
         output_dir=output_dir,
-        batch_size=32,
+        batch_size=1,
     )
     rec_result = inference_pipline(audio_in=audio_in)
     print(rec_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell1-vocab4234-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
rename to egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/demo.py
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-aishell2-vocab5212-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 94144ef..83c462d 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -1,46 +1,246 @@
-# ModelScope Model
+# Speech Recognition
 
-## How to finetune and infer using a pretrained Paraformer-large Model
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take the typic models as examples to demonstrate the usage.
 
-### Finetune
+## Inference
 
-- Modify finetune training related parameters in `finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
-    - <strong>batch_bins:</strong> # batch size
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
-
-- Then you can run the pipeline to finetune with:
+### Quick start
+#### [Paraformer Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 ```python
-    python finetune.py
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+```python
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+    )
+import soundfile
+speech, sample_rate = soundfile.read("example/asr_example.wav")
+
+param_dict = {"cache": dict(), "is_final": False}
+chunk_stride = 7680# 480ms
+# first chunk, 480ms
+speech_chunk = speech[0:chunk_stride] 
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+# next chunk, 480ms
+speech_chunk = speech[chunk_stride:chunk_stride+chunk_stride]
+rec_result = inference_pipeline(audio_in=speech_chunk, param_dict=param_dict)
+print(rec_result)
+```
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
+
+#### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+```python
+decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825',
+    param_dict={"decoding_model": decoding_model})
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
+```
+The decoding mode of `fast` and `normal` is fake streaming, which could be used for evaluating of recognition accuracy.
+Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
+#### [RNN-T-online model]()
+Undo
+
+#### [MFCCA Model](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+For more model detailes, please refer to [docs](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary)
+```python
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipeline = pipeline(
+    task=Tasks.auto_speech_recognition,
+    model='NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950',
+    model_revision='v3.0.0'
+)
+
+rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+print(rec_result)
 ```
 
-### Inference
+#### API-reference
+##### Define pipeline
+- `task`: `Tasks.auto_speech_recognition`
+- `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+- `ngpu`: `1` (Default), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: `1` (Default), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: `None` (Default), the output path of results if set
+- `batch_size`: `1` (Default), batch size when decoding
+##### Infer pipeline
+- `audio_in`: the input to decode, which could be: 
+  - wav_path, `e.g.`: asr_example.wav,
+  - pcm_path, `e.g.`: asr_example.pcm, 
+  - audio bytes stream, `e.g.`: bytes data from a microphone
+  - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
+  - wav.scp, kaldi style wav list (`wav_id \t wav_path`), `e.g.`: 
+  ```text
+  asr_example1  ./audios/asr_example1.wav
+  asr_example2  ./audios/asr_example2.wav
+  ```
+  In this case of `wav.scp` input, `output_dir` must be set to save the output results
+- `audio_fs`: audio sampling rate, only set when audio_in is pcm audio
+- `output_dir`: None (Default), the output path of results if set
 
-Or you can use the finetuned model for inference directly.
+### Inference with multi-thread CPUs or multi GPUs
+FunASR also offer recipes [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
+- Setting parameters in `infer.sh`
+    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+    - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - `output_dir`: output dir of the recognition results
+    - `batch_size`: `64` (Default), batch size of inference on gpu
+    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
+    - `decoding_mode`: `normal` (Default), decoding mode for UniASR model(fast銆乶ormal銆乷ffline)
+    - `hotword_txt`: `None` (Default), hotword file for contextual paraformer model(the hotword file name ends with .txt")
 
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1"
 ```
-
-### Inference using local finetuned model
-
-- Modify inference related parameters in `infer_after_finetune.py`
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-
-- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64
 ```
 
 - Results
 
-The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
+
+If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
+
+
+## Finetune with pipeline
+
+### Quick start
+[finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+```python
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from modelscope.msdatasets.audio.asr_dataset import ASRDataset
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = ASRDataset.load(params.data_path, namespace='speech_asr')
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    from funasr.utils.modelscope_param import modelscope_args
+    params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    params.output_dir = "./checkpoint"                      # 妯″瀷淇濆瓨璺緞
+    params.data_path = "speech_asr_aishell1_trainsets"      # 鏁版嵁璺緞锛屽彲浠ヤ负modelscope涓凡涓婁紶鏁版嵁锛屼篃鍙互鏄湰鍦版暟鎹�
+    params.dataset_type = "small"                           # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 2000                                # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 50                                   # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                                     # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
+```
+
+```shell
+python finetune.py &> log.txt &
+```
+
+### Finetune with your data
+
+- Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
+    - `output_dir`: result dir
+    - `data_dir`: the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - `dataset_type`: for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - `batch_bins`: batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - `max_epoch`: number of training epoch
+    - `lr`: learning rate
+
+- Training data formats锛�
+```sh
+cat ./example_data/text
+BAC009S0002W0122 鑰� 瀵� 妤� 甯� 鎴� 浜� 鎶� 鍒� 浣� 鐢� 鏈� 澶� 鐨� 闄� 璐�
+BAC009S0002W0123 涔� 鎴� 涓� 鍦� 鏂� 鏀� 搴� 鐨� 鐪� 涓� 閽�
+english_example_1 hello world
+english_example_2 go swim 鍘� 娓� 娉�
+
+cat ./example_data/wav.scp
+BAC009S0002W0122 /mnt/data/wav/train/S0002/BAC009S0002W0122.wav
+BAC009S0002W0123 /mnt/data/wav/train/S0002/BAC009S0002W0123.wav
+english_example_1 /mnt/data/wav/train/S0002/english_example_1.wav
+english_example_2 /mnt/data/wav/train/S0002/english_example_2.wav
+```
+
+- Then you can run the pipeline to finetune with:
+```shell
+python finetune.py
+```
+If you want finetune with multi-GPUs, you could:
+```shell
+CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
+```
+## Inference with your finetuned model
+
+- Setting parameters in [egs_modelscope/asr/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) is the same with [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/egs_modelscope/asr/TEMPLATE#inference-with-multi-thread-cpus-or-multi-gpus), `model` is the model name from modelscope, which you finetuned.
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1" \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64 \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
new file mode 100644
index 0000000..c533e67
--- /dev/null
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -0,0 +1,15 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == '__main__':
+    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
+    output_dir = None
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
+        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
+    )
+    rec_result = inference_pipeline(audio_in=audio_in)
+    print(rec_result)
+
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index 4d98a65..5bc205c 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -1,16 +1,28 @@
+import os
+import shutil
+import argparse
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 
-if __name__ == '__main__':
-    audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
-    output_dir = None
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
     inference_pipeline = pipeline(
         task=Tasks.auto_speech_recognition,
-        model='damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
-        vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
-        punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-        ngpu=1,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+        param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt}
     )
-    rec_result = inference_pipeline(audio_in=audio_in)
-    print(rec_result)
+    inference_pipeline(audio_in=args.audio_in)
 
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--decoding_mode', type=str, default="normal")
+    parser.add_argument('--hotword_txt', type=str, default=None)
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
similarity index 100%
rename from egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
rename to egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
deleted file mode 100644
index 473019c..0000000
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ /dev/null
@@ -1,47 +0,0 @@
-import json
-import os
-import shutil
-
-from modelscope.pipelines import pipeline
-from modelscope.utils.constant import Tasks
-from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
-
-def modelscope_infer_after_finetune(params):
-    # prepare for decoding
-
-    try:
-        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
-    except BaseException:
-        raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
-    decoding_path = os.path.join(params["output_dir"], "decode_results")
-    if os.path.exists(decoding_path):
-        shutil.rmtree(decoding_path)
-    os.mkdir(decoding_path)
-
-    # decoding
-    inference_pipeline = pipeline(
-        task=Tasks.auto_speech_recognition,
-        model=pretrained_model_path,
-        output_dir=decoding_path,
-        batch_size=params["batch_size"]
-    )
-    audio_in = os.path.join(params["data_dir"], "wav.scp")
-    inference_pipeline(audio_in=audio_in)
-
-    # computer CER if GT text is set
-    text_in = os.path.join(params["data_dir"], "text")
-    if os.path.exists(text_in):
-        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
-        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
-
-
-if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
new file mode 120000
index 0000000..3d3dd06
--- /dev/null
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/utils
@@ -0,0 +1 @@
+../../asr/TEMPLATE/utils
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/TEMPLATE/README.md b/egs_modelscope/punctuation/TEMPLATE/README.md
index 5618973..7cbca05 100644
--- a/egs_modelscope/punctuation/TEMPLATE/README.md
+++ b/egs_modelscope/punctuation/TEMPLATE/README.md
@@ -1,5 +1,4 @@
 # Punctuation Restoration
-# Voice Activity Detection
 
 > **Note**: 
 > The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetune. Here we take the model of the punctuation model of CT-Transformer as example to demonstrate the usage.
@@ -69,7 +68,7 @@
 - `param_dict`: reserving the cache which is necessary in realtime mode. 
 
 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [egs_modelscope/punc/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punc/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.
+FunASR also offer recipes [egs_modelscope/punctuation/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/punctuation/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. It is an offline recipe and only support offline model.
 
 - Setting parameters in `infer.sh`
     - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
@@ -87,7 +86,7 @@
     --model "damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
@@ -98,7 +97,7 @@
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 64
+    --njob 1
 ```
 
 
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
similarity index 100%
rename from egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
rename to egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/demo.py
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
deleted file mode 100644
index b125d48..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
+++ /dev/null
@@ -1,19 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-task=Tasks.punctuation,
-    model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
-
-- Setting parameters in `modelscope_common_infer.sh`
-    - <strong>model:</strong> damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch  # pre-trained model, download from modelscope
-    - <strong>text_in:</strong> input path, text or url
-    - <strong>output_dir:</strong> the result dir
-- Then you can run the pipeline to infer with: 
-```sh
-    python ./infer.py
-```
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
new file mode 120000
index 0000000..92088a2
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/README.md
@@ -0,0 +1 @@
+../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
deleted file mode 100644
index 367be79..0000000
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/data/punc_example.txt
+++ /dev/null
@@ -1,3 +0,0 @@
-1	璺ㄥ娌虫祦鏄吇鑲叉部宀镐汉姘戠殑鐢熷懡涔嬫簮闀挎湡浠ユ潵涓哄府鍔╀笅娓稿湴鍖洪槻鐏惧噺鐏句腑鏂规妧鏈汉鍛樺湪涓婃父鍦板尯鏋佷负鎭跺姡鐨勮嚜鐒舵潯浠朵笅鍏嬫湇宸ㄥぇ鍥伴毦鐢氳嚦鍐掔潃鐢熷懡鍗遍櫓鍚戝嵃鏂规彁渚涙睕鏈熸按鏂囪祫鏂欏鐞嗙揣鎬ヤ簨浠朵腑鏂归噸瑙嗗嵃鏂瑰湪璺ㄥ娌虫祦闂涓婄殑鍏冲垏鎰挎剰杩涗竴姝ュ畬鍠勫弻鏂硅仈鍚堝伐浣滄満鍒跺嚒鏄腑鏂硅兘鍋氱殑鎴戜滑閮戒細鍘诲仛鑰屼笖浼氬仛寰楁洿濂芥垜璇峰嵃搴︽湅鍙嬩滑鏀惧績涓浗鍦ㄤ笂娓哥殑浠讳綍寮�鍙戝埄鐢ㄩ兘浼氱粡杩囩瀛﹁鍒掑拰璁鸿瘉鍏奸【涓婁笅娓哥殑鍒╃泭
-2	浠庡瓨鍌ㄤ笂鏉ヨ浠呬粎鏄叏鏅浘鐗囧畠灏变細鏄浘鐗囩殑鍥涘�嶇殑瀹归噺鐒跺悗鍏ㄦ櫙鐨勮棰戜細鏄櫘閫氳棰戝叓鍊嶇殑杩欎釜瀛樺偍鐨勫瑕佹眰鑰屼笁d鐨勬ā鍨嬩細鏄浘鐗囩殑鍗佸�嶈繖閮藉鎴戜滑浠婂ぉ杩愯鍦ㄧ殑浜戣绠楃殑骞冲彴瀛樺偍鐨勫钩鍙版彁鍑轰簡鏇撮珮鐨勮姹�
-3	閭ｄ粖澶╃殑浼氬氨鍒拌繖閲屽惂 happy new year 鏄庡勾瑙�
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
similarity index 100%
rename from egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
rename to egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/demo.py
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
new file mode 120000
index 0000000..f05fbbb
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.py
@@ -0,0 +1 @@
+../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
new file mode 120000
index 0000000..0b3b38b
--- /dev/null
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/infer.sh
@@ -0,0 +1 @@
+../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/tp/TEMPLATE/README.md b/egs_modelscope/tp/TEMPLATE/README.md
index 2678a7f..8d75581 100644
--- a/egs_modelscope/tp/TEMPLATE/README.md
+++ b/egs_modelscope/tp/TEMPLATE/README.md
@@ -59,11 +59,11 @@
     ```
 
 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [egs_modelscope/vad/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+FunASR also offer recipes [egs_modelscope/tp/TEMPLATE/infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/tp/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 - Setting parameters in `infer.sh`
     - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-    - `data_dir`: the dataset dir **must** include `wav.scp` and `text.scp`
+    - `data_dir`: the dataset dir **must** include `wav.scp` and `text.txt`
     - `output_dir`: output dir of the recognition results
     - `batch_size`: `64` (Default), batch size of inference on gpu
     - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
@@ -78,7 +78,7 @@
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
@@ -89,7 +89,7 @@
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 64
+    --njob 1
 ```
 
 ## Finetune with pipeline
diff --git a/egs_modelscope/tp/TEMPLATE/infer.py b/egs_modelscope/tp/TEMPLATE/infer.py
deleted file mode 120000
index df5dff2..0000000
--- a/egs_modelscope/tp/TEMPLATE/infer.py
+++ /dev/null
@@ -1 +0,0 @@
-../speech_timestamp_prediction-v1-16k-offline/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py b/egs_modelscope/tp/TEMPLATE/infer.py
similarity index 100%
rename from egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
rename to egs_modelscope/tp/TEMPLATE/infer.py
diff --git a/egs_modelscope/tp/TEMPLATE/infer.sh b/egs_modelscope/tp/TEMPLATE/infer.sh
index 2a923bb..bae62e8 100644
--- a/egs_modelscope/tp/TEMPLATE/infer.sh
+++ b/egs_modelscope/tp/TEMPLATE/infer.sh
@@ -37,7 +37,7 @@
     split_texts="$split_texts $output_dir/split/text.$JOB.scp"
 done
 perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
-perl utils/split_scp.pl ${data_dir}/text.scp ${split_texts}
+perl utils/split_scp.pl ${data_dir}/text.txt ${split_texts}
 
 if [ -n "${checkpoint_dir}" ]; then
   python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
deleted file mode 100644
index 5488aaa..0000000
--- a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
+++ /dev/null
@@ -1,25 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>text_in:</strong> # support text, text url.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
new file mode 100644
index 0000000..2e6f92f
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/demo.py
@@ -0,0 +1,12 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_pipline = pipeline(
+    task=Tasks.speech_timestamp,
+    model='damo/speech_timestamp_prediction-v1-16k-offline',
+    output_dir=None)
+
+rec_result = inference_pipline(
+    audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
+    text_in='涓� 涓� 涓� 澶� 骞� 娲� 鍥� 瀹� 涓� 浠� 涔� 璺� 鍒� 瑗� 澶� 骞� 娲� 鏉� 浜� 鍛�',)
+print(rec_result)
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/tp/speech_timestamp_prediction-v1-16k-offline/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/vad/TEMPLATE/README.md b/egs_modelscope/vad/TEMPLATE/README.md
index 6f746d5..0542331 100644
--- a/egs_modelscope/vad/TEMPLATE/README.md
+++ b/egs_modelscope/vad/TEMPLATE/README.md
@@ -86,7 +86,7 @@
     --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
-    --batch_size 64 \
+    --batch_size 1 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
@@ -97,7 +97,7 @@
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
-    --njob 64
+    --njob 1
 ```
 
 ## Finetune with pipeline
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
deleted file mode 100644
index 6d9cd30..0000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
similarity index 100%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo.py
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
similarity index 100%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/demo_online.py
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
deleted file mode 100644
index 6d9cd30..0000000
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
+++ /dev/null
@@ -1,24 +0,0 @@
-# ModelScope Model
-
-## How to finetune and infer using a pretrained ModelScope Model
-
-### Inference
-
-Or you can use the finetuned model for inference directly.
-
-- Setting parameters in `infer.py`
-    - <strong>audio_in:</strong> # support wav, url, bytes, and parsed audio format.
-    - <strong>output_dir:</strong> # If the input format is wav.scp, it needs to be set.
-
-- Then you can run the pipeline to infer with:
-```python
-    python infer.py
-```
-
-
-Modify inference related parameters in vad.yaml.
-
-- max_end_silence_time: The end-point silence duration  to judge the end of sentence, the parameter range is 500ms~6000ms, and the default value is 800ms
-- speech_noise_thres:  The balance of speech and silence scores, the parameter range is (-1,1)
-    - The value tends to -1, the greater probability of noise being judged as speech
-    - The value tends to 1, the greater probability of speech being judged as noise
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
new file mode 120000
index 0000000..bb55ab5
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/README.md
@@ -0,0 +1 @@
+../../TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
similarity index 100%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo.py
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
similarity index 100%
rename from egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
rename to egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/demo_online.py
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
new file mode 120000
index 0000000..128fc31
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.py
@@ -0,0 +1 @@
+../../TEMPLATE/infer.py
\ No newline at end of file
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
new file mode 120000
index 0000000..5e59f18
--- /dev/null
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer.sh
@@ -0,0 +1 @@
+../../TEMPLATE/infer.sh
\ No newline at end of file

--
Gitblit v1.9.1