From a96350ff4fe0b53d8544de101817881d6ff105b2 Mon Sep 17 00:00:00 2001
From: yhliang <68215459+yhliang-aslp@users.noreply.github.com>
Date: 星期五, 21 四月 2023 11:46:34 +0800
Subject: [PATCH] Merge pull request #392 from alibaba-damo-academy/main

---
 docs/modescope_pipeline/vad_pipeline.md             |    1 
 funasr/bin/punctuation_infer.py                     |    2 
 egs_modelscope/vad/TEMPLATE/infer.sh                |   71 +++++++
 funasr/export/README.md                             |   18 +
 funasr/runtime/python/grpc/Readme.md                |    5 
 funasr/runtime/python/websocket/README.md           |    5 
 docs/index.rst                                      |   17 +
 funasr/train/trainer.py                             |   10 
 egs_modelscope/vad/TEMPLATE/infer.py                |   25 ++
 docs/modescope_pipeline/asr_pipeline.md             |    1 
 egs/aishell/transformer/utils/prepare_checkpoint.py |   21 +-
 README.md                                           |   13 
 docs/docker.md                                      |   65 +++++++
 funasr/runtime/python/libtorch/README.md            |    9 
 funasr/runtime/python/onnxruntime/README.md         |    9 
 egs_modelscope/vad/TEMPLATE/utils                   |    1 
 egs_modelscope/vad/TEMPLATE/README.md               |   47 ++--
 egs/aishell/transformer/utils/process_opus.py       |   86 +++++++++
 egs_modelscope/asr/TEMPLATE/README.md               |   85 +++++---
 egs_modelscope/asr/TEMPLATE/infer.sh                |    9 
 docs/application.md                                 |    5 
 docs/modescope_pipeline/quick_start.md              |   25 +
 22 files changed, 423 insertions(+), 107 deletions(-)

diff --git a/README.md b/README.md
index 29ddd4a..b8e1b89 100644
--- a/README.md
+++ b/README.md
@@ -97,19 +97,18 @@
 ## Citations
 
 ``` bibtex
-@inproceedings{gao2020universal,
-  title={Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model},
-  author={Gao, Zhifu and Zhang, Shiliang and Lei, Ming and McLoughlin, Ian},
-  booktitle={arXiv preprint arXiv:2010.14099},
-  year={2020}
-}
-
 @inproceedings{gao2022paraformer,
   title={Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition},
   author={Gao, Zhifu and Zhang, Shiliang and McLoughlin, Ian and Yan, Zhijie},
   booktitle={INTERSPEECH},
   year={2022}
 }
+@inproceedings{gao2020universal,
+  title={Universal ASR: Unifying Streaming and Non-Streaming ASR Using a Single Encoder-Decoder Model},
+  author={Gao, Zhifu and Zhang, Shiliang and Lei, Ming and McLoughlin, Ian},
+  booktitle={arXiv preprint arXiv:2010.14099},
+  year={2020}
+}
 @inproceedings{Shi2023AchievingTP,
   title={Achieving Timestamp Prediction While Recognizing with Non-Autoregressive End-to-End ASR Model},
   author={Xian Shi and Yanni Chen and Shiliang Zhang and Zhijie Yan},
diff --git a/docs/application.md b/docs/application.md
new file mode 100644
index 0000000..1c19dd6
--- /dev/null
+++ b/docs/application.md
@@ -0,0 +1,5 @@
+## Audio Cut
+
+## Realtime Speech Recognition
+
+## Audio Chat
\ No newline at end of file
diff --git a/docs/docker.md b/docs/docker.md
new file mode 100644
index 0000000..77554d3
--- /dev/null
+++ b/docs/docker.md
@@ -0,0 +1,65 @@
+# Docker
+
+## Install Docker
+
+### Ubuntu
+```shell
+curl -fsSL https://test.docker.com -o test-docker.sh
+sudo sh test-docker.sh
+```
+### Debian
+```shell
+ curl -fsSL https://get.docker.com -o get-docker.sh
+ sudo sh get-docker.sh
+```
+
+### CentOS
+```shell
+curl -fsSL https://get.docker.com | bash -s docker --mirror Aliyun
+```
+
+### MacOS
+```shell
+brew install --cask --appdir=/Applications docker
+```
+
+### Windows
+Ref to [docs](https://docs.docker.com/desktop/install/windows-install/)
+
+## Start Docker
+```shell
+sudo systemctl start docker
+```
+## Download image
+
+### Image
+#### CPU
+`registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-py37-torch1.11.0-tf1.15.5-1.5.0`
+
+#### GPU
+
+`registry.cn-hangzhou.aliyuncs.com/modelscope-repo/modelscope:ubuntu20.04-cuda11.3.0-py37-torch1.11.0-tf1.15.5-1.5.0`
+
+### Pull Image
+```shell
+sudo docker pull <image-name>:<tag>
+```
+
+### Check Image 
+```shell
+sudo docker images
+```
+
+## Run Docker
+```shell
+sudo docker run -itd --name funasr <image-name>:<tag> bash
+sudo docker exec -it funasr bash
+```
+
+## Stop Docker
+```shell
+exit
+sudo docker ps
+sudo docker stop funasr
+```
+
diff --git a/docs/index.rst b/docs/index.rst
index 14c9525..2fcc6c6 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -14,15 +14,17 @@
    :caption: Installation
 
    ./installation.md
+   ./docker.md
 
 .. toctree::
    :maxdepth: 1
    :caption: Recipe
 
    ./recipe/asr_recipe.md
-   ./recipe/sv_recipe.md
    ./recipe/punc_recipe.md
    ./recipe/vad_recipe.md
+   ./recipe/sv_recipe.md
+   ./recipe/sd_recipe.md
 
 .. toctree::
    :maxdepth: 1
@@ -48,7 +50,12 @@
    ./modescope_pipeline/tp_pipeline.md
    ./modescope_pipeline/sv_pipeline.md
    ./modescope_pipeline/sd_pipeline.md
-   ./modescope_pipeline/lm_pipeline.md
+
+.. toctree::
+   :maxdepth: 1
+   :caption: Huggingface pipeline
+
+   Undo
 
 .. toctree::
    :maxdepth: 1
@@ -77,6 +84,12 @@
 
 .. toctree::
    :maxdepth: 1
+   :caption: Application
+
+   ./application.md
+
+.. toctree::
+   :maxdepth: 1
    :caption: FQA
 
    ./FQA.md
diff --git a/docs/modescope_pipeline/asr_pipeline.md b/docs/modescope_pipeline/asr_pipeline.md
new file mode 120000
index 0000000..465d5a2
--- /dev/null
+++ b/docs/modescope_pipeline/asr_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/asr/TEMPLATE/README.md
\ No newline at end of file
diff --git a/docs/modescope_pipeline/quick_start.md b/docs/modescope_pipeline/quick_start.md
index b1614f5..e6a85d9 100644
--- a/docs/modescope_pipeline/quick_start.md
+++ b/docs/modescope_pipeline/quick_start.md
@@ -1,9 +1,13 @@
 # Quick Start
 
+> **Note**: 
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
+
+
 ## Inference with pipeline
 
 ### Speech Recognition
-#### Paraformer model
+#### Paraformer Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -15,10 +19,11 @@
 
 rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
 print(rec_result)
+# {'text': '娆㈣繋澶у鏉ヤ綋楠岃揪鎽╅櫌鎺ㄥ嚭鐨勮闊宠瘑鍒ā鍨�'}
 ```
 
 ### Voice Activity Detection
-#### FSMN-VAD
+#### FSMN-VAD Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -34,10 +39,11 @@
 
 segments_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav')
 print(segments_result)
+# {'text': [[70, 2340], [2620, 6200], [6480, 23670], [23950, 26250], [26780, 28990], [29950, 31430], [31750, 37600], [38210, 46900], [47310, 49630], [49910, 56460], [56740, 59540], [59820, 70450]]}
 ```
 
 ### Punctuation Restoration
-#### CT_Transformer
+#### CT_Transformer Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -49,10 +55,11 @@
 
 rec_result = inference_pipeline(text_in='鎴戜滑閮芥槸鏈ㄥご浜轰笉浼氳璇濅笉浼氬姩')
 print(rec_result)
+# {'text': '鎴戜滑閮芥槸鏈ㄥご浜猴紝涓嶄細璁茶瘽锛屼笉浼氬姩銆�'}
 ```
 
 ### Timestamp Prediction
-#### TP-Aligner
+#### TP-Aligner Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -65,10 +72,11 @@
     audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_timestamps.wav',
     text_in='涓� 涓� 涓� 澶� 骞� 娲� 鍥� 瀹� 涓� 浠� 涔� 璺� 鍒� 瑗� 澶� 骞� 娲� 鏉� 浜� 鍛�',)
 print(rec_result)
+# {'text': '<sil> 0.000 0.380;涓� 0.380 0.560;涓� 0.560 0.800;涓� 0.800 0.980;澶� 0.980 1.140;骞� 1.140 1.260;娲� 1.260 1.440;鍥� 1.440 1.680;瀹� 1.680 1.920;<sil> 1.920 2.040;涓� 2.040 2.200;浠� 2.200 2.320;涔� 2.320 2.500;璺� 2.500 2.680;鍒� 2.680 2.860;瑗� 2.860 3.040;澶� 3.040 3.200;骞� 3.200 3.380;娲� 3.380 3.500;鏉� 3.500 3.640;浜� 3.640 3.800;鍛� 3.800 4.150;<sil> 4.150 4.440;', 'timestamp': [[380, 560], [560, 800], [800, 980], [980, 1140], [1140, 1260], [1260, 1440], [1440, 1680], [1680, 1920], [2040, 2200], [2200, 2320], [2320, 2500], [2500, 2680], [2680, 2860], [2860, 3040], [3040, 3200], [3200, 3380], [3380, 3500], [3500, 3640], [3640, 3800], [3800, 4150]]}
 ```
 
 ### Speaker Verification
-#### X-vector
+#### X-vector Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -85,10 +93,11 @@
 # speaker verification
 rec_result = inference_sv_pipline(audio_in=('https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_enroll.wav','https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/sv_example_same.wav'))
 print(rec_result["scores"][0])
+# 0.8540499500025098
 ```
 
-### Speaker diarization
-#### SOND
+### Speaker Diarization
+#### SOND Model
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -154,7 +163,7 @@
 
 ## Finetune with pipeline
 ### Speech Recognition
-#### Paraformer model
+#### Paraformer Model
 
 finetune.py
 ```python
diff --git a/docs/modescope_pipeline/vad_pipeline.md b/docs/modescope_pipeline/vad_pipeline.md
new file mode 120000
index 0000000..30ea6fc
--- /dev/null
+++ b/docs/modescope_pipeline/vad_pipeline.md
@@ -0,0 +1 @@
+../../egs_modelscope/vad/TEMPLATE/README.md
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py b/egs/aishell/transformer/utils/prepare_checkpoint.py
similarity index 75%
rename from egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
rename to egs/aishell/transformer/utils/prepare_checkpoint.py
index 2d311dd..01763d4 100644
--- a/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py
+++ b/egs/aishell/transformer/utils/prepare_checkpoint.py
@@ -1,12 +1,9 @@
-import json
 import os
 import shutil
 
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
 from modelscope.hub.snapshot_download import snapshot_download
-
-from funasr.utils.compute_wer import compute_wer
 
 def modelscope_infer_after_finetune(params):
     # prepare for decoding
@@ -39,10 +36,14 @@
 
 
 if __name__ == '__main__':
-    params = {}
-    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
-    params["output_dir"] = "./checkpoint"
-    params["data_dir"] = "./data/test"
-    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
-    params["batch_size"] = 64
-    modelscope_infer_after_finetune(params)
\ No newline at end of file
+    import sys
+    
+    model = sys.argv[1]
+    checkpoint_dir = sys.argv[2]
+    checkpoint_name = sys.argv[3]
+    
+    try:
+        pretrained_model_path = snapshot_download(model, cache_dir=checkpoint_dir)
+    except BaseException:
+        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+    shutil.copy(os.path.join(checkpoint_dir, checkpoint_name), os.path.join(pretrained_model_path, "model.pb"))
diff --git a/egs/aishell/transformer/utils/process_opus.py b/egs/aishell/transformer/utils/process_opus.py
new file mode 100755
index 0000000..1baa3ed
--- /dev/null
+++ b/egs/aishell/transformer/utils/process_opus.py
@@ -0,0 +1,86 @@
+# Copyright 2021  NPU, ASLP Group (Author: Qijie Shao)
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# process_opus.py: segmentation and downsampling of opus audio
+
+# usage: python3 process_opus.py wav.scp segments output_wav.scp
+
+from pydub import AudioSegment
+import sys
+import os
+
+
+def read_file(wav_scp, segments):
+    wav_scp_dict = {}
+    with open(wav_scp, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            wav_id, path = line_str.strip().split()
+            wav_scp_dict[wav_id] = path
+
+    utt_list = []
+    seg_path_list = []
+    start_time_list = []
+    end_time_list = []
+    with open(segments, "r", encoding="UTF-8") as fin:
+        for line_str in fin:
+            arr = line_str.strip().split()
+            assert len(arr) == 4
+            utt_list.append(arr[0])
+            seg_path_list.append(wav_scp_dict[arr[1]])
+            start_time_list.append(float(arr[2]))
+            end_time_list.append(float(arr[3]))
+    return utt_list, seg_path_list, start_time_list, end_time_list
+
+
+# TODO(Qijie): Fix the process logic
+def output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list):
+    num_utts = len(utt_list)
+    with open(output_wav_scp, "w", encoding="UTF-8") as fout:
+        previous_wav_path = ""
+        for i in range(num_utts):
+            utt_id = utt_list[i]
+            current_wav_path = seg_path_list[i]
+            output_dir = (os.path.dirname(current_wav_path)).replace(
+                "audio", "audio_seg"
+            )
+            seg_wav_path = os.path.join(output_dir, utt_id + ".wav")
+
+            os.makedirs(output_dir, exist_ok=True)
+            if current_wav_path != previous_wav_path:
+                source_wav = AudioSegment.from_file(current_wav_path)
+            previous_wav_path = current_wav_path
+
+            start = int(start_time_list[i] * 1000)
+            end = int(end_time_list[i] * 1000)
+            target_audio = source_wav[start:end].set_frame_rate(16000).set_sample_width(2)
+            target_audio.export(seg_wav_path, format="wav")
+
+            fout.write("{} {}\n".format(utt_id, seg_wav_path))
+            if i % 200 == 0:
+                print("seg wav finished: {}%".format(int(i / num_utts)))
+
+
+def main():
+    wav_scp = sys.argv[1]
+    segments = sys.argv[2]
+    output_wav_scp = sys.argv[3]
+
+    utt_list, seg_path_list, start_time_list, end_time_list = read_file(
+        wav_scp, segments
+    )
+    output(output_wav_scp, utt_list, seg_path_list, start_time_list, end_time_list)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/docs/modescope_pipeline/asr_pipeline.md b/egs_modelscope/asr/TEMPLATE/README.md
similarity index 68%
rename from docs/modescope_pipeline/asr_pipeline.md
rename to egs_modelscope/asr/TEMPLATE/README.md
index 8b6b24d..3daff1f 100644
--- a/docs/modescope_pipeline/asr_pipeline.md
+++ b/egs_modelscope/asr/TEMPLATE/README.md
@@ -1,12 +1,12 @@
 # Speech Recognition
 
 > **Note**: 
-> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take model of Paraformer and Paraformer-online as example to demonstrate the usage.
+> The modelscope pipeline supports all the models in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope) to inference and finetine. Here we take typic model as example to demonstrate the usage.
 
 ## Inference
 
 ### Quick start
-#### [Paraformer model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+#### [Paraformer Model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
 ```python
 from modelscope.pipelines import pipeline
 from modelscope.utils.constant import Tasks
@@ -19,7 +19,7 @@
 rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
 print(rec_result)
 ```
-#### [Paraformer-online model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
+#### [Paraformer-online Model](https://www.modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/summary)
 ```python
 inference_pipeline = pipeline(
     task=Tasks.auto_speech_recognition,
@@ -41,7 +41,7 @@
 ```
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/241)
 
-#### [UniASR model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
+#### [UniASR Model](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
 There are three decoding mode for UniASR model(`fast`銆乣normal`銆乣offline`), for more model detailes, please refer to [docs](https://www.modelscope.cn/models/damo/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/summary)
 ```python
 decoding_model = "fast" # "fast"銆�"normal"銆�"offline"
@@ -53,27 +53,27 @@
 rec_result = inference_pipeline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
 print(rec_result)
 ```
-The decoding mode of `fast` and `normal`
+The decoding mode of `fast` and `normal` is fake streaming, which could be used for evaluating of recognition accuracy.
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/151)
 #### [RNN-T-online model]()
 Undo
 
 #### API-reference
-##### define pipeline
+##### Define pipeline
 - `task`: `Tasks.auto_speech_recognition`
 - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
-- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU 
-- `output_dir`: None (Defalut), the output path of results if set
-- `batch_size`: 1 (Defalut), batch size when decoding
-##### infer pipeline
+- `ngpu`: `1` (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: `1` (Defalut), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: `None` (Defalut), the output path of results if set
+- `batch_size`: `1` (Defalut), batch size when decoding
+##### Infer pipeline
 - `audio_in`: the input to decode, which could be: 
   - wav_path, `e.g.`: asr_example.wav,
   - pcm_path, `e.g.`: asr_example.pcm, 
   - audio bytes stream, `e.g.`: bytes data from a microphone
   - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
   - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: 
-  ```cat wav.scp
+  ```text
   asr_example1  ./audios/asr_example1.wav
   asr_example2  ./audios/asr_example2.wav
   ```
@@ -85,13 +85,15 @@
 FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 - Setting parameters in `infer.sh`
-    - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - <strong>output_dir:</strong> # result dir
-    - <strong>batch_size:</strong> # batchsize of inference
-    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
-    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
-    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+    - `data_dir`: the dataset dir needs to include `wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
+    - `output_dir`: output dir of the recognition results
+    - `batch_size`: `64` (Default), batch size of inference on gpu
+    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
 
 - Decode with multi GPUs:
 ```shell
@@ -167,12 +169,12 @@
 ### Finetune with your data
 
 - Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py)
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
-    - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
-    - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
-    - <strong>max_epoch:</strong> # number of training epoch
-    - <strong>lr:</strong> # learning rate
+    - `output_dir`: result dir
+    - `data_dir`: the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text`
+    - `dataset_type`: for dataset larger than 1000 hours, set as `large`, otherwise set as `small`
+    - `batch_bins`: batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms
+    - `max_epoch`: number of training epoch
+    - `lr`: learning rate
 
 - Then you can run the pipeline to finetune with:
 ```shell
@@ -183,14 +185,29 @@
 CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1
 ```
 ## Inference with your finetuned model
-- Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py)
-    - <strong>modelscope_model_name: </strong> # model name on ModelScope
-    - <strong>output_dir:</strong> # result dir
-    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
-    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
-    - <strong>batch_size:</strong> # batchsize of inference  
 
-- Then you can run the pipeline to finetune with:
-```python
-    python infer_after_finetune.py
+- Setting parameters in [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) is the same with [docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/egs_modelscope/asr/TEMPLATE#inference-with-multi-thread-cpus-or-multi-gpus) 
+
+- Decode with multi GPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --batch_size 64 \
+    --gpu_inference true \
+    --gpuid_list "0,1" \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
 ```
+- Decode with multi-thread CPUs:
+```shell
+    bash infer.sh \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
+    --data_dir "./data/test" \
+    --output_dir "./results" \
+    --gpu_inference false \
+    --njob 64 \
+    --checkpoint_dir "./checkpoint" \
+    --checkpoint_name "valid.cer_ctc.ave.pb"
+```
\ No newline at end of file
diff --git a/egs_modelscope/asr/TEMPLATE/infer.sh b/egs_modelscope/asr/TEMPLATE/infer.sh
index b8b011c..134a8ad 100644
--- a/egs_modelscope/asr/TEMPLATE/infer.sh
+++ b/egs_modelscope/asr/TEMPLATE/infer.sh
@@ -12,7 +12,9 @@
 batch_size=64
 gpu_inference=true    # whether to perform gpu decoding
 gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
-njob=4    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
 
 . utils/parse_options.sh || exit 1;
 
@@ -34,6 +36,11 @@
 done
 perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
 
+if ${checkpoint_dir}; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
 if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
     echo "Decoding ..."
     gpuid_list_array=(${gpuid_list//,/ })
diff --git a/docs/modescope_pipeline/vad_pipeline.md b/egs_modelscope/vad/TEMPLATE/README.md
similarity index 66%
rename from docs/modescope_pipeline/vad_pipeline.md
rename to egs_modelscope/vad/TEMPLATE/README.md
index 9d9b77a..df45b35 100644
--- a/docs/modescope_pipeline/vad_pipeline.md
+++ b/egs_modelscope/vad/TEMPLATE/README.md
@@ -42,22 +42,23 @@
 Full code of demo, please ref to [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/236)
 
 
+
 #### API-reference
-##### define pipeline
-- `task`: `Tasks.auto_speech_recognition`
+##### Define pipeline
+- `task`: `Tasks.voice_activity_detection`
 - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
-- `ngpu`: 1 (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
-- `ncpu`: 1 (Defalut), sets the number of threads used for intraop parallelism on CPU 
-- `output_dir`: None (Defalut), the output path of results if set
-- `batch_size`: 1 (Defalut), batch size when decoding
-##### infer pipeline
+- `ngpu`: `1` (Defalut), decoding on GPU. If ngpu=0, decoding on CPU
+- `ncpu`: `1` (Defalut), sets the number of threads used for intraop parallelism on CPU 
+- `output_dir`: `None` (Defalut), the output path of results if set
+- `batch_size`: `1` (Defalut), batch size when decoding
+##### Infer pipeline
 - `audio_in`: the input to decode, which could be: 
   - wav_path, `e.g.`: asr_example.wav,
   - pcm_path, `e.g.`: asr_example.pcm, 
   - audio bytes stream, `e.g.`: bytes data from a microphone
   - audio sample point锛宍e.g.`: `audio, rate = soundfile.read("asr_example_zh.wav")`, the dtype is numpy.ndarray or torch.Tensor
   - wav.scp, kaldi style wav list (`wav_id \t wav_path``), `e.g.`: 
-  ```cat wav.scp
+  ```text
   asr_example1  ./audios/asr_example1.wav
   asr_example2  ./audios/asr_example2.wav
   ```
@@ -66,42 +67,38 @@
 - `output_dir`: None (Defalut), the output path of results if set
 
 ### Inference with multi-thread CPUs or multi GPUs
-FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs.
+FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/vad/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs.
 
 - Setting parameters in `infer.sh`
-    - <strong>model:</strong> # model name on ModelScope
-    - <strong>data_dir:</strong> # the dataset dir needs to include `${data_dir}/wav.scp`. If `${data_dir}/text` is also exists, CER will be computed
-    - <strong>output_dir:</strong> # result dir
-    - <strong>batch_size:</strong> # batchsize of inference
-    - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding
-    - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1"
-    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob`
+    - `model`: model name in [model zoo](https://alibaba-damo-academy.github.io/FunASR/en/modelscope_models.html#pretrained-models-on-modelscope), or model path in local disk
+    - `data_dir`: the dataset dir needs to include `wav.scp`
+    - `output_dir`: output dir of the recognition results
+    - `batch_size`: `64` (Default), batch size of inference on gpu
+    - `gpu_inference`: `true` (Default), whether to perform gpu decoding, set false for CPU inference
+    - `gpuid_list`: `0,1` (Default), which gpu_ids are used to infer
+    - `njob`: only used for CPU inference (`gpu_inference`=`false`), `64` (Default), the number of jobs for CPU decoding
+    - `checkpoint_dir`: only used for infer finetuned models, the path dir of finetuned models
+    - `checkpoint_name`: only used for infer finetuned models, `valid.cer_ctc.ave.pb` (Default), which checkpoint is used to infer
 
 - Decode with multi GPUs:
 ```shell
     bash infer.sh \
-    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
+    --batch_size 64 \
     --gpu_inference true \
     --gpuid_list "0,1"
 ```
 - Decode with multi-thread CPUs:
 ```shell
     bash infer.sh \
-    --model "damo/speech_fsmn_vad_zh-cn-16k-common-pytorch" \
+    --model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" \
     --data_dir "./data/test" \
     --output_dir "./results" \
     --gpu_inference false \
     --njob 64
 ```
-
-- Results
-
-The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set.
-
-If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization.
-
 
 ## Finetune with pipeline
 
diff --git a/egs_modelscope/vad/TEMPLATE/infer.py b/egs_modelscope/vad/TEMPLATE/infer.py
new file mode 100644
index 0000000..3d9ee55
--- /dev/null
+++ b/egs_modelscope/vad/TEMPLATE/infer.py
@@ -0,0 +1,25 @@
+import os
+import shutil
+import argparse
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+def modelscope_infer(args):
+    os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid)
+    inference_pipeline = pipeline(
+        task=Tasks.voice_activity_detection,
+        model=args.model,
+        output_dir=args.output_dir,
+        batch_size=args.batch_size,
+    )
+    inference_pipeline(audio_in=args.audio_in)
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+    parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
+    parser.add_argument('--output_dir', type=str, default="./results/")
+    parser.add_argument('--batch_size', type=int, default=64)
+    parser.add_argument('--gpuid', type=str, default="0")
+    args = parser.parse_args()
+    modelscope_infer(args)
\ No newline at end of file
diff --git a/egs_modelscope/vad/TEMPLATE/infer.sh b/egs_modelscope/vad/TEMPLATE/infer.sh
new file mode 100644
index 0000000..261b5e6
--- /dev/null
+++ b/egs_modelscope/vad/TEMPLATE/infer.sh
@@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+
+set -e
+set -u
+set -o pipefail
+
+stage=1
+stop_stage=2
+model="damo/speech_fsmn_vad_zh-cn-16k-common"
+data_dir="./data/test"
+output_dir="./results"
+batch_size=64
+gpu_inference=true    # whether to perform gpu decoding
+gpuid_list="0,1"    # set gpus, e.g., gpuid_list="0,1"
+njob=64    # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob
+checkpoint_dir=
+checkpoint_name="valid.cer_ctc.ave.pb"
+
+. utils/parse_options.sh || exit 1;
+
+if ${gpu_inference} == "true"; then
+    nj=$(echo $gpuid_list | awk -F "," '{print NF}')
+else
+    nj=$njob
+    batch_size=1
+    gpuid_list=""
+    for JOB in $(seq ${nj}); do
+        gpuid_list=$gpuid_list"-1,"
+    done
+fi
+
+mkdir -p $output_dir/split
+split_scps=""
+for JOB in $(seq ${nj}); do
+    split_scps="$split_scps $output_dir/split/wav.$JOB.scp"
+done
+perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps}
+
+if ${checkpoint_dir}; then
+  python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name}
+  model=${checkpoint_dir}/${model}
+fi
+
+if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then
+    echo "Decoding ..."
+    gpuid_list_array=(${gpuid_list//,/ })
+    for JOB in $(seq ${nj}); do
+        {
+        id=$((JOB-1))
+        gpuid=${gpuid_list_array[$id]}
+        mkdir -p ${output_dir}/output.$JOB
+        python infer.py \
+            --model ${model} \
+            --audio_in ${output_dir}/split/wav.$JOB.scp \
+            --output_dir ${output_dir}/output.$JOB \
+            --batch_size ${batch_size} \
+            --gpuid ${gpuid}
+        }&
+    done
+    wait
+
+    mkdir -p ${output_dir}/1best_recog
+    for f in token score text; do
+        if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then
+          for i in $(seq "${nj}"); do
+              cat "${output_dir}/output.${i}/1best_recog/${f}"
+          done | sort -k1 >"${output_dir}/1best_recog/${f}"
+        fi
+    done
+fi
+
diff --git a/egs_modelscope/vad/TEMPLATE/utils b/egs_modelscope/vad/TEMPLATE/utils
new file mode 120000
index 0000000..dc7d417
--- /dev/null
+++ b/egs_modelscope/vad/TEMPLATE/utils
@@ -0,0 +1 @@
+../../../egs/aishell/transformer/utils
\ No newline at end of file
diff --git a/funasr/bin/punctuation_infer.py b/funasr/bin/punctuation_infer.py
index dd28ef8..077814d 100644
--- a/funasr/bin/punctuation_infer.py
+++ b/funasr/bin/punctuation_infer.py
@@ -61,7 +61,6 @@
             text_name="text",
             non_linguistic_symbols=train_args.non_linguistic_symbols,
         )
-        print("start decoding!!!")
 
     @torch.no_grad()
     def __call__(self, text: Union[list, str], split_size=20):
@@ -223,7 +222,6 @@
             result, _ = text2punc(line)
             item = {'key': key, 'value': result}
             results.append(item)
-            print(results)
             return results
 
         for inference_text, _, _ in data_path_and_name_and_type:
diff --git a/funasr/export/README.md b/funasr/export/README.md
index 8f57673..d403121 100644
--- a/funasr/export/README.md
+++ b/funasr/export/README.md
@@ -1,14 +1,20 @@
 # Export models
 
 ## Environments
-    torch >= 1.11.0
-    modelscope >= 1.2.0
-    torch-quant >= 0.4.0 (required for exporting quantized torchscript format model)
-    # pip install torch-quant -i https://pypi.org/simple
-
-## Install modelscope and funasr
+### Install modelscope and funasr
 
 The installation is the same as [funasr](https://github.com/alibaba-damo-academy/FunASR/blob/main/README.md#installation)
+```shell
+# pip3 install torch torchaudio
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+```
+### Install the quantization tools
+```shell
+pip install torch-quant # Optional, for torchscript quantization
+pip install onnx onnxruntime # Optional, for onnx quantization
+```
 
 ## Export model
    `Tips`: torch>=1.11.0
diff --git a/funasr/runtime/python/grpc/Readme.md b/funasr/runtime/python/grpc/Readme.md
index 822cb52..895013a 100644
--- a/funasr/runtime/python/grpc/Readme.md
+++ b/funasr/runtime/python/grpc/Readme.md
@@ -9,9 +9,10 @@
 Install the modelscope and funasr
 
 ```shell
-pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
 git clone https://github.com/alibaba/FunASR.git && cd FunASR
-pip install --editable ./
 ```
 
 Install the requirements
diff --git a/funasr/runtime/python/libtorch/README.md b/funasr/runtime/python/libtorch/README.md
index fd64cc6..4174656 100644
--- a/funasr/runtime/python/libtorch/README.md
+++ b/funasr/runtime/python/libtorch/README.md
@@ -4,9 +4,12 @@
 ### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
 
 ```shell
-pip3 install torch torchaudio
-pip install -U modelscope
-pip install -U funasr
+# pip3 install torch torchaudio
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+pip install torch-quant # Optional, for torchscript quantization
+pip install onnx onnxruntime # Optional, for onnx quantization
 ```
 
 ### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index 3f4e762..1f7fcaa 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -4,9 +4,12 @@
 ### Install [modelscope and funasr](https://github.com/alibaba-damo-academy/FunASR#installation)
 
 ```shell
-pip3 install torch torchaudio
-pip install -U modelscope
-pip install -U funasr
+#pip3 install torch torchaudio
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
+pip install torch-quant # Optional, for torchscript quantization
+pip install onnx onnxruntime # Optional, for onnx quantization
 ```
 
 ### Export [onnx model](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)
diff --git a/funasr/runtime/python/websocket/README.md b/funasr/runtime/python/websocket/README.md
index 353cfa6..73f8aeb 100644
--- a/funasr/runtime/python/websocket/README.md
+++ b/funasr/runtime/python/websocket/README.md
@@ -8,9 +8,10 @@
 Install the modelscope and funasr
 
 ```shell
-pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install -U modelscope funasr
+# For the users in China, you could install with the command:
+# pip install -U modelscope funasr -i https://mirror.sjtu.edu.cn/pypi/web/simple
 git clone https://github.com/alibaba/FunASR.git && cd FunASR
-pip install --editable ./
 ```
 
 Install the requirements for server
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 9c4af41..7c187e9 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -582,10 +582,16 @@
                 if num_batch_updates % batch_interval == 0:
                     if options.use_pai and options.oss_bucket is not None:
                         buffer = BytesIO()
-                        torch.save(model.state_dict(), buffer)
+                        if hasattr(model, "module"):
+                            torch.save(model.module.state_dict(), buffer)
+                        else:
+                            torch.save(model.state_dict(), buffer)
                         options.oss_bucket.put_object(os.path.join(output_dir, f"{num_batch_updates}step.pb"), buffer.getvalue())
                     else:
-                        torch.save(model.state_dict(), os.path.join(output_dir, f"{num_batch_updates}step.pb"))
+                        if hasattr(model, "module"):
+                            torch.save(model.module.state_dict(), os.path.join(output_dir, f"{num_batch_updates}step.pb"))
+                        else:
+                            torch.save(model.state_dict(), os.path.join(output_dir, f"{num_batch_updates}step.pb"))
 
             if distributed:
                 torch.distributed.all_reduce(iterator_stop, ReduceOp.SUM)

--
Gitblit v1.9.1