From 2f3d36d689d36d2b422899e0589443c3d07542b8 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 30 一月 2023 17:51:07 +0800
Subject: [PATCH] Merge branch 'dev' of github.com:alibaba-damo-academy/FunASR into dev add

---
 docs/images/.DS_Store                                                                                                 |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md               |   30 ++++++
 funasr/bin/asr_inference_paraformer.py                                                                                |    9 +
 funasr/bin/asr_inference_uniasr.py                                                                                    |    7 +
 funasr/models/e2e_vad.py                                                                                              |    2 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py                |   88 +++++++++++++++++
 funasr/bin/asr_inference_paraformer_timestamp.py                                                                      |    9 +
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py |   53 ++++++++++
 funasr/bin/asr_inference_paraformer_vad_punc.py                                                                       |   42 +++++--
 docs/images/wechat.png                                                                                                |    0 
 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py             |   36 +++++++
 README.md                                                                                                             |   17 +--
 12 files changed, 260 insertions(+), 33 deletions(-)

diff --git a/README.md b/README.md
index 6bf1278..25fae21 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
 - FunASR supplies a easy-to-use pipeline to finetune pretrained models from [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
 - Compared to [Espnet](https://github.com/espnet/espnet) framework, the training speed of large-scale datasets in FunASR is much faster owning to the optimized dataloader.
 
-## Installation(Training and Developing)
+## Installation
 
 - Install Conda:
 ``` sh
@@ -40,16 +40,17 @@
 ```
 For more versions, please see [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
 
+- Install ModelScope:
 
-If you are in the area of China, you could set the source to speed the downloading.
+If you are in the area of China, you could set the source to speedup the downloading.
 
 ``` sh
 pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
 ```
 
-- Install ModelScope:
+Install or upgrade modelscope.
 ``` sh
-pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install "modelscope[audio]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
 ```
 
 For more details about modelscope, please see [modelscope installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
@@ -61,18 +62,14 @@
 pip install --editable ./
 ```
 
-## Pretrained Model Zoo
-
-We have trained many academic and industrial models, [model hub](docs/modelscope_models.md)
-
 ## Contact
 
 If you have any questions about FunASR, please contact us by
 
 - email: [funasr@list.alibaba-inc.com](funasr@list.alibaba-inc.com)
 
-- Dingding group:
-<div align="left"><img src="docs/images/dingding.jpg" width="250"/>!<img src="docs/images/wechat.png" width="222"/></div>
+- Dingding group and Wechat group:
+<div align="left"><img src="docs/images/dingding.jpg" width="250"/> <img src="docs/images/wechat.png" width="222"/></div>
 
 
 ## Acknowledge
diff --git a/docs/images/.DS_Store b/docs/images/.DS_Store
index 5ef0f4c..606ad25 100644
--- a/docs/images/.DS_Store
+++ b/docs/images/.DS_Store
Binary files differ
diff --git a/docs/images/wechat.png b/docs/images/wechat.png
index c71b1fb..e7b7349 100644
--- a/docs/images/wechat.png
+++ b/docs/images/wechat.png
Binary files differ
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
new file mode 100644
index 0000000..1587d3d
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
@@ -0,0 +1,30 @@
+# ModelScope Model
+
+## How to finetune and infer using a pretrained Paraformer-large Model
+
+### Finetune
+
+- Modify finetune training related parameters in `finetune.py`
+    - <strong>output_dir:</strong> # result dir
+    - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
+    - <strong>batch_bins:</strong> # batch size
+    - <strong>max_epoch:</strong> # number of training epoch
+    - <strong>lr:</strong> # learning rate
+
+- Then you can run the pipeline to finetune with:
+```python
+    python finetune.py
+```
+
+### Inference
+
+Or you can use the finetuned model for inference directly.
+
+- Setting parameters in `infer.py`
+    - <strong>data_dir:</strong> # the dataset dir
+    - <strong>output_dir:</strong> # result dir
+
+- Then you can run the pipeline to infer with:
+```python
+    python infer.py
+```
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
new file mode 100644
index 0000000..a2b3c29
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
@@ -0,0 +1,36 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+    if not os.path.exists(params.output_dir):
+        os.makedirs(params.output_dir, exist_ok=True)
+    # dataset split ["train", "validation"]
+    ds_dict = MsDataset.load(params.data_path)
+    kwargs = dict(
+        model=params.model,
+        data_dir=ds_dict,
+        dataset_type=params.dataset_type,
+        work_dir=params.output_dir,
+        batch_bins=params.batch_bins,
+        max_epoch=params.max_epoch,
+        lr=params.lr)
+    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+    trainer.train()
+
+
+if __name__ == '__main__':
+    params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1", data_path="./data")
+    params.output_dir = "./checkpoint"              # m妯″瀷淇濆瓨璺緞
+    params.data_path = "./example_data/"            # 鏁版嵁璺緞
+    params.dataset_type = "small"                   # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+    params.batch_bins = 2000                       # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+    params.max_epoch = 50                           # 鏈�澶ц缁冭疆鏁�
+    params.lr = 0.00005                             # 璁剧疆瀛︿範鐜�
+    
+    modelscope_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
new file mode 100644
index 0000000..f08b31f
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -0,0 +1,88 @@
+import os
+import shutil
+from multiprocessing import Pool
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+
+def modelscope_infer_core(output_dir, split_dir, njob, idx):
+    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
+    gpu_id = (int(idx) - 1) // njob
+    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
+        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
+    else:
+        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+    inference_pipline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
+        output_dir=output_dir_job,
+        batch_size=64
+    )
+    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
+    inference_pipline(audio_in=audio_in)
+
+
+def modelscope_infer(params):
+    # prepare for multi-GPU decoding
+    ngpu = params["ngpu"]
+    njob = params["njob"]
+    output_dir = params["output_dir"]
+    if os.path.exists(output_dir):
+        shutil.rmtree(output_dir)
+    os.mkdir(output_dir)
+    split_dir = os.path.join(output_dir, "split")
+    os.mkdir(split_dir)
+    nj = ngpu * njob
+    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
+    with open(wav_scp_file) as f:
+        lines = f.readlines()
+        num_lines = len(lines)
+        num_job_lines = num_lines // nj
+    start = 0
+    for i in range(nj):
+        end = start + num_job_lines
+        file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
+        with open(file, "w") as f:
+            if i == nj - 1:
+                f.writelines(lines[start:])
+            else:
+                f.writelines(lines[start:end])
+        start = end
+
+    p = Pool(nj)
+    for i in range(nj):
+        p.apply_async(modelscope_infer_core,
+                      args=(output_dir, split_dir, njob, str(i + 1)))
+    p.close()
+    p.join()
+
+    # combine decoding results
+    best_recog_path = os.path.join(output_dir, "1best_recog")
+    os.mkdir(best_recog_path)
+    files = ["text", "token", "score"]
+    for file in files:
+        with open(os.path.join(best_recog_path, file), "w") as f:
+            for i in range(nj):
+                job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
+                with open(job_file) as f_job:
+                    lines = f_job.readlines()
+                f.writelines(lines)
+
+    # If text exists, compute CER
+    text_in = os.path.join(params["data_dir"], "text")
+    if os.path.exists(text_in):
+        text_proc_file = os.path.join(best_recog_path, "token")
+        compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
+
+
+if __name__ == "__main__":
+    params = {}
+    params["data_dir"] = "./data/test"
+    params["output_dir"] = "./results"
+    params["ngpu"] = 1
+    params["njob"] = 1
+    modelscope_infer(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
new file mode 100644
index 0000000..cd13d76
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -0,0 +1,53 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+
+def modelscope_infer_after_finetune(params):
+    # prepare for decoding
+    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
+    for file_name in params["required_files"]:
+        if file_name == "configuration.json":
+            with open(os.path.join(pretrained_model_path, file_name)) as f:
+                config_dict = json.load(f)
+                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
+            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
+                json.dump(config_dict, f, indent=4, separators=(',', ': '))
+        else:
+            shutil.copy(os.path.join(pretrained_model_path, file_name),
+                        os.path.join(params["output_dir"], file_name))
+    decoding_path = os.path.join(params["output_dir"], "decode_results")
+    if os.path.exists(decoding_path):
+        shutil.rmtree(decoding_path)
+    os.mkdir(decoding_path)
+
+    # decoding
+    inference_pipeline = pipeline(
+        task=Tasks.auto_speech_recognition,
+        model=params["output_dir"],
+        output_dir=decoding_path,
+        batch_size=64
+    )
+    audio_in = os.path.join(params["data_dir"], "wav.scp")
+    inference_pipeline(audio_in=audio_in)
+
+    # computer CER if GT text is set
+    text_in = os.path.join(params["data_dir"], "text")
+    if text_in is not None:
+        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
+
+
+if __name__ == '__main__':
+    params = {}
+    params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
+    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
+    params["output_dir"] = "./checkpoint"
+    params["data_dir"] = "./data/test"
+    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+    modelscope_infer_after_finetune(params)
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index c1f0864..01237f7 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -95,10 +95,13 @@
         logging.info("asr_train_args: {}".format(asr_train_args))
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
         token_list = asr_model.token_list
         scorers.update(
-            ctc=ctc,
             length_bonus=LengthBonus(len(token_list)),
         )
 
@@ -166,7 +169,7 @@
         self.converter = converter
         self.tokenizer = tokenizer
         is_use_lm = lm_weight != 0.0 and lm_file is not None
-        if ctc_weight == 0.0 and not is_use_lm:
+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
             beam_search = None
         self.beam_search = beam_search
         logging.info(f"Beam_search: {self.beam_search}")
diff --git a/funasr/bin/asr_inference_paraformer_timestamp.py b/funasr/bin/asr_inference_paraformer_timestamp.py
index 3fb8764..9560cb0 100644
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -98,10 +98,13 @@
         logging.info("asr_train_args: {}".format(asr_train_args))
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
         token_list = asr_model.token_list
         scorers.update(
-            ctc=ctc,
             length_bonus=LengthBonus(len(token_list)),
         )
 
@@ -169,7 +172,7 @@
         self.converter = converter
         self.tokenizer = tokenizer
         is_use_lm = lm_weight != 0.0 and lm_file is not None
-        if ctc_weight == 0.0 and not is_use_lm:
+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
             beam_search = None
         self.beam_search = beam_search
         logging.info(f"Beam_search: {self.beam_search}")
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 7752ea9..7a539e4 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -3,6 +3,7 @@
 import logging
 import sys
 import time
+import json
 from pathlib import Path
 from typing import Optional
 from typing import Sequence
@@ -100,10 +101,13 @@
         # logging.info("asr_train_args: {}".format(asr_train_args))
         asr_model.to(dtype=getattr(torch, dtype)).eval()
 
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
         token_list = asr_model.token_list
         scorers.update(
-            ctc=ctc,
             length_bonus=LengthBonus(len(token_list)),
         )
 
@@ -171,7 +175,7 @@
         self.converter = converter
         self.tokenizer = tokenizer
         is_use_lm = lm_weight != 0.0 and lm_file is not None
-        if ctc_weight == 0.0 and not is_use_lm:
+        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
             beam_search = None
         self.beam_search = beam_search
         logging.info(f"Beam_search: {self.beam_search}")
@@ -562,6 +566,7 @@
         length_total = 0.0
         finish_count = 0
         file_count = 1
+        lfr_factor = 6
         # 7 .Start for-loop
         asr_result_list = []
         output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
@@ -597,7 +602,7 @@
                     results = speech2text(**batch)
                     if len(results) < 1:
                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                        results = [[" ", ["<space>"], [2], 10, 6]] * nbest
+                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
                     time_end = time.time()
                     forward_time = time_end - time_beg
                     lfr_factor = results[0][-1]
@@ -615,7 +620,8 @@
                 
                 key = keys[0]
                 result = result_segments[0]
-                text, token, token_int, time_stamp = result
+                text, token, token_int = result[0], result[1], result[2]
+                time_stamp = None if len(result) < 4 else result[3]
                 
                 # Create a directory: outdir/{n}best_recog
                 if writer is not None:
@@ -630,15 +636,23 @@
                         text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                                    postprocessed_result[1], \
                                                                                    postprocessed_result[2]
-                        text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
-                        text_postprocessed_punc_time_stamp = "predictions: {}  time_stamp: {}".format(
-                            text_postprocessed_punc, time_stamp_postprocessed)
+                        if len(word_lists) > 0: 
+                            text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
+                            text_postprocessed_punc_time_stamp = json.dumps({"predictions": text_postprocessed_punc,
+                                                                             "time_stamp": time_stamp_postprocessed},
+                                                                            ensure_ascii=False)
+                        else:
+                            text_postprocessed_punc = ""
+                            punc_id_list = []
+                            text_postprocessed_punc_time_stamp = ""
+                            
                     else:
-                        text_postprocessed = postprocessed_result
-                        time_stamp_postprocessed = None
-                        word_lists = None
-                        text_postprocessed_punc_time_stamp = None
-                        punc_id_list = None
+                        text_postprocessed = ""
+                        time_stamp_postprocessed = ""
+                        word_lists = ""
+                        text_postprocessed_punc_time_stamp = ""
+                        punc_id_list = ""
+                        text_postprocessed_punc = ""
 
                     item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed,
                             'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list, 'token': token}
@@ -660,7 +674,7 @@
                                                                                          time_stamp_postprocessed))
         
         logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
-                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)))
+                     format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor+1e-6)))
         return asr_result_list
     return _forward
 
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 515c0d4..d386ff1 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -96,11 +96,14 @@
         else:
             decoder = asr_model.decoder2
 
-        ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+        if asr_model.ctc != None:
+            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+            scorers.update(
+                ctc=ctc
+            )
         token_list = asr_model.token_list
         scorers.update(
             decoder=decoder,
-            ctc=ctc,
             length_bonus=LengthBonus(len(token_list)),
         )
 
diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index 35b9260..98504d6 100755
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -272,7 +272,7 @@
         frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
         frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
         self.data_buf = self.waveform[0]  # 鎸囧悜self.waveform[0]
-        for offset in range(0, self.waveform.shape[1] - frame_sample_length, frame_shift_length):
+        for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
             self.decibel.append(
                 10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
                                 0.000001))

--
Gitblit v1.9.1