From 2f3d36d689d36d2b422899e0589443c3d07542b8 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 30 一月 2023 17:51:07 +0800
Subject: [PATCH] Merge branch 'dev' of github.com:alibaba-damo-academy/FunASR into dev add
---
docs/images/.DS_Store | 0
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md | 30 ++++++
funasr/bin/asr_inference_paraformer.py | 9 +
funasr/bin/asr_inference_uniasr.py | 7 +
funasr/models/e2e_vad.py | 2
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py | 88 +++++++++++++++++
funasr/bin/asr_inference_paraformer_timestamp.py | 9 +
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py | 53 ++++++++++
funasr/bin/asr_inference_paraformer_vad_punc.py | 42 +++++--
docs/images/wechat.png | 0
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py | 36 +++++++
README.md | 17 +--
12 files changed, 260 insertions(+), 33 deletions(-)
diff --git a/README.md b/README.md
index 6bf1278..25fae21 100644
--- a/README.md
+++ b/README.md
@@ -23,7 +23,7 @@
- FunASR supplies a easy-to-use pipeline to finetune pretrained models from [ModelScope](https://www.modelscope.cn/models?page=1&tasks=auto-speech-recognition)
- Compared to [Espnet](https://github.com/espnet/espnet) framework, the training speed of large-scale datasets in FunASR is much faster owning to the optimized dataloader.
-## Installation(Training and Developing)
+## Installation
- Install Conda:
``` sh
@@ -40,16 +40,17 @@
```
For more versions, please see [https://pytorch.org/get-started/locally](https://pytorch.org/get-started/locally)
+- Install ModelScope:
-If you are in the area of China, you could set the source to speed the downloading.
+If you are in the area of China, you could set the source to speedup the downloading.
``` sh
pip config set global.index-url https://mirror.sjtu.edu.cn/pypi/web/simple
```
-- Install ModelScope:
+Install or upgrade modelscope.
``` sh
-pip install "modelscope[audio]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+pip install "modelscope[audio]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
```
For more details about modelscope, please see [modelscope installation](https://modelscope.cn/docs/%E7%8E%AF%E5%A2%83%E5%AE%89%E8%A3%85)
@@ -61,18 +62,14 @@
pip install --editable ./
```
-## Pretrained Model Zoo
-
-We have trained many academic and industrial models, [model hub](docs/modelscope_models.md)
-
## Contact
If you have any questions about FunASR, please contact us by
- email: [funasr@list.alibaba-inc.com](funasr@list.alibaba-inc.com)
-- Dingding group:
-<div align="left"><img src="docs/images/dingding.jpg" width="250"/>!<img src="docs/images/wechat.png" width="222"/></div>
+- Dingding group and Wechat group:
+<div align="left"><img src="docs/images/dingding.jpg" width="250"/> <img src="docs/images/wechat.png" width="222"/></div>
## Acknowledge
diff --git a/docs/images/.DS_Store b/docs/images/.DS_Store
index 5ef0f4c..606ad25 100644
--- a/docs/images/.DS_Store
+++ b/docs/images/.DS_Store
Binary files differ
diff --git a/docs/images/wechat.png b/docs/images/wechat.png
index c71b1fb..e7b7349 100644
--- a/docs/images/wechat.png
+++ b/docs/images/wechat.png
Binary files differ
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
new file mode 100644
index 0000000..1587d3d
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
@@ -0,0 +1,30 @@
+# ModelScope Model
+
+## How to finetune and infer using a pretrained Paraformer-large Model
+
+### Finetune
+
+- Modify finetune training related parameters in `finetune.py`
+ - <strong>output_dir:</strong> # result dir
+ - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text.
+ - <strong>batch_bins:</strong> # batch size
+ - <strong>max_epoch:</strong> # number of training epoch
+ - <strong>lr:</strong> # learning rate
+
+- Then you can run the pipeline to finetune with:
+```python
+ python finetune.py
+```
+
+### Inference
+
+Or you can use the finetuned model for inference directly.
+
+- Setting parameters in `infer.py`
+ - <strong>data_dir:</strong> # the dataset dir
+ - <strong>output_dir:</strong> # result dir
+
+- Then you can run the pipeline to infer with:
+```python
+ python infer.py
+```
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
new file mode 100644
index 0000000..a2b3c29
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
@@ -0,0 +1,36 @@
+import os
+
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+
+from funasr.datasets.ms_dataset import MsDataset
+from funasr.utils.modelscope_param import modelscope_args
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params.output_dir):
+ os.makedirs(params.output_dir, exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params.data_path)
+ kwargs = dict(
+ model=params.model,
+ data_dir=ds_dict,
+ dataset_type=params.dataset_type,
+ work_dir=params.output_dir,
+ batch_bins=params.batch_bins,
+ max_epoch=params.max_epoch,
+ lr=params.lr)
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1", data_path="./data")
+ params.output_dir = "./checkpoint" # m妯″瀷淇濆瓨璺緞
+ params.data_path = "./example_data/" # 鏁版嵁璺緞
+ params.dataset_type = "small" # 灏忔暟鎹噺璁剧疆small锛岃嫢鏁版嵁閲忓ぇ浜�1000灏忔椂锛岃浣跨敤large
+ params.batch_bins = 2000 # batch size锛屽鏋渄ataset_type="small"锛宐atch_bins鍗曚綅涓篺bank鐗瑰緛甯ф暟锛屽鏋渄ataset_type="large"锛宐atch_bins鍗曚綅涓烘绉掞紝
+ params.max_epoch = 50 # 鏈�澶ц缁冭疆鏁�
+ params.lr = 0.00005 # 璁剧疆瀛︿範鐜�
+
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
new file mode 100644
index 0000000..f08b31f
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -0,0 +1,88 @@
+import os
+import shutil
+from multiprocessing import Pool
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+
+def modelscope_infer_core(output_dir, split_dir, njob, idx):
+ output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
+ gpu_id = (int(idx) - 1) // njob
+ if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
+ gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
+ else:
+ os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
+ output_dir=output_dir_job,
+ batch_size=64
+ )
+ audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
+ inference_pipline(audio_in=audio_in)
+
+
+def modelscope_infer(params):
+ # prepare for multi-GPU decoding
+ ngpu = params["ngpu"]
+ njob = params["njob"]
+ output_dir = params["output_dir"]
+ if os.path.exists(output_dir):
+ shutil.rmtree(output_dir)
+ os.mkdir(output_dir)
+ split_dir = os.path.join(output_dir, "split")
+ os.mkdir(split_dir)
+ nj = ngpu * njob
+ wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
+ with open(wav_scp_file) as f:
+ lines = f.readlines()
+ num_lines = len(lines)
+ num_job_lines = num_lines // nj
+ start = 0
+ for i in range(nj):
+ end = start + num_job_lines
+ file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1)))
+ with open(file, "w") as f:
+ if i == nj - 1:
+ f.writelines(lines[start:])
+ else:
+ f.writelines(lines[start:end])
+ start = end
+
+ p = Pool(nj)
+ for i in range(nj):
+ p.apply_async(modelscope_infer_core,
+ args=(output_dir, split_dir, njob, str(i + 1)))
+ p.close()
+ p.join()
+
+ # combine decoding results
+ best_recog_path = os.path.join(output_dir, "1best_recog")
+ os.mkdir(best_recog_path)
+ files = ["text", "token", "score"]
+ for file in files:
+ with open(os.path.join(best_recog_path, file), "w") as f:
+ for i in range(nj):
+ job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file)
+ with open(job_file) as f_job:
+ lines = f_job.readlines()
+ f.writelines(lines)
+
+ # If text exists, compute CER
+ text_in = os.path.join(params["data_dir"], "text")
+ if os.path.exists(text_in):
+ text_proc_file = os.path.join(best_recog_path, "token")
+ compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer"))
+
+
+if __name__ == "__main__":
+ params = {}
+ params["data_dir"] = "./data/test"
+ params["output_dir"] = "./results"
+ params["ngpu"] = 1
+ params["njob"] = 1
+ modelscope_infer(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
new file mode 100644
index 0000000..cd13d76
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -0,0 +1,53 @@
+import json
+import os
+import shutil
+
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from funasr.utils.compute_wer import compute_wer
+
+
+def modelscope_infer_after_finetune(params):
+ # prepare for decoding
+ pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
+ for file_name in params["required_files"]:
+ if file_name == "configuration.json":
+ with open(os.path.join(pretrained_model_path, file_name)) as f:
+ config_dict = json.load(f)
+ config_dict["model"]["am_model_name"] = params["decoding_model_name"]
+ with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
+ json.dump(config_dict, f, indent=4, separators=(',', ': '))
+ else:
+ shutil.copy(os.path.join(pretrained_model_path, file_name),
+ os.path.join(params["output_dir"], file_name))
+ decoding_path = os.path.join(params["output_dir"], "decode_results")
+ if os.path.exists(decoding_path):
+ shutil.rmtree(decoding_path)
+ os.mkdir(decoding_path)
+
+ # decoding
+ inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model=params["output_dir"],
+ output_dir=decoding_path,
+ batch_size=64
+ )
+ audio_in = os.path.join(params["data_dir"], "wav.scp")
+ inference_pipeline(audio_in=audio_in)
+
+ # computer CER if GT text is set
+ text_in = os.path.join(params["data_dir"], "text")
+ if text_in is not None:
+ text_proc_file = os.path.join(decoding_path, "1best_recog/token")
+ compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
+
+
+if __name__ == '__main__':
+ params = {}
+ params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
+ params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data/test"
+ params["decoding_model_name"] = "valid.acc.ave_10best.pth"
+ modelscope_infer_after_finetune(params)
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index c1f0864..01237f7 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -95,10 +95,13 @@
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
@@ -166,7 +169,7 @@
self.converter = converter
self.tokenizer = tokenizer
is_use_lm = lm_weight != 0.0 and lm_file is not None
- if ctc_weight == 0.0 and not is_use_lm:
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
diff --git a/funasr/bin/asr_inference_paraformer_timestamp.py b/funasr/bin/asr_inference_paraformer_timestamp.py
index 3fb8764..9560cb0 100644
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -98,10 +98,13 @@
logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
@@ -169,7 +172,7 @@
self.converter = converter
self.tokenizer = tokenizer
is_use_lm = lm_weight != 0.0 and lm_file is not None
- if ctc_weight == 0.0 and not is_use_lm:
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 7752ea9..7a539e4 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -3,6 +3,7 @@
import logging
import sys
import time
+import json
from pathlib import Path
from typing import Optional
from typing import Sequence
@@ -100,10 +101,13 @@
# logging.info("asr_train_args: {}".format(asr_train_args))
asr_model.to(dtype=getattr(torch, dtype)).eval()
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
@@ -171,7 +175,7 @@
self.converter = converter
self.tokenizer = tokenizer
is_use_lm = lm_weight != 0.0 and lm_file is not None
- if ctc_weight == 0.0 and not is_use_lm:
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
self.beam_search = beam_search
logging.info(f"Beam_search: {self.beam_search}")
@@ -562,6 +566,7 @@
length_total = 0.0
finish_count = 0
file_count = 1
+ lfr_factor = 6
# 7 .Start for-loop
asr_result_list = []
output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
@@ -597,7 +602,7 @@
results = speech2text(**batch)
if len(results) < 1:
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], 10, 6]] * nbest
+ results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
time_end = time.time()
forward_time = time_end - time_beg
lfr_factor = results[0][-1]
@@ -615,7 +620,8 @@
key = keys[0]
result = result_segments[0]
- text, token, token_int, time_stamp = result
+ text, token, token_int = result[0], result[1], result[2]
+ time_stamp = None if len(result) < 4 else result[3]
# Create a directory: outdir/{n}best_recog
if writer is not None:
@@ -630,15 +636,23 @@
text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
postprocessed_result[1], \
postprocessed_result[2]
- text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
- text_postprocessed_punc_time_stamp = "predictions: {} time_stamp: {}".format(
- text_postprocessed_punc, time_stamp_postprocessed)
+ if len(word_lists) > 0:
+ text_postprocessed_punc, punc_id_list = text2punc(word_lists, 20)
+ text_postprocessed_punc_time_stamp = json.dumps({"predictions": text_postprocessed_punc,
+ "time_stamp": time_stamp_postprocessed},
+ ensure_ascii=False)
+ else:
+ text_postprocessed_punc = ""
+ punc_id_list = []
+ text_postprocessed_punc_time_stamp = ""
+
else:
- text_postprocessed = postprocessed_result
- time_stamp_postprocessed = None
- word_lists = None
- text_postprocessed_punc_time_stamp = None
- punc_id_list = None
+ text_postprocessed = ""
+ time_stamp_postprocessed = ""
+ word_lists = ""
+ text_postprocessed_punc_time_stamp = ""
+ punc_id_list = ""
+ text_postprocessed_punc = ""
item = {'key': key, 'value': text_postprocessed_punc_time_stamp, 'text': text_postprocessed,
'time_stamp': time_stamp_postprocessed, 'punc': punc_id_list, 'token': token}
@@ -660,7 +674,7 @@
time_stamp_postprocessed))
logging.info("decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".
- format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor)))
+ format(length_total, forward_time_total, 100 * forward_time_total / (length_total * lfr_factor+1e-6)))
return asr_result_list
return _forward
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 515c0d4..d386ff1 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -96,11 +96,14 @@
else:
decoder = asr_model.decoder2
- ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
token_list = asr_model.token_list
scorers.update(
decoder=decoder,
- ctc=ctc,
length_bonus=LengthBonus(len(token_list)),
)
diff --git a/funasr/models/e2e_vad.py b/funasr/models/e2e_vad.py
index 35b9260..98504d6 100755
--- a/funasr/models/e2e_vad.py
+++ b/funasr/models/e2e_vad.py
@@ -272,7 +272,7 @@
frame_sample_length = int(self.vad_opts.frame_length_ms * self.vad_opts.sample_rate / 1000)
frame_shift_length = int(self.vad_opts.frame_in_ms * self.vad_opts.sample_rate / 1000)
self.data_buf = self.waveform[0] # 鎸囧悜self.waveform[0]
- for offset in range(0, self.waveform.shape[1] - frame_sample_length, frame_shift_length):
+ for offset in range(0, self.waveform.shape[1] - frame_sample_length + 1, frame_shift_length):
self.decibel.append(
10 * math.log10((self.waveform[0][offset: offset + frame_sample_length]).square().sum() + \
0.000001))
--
Gitblit v1.9.1