| egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 | |
| egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py | ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史 |
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/README.md
New file @@ -0,0 +1,30 @@ # ModelScope Model ## How to finetune and infer using a pretrained Paraformer-large Model ### Finetune - Modify finetune training related parameters in `finetune.py` - <strong>output_dir:</strong> # result dir - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text. - <strong>batch_bins:</strong> # batch size - <strong>max_epoch:</strong> # number of training epoch - <strong>lr:</strong> # learning rate - Then you can run the pipeline to finetune with: ```python python finetune.py ``` ### Inference Or you can use the finetuned model for inference directly. - Setting parameters in `infer.py` - <strong>data_dir:</strong> # the dataset dir - <strong>output_dir:</strong> # result dir - Then you can run the pipeline to infer with: ```python python infer.py ``` egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/finetune.py
New file @@ -0,0 +1,36 @@ import os from modelscope.metainfo import Trainers from modelscope.trainers import build_trainer from funasr.datasets.ms_dataset import MsDataset from funasr.utils.modelscope_param import modelscope_args def modelscope_finetune(params): if not os.path.exists(params.output_dir): os.makedirs(params.output_dir, exist_ok=True) # dataset split ["train", "validation"] ds_dict = MsDataset.load(params.data_path) kwargs = dict( model=params.model, data_dir=ds_dict, dataset_type=params.dataset_type, work_dir=params.output_dir, batch_bins=params.batch_bins, max_epoch=params.max_epoch, lr=params.lr) trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) trainer.train() if __name__ == '__main__': params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1", data_path="./data") params.output_dir = "./checkpoint" # m模型保存路径 params.data_path = "./example_data/" # 数据路径 params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, params.max_epoch = 50 # 最大训练轮数 params.lr = 0.00005 # 设置学习率 modelscope_finetune(params) egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
New file @@ -0,0 +1,88 @@ import os import shutil from multiprocessing import Pool from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from funasr.utils.compute_wer import compute_wer def modelscope_infer_core(output_dir, split_dir, njob, idx): output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) gpu_id = (int(idx) - 1) // njob if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) else: os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) inference_pipline = pipeline( task=Tasks.auto_speech_recognition, model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1", output_dir=output_dir_job, batch_size=64 ) audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) inference_pipline(audio_in=audio_in) def modelscope_infer(params): # prepare for multi-GPU decoding ngpu = params["ngpu"] njob = params["njob"] output_dir = params["output_dir"] if os.path.exists(output_dir): shutil.rmtree(output_dir) os.mkdir(output_dir) split_dir = os.path.join(output_dir, "split") os.mkdir(split_dir) nj = ngpu * njob wav_scp_file = os.path.join(params["data_dir"], "wav.scp") with open(wav_scp_file) as f: lines = f.readlines() num_lines = len(lines) num_job_lines = num_lines // nj start = 0 for i in range(nj): end = start + num_job_lines file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) with open(file, "w") as f: if i == nj - 1: f.writelines(lines[start:]) else: f.writelines(lines[start:end]) start = end p = Pool(nj) for i in range(nj): p.apply_async(modelscope_infer_core, args=(output_dir, split_dir, njob, str(i + 1))) p.close() p.join() # combine decoding results best_recog_path = os.path.join(output_dir, "1best_recog") os.mkdir(best_recog_path) files = ["text", "token", "score"] for file in files: with open(os.path.join(best_recog_path, file), "w") as f: for i in range(nj): job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) with open(job_file) as f_job: lines = f_job.readlines() f.writelines(lines) # If text exists, compute CER text_in = os.path.join(params["data_dir"], "text") if os.path.exists(text_in): text_proc_file = os.path.join(best_recog_path, "token") compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) if __name__ == "__main__": params = {} params["data_dir"] = "./data/test" params["output_dir"] = "./results" params["ngpu"] = 1 params["njob"] = 1 modelscope_infer(params) egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
New file @@ -0,0 +1,53 @@ import json import os import shutil from modelscope.pipelines import pipeline from modelscope.utils.constant import Tasks from funasr.utils.compute_wer import compute_wer def modelscope_infer_after_finetune(params): # prepare for decoding pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"]) for file_name in params["required_files"]: if file_name == "configuration.json": with open(os.path.join(pretrained_model_path, file_name)) as f: config_dict = json.load(f) config_dict["model"]["am_model_name"] = params["decoding_model_name"] with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f: json.dump(config_dict, f, indent=4, separators=(',', ': ')) else: shutil.copy(os.path.join(pretrained_model_path, file_name), os.path.join(params["output_dir"], file_name)) decoding_path = os.path.join(params["output_dir"], "decode_results") if os.path.exists(decoding_path): shutil.rmtree(decoding_path) os.mkdir(decoding_path) # decoding inference_pipeline = pipeline( task=Tasks.auto_speech_recognition, model=params["output_dir"], output_dir=decoding_path, batch_size=64 ) audio_in = os.path.join(params["data_dir"], "wav.scp") inference_pipeline(audio_in=audio_in) # computer CER if GT text is set text_in = os.path.join(params["data_dir"], "text") if text_in is not None: text_proc_file = os.path.join(decoding_path, "1best_recog/token") compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) if __name__ == '__main__': params = {} params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1" params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"] params["output_dir"] = "./checkpoint" params["data_dir"] = "./data/test" params["decoding_model_name"] = "valid.acc.ave_10best.pth" modelscope_infer_after_finetune(params)