| | |
| | | - `output_dir`: None (Defalut), the output path of results if set |
| | | |
| | | ### Inference with multi-thread CPUs or multi GPUs |
| | | FunASR also offer recipes [run.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh) to decode with multi-thread CPUs, or multi GPUs. |
| | | FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer.sh) to decode with multi-thread CPUs, or multi GPUs. |
| | | |
| | | - Setting parameters in `infer.sh` |
| | | - <strong>model:</strong> # model name on ModelScope |
| | |
| | | ## Finetune with pipeline |
| | | |
| | | ### Quick start |
| | | [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/finetune.py) |
| | | [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py) |
| | | ```python |
| | | import os |
| | | from modelscope.metainfo import Trainers |
| | |
| | | |
| | | ### Finetune with your data |
| | | |
| | | - Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/finetune.py) |
| | | - Modify finetune training related parameters in [finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/finetune.py) |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text` |
| | | - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small` |
| | |
| | | CUDA_VISIBLE_DEVICES=1,2 python -m torch.distributed.launch --nproc_per_node 2 finetune.py > log.txt 2>&1 |
| | | ``` |
| | | ## Inference with your finetuned model |
| | | - Modify inference related parameters in `infer_after_finetune.py` |
| | | - Modify inference related parameters in [infer_after_finetune.py](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE/infer_after_finetune.py) |
| | | - <strong>modelscope_model_name: </strong> # model name on ModelScope |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed |
| | |
| | | - `output_dir`: None (Defalut), the output path of results if set |
| | | |
| | | ### Inference with multi-thread CPUs or multi GPUs |
| | | FunASR also offer recipes [run.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.sh) to decode with multi-thread CPUs, or multi GPUs. |
| | | FunASR also offer recipes [infer.sh](https://github.com/alibaba-damo-academy/FunASR/blob/main/egs_modelscope/asr/TEMPLATE//infer.sh) to decode with multi-thread CPUs, or multi GPUs. |
| | | |
| | | - Setting parameters in `infer.sh` |
| | | - <strong>model:</strong> # model name on ModelScope |
| New file |
| | |
| | | import os |
| | | |
| | | from modelscope.metainfo import Trainers |
| | | from modelscope.trainers import build_trainer |
| | | |
| | | from funasr.datasets.ms_dataset import MsDataset |
| | | from funasr.utils.modelscope_param import modelscope_args |
| | | |
| | | |
| | | def modelscope_finetune(params): |
| | | if not os.path.exists(params.output_dir): |
| | | os.makedirs(params.output_dir, exist_ok=True) |
| | | # dataset split ["train", "validation"] |
| | | ds_dict = MsDataset.load(params.data_path) |
| | | kwargs = dict( |
| | | model=params.model, |
| | | data_dir=ds_dict, |
| | | dataset_type=params.dataset_type, |
| | | work_dir=params.output_dir, |
| | | batch_bins=params.batch_bins, |
| | | max_epoch=params.max_epoch, |
| | | lr=params.lr) |
| | | trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) |
| | | trainer.train() |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | params = modelscope_args(model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", data_path="./data") |
| | | params.output_dir = "./checkpoint" # m模型保存路径 |
| | | params.data_path = "./example_data/" # 数据路径 |
| | | params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large |
| | | params.batch_bins = 2000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, |
| | | params.max_epoch = 50 # 最大训练轮数 |
| | | params.lr = 0.00005 # 设置学习率 |
| | | |
| | | modelscope_finetune(params) |
| New file |
| | |
| | | import os |
| | | import shutil |
| | | import argparse |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | def modelscope_infer(args): |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=args.model, |
| | | output_dir=args.output_dir, |
| | | batch_size=args.batch_size, |
| | | ) |
| | | inference_pipeline(audio_in=args.audio_in) |
| | | |
| | | if __name__ == "__main__": |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | | args = parser.parse_args() |
| | | modelscope_infer(args) |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | stage=1 |
| | | stop_stage=2 |
| | | model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | data_dir="./data/test" |
| | | output_dir="./results" |
| | | batch_size=64 |
| | | gpu_inference=true # whether to perform gpu decoding |
| | | gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" |
| | | njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | if ${gpu_inference} == "true"; then |
| | | nj=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | else |
| | | nj=$njob |
| | | batch_size=1 |
| | | gpuid_list="" |
| | | for JOB in $(seq ${nj}); do |
| | | gpuid_list=$gpuid_list"-1," |
| | | done |
| | | fi |
| | | |
| | | mkdir -p $output_dir/split |
| | | split_scps="" |
| | | for JOB in $(seq ${nj}); do |
| | | split_scps="$split_scps $output_dir/split/wav.$JOB.scp" |
| | | done |
| | | perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} |
| | | |
| | | if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then |
| | | echo "Decoding ..." |
| | | gpuid_list_array=(${gpuid_list//,/ }) |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | id=$((JOB-1)) |
| | | gpuid=${gpuid_list_array[$id]} |
| | | mkdir -p ${output_dir}/output.$JOB |
| | | python infer.py \ |
| | | --model ${model} \ |
| | | --audio_in ${output_dir}/split/wav.$JOB.scp \ |
| | | --output_dir ${output_dir}/output.$JOB \ |
| | | --batch_size ${batch_size} \ |
| | | --gpuid ${gpuid} |
| | | }& |
| | | done |
| | | wait |
| | | |
| | | mkdir -p ${output_dir}/1best_recog |
| | | for f in token score text; do |
| | | if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${nj}"); do |
| | | cat "${output_dir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${output_dir}/1best_recog/${f}" |
| | | fi |
| | | done |
| | | fi |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | cp ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | | |
| | | if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then |
| | | echo "SpeechIO TIOBE textnorm" |
| | | echo "$0 --> Normalizing REF text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${data_dir}/text \ |
| | | ${output_dir}/1best_recog/ref.txt |
| | | |
| | | echo "$0 --> Normalizing HYP text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${output_dir}/1best_recog/text.proc \ |
| | | ${output_dir}/1best_recog/rec.txt |
| | | grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt |
| | | |
| | | echo "$0 --> computing WER/CER and alignment ..." |
| | | ./utils/error_rate_zh \ |
| | | --tokenizer char \ |
| | | --ref ${output_dir}/1best_recog/ref.txt \ |
| | | --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ |
| | | ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt |
| | | rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt |
| | | fi |
| | | |
| New file |
| | |
| | | import json |
| | | import os |
| | | import shutil |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | from modelscope.hub.snapshot_download import snapshot_download |
| | | |
| | | from funasr.utils.compute_wer import compute_wer |
| | | |
| | | def modelscope_infer_after_finetune(params): |
| | | # prepare for decoding |
| | | |
| | | try: |
| | | pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"]) |
| | | except BaseException: |
| | | raise BaseException(f"Please download pretrain model from ModelScope firstly.") |
| | | shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb")) |
| | | decoding_path = os.path.join(params["output_dir"], "decode_results") |
| | | if os.path.exists(decoding_path): |
| | | shutil.rmtree(decoding_path) |
| | | os.mkdir(decoding_path) |
| | | |
| | | # decoding |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=pretrained_model_path, |
| | | output_dir=decoding_path, |
| | | batch_size=params["batch_size"] |
| | | ) |
| | | audio_in = os.path.join(params["data_dir"], "wav.scp") |
| | | inference_pipeline(audio_in=audio_in) |
| | | |
| | | # computer CER if GT text is set |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(decoding_path, "1best_recog/text") |
| | | compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer")) |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | params = {} |
| | | params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | params["output_dir"] = "./checkpoint" |
| | | params["data_dir"] = "./data/test" |
| | | params["decoding_model_name"] = "valid.acc.ave_10best.pb" |
| | | params["batch_size"] = 64 |
| | | modelscope_infer_after_finetune(params) |
| New file |
| | |
| | | ../../../egs/aishell/transformer/utils |
| | |
| | | |
| | | from funasr.utils.compute_wer import compute_wer |
| | | |
| | | import pdb; |
| | | def modelscope_infer_core(output_dir, split_dir, njob, idx): |
| | | output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) |
| | | gpu_id = (int(idx) - 1) // njob |