Merge pull request #574 from alibaba-damo-academy/dev_lhn3
update
| New file |
| | |
| | | ../../TEMPLATE/README.md |
| | |
| | | import os |
| | | import logging |
| | | import torch |
| | | import soundfile |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | from modelscope.utils.logger import get_logger |
| | | |
| | | logger = get_logger(log_level=logging.CRITICAL) |
| | | logger.setLevel(logging.CRITICAL) |
| | | |
| | | os.environ["MODELSCOPE_CACHE"] = "./" |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', |
| | | model_revision='v1.0.4' |
| | | model_revision='v1.0.6', |
| | | mode="paraformer_fake_streaming" |
| | | ) |
| | | |
| | | model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online") |
| | | speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav")) |
| | | speech_length = speech.shape[0] |
| | | |
| | | sample_offset = 0 |
| | | chunk_size = [8, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms |
| | | stride_size = chunk_size[1] * 960 |
| | | param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size} |
| | | final_result = "" |
| | | |
| | | for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): |
| | | if sample_offset + stride_size >= speech_length - 1: |
| | | stride_size = speech_length - sample_offset |
| | | param_dict["is_final"] = True |
| | | rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], |
| | | param_dict=param_dict) |
| | | if len(rec_result) != 0: |
| | | final_result += rec_result['text'] |
| | | print(rec_result) |
| | | print(final_result.strip()) |
| | | audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav' |
| | | rec_result = inference_pipeline(audio_in=audio_in) |
| | | print(rec_result) |
| New file |
| | |
| | | import os |
| | | import logging |
| | | import torch |
| | | import soundfile |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | from modelscope.utils.logger import get_logger |
| | | |
| | | logger = get_logger(log_level=logging.CRITICAL) |
| | | logger.setLevel(logging.CRITICAL) |
| | | |
| | | os.environ["MODELSCOPE_CACHE"] = "./" |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online', |
| | | model_revision='v1.0.6', |
| | | mode="paraformer_streaming" |
| | | ) |
| | | |
| | | model_dir = os.path.join(os.environ["MODELSCOPE_CACHE"], "damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online") |
| | | speech, sample_rate = soundfile.read(os.path.join(model_dir, "example/asr_example.wav")) |
| | | speech_length = speech.shape[0] |
| | | |
| | | sample_offset = 0 |
| | | chunk_size = [8, 8, 4] #[5, 10, 5] 600ms, [8, 8, 4] 480ms |
| | | stride_size = chunk_size[1] * 960 |
| | | param_dict = {"cache": dict(), "is_final": False, "chunk_size": chunk_size} |
| | | final_result = "" |
| | | |
| | | for sample_offset in range(0, speech_length, min(stride_size, speech_length - sample_offset)): |
| | | if sample_offset + stride_size >= speech_length - 1: |
| | | stride_size = speech_length - sample_offset |
| | | param_dict["is_final"] = True |
| | | rec_result = inference_pipeline(audio_in=speech[sample_offset: sample_offset + stride_size], |
| | | param_dict=param_dict) |
| | | if len(rec_result) != 0: |
| | | final_result += rec_result['text'] |
| | | print(rec_result) |
| | | print(final_result) |
| New file |
| | |
| | | import os |
| | | |
| | | from modelscope.metainfo import Trainers |
| | | from modelscope.trainers import build_trainer |
| | | |
| | | from funasr.datasets.ms_dataset import MsDataset |
| | | from funasr.utils.modelscope_param import modelscope_args |
| | | |
| | | |
| | | def modelscope_finetune(params): |
| | | if not os.path.exists(params.output_dir): |
| | | os.makedirs(params.output_dir, exist_ok=True) |
| | | # dataset split ["train", "validation"] |
| | | ds_dict = MsDataset.load(params.data_path) |
| | | kwargs = dict( |
| | | model=params.model, |
| | | model_revision='v1.0.6', |
| | | data_dir=ds_dict, |
| | | dataset_type=params.dataset_type, |
| | | work_dir=params.output_dir, |
| | | batch_bins=params.batch_bins, |
| | | max_epoch=params.max_epoch, |
| | | lr=params.lr) |
| | | trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs) |
| | | trainer.train() |
| | | |
| | | |
| | | if __name__ == '__main__': |
| | | params = modelscope_args(model="damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online", data_path="./data") |
| | | params.output_dir = "./checkpoint" # m模型保存路径 |
| | | params.data_path = "./example_data/" # 数据路径 |
| | | params.dataset_type = "small" # 小数据量设置small,若数据量大于1000小时,请使用large |
| | | params.batch_bins = 1000 # batch size,如果dataset_type="small",batch_bins单位为fbank特征帧数,如果dataset_type="large",batch_bins单位为毫秒, |
| | | params.max_epoch = 20 # 最大训练轮数 |
| | | params.lr = 0.00005 # 设置学习率 |
| | | |
| | | modelscope_finetune(params) |
| New file |
| | |
| | | import os |
| | | import shutil |
| | | import argparse |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | def modelscope_infer(args): |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=args.model, |
| | | output_dir=args.output_dir, |
| | | batch_size=args.batch_size, |
| | | model_revision='v1.0.6', |
| | | mode="paraformer_fake_streaming", |
| | | param_dict={"decoding_model": args.decoding_mode, "hotword": args.hotword_txt} |
| | | ) |
| | | inference_pipeline(audio_in=args.audio_in) |
| | | |
| | | if __name__ == "__main__": |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--decoding_mode', type=str, default="normal") |
| | | parser.add_argument('--model_revision', type=str, default=None) |
| | | parser.add_argument('--mode', type=str, default=None) |
| | | parser.add_argument('--hotword_txt', type=str, default=None) |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | | args = parser.parse_args() |
| | | modelscope_infer(args) |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | stage=1 |
| | | stop_stage=2 |
| | | model="damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online" |
| | | data_dir="./data/test" |
| | | output_dir="./results" |
| | | batch_size=32 |
| | | gpu_inference=true # whether to perform gpu decoding |
| | | gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" |
| | | njob=32 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob |
| | | checkpoint_dir= |
| | | checkpoint_name="valid.cer_ctc.ave.pb" |
| | | |
| | | . utils/parse_options.sh || exit 1; |
| | | |
| | | if ${gpu_inference} == "true"; then |
| | | nj=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | else |
| | | nj=$njob |
| | | batch_size=1 |
| | | gpuid_list="" |
| | | for JOB in $(seq ${nj}); do |
| | | gpuid_list=$gpuid_list"-1," |
| | | done |
| | | fi |
| | | |
| | | mkdir -p $output_dir/split |
| | | split_scps="" |
| | | for JOB in $(seq ${nj}); do |
| | | split_scps="$split_scps $output_dir/split/wav.$JOB.scp" |
| | | done |
| | | perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} |
| | | |
| | | if [ -n "${checkpoint_dir}" ]; then |
| | | python utils/prepare_checkpoint.py ${model} ${checkpoint_dir} ${checkpoint_name} |
| | | model=${checkpoint_dir}/${model} |
| | | fi |
| | | |
| | | if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then |
| | | echo "Decoding ..." |
| | | gpuid_list_array=(${gpuid_list//,/ }) |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | id=$((JOB-1)) |
| | | gpuid=${gpuid_list_array[$id]} |
| | | mkdir -p ${output_dir}/output.$JOB |
| | | python infer.py \ |
| | | --model ${model} \ |
| | | --audio_in ${output_dir}/split/wav.$JOB.scp \ |
| | | --output_dir ${output_dir}/output.$JOB \ |
| | | --batch_size ${batch_size} \ |
| | | --gpuid ${gpuid} |
| | | --mode "paraformer_fake_streaming" |
| | | }& |
| | | done |
| | | wait |
| | | |
| | | mkdir -p ${output_dir}/1best_recog |
| | | for f in token score text; do |
| | | if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${nj}"); do |
| | | cat "${output_dir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${output_dir}/1best_recog/${f}" |
| | | fi |
| | | done |
| | | fi |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | cp ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | cp ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | | |
| | | if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then |
| | | echo "SpeechIO TIOBE textnorm" |
| | | echo "$0 --> Normalizing REF text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${data_dir}/text \ |
| | | ${output_dir}/1best_recog/ref.txt |
| | | |
| | | echo "$0 --> Normalizing HYP text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${output_dir}/1best_recog/text.proc \ |
| | | ${output_dir}/1best_recog/rec.txt |
| | | grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt |
| | | |
| | | echo "$0 --> computing WER/CER and alignment ..." |
| | | ./utils/error_rate_zh \ |
| | | --tokenizer char \ |
| | | --ref ${output_dir}/1best_recog/ref.txt \ |
| | | --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ |
| | | ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt |
| | | rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt |
| | | fi |
| | | |
| New file |
| | |
| | | ../../TEMPLATE/utils/ |