Merge branch 'dev_gzf' of github.com:alibaba-damo-academy/FunASR into dev_gzf
12个文件已修改
13个文件已添加
10个文件已删除
| | |
| | | |
| | | Or you can use the finetuned model for inference directly. |
| | | |
| | | - Setting parameters in `infer.py` |
| | | - Setting parameters in `infer.sh` |
| | | - <strong>model:</strong> # model name on ModelScope |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding |
| | | - <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob` |
| | | - <strong>batch_size:</strong> # batchsize of inference |
| | | - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding |
| | | - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1" |
| | | - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob` |
| | | |
| | | - Then you can run the pipeline to infer with: |
| | | ```python |
| | | python infer.py |
| | | sh infer.sh |
| | | ``` |
| | | |
| | | - Results |
| | | |
| | | The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. |
| | | |
| | | If you decode the SpeechIO test sets, you can use textnorm with `stage`=3, and `DETAILS.txt`, `RESULTS.txt` record the results and CER after text normalization. |
| | | |
| | | ### Inference using local finetuned model |
| | | |
| | | - Modify inference related parameters in `infer_after_finetune.py` |
| | |
| | | - Decode without CTC |
| | | - Decode without LM |
| | | |
| | | | testset | CER(%)| |
| | | |:---------:|:-----:| |
| | | | dev | 1.75 | |
| | | | test | 1.95 | |
| | | | CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/summary) | |
| | | |:---------:|:-------------:|:-------------:| |
| | | | dev | 1.75 |1.62 | |
| | | | test | 1.95 |1.78 | |
| | | |
| | | ## AISHELL-2 |
| | | - Decode config: |
| | | - Decode without CTC |
| | | - Decode without LM |
| | | |
| | | | testset | CER(%)| |
| | | |:------------:|:-----:| |
| | | | dev_ios | 2.80 | |
| | | | test_android | 3.13 | |
| | | | test_ios | 2.85 | |
| | | | test_mic | 3.06 | |
| | | | CER(%) | Pretrain model|[Finetune model](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/summary)| |
| | | |:------------:|:-------------:|:------------:| |
| | | | dev_ios | 2.80 |2.60 | |
| | | | test_android | 3.13 |2.84 | |
| | | | test_ios | 2.85 |2.82 | |
| | | | test_mic | 3.06 |2.88 | |
| | | |
| | | ## Wenetspeech |
| | | - Decode config: |
| | |
| | | import os |
| | | import shutil |
| | | from multiprocessing import Pool |
| | | |
| | | import argparse |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | from funasr.utils.compute_wer import compute_wer |
| | | |
| | | |
| | | def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model): |
| | | output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) |
| | | if ngpu > 0: |
| | | use_gpu = 1 |
| | | gpu_id = int(idx) - 1 |
| | | else: |
| | | use_gpu = 0 |
| | | gpu_id = -1 |
| | | if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): |
| | | gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) |
| | | else: |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) |
| | | inference_pipline = pipeline( |
| | | def modelscope_infer(args): |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=model, |
| | | output_dir=output_dir_job, |
| | | batch_size=batch_size, |
| | | ngpu=use_gpu, |
| | | model=args.model, |
| | | output_dir=args.output_dir, |
| | | batch_size=args.batch_size, |
| | | ) |
| | | audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) |
| | | inference_pipline(audio_in=audio_in) |
| | | |
| | | |
| | | def modelscope_infer(params): |
| | | # prepare for multi-GPU decoding |
| | | ngpu = params["ngpu"] |
| | | njob = params["njob"] |
| | | batch_size = params["batch_size"] |
| | | output_dir = params["output_dir"] |
| | | model = params["model"] |
| | | if os.path.exists(output_dir): |
| | | shutil.rmtree(output_dir) |
| | | os.mkdir(output_dir) |
| | | split_dir = os.path.join(output_dir, "split") |
| | | os.mkdir(split_dir) |
| | | if ngpu > 0: |
| | | nj = ngpu |
| | | elif ngpu == 0: |
| | | nj = njob |
| | | wav_scp_file = os.path.join(params["data_dir"], "wav.scp") |
| | | with open(wav_scp_file) as f: |
| | | lines = f.readlines() |
| | | num_lines = len(lines) |
| | | num_job_lines = num_lines // nj |
| | | start = 0 |
| | | for i in range(nj): |
| | | end = start + num_job_lines |
| | | file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) |
| | | with open(file, "w") as f: |
| | | if i == nj - 1: |
| | | f.writelines(lines[start:]) |
| | | else: |
| | | f.writelines(lines[start:end]) |
| | | start = end |
| | | |
| | | p = Pool(nj) |
| | | for i in range(nj): |
| | | p.apply_async(modelscope_infer_core, |
| | | args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model)) |
| | | p.close() |
| | | p.join() |
| | | |
| | | # combine decoding results |
| | | best_recog_path = os.path.join(output_dir, "1best_recog") |
| | | os.mkdir(best_recog_path) |
| | | files = ["text", "token", "score"] |
| | | for file in files: |
| | | with open(os.path.join(best_recog_path, file), "w") as f: |
| | | for i in range(nj): |
| | | job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) |
| | | with open(job_file) as f_job: |
| | | lines = f_job.readlines() |
| | | f.writelines(lines) |
| | | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | inference_pipeline(audio_in=args.audio_in) |
| | | |
| | | if __name__ == "__main__": |
| | | params = {} |
| | | params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | params["data_dir"] = "./data/test" |
| | | params["output_dir"] = "./results" |
| | | params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding |
| | | params["njob"] = 1 # if ngpu = 0, will use cpu decoding |
| | | params["batch_size"] = 64 |
| | | modelscope_infer(params) |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | | args = parser.parse_args() |
| | | modelscope_infer(args) |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | stage=1 |
| | | stop_stage=2 |
| | | model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | data_dir="./data/test" |
| | | output_dir="./results" |
| | | batch_size=64 |
| | | gpu_inference=true # whether to perform gpu decoding |
| | | gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" |
| | | njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob |
| | | |
| | | |
| | | if ${gpu_inference}; then |
| | | nj=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | else |
| | | nj=$njob |
| | | batch_size=1 |
| | | gpuid_list="" |
| | | for JOB in $(seq ${nj}); do |
| | | gpuid_list=$gpuid_list"-1," |
| | | done |
| | | fi |
| | | |
| | | mkdir -p $output_dir/split |
| | | split_scps="" |
| | | for JOB in $(seq ${nj}); do |
| | | split_scps="$split_scps $output_dir/split/wav.$JOB.scp" |
| | | done |
| | | perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} |
| | | |
| | | if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then |
| | | echo "Decoding ..." |
| | | gpuid_list_array=(${gpuid_list//,/ }) |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | id=$((JOB-1)) |
| | | gpuid=${gpuid_list_array[$id]} |
| | | mkdir -p ${output_dir}/output.$JOB |
| | | python infer.py \ |
| | | --model ${model} \ |
| | | --audio_in ${output_dir}/split/wav.$JOB.scp \ |
| | | --output_dir ${output_dir}/output.$JOB \ |
| | | --batch_size ${batch_size} \ |
| | | --gpuid ${gpuid} |
| | | }& |
| | | done |
| | | wait |
| | | |
| | | mkdir -p ${output_dir}/1best_recog |
| | | for f in token score text; do |
| | | if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${nj}"); do |
| | | cat "${output_dir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${output_dir}/1best_recog/${f}" |
| | | fi |
| | | done |
| | | fi |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | | |
| | | if [ $stage -le 3 ] && [ $stop_stage -ge 3 ];then |
| | | echo "SpeechIO TIOBE textnorm" |
| | | echo "$0 --> Normalizing REF text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${data_dir}/text \ |
| | | ${output_dir}/1best_recog/ref.txt |
| | | |
| | | echo "$0 --> Normalizing HYP text ..." |
| | | ./utils/textnorm_zh.py \ |
| | | --has_key --to_upper \ |
| | | ${output_dir}/1best_recog/text.proc \ |
| | | ${output_dir}/1best_recog/rec.txt |
| | | grep -v $'\t$' ${output_dir}/1best_recog/rec.txt > ${output_dir}/1best_recog/rec_non_empty.txt |
| | | |
| | | echo "$0 --> computing WER/CER and alignment ..." |
| | | ./utils/error_rate_zh \ |
| | | --tokenizer char \ |
| | | --ref ${output_dir}/1best_recog/ref.txt \ |
| | | --hyp ${output_dir}/1best_recog/rec_non_empty.txt \ |
| | | ${output_dir}/1best_recog/DETAILS.txt | tee ${output_dir}/1best_recog/RESULTS.txt |
| | | rm -rf ${output_dir}/1best_recog/rec.txt ${output_dir}/1best_recog/rec_non_empty.txt |
| | | fi |
| | | |
| New file |
| | |
| | | ../../../../egs/aishell/transformer/utils |
| | |
| | | |
| | | - Modify finetune training related parameters in `finetune.py` |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include files: train/wav.scp, train/text; validation/wav.scp, validation/text. |
| | | - <strong>batch_bins:</strong> # batch size |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include files: `train/wav.scp`, `train/text`; `validation/wav.scp`, `validation/text` |
| | | - <strong>dataset_type:</strong> # for dataset larger than 1000 hours, set as `large`, otherwise set as `small` |
| | | - <strong>batch_bins:</strong> # batch size. For dataset_type is `small`, `batch_bins` indicates the feature frames. For dataset_type is `large`, `batch_bins` indicates the duration in ms |
| | | - <strong>max_epoch:</strong> # number of training epoch |
| | | - <strong>lr:</strong> # learning rate |
| | | |
| | |
| | | |
| | | Or you can use the finetuned model for inference directly. |
| | | |
| | | - Setting parameters in `infer.py` |
| | | - <strong>data_dir:</strong> # the dataset dir |
| | | - Setting parameters in `infer.sh` |
| | | - <strong>model:</strong> # model name on ModelScope |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>batch_size:</strong> # batchsize of inference |
| | | - <strong>gpu_inference:</strong> # whether to perform gpu decoding, set false for cpu decoding |
| | | - <strong>gpuid_list:</strong> # set gpus, e.g., gpuid_list="0,1" |
| | | - <strong>njob:</strong> # the number of jobs for CPU decoding, if `gpu_inference`=false, use CPU decoding, please set `njob` |
| | | |
| | | - Then you can run the pipeline to infer with: |
| | | ```python |
| | | python infer.py |
| | | sh infer.sh |
| | | ``` |
| | | |
| | | - Results |
| | | |
| | | The decoding results can be found in `$output_dir/1best_recog/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. |
| | | |
| | | ### Inference using local finetuned model |
| | | |
| | | - Modify inference related parameters in `infer_after_finetune.py` |
| | | - <strong>modelscope_model_name: </strong> # model name on ModelScope |
| | | - <strong>output_dir:</strong> # result dir |
| | | - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed |
| | | - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb` |
| | | - <strong>batch_size:</strong> # batchsize of inference |
| | | |
| | | - Then you can run the pipeline to finetune with: |
| | | ```python |
| | | python infer_after_finetune.py |
| | | ``` |
| | | |
| | | - Results |
| | | |
| | | The decoding results can be found in `$output_dir/decoding_results/text.cer`, which includes recognition results of each sample and the CER metric of the whole test set. |
| | |
| | | import os |
| | | import shutil |
| | | from multiprocessing import Pool |
| | | |
| | | import argparse |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | |
| | | from funasr.utils.compute_wer import compute_wer |
| | | |
| | | |
| | | def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model): |
| | | output_dir_job = os.path.join(output_dir, "output.{}".format(idx)) |
| | | if ngpu > 0: |
| | | use_gpu = 1 |
| | | gpu_id = int(idx) - 1 |
| | | else: |
| | | use_gpu = 0 |
| | | gpu_id = -1 |
| | | if "CUDA_VISIBLE_DEVICES" in os.environ.keys(): |
| | | gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",") |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id]) |
| | | else: |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id) |
| | | inference_pipline = pipeline( |
| | | def modelscope_infer(args): |
| | | os.environ['CUDA_VISIBLE_DEVICES'] = str(args.gpuid) |
| | | inference_pipeline = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=model, |
| | | output_dir=output_dir_job, |
| | | batch_size=batch_size, |
| | | ngpu=use_gpu, |
| | | model=args.model, |
| | | output_dir=args.output_dir, |
| | | batch_size=args.batch_size, |
| | | ) |
| | | audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx)) |
| | | inference_pipline(audio_in=audio_in) |
| | | |
| | | |
| | | def modelscope_infer(params): |
| | | # prepare for multi-GPU decoding |
| | | ngpu = params["ngpu"] |
| | | njob = params["njob"] |
| | | batch_size = params["batch_size"] |
| | | output_dir = params["output_dir"] |
| | | model = params["model"] |
| | | if os.path.exists(output_dir): |
| | | shutil.rmtree(output_dir) |
| | | os.mkdir(output_dir) |
| | | split_dir = os.path.join(output_dir, "split") |
| | | os.mkdir(split_dir) |
| | | if ngpu > 0: |
| | | nj = ngpu |
| | | elif ngpu == 0: |
| | | nj = njob |
| | | wav_scp_file = os.path.join(params["data_dir"], "wav.scp") |
| | | with open(wav_scp_file) as f: |
| | | lines = f.readlines() |
| | | num_lines = len(lines) |
| | | num_job_lines = num_lines // nj |
| | | start = 0 |
| | | for i in range(nj): |
| | | end = start + num_job_lines |
| | | file = os.path.join(split_dir, "wav.{}.scp".format(str(i + 1))) |
| | | with open(file, "w") as f: |
| | | if i == nj - 1: |
| | | f.writelines(lines[start:]) |
| | | else: |
| | | f.writelines(lines[start:end]) |
| | | start = end |
| | | |
| | | p = Pool(nj) |
| | | for i in range(nj): |
| | | p.apply_async(modelscope_infer_core, |
| | | args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model)) |
| | | p.close() |
| | | p.join() |
| | | |
| | | # combine decoding results |
| | | best_recog_path = os.path.join(output_dir, "1best_recog") |
| | | os.mkdir(best_recog_path) |
| | | files = ["text", "token", "score"] |
| | | for file in files: |
| | | with open(os.path.join(best_recog_path, file), "w") as f: |
| | | for i in range(nj): |
| | | job_file = os.path.join(output_dir, "output.{}/1best_recog".format(str(i + 1)), file) |
| | | with open(job_file) as f_job: |
| | | lines = f_job.readlines() |
| | | f.writelines(lines) |
| | | |
| | | # If text exists, compute CER |
| | | text_in = os.path.join(params["data_dir"], "text") |
| | | if os.path.exists(text_in): |
| | | text_proc_file = os.path.join(best_recog_path, "token") |
| | | compute_wer(text_in, text_proc_file, os.path.join(best_recog_path, "text.cer")) |
| | | |
| | | inference_pipeline(audio_in=args.audio_in) |
| | | |
| | | if __name__ == "__main__": |
| | | params = {} |
| | | params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1" |
| | | params["data_dir"] = "./data/test" |
| | | params["output_dir"] = "./results" |
| | | params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding |
| | | params["njob"] = 1 # if ngpu = 0, will use cpu decoding |
| | | params["batch_size"] = 64 |
| | | modelscope_infer(params) |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model', type=str, default="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1") |
| | | parser.add_argument('--audio_in', type=str, default="./data/test") |
| | | parser.add_argument('--output_dir', type=str, default="./results/") |
| | | parser.add_argument('--batch_size', type=int, default=64) |
| | | parser.add_argument('--gpuid', type=str, default="0") |
| | | args = parser.parse_args() |
| | | modelscope_infer(args) |
| New file |
| | |
| | | #!/usr/bin/env bash |
| | | |
| | | set -e |
| | | set -u |
| | | set -o pipefail |
| | | |
| | | stage=1 |
| | | stop_stage=2 |
| | | model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1" |
| | | data_dir="./data/test" |
| | | output_dir="./results" |
| | | batch_size=64 |
| | | gpu_inference=true # whether to perform gpu decoding |
| | | gpuid_list="0,1" # set gpus, e.g., gpuid_list="0,1" |
| | | njob=4 # the number of jobs for CPU decoding, if gpu_inference=false, use CPU decoding, please set njob |
| | | |
| | | |
| | | if ${gpu_inference}; then |
| | | nj=$(echo $gpuid_list | awk -F "," '{print NF}') |
| | | else |
| | | nj=$njob |
| | | batch_size=1 |
| | | gpuid_list="" |
| | | for JOB in $(seq ${nj}); do |
| | | gpuid_list=$gpuid_list"-1," |
| | | done |
| | | fi |
| | | |
| | | mkdir -p $output_dir/split |
| | | split_scps="" |
| | | for JOB in $(seq ${nj}); do |
| | | split_scps="$split_scps $output_dir/split/wav.$JOB.scp" |
| | | done |
| | | perl utils/split_scp.pl ${data_dir}/wav.scp ${split_scps} |
| | | |
| | | if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then |
| | | echo "Decoding ..." |
| | | gpuid_list_array=(${gpuid_list//,/ }) |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | id=$((JOB-1)) |
| | | gpuid=${gpuid_list_array[$id]} |
| | | mkdir -p ${output_dir}/output.$JOB |
| | | python infer.py \ |
| | | --model ${model} \ |
| | | --audio_in ${output_dir}/split/wav.$JOB.scp \ |
| | | --output_dir ${output_dir}/output.$JOB \ |
| | | --batch_size ${batch_size} \ |
| | | --gpuid ${gpuid} |
| | | }& |
| | | done |
| | | wait |
| | | |
| | | mkdir -p ${output_dir}/1best_recog |
| | | for f in token score text; do |
| | | if [ -f "${output_dir}/output.1/1best_recog/${f}" ]; then |
| | | for i in $(seq "${nj}"); do |
| | | cat "${output_dir}/output.${i}/1best_recog/${f}" |
| | | done | sort -k1 >"${output_dir}/1best_recog/${f}" |
| | | fi |
| | | done |
| | | fi |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | python utils/proce_text.py ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | python utils/proce_text.py ${data_dir}/text ${output_dir}/1best_recog/text.ref |
| | | python utils/compute_wer.py ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| New file |
| | |
| | | ../../../../egs/aishell/transformer/utils |
| | |
| | | ): |
| | | results = [] |
| | | split_size = 10 |
| | | |
| | | cache_in = param_dict["cache"] |
| | | if raw_inputs != None: |
| | | line = raw_inputs.strip() |
| | | key = "demo" |
| | |
| | | item = {'key': key, 'value': ""} |
| | | results.append(item) |
| | | return results |
| | | result, _, cache = text2punc(line, cache) |
| | | item = {'key': key, 'value': result, 'cache': cache} |
| | | result, _, cache = text2punc(line, cache_in) |
| | | param_dict["cache"] = cache |
| | | item = {'key': key, 'value': result} |
| | | results.append(item) |
| | | return results |
| | | |
| | | for inference_text, _, _ in data_path_and_name_and_type: |
| | | with open(inference_text, "r", encoding="utf-8") as fin: |
| | | for line in fin: |
| | | line = line.strip() |
| | | segs = line.split("\t") |
| | | if len(segs) != 2: |
| | | continue |
| | | key = segs[0] |
| | | if len(segs[1]) == 0: |
| | | continue |
| | | result, _ = text2punc(segs[1]) |
| | | item = {'key': key, 'value': result} |
| | | results.append(item) |
| | | output_path = output_dir_v2 if output_dir_v2 is not None else output_dir |
| | | if output_path != None: |
| | | output_file_name = "infer.out" |
| | | Path(output_path).mkdir(parents=True, exist_ok=True) |
| | | output_file_path = (Path(output_path) / output_file_name).absolute() |
| | | with open(output_file_path, "w", encoding="utf-8") as fout: |
| | | for item_i in results: |
| | | key_out = item_i["key"] |
| | | value_out = item_i["value"] |
| | | fout.write(f"{key_out}\t{value_out}\n") |
| | | return results |
| | | |
| | | return _forward |
| | |
| | | from funasr.models.frontend.wav_frontend import WavFrontend |
| | | from funasr.bin.vad_inference import Speech2VadSegment |
| | | |
| | | header_colors = '\033[95m' |
| | | end_colors = '\033[0m' |
| | | |
| | | global_asr_language: str = 'zh-cn' |
| | | global_sample_rate: Union[int, Dict[Any, int]] = { |
| | | 'audio_fs': 16000, |
| | | 'model_fs': 16000 |
| | | } |
| | | |
| | | |
| | | class Speech2VadSegmentOnline(Speech2VadSegment): |
| | |
| | | # torch_version = float(".".join(torch.__version__.split(".")[:2])) |
| | | # assert torch_version > 1.9 |
| | | |
| | | class ASRModelExportParaformer: |
| | | class ModelExport: |
| | | def __init__( |
| | | self, |
| | | cache_dir: Union[Path, str] = None, |
| | |
| | | parser.add_argument('--calib_num', type=int, default=200, help='calib max num') |
| | | args = parser.parse_args() |
| | | |
| | | export_model = ASRModelExportParaformer( |
| | | export_model = ModelExport( |
| | | cache_dir=args.export_dir, |
| | | onnx=args.type == 'onnx', |
| | | quant=args.quantize, |
| | |
| | | 导出onnx模型,[详见](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export),参考示例,从modelscope中模型导出: |
| | | |
| | | ```shell |
| | | python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False |
| | | python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True |
| | | ``` |
| | | |
| | | ## Building Guidance for Linux/Unix |
| | |
| | | |
| | | ## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) |
| | | |
| | | Number of Parameter: 220M |
| | | |
| | | Storage size: 880MB |
| | | |
| | | Storage size after int8-quant: 237MB |
| | | |
| | | CER: 1.95% |
| | | |
| | | CER after int8-quant: 1.95% |
| | | |
| | | ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni |
| | | |
| | | | concurrent-tasks | processing time(s) | RTF | Speedup Rate | |
| | |
| | | |
| | | ## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary) |
| | | |
| | | Number of Parameter: 68M |
| | | |
| | | Storage size: 275MB |
| | | |
| | | Storage size after int8-quant: 81MB |
| | | |
| | | CER: 3.73% |
| | | |
| | | CER after int8-quant: 3.78% |
| | | |
| | | ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni |
| | | |
| | | | concurrent-tasks | processing time(s) | RTF | Speedup Rate | |
| | |
| | | |
| | | Step 1-2) Optional, Prepare server onnxruntime environment (on server). |
| | | |
| | | Install [`rapid_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | Install [`onnx_paraformer`](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | |
| | | - Build the rapid_paraformer `whl` |
| | | - Build the onnx_paraformer `whl` |
| | | ``` |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/onnxruntime/rapid_paraformer |
| | | python setup.py bdist_wheel |
| | | python setup.py build |
| | | python setup.py install |
| | | ``` |
| | | |
| | | - Install the build `whl` |
| | | ``` |
| | | pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl |
| | | ``` |
| | | [//]: # () |
| | | [//]: # (- Install the build `whl`) |
| | | |
| | | [//]: # (```) |
| | | |
| | | [//]: # (pip install dist/rapid_paraformer-0.0.1-py3-none-any.whl) |
| | | |
| | | [//]: # (```) |
| | | |
| | | Export the model, more details ref to [export docs](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime). |
| | | ``` |
| | | python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true |
| | | ```shell |
| | | python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize True |
| | | ``` |
| | | |
| | | Step 2) Optional, generate protobuf file (run on server, the two generated pb files are both used for server and client). |
| New file |
| | |
| | | import os |
| | | import numpy as np |
| | | import sys |
| | | |
| | | def compute_wer(ref_file, |
| | | hyp_file, |
| | | cer_detail_file): |
| | | rst = { |
| | | 'Wrd': 0, |
| | | 'Corr': 0, |
| | | 'Ins': 0, |
| | | 'Del': 0, |
| | | 'Sub': 0, |
| | | 'Snt': 0, |
| | | 'Err': 0.0, |
| | | 'S.Err': 0.0, |
| | | 'wrong_words': 0, |
| | | 'wrong_sentences': 0 |
| | | } |
| | | |
| | | hyp_dict = {} |
| | | ref_dict = {} |
| | | with open(hyp_file, 'r') as hyp_reader: |
| | | for line in hyp_reader: |
| | | key = line.strip().split()[0] |
| | | value = line.strip().split()[1:] |
| | | hyp_dict[key] = value |
| | | with open(ref_file, 'r') as ref_reader: |
| | | for line in ref_reader: |
| | | key = line.strip().split()[0] |
| | | value = line.strip().split()[1:] |
| | | ref_dict[key] = value |
| | | |
| | | cer_detail_writer = open(cer_detail_file, 'w') |
| | | for hyp_key in hyp_dict: |
| | | if hyp_key in ref_dict: |
| | | out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key]) |
| | | rst['Wrd'] += out_item['nwords'] |
| | | rst['Corr'] += out_item['cor'] |
| | | rst['wrong_words'] += out_item['wrong'] |
| | | rst['Ins'] += out_item['ins'] |
| | | rst['Del'] += out_item['del'] |
| | | rst['Sub'] += out_item['sub'] |
| | | rst['Snt'] += 1 |
| | | if out_item['wrong'] > 0: |
| | | rst['wrong_sentences'] += 1 |
| | | cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n') |
| | | cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n') |
| | | cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n') |
| | | |
| | | if rst['Wrd'] > 0: |
| | | rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2) |
| | | if rst['Snt'] > 0: |
| | | rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2) |
| | | |
| | | cer_detail_writer.write('\n') |
| | | cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) + |
| | | ", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n') |
| | | cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n') |
| | | cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n') |
| | | |
| | | |
| | | def compute_wer_by_line(hyp, |
| | | ref): |
| | | hyp = list(map(lambda x: x.lower(), hyp)) |
| | | ref = list(map(lambda x: x.lower(), ref)) |
| | | |
| | | len_hyp = len(hyp) |
| | | len_ref = len(ref) |
| | | |
| | | cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16) |
| | | |
| | | ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8) |
| | | |
| | | for i in range(len_hyp + 1): |
| | | cost_matrix[i][0] = i |
| | | for j in range(len_ref + 1): |
| | | cost_matrix[0][j] = j |
| | | |
| | | for i in range(1, len_hyp + 1): |
| | | for j in range(1, len_ref + 1): |
| | | if hyp[i - 1] == ref[j - 1]: |
| | | cost_matrix[i][j] = cost_matrix[i - 1][j - 1] |
| | | else: |
| | | substitution = cost_matrix[i - 1][j - 1] + 1 |
| | | insertion = cost_matrix[i - 1][j] + 1 |
| | | deletion = cost_matrix[i][j - 1] + 1 |
| | | |
| | | compare_val = [substitution, insertion, deletion] |
| | | |
| | | min_val = min(compare_val) |
| | | operation_idx = compare_val.index(min_val) + 1 |
| | | cost_matrix[i][j] = min_val |
| | | ops_matrix[i][j] = operation_idx |
| | | |
| | | match_idx = [] |
| | | i = len_hyp |
| | | j = len_ref |
| | | rst = { |
| | | 'nwords': len_ref, |
| | | 'cor': 0, |
| | | 'wrong': 0, |
| | | 'ins': 0, |
| | | 'del': 0, |
| | | 'sub': 0 |
| | | } |
| | | while i >= 0 or j >= 0: |
| | | i_idx = max(0, i) |
| | | j_idx = max(0, j) |
| | | |
| | | if ops_matrix[i_idx][j_idx] == 0: # correct |
| | | if i - 1 >= 0 and j - 1 >= 0: |
| | | match_idx.append((j - 1, i - 1)) |
| | | rst['cor'] += 1 |
| | | |
| | | i -= 1 |
| | | j -= 1 |
| | | |
| | | elif ops_matrix[i_idx][j_idx] == 2: # insert |
| | | i -= 1 |
| | | rst['ins'] += 1 |
| | | |
| | | elif ops_matrix[i_idx][j_idx] == 3: # delete |
| | | j -= 1 |
| | | rst['del'] += 1 |
| | | |
| | | elif ops_matrix[i_idx][j_idx] == 1: # substitute |
| | | i -= 1 |
| | | j -= 1 |
| | | rst['sub'] += 1 |
| | | |
| | | if i < 0 and j >= 0: |
| | | rst['del'] += 1 |
| | | elif j < 0 and i >= 0: |
| | | rst['ins'] += 1 |
| | | |
| | | match_idx.reverse() |
| | | wrong_cnt = cost_matrix[len_hyp][len_ref] |
| | | rst['wrong'] = wrong_cnt |
| | | |
| | | return rst |
| | | |
| | | def print_cer_detail(rst): |
| | | return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor']) |
| | | + ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub=" |
| | | + str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords']) |
| | | + ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords'])) |
| | | |
| | | if __name__ == '__main__': |
| | | if len(sys.argv) != 4: |
| | | print("usage : python compute-wer.py test.ref test.hyp test.wer") |
| | | sys.exit(0) |
| | | |
| | | ref_file = sys.argv[1] |
| | | hyp_file = sys.argv[2] |
| | | cer_detail_file = sys.argv[3] |
| | | compute_wer(ref_file, hyp_file, cer_detail_file) |
| New file |
| | |
| | | import os |
| | | import time |
| | | import sys |
| | | import librosa |
| | | from funasr.utils.types import str2bool |
| | | |
| | | import argparse |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument('--model_dir', type=str, required=True) |
| | | parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]') |
| | | parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number') |
| | | parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model') |
| | | parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx') |
| | | parser.add_argument('--output_dir', type=str, default=None, help='amp fallback number') |
| | | args = parser.parse_args() |
| | | |
| | | |
| | | from funasr.runtime.python.libtorch.torch_paraformer import Paraformer |
| | | if args.backend == "onnx": |
| | | from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer |
| | | |
| | | model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads) |
| | | |
| | | wav_file_f = open(args.wav_file, 'r') |
| | | wav_files = wav_file_f.readlines() |
| | | |
| | | output_dir = args.output_dir |
| | | if not os.path.exists(output_dir): |
| | | os.makedirs(output_dir) |
| | | if os.name == 'nt': # Windows |
| | | newline = '\r\n' |
| | | else: # Linux Mac |
| | | newline = '\n' |
| | | text_f = open(os.path.join(output_dir, "text"), "w", newline=newline) |
| | | token_f = open(os.path.join(output_dir, "token"), "w", newline=newline) |
| | | |
| | | for i, wav_path_i in enumerate(wav_files): |
| | | wav_name, wav_path = wav_path_i.strip().split() |
| | | result = model(wav_path) |
| | | text_i = "{} {}\n".format(wav_name, result[0]['preds'][0]) |
| | | token_i = "{} {}\n".format(wav_name, result[0]['preds'][1]) |
| | | text_f.write(text_i) |
| | | text_f.flush() |
| | | token_f.write(token_i) |
| | | token_f.flush() |
| | | text_f.close() |
| | | token_f.close() |
| | | |
| New file |
| | |
| | | |
| | | split_scps_tool=split_scp.pl |
| | | inference_tool=infer.py |
| | | proce_text_tool=proce_text.py |
| | | compute_wer_tool=compute_wer.py |
| | | |
| | | nj=32 |
| | | stage=0 |
| | | stop_stage=2 |
| | | |
| | | scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp" |
| | | label_text="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/text" |
| | | export_root="/nfs/zhifu.gzf/export" |
| | | |
| | | |
| | | #:<<! |
| | | model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | backend="onnx" # "torch" |
| | | quantize='true' # 'False' |
| | | fallback_op_num_torch=20 |
| | | tag=${model_name}/${backend}_quantize_${quantize}_${fallback_op_num_torch} |
| | | ! |
| | | |
| | | output_dir=${export_root}/logs/${tag}/split$nj |
| | | mkdir -p ${output_dir} |
| | | echo ${output_dir} |
| | | |
| | | |
| | | if [ $stage -le 0 ] && [ $stop_stage -ge 0 ];then |
| | | |
| | | python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp} --fallback-num ${fallback_op_num_torch} |
| | | |
| | | fi |
| | | |
| | | |
| | | if [ $stage -le 1 ] && [ $stop_stage -ge 1 ];then |
| | | |
| | | model_dir=${export_root}/${model_name} |
| | | split_scps="" |
| | | for JOB in $(seq ${nj}); do |
| | | split_scps="$split_scps $output_dir/wav.$JOB.scp" |
| | | done |
| | | |
| | | perl ${split_scps_tool} $scp ${split_scps} |
| | | |
| | | |
| | | for JOB in $(seq ${nj}); do |
| | | { |
| | | core_id=`expr $JOB - 1` |
| | | taskset -c ${core_id} python ${inference_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${output_dir}/wav.$JOB.scp --quantize ${quantize} --output_dir ${output_dir}/${JOB} &> ${output_dir}/log.$JOB.txt |
| | | }& |
| | | |
| | | done |
| | | wait |
| | | |
| | | mkdir -p ${output_dir}/1best_recog |
| | | for f in token text; do |
| | | if [ -f "${output_dir}/1/${f}" ]; then |
| | | for JOB in $(seq "${nj}"); do |
| | | cat "${output_dir}/${JOB}/${f}" |
| | | done | sort -k1 >"${output_dir}/1best_recog/${f}" |
| | | fi |
| | | done |
| | | |
| | | fi |
| | | |
| | | if [ $stage -le 2 ] && [ $stop_stage -ge 2 ];then |
| | | echo "Computing WER ..." |
| | | python ${proce_text_tool} ${output_dir}/1best_recog/text ${output_dir}/1best_recog/text.proc |
| | | python ${proce_text_tool} ${label_text} ${output_dir}/1best_recog/text.ref |
| | | python ${compute_wer_tool} ${output_dir}/1best_recog/text.ref ${output_dir}/1best_recog/text.proc ${output_dir}/1best_recog/text.cer |
| | | tail -n 3 ${output_dir}/1best_recog/text.cer |
| | | fi |
| | | |
| New file |
| | |
| | | |
| | | import sys |
| | | import re |
| | | |
| | | in_f = sys.argv[1] |
| | | out_f = sys.argv[2] |
| | | |
| | | |
| | | with open(in_f, "r", encoding="utf-8") as f: |
| | | lines = f.readlines() |
| | | |
| | | with open(out_f, "w", encoding="utf-8") as f: |
| | | for line in lines: |
| | | outs = line.strip().split(" ", 1) |
| | | if len(outs) == 2: |
| | | idx, text = outs |
| | | text = re.sub("</s>", "", text) |
| | | text = re.sub("<s>", "", text) |
| | | text = re.sub("@@", "", text) |
| | | text = re.sub("@", "", text) |
| | | text = re.sub("<unk>", "", text) |
| | | text = re.sub(" ", "", text) |
| | | text = text.lower() |
| | | else: |
| | | idx = outs[0] |
| | | text = " " |
| | | |
| | | text = [x for x in text] |
| | | text = " ".join(text) |
| | | out = "{} {}\n".format(idx, text) |
| | | f.write(out) |
| New file |
| | |
| | | import pyaudio |
| | | # import websocket #区别服务端这里是 websocket-client库 |
| | | import time |
| | | import websockets |
| | | import asyncio |
| | | from queue import Queue |
| | | # import threading |
| | | import argparse |
| | | |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument("--host", |
| | | type=str, |
| | | default="localhost", |
| | | required=False, |
| | | help="host ip, localhost, 0.0.0.0") |
| | | parser.add_argument("--port", |
| | | type=int, |
| | | default=10095, |
| | | required=False, |
| | | help="grpc server port") |
| | | parser.add_argument("--chunk_size", |
| | | type=int, |
| | | default=300, |
| | | help="ms") |
| | | |
| | | args = parser.parse_args() |
| | | |
| | | voices = Queue() |
| | | async def ws_client(): |
| | | global ws # 定义一个全局变量ws,用于保存websocket连接对象 |
| | | # uri = "ws://11.167.134.197:8899" |
| | | uri = "ws://{}:{}".format(args.host, args.port) |
| | | ws = await websockets.connect(uri, subprotocols=["binary"]) # 创建一个长连接 |
| | | ws.max_size = 1024 * 1024 * 20 |
| | | print("connected ws server") |
| | | |
| | | async def send(data): |
| | | global ws # 引用全局变量ws |
| | | try: |
| | | await ws.send(data) # 通过ws对象发送数据 |
| | | except Exception as e: |
| | | print('Exception occurred:', e) |
| | | |
| | | |
| | | |
| | | asyncio.get_event_loop().run_until_complete(ws_client()) # 启动协程 |
| | | |
| | | |
| | | # 其他函数可以通过调用send(data)来发送数据,例如: |
| | | async def test(): |
| | | #print("2") |
| | | global voices |
| | | FORMAT = pyaudio.paInt16 |
| | | CHANNELS = 1 |
| | | RATE = 16000 |
| | | CHUNK = int(RATE / 1000 * args.chunk_size) |
| | | |
| | | p = pyaudio.PyAudio() |
| | | |
| | | stream = p.open(format=FORMAT, |
| | | channels=CHANNELS, |
| | | rate=RATE, |
| | | input=True, |
| | | frames_per_buffer=CHUNK) |
| | | |
| | | while True: |
| | | |
| | | data = stream.read(CHUNK) |
| | | |
| | | voices.put(data) |
| | | #print(voices.qsize()) |
| | | await asyncio.sleep(0.01) |
| | | |
| | | |
| | | |
| | | |
| | | |
| | | async def ws_send(): |
| | | global voices |
| | | print("started to sending data!") |
| | | while True: |
| | | while not voices.empty(): |
| | | data = voices.get() |
| | | voices.task_done() |
| | | await send(data) |
| | | await asyncio.sleep(0.01) |
| | | await asyncio.sleep(0.01) |
| | | |
| | | async def main(): |
| | | task = asyncio.create_task(test()) # 创建一个后台任务 |
| | | task2 = asyncio.create_task(ws_send()) # 创建一个后台任务 |
| | | |
| | | await asyncio.gather(task, task2) |
| | | |
| | | asyncio.run(main()) |
| New file |
| | |
| | | import asyncio |
| | | import websockets |
| | | import time |
| | | from queue import Queue |
| | | import threading |
| | | import argparse |
| | | |
| | | from modelscope.pipelines import pipeline |
| | | from modelscope.utils.constant import Tasks |
| | | from modelscope.utils.logger import get_logger |
| | | import logging |
| | | |
| | | logger = get_logger(log_level=logging.CRITICAL) |
| | | logger.setLevel(logging.CRITICAL) |
| | | |
| | | |
| | | parser = argparse.ArgumentParser() |
| | | parser.add_argument("--host", |
| | | type=str, |
| | | default="0.0.0.0", |
| | | required=False, |
| | | help="host ip, localhost, 0.0.0.0") |
| | | parser.add_argument("--port", |
| | | type=int, |
| | | default=10095, |
| | | required=False, |
| | | help="grpc server port") |
| | | parser.add_argument("--asr_model", |
| | | type=str, |
| | | default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch", |
| | | help="model from modelscope") |
| | | parser.add_argument("--vad_model", |
| | | type=str, |
| | | default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch", |
| | | help="model from modelscope") |
| | | |
| | | parser.add_argument("--punc_model", |
| | | type=str, |
| | | default="damo/punc_ct-transformer_zh-cn-common-vad_realtime-vocab272727", |
| | | help="model from modelscope") |
| | | parser.add_argument("--ngpu", |
| | | type=int, |
| | | default=1, |
| | | help="0 for cpu, 1 for gpu") |
| | | |
| | | args = parser.parse_args() |
| | | |
| | | print("model loading") |
| | | voices = Queue() |
| | | speek = Queue() |
| | | |
| | | # vad |
| | | inference_pipeline_vad = pipeline( |
| | | task=Tasks.voice_activity_detection, |
| | | model=args.vad_model, |
| | | model_revision=None, |
| | | output_dir=None, |
| | | batch_size=1, |
| | | mode='online', |
| | | ngpu=args.ngpu, |
| | | ) |
| | | param_dict_vad = {'in_cache': dict(), "is_final": False} |
| | | |
| | | # asr |
| | | param_dict_asr = {} |
| | | # param_dict["hotword"] = "小五 小五月" # 设置热词,用空格隔开 |
| | | inference_pipeline_asr = pipeline( |
| | | task=Tasks.auto_speech_recognition, |
| | | model=args.asr_model, |
| | | param_dict=param_dict_asr, |
| | | ngpu=args.ngpu, |
| | | ) |
| | | |
| | | param_dict_punc = {'cache': list()} |
| | | inference_pipeline_punc = pipeline( |
| | | task=Tasks.punctuation, |
| | | model=args.punc_model, |
| | | model_revision=None, |
| | | ngpu=args.ngpu, |
| | | ) |
| | | |
| | | print("model loaded") |
| | | |
| | | |
| | | |
| | | async def ws_serve(websocket, path): |
| | | global voices |
| | | try: |
| | | async for message in websocket: |
| | | voices.put(message) |
| | | #print("put") |
| | | except websockets.exceptions.ConnectionClosedError as e: |
| | | print('Connection closed with exception:', e) |
| | | except Exception as e: |
| | | print('Exception occurred:', e) |
| | | |
| | | start_server = websockets.serve(ws_serve, args.host, args.port, subprotocols=["binary"], ping_interval=None) |
| | | |
| | | |
| | | def vad(data): # 推理 |
| | | global vad_pipline, param_dict_vad |
| | | #print(type(data)) |
| | | # print(param_dict_vad) |
| | | segments_result = inference_pipeline_vad(audio_in=data, param_dict=param_dict_vad) |
| | | # print(segments_result) |
| | | # print(param_dict_vad) |
| | | speech_start = False |
| | | speech_end = False |
| | | |
| | | if len(segments_result) == 0 or len(segments_result["text"]) > 1: |
| | | return speech_start, speech_end |
| | | if segments_result["text"][0][0] != -1: |
| | | speech_start = True |
| | | if segments_result["text"][0][1] != -1: |
| | | speech_end = True |
| | | return speech_start, speech_end |
| | | |
| | | def asr(): # 推理 |
| | | global inference_pipeline2 |
| | | global speek |
| | | while True: |
| | | while not speek.empty(): |
| | | audio_in = speek.get() |
| | | speek.task_done() |
| | | rec_result = inference_pipeline_asr(audio_in=audio_in) |
| | | rec_result_punc = inference_pipeline_punc(text_in=rec_result['text'], param_dict=param_dict_punc) |
| | | print(rec_result_punc) |
| | | time.sleep(0.1) |
| | | time.sleep(0.1) |
| | | |
| | | |
| | | def main(): # 推理 |
| | | frames = [] # 存储所有的帧数据 |
| | | buffer = [] # 存储缓存中的帧数据(最多两个片段) |
| | | # silence_count = 0 # 统计连续静音的次数 |
| | | # speech_detected = False # 标记是否检测到语音 |
| | | RECORD_NUM = 0 |
| | | global voices |
| | | global speek |
| | | speech_start, speech_end = False, False |
| | | while True: |
| | | while not voices.empty(): |
| | | |
| | | data = voices.get() |
| | | #print("队列排队数",voices.qsize()) |
| | | voices.task_done() |
| | | buffer.append(data) |
| | | if len(buffer) > 2: |
| | | buffer.pop(0) # 如果缓存超过两个片段,则删除最早的一个 |
| | | |
| | | if speech_start: |
| | | frames.append(data) |
| | | RECORD_NUM += 1 |
| | | speech_start_i, speech_end_i = vad(data) |
| | | # print(speech_start_i, speech_end_i) |
| | | if speech_start_i: |
| | | speech_start = speech_start_i |
| | | # if not speech_detected: |
| | | # print("检测到人声...") |
| | | # speech_detected = True # 标记为检测到语音 |
| | | frames = [] |
| | | frames.extend(buffer) # 把之前2个语音数据快加入 |
| | | # silence_count = 0 # 重置静音次数 |
| | | if speech_end_i or RECORD_NUM > 300: |
| | | # silence_count += 1 # 增加静音次数 |
| | | # speech_end = speech_end_i |
| | | speech_start = False |
| | | # if RECORD_NUM > 300: #这里 50 可根据需求改为合适的数据快数量 |
| | | # print("说话结束或者超过设置最长时间...") |
| | | audio_in = b"".join(frames) |
| | | #asrt = threading.Thread(target=asr,args=(audio_in,)) |
| | | #asrt.start() |
| | | speek.put(audio_in) |
| | | #rec_result = inference_pipeline2(audio_in=audio_in) # ASR 模型里跑一跑 |
| | | frames = [] # 清空所有的帧数据 |
| | | buffer = [] # 清空缓存中的帧数据(最多两个片段) |
| | | # silence_count = 0 # 统计连续静音的次数清零 |
| | | # speech_detected = False # 标记是否检测到语音 |
| | | RECORD_NUM = 0 |
| | | time.sleep(0.01) |
| | | time.sleep(0.01) |
| | | |
| | | |
| | | |
| | | s = threading.Thread(target=main) |
| | | s.start() |
| | | s = threading.Thread(target=asr) |
| | | s.start() |
| | | |
| | | asyncio.get_event_loop().run_until_complete(start_server) |
| | | asyncio.get_event_loop().run_forever() |
| New file |
| | |
| | | # Using funasr with websocket |
| | | We can send streaming audio data to server in real-time with grpc client every 300 ms e.g., and get transcribed text when stop speaking. |
| | | The audio data is in streaming, the asr inference process is in offline. |
| | | |
| | | # Steps |
| | | |
| | | ## For the Server |
| | | |
| | | Install the modelscope and funasr |
| | | |
| | | ```shell |
| | | pip install "modelscope[audio_asr]" -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | pip install --editable ./ |
| | | ``` |
| | | |
| | | Install the requirements for server |
| | | |
| | | ```shell |
| | | cd funasr/runtime/python/websocket |
| | | pip install -r requirements_server.txt |
| | | ``` |
| | | |
| | | Start server |
| | | |
| | | ```shell |
| | | python ASR_server.py --host "0.0.0.0" --port 10095 --asr_model "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch" |
| | | ``` |
| | | |
| | | ## For the client |
| | | |
| | | Install the requirements for client |
| | | ```shell |
| | | git clone https://github.com/alibaba/FunASR.git && cd FunASR |
| | | cd funasr/runtime/python/websocket |
| | | pip install -r requirements_client.txt |
| | | ``` |
| | | |
| | | Start client |
| | | |
| | | ```shell |
| | | python ASR_client.py --host "127.0.0.1" --port 10095 --chunk_size 300 |
| | | ``` |
| | | |
| | | ## Acknowledge |
| | | 1. We acknowledge [cgisky1980](https://github.com/cgisky1980/FunASR) for contributing the websocket service. |
| New file |
| | |
| | | websockets |
| | | pyaudio |
| | |
| | | # logging.basicConfig() is invoked in main_worker() instead of main() |
| | | # because it can be invoked only once in a process. |
| | | # FIXME(kamo): Should we use logging.getLogger()? |
| | | # BUGFIX: Remove previous handlers and reset log level |
| | | for handler in logging.root.handlers[:]: |
| | | logging.root.removeHandler(handler) |
| | | logging.basicConfig( |
| | | level=args.log_level, |
| | | format=f"[{os.uname()[1].split('.')[0]}]" |
| | | f" %(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s", |
| | | ) |
| | | else: |
| | | # BUGFIX: Remove previous handlers and reset log level |
| | | for handler in logging.root.handlers[:]: |
| | | logging.root.removeHandler(handler) |
| | | # Suppress logging if RANK != 0 |
| | | logging.basicConfig( |
| | | level="ERROR", |