python/FunASR-XL.git

parent: 8a100b73 | 补丁 | 提交 | ignore whitespace

aky15

2023-03-21 4dc3a1b011e1e72eb737417b8e0e0bec7a7e3a6e

resolve conflict

106个文件已修改

22个文件已添加

	README.md	30 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/conformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/data2vec_paraformer_finetune/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/data2vec_transformer_finetune/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/paraformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/paraformerbert/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/transformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/conformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/paraformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/paraformerbert/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/infer_alimeeting_test.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/run.sh	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/unit_test.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/callhome/diarization/sond/unit_test.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/mars/sd/local_run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py	29 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py	29 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py	29 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py	31 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py	29 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py	34 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_launch.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_mfcca.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_streaming.py	907 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/diar_inference_launch.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/eend_ola_inference.py	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/sond_inference.py	32 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/sv_inference.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/iterable_dataset.py	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/large_datasets/utils/tokenize.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/preprocessor.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/README.md	33 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/export_model.py	146 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/models/modules/encoder_layer.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/export/models/modules/multihead_att.py	30 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/main_funcs/average_nbest_models.py	18 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/main_funcs/pack_funcs.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/decoder/sanm_decoder.py	101 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_asr_paraformer.py	74 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_diar_eend_ola.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_diar_sond.py	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/encoder/sanm_encoder.py	42 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/frontend/wav_frontend.py	77 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/predictor/cif.py	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/attention.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/eend_ola/encoder.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/eend_ola/encoder_decoder_attractor.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/embedding.py	11 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/CMakeLists.txt	83 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/Readme.md	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/common.cmake	125 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/paraformer_server.cc	195 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/paraformer_server.h	56 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/grpc/rebuild.sh	12 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/readme.md	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/Audio.cpp	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/librapidasrapi.cpp	17 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/tester/tester.cpp	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/benchmark_libtorch.md	45 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/benchmark_onnx.md	89 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/grpc_main_server.py	14 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/grpc/grpc_server.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/README.md	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/setup.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py	28 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py	157 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py	15 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/README.md	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py	29 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/onnxruntime/setup.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/requirements.txt	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/split_scp.pl	246 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/test_rtf.py	55 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/utils/test_rtf.sh	71 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/abs_task.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/asr.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/diar.py	87 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/sv.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/torch_utils/load_pretrained_model.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/train/trainer.py	36 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/asr_utils.py	52 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/postprocess_utils.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/timestamp_tools.py	186 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	setup.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	tests/test_asr_inference_pipeline.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	tests/test_sv_inference_pipeline.py	1 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 README.md

@@ -15,36 +15,10 @@
| [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
| [**Contact**](#contact)


## What's new: 

### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
- We release several new UniASR model: 
[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus...
### 2023.1.16, funasr-0.1.6， modelscope-1.2.0
- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
 [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)

## Highlights
- Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).

 egs/aishell/conformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/data2vec_paraformer_finetune/run.sh

@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/data2vec_transformer_finetune/run.sh

@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.cer_ctc.ave_10best.pth
inference_asr_model=valid.cer_ctc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/paraformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/paraformerbert/run.sh

@@ -56,7 +56,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/transformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell2/conformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/paraformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/paraformerbert/run.sh

@@ -58,7 +58,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/transformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/transformerLM/run.sh

@@ -34,7 +34,7 @@
tag=exp1
model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
lm_exp=${exp_dir}/exp/${model_dir}
inference_lm=valid.loss.ave.pth       # Language model path for decoding.
inference_lm=valid.loss.ave.pb       # Language model path for decoding.

stage=0
stop_stage=3

 egs/alimeeting/diarization/sond/infer_alimeeting_test.py

@@ -4,7 +4,7 @@

def main():
    diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
    output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
    data_path_and_name_and_type = [
        ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),

 egs/alimeeting/diarization/sond/run.sh

@@ -17,9 +17,9 @@
  echo "Downloading Pre-trained model..."
  git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
  git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
  cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
  cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
  cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
  echo "Done."
@@ -30,7 +30,7 @@

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "Calculating diarization results..."
  python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
  python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
  python local/convert_label_to_rttm.py \
    outputs/labels.txt \
    data/test_rmsil/raw_rmsil_map.scp \

 egs/alimeeting/diarization/sond/unit_test.py

@@ -4,7 +4,7 @@

def test_fbank_cpu_infer():
    diar_config_path = "config_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@

def test_fbank_gpu_infer():
    diar_config_path = "config_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@

def test_wav_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@

def test_without_profile_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    raw_inputs = [[
        "data/unit_test/raw_inputs/record.wav",

 egs/callhome/diarization/sond/unit_test.py

@@ -4,7 +4,7 @@

def test_fbank_cpu_infer():
    diar_config_path = "sond_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@

def test_fbank_gpu_infer():
    diar_config_path = "sond_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@

def test_wav_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@

def test_without_profile_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    raw_inputs = [[
        "data/unit_test/raw_inputs/record.wav",

 egs/mars/sd/local_run.sh

@@ -49,7 +49,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py

@@ -48,5 +48,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py

@@ -48,5 +48,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py

@@ -63,5 +63,5 @@
    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./example_data/validation"
    params["decoding_model_name"] = "valid.acc.ave.pth"
    params["decoding_model_name"] = "valid.acc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py

@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer


def modelscope_infer_core(output_dir, split_dir, njob, idx):
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
    gpu_id = (int(idx) - 1) // njob
    if ngpu > 0:
        use_gpu = 1
        gpu_id = int(idx) - 1
    else:
        use_gpu = 0
        gpu_id = -1
    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch",
        model=model,
        output_dir=output_dir_job,
        batch_size=64
        batch_size=batch_size,
        ngpu=use_gpu,
    )
    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
    inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
    # prepare for multi-GPU decoding
    ngpu = params["ngpu"]
    njob = params["njob"]
    batch_size = params["batch_size"]
    output_dir = params["output_dir"]
    model = params["model"]
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    split_dir = os.path.join(output_dir, "split")
    os.mkdir(split_dir)
    nj = ngpu * njob
    if ngpu > 0:
        nj = ngpu
    elif ngpu == 0:
        nj = njob
    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
    with open(wav_scp_file) as f:
        lines = f.readlines()
@@ -56,7 +67,7 @@
    p = Pool(nj)
    for i in range(nj):
        p.apply_async(modelscope_infer_core,
                      args=(output_dir, split_dir, njob, str(i + 1)))
                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
    p.close()
    p.join()

@@ -81,8 +92,10 @@

if __name__ == "__main__":
    params = {}
    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
    params["data_dir"] = "./data/test"
    params["output_dir"] = "./results"
    params["ngpu"] = 1
    params["njob"] = 1
    modelscope_infer(params)
    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
    params["batch_size"] = 64
    modelscope_infer(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py

@@ -4,23 +4,18 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

from funasr.utils.compute_wer import compute_wer


def modelscope_infer_after_finetune(params):
    # prepare for decoding
    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
    for file_name in params["required_files"]:
        if file_name == "configuration.json":
            with open(os.path.join(pretrained_model_path, file_name)) as f:
                config_dict = json.load(f)
                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
                json.dump(config_dict, f, indent=4, separators=(',', ': '))
        else:
            shutil.copy(os.path.join(pretrained_model_path, file_name),
                        os.path.join(params["output_dir"], file_name))

    try:
        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
    except BaseException:
        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
    decoding_path = os.path.join(params["output_dir"], "decode_results")
    if os.path.exists(decoding_path):
        shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
    # decoding
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model=params["output_dir"],
        model=pretrained_model_path,
        output_dir=decoding_path,
        batch_size=64
        batch_size=params["batch_size"]
    )
    audio_in = os.path.join(params["data_dir"], "wav.scp")
    inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
    params = {}
    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    modelscope_infer_after_finetune(params)
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    params["batch_size"] = 64
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py

@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer


def modelscope_infer_core(output_dir, split_dir, njob, idx):
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
    gpu_id = (int(idx) - 1) // njob
    if ngpu > 0:
        use_gpu = 1
        gpu_id = int(idx) - 1
    else:
        use_gpu = 0
        gpu_id = -1
    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch",
        model=model,
        output_dir=output_dir_job,
        batch_size=64
        batch_size=batch_size,
        ngpu=use_gpu,
    )
    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
    inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
    # prepare for multi-GPU decoding
    ngpu = params["ngpu"]
    njob = params["njob"]
    batch_size = params["batch_size"]
    output_dir = params["output_dir"]
    model = params["model"]
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    split_dir = os.path.join(output_dir, "split")
    os.mkdir(split_dir)
    nj = ngpu * njob
    if ngpu > 0:
        nj = ngpu
    elif ngpu == 0:
        nj = njob
    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
    with open(wav_scp_file) as f:
        lines = f.readlines()
@@ -56,7 +67,7 @@
    p = Pool(nj)
    for i in range(nj):
        p.apply_async(modelscope_infer_core,
                      args=(output_dir, split_dir, njob, str(i + 1)))
                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
    p.close()
    p.join()

@@ -81,8 +92,10 @@

if __name__ == "__main__":
    params = {}
    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
    params["data_dir"] = "./data/test"
    params["output_dir"] = "./results"
    params["ngpu"] = 1
    params["njob"] = 1
    modelscope_infer(params)
    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
    params["batch_size"] = 64
    modelscope_infer(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py

@@ -4,23 +4,18 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

from funasr.utils.compute_wer import compute_wer


def modelscope_infer_after_finetune(params):
    # prepare for decoding
    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
    for file_name in params["required_files"]:
        if file_name == "configuration.json":
            with open(os.path.join(pretrained_model_path, file_name)) as f:
                config_dict = json.load(f)
                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
                json.dump(config_dict, f, indent=4, separators=(',', ': '))
        else:
            shutil.copy(os.path.join(pretrained_model_path, file_name),
                        os.path.join(params["output_dir"], file_name))

    try:
        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
    except BaseException:
        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
    decoding_path = os.path.join(params["output_dir"], "decode_results")
    if os.path.exists(decoding_path):
        shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
    # decoding
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model=params["output_dir"],
        model=pretrained_model_path,
        output_dir=decoding_path,
        batch_size=64
        batch_size=params["batch_size"]
    )
    audio_in = os.path.join(params["data_dir"], "wav.scp")
    inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
    params = {}
    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    modelscope_infer_after_finetune(params)
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    params["batch_size"] = 64
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md

@@ -22,10 +22,12 @@
Or you can use the finetuned model for inference directly.

- Setting parameters in `infer.py`
    - <strong>model:</strong> # model name on ModelScope
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>output_dir:</strong> # result dir
    - <strong>ngpu:</strong> # the number of GPUs for decoding
    - <strong>njob:</strong> # the number of jobs for each GPU
    - <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
    - <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
    - <strong>batch_size:</strong> # batchsize of inference

- Then you can run the pipeline to infer with:
```python
@@ -39,9 +41,11 @@
### Inference using local finetuned model

- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>modelscope_model_name: </strong> # model name on ModelScope
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
    - <strong>batch_size:</strong> # batchsize of inference  

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py

@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer


def modelscope_infer_core(output_dir, split_dir, njob, idx):
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
    gpu_id = (int(idx) - 1) // njob
    if ngpu > 0:
        use_gpu = 1
        gpu_id = int(idx) - 1
    else:
        use_gpu = 0
        gpu_id = -1
    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
        model=model,
        output_dir=output_dir_job,
        batch_size=64
        batch_size=batch_size,
        ngpu=use_gpu,
    )
    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
    inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
    # prepare for multi-GPU decoding
    ngpu = params["ngpu"]
    njob = params["njob"]
    batch_size = params["batch_size"]
    output_dir = params["output_dir"]
    model = params["model"]
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    split_dir = os.path.join(output_dir, "split")
    os.mkdir(split_dir)
    nj = ngpu * njob
    if ngpu > 0:
        nj = ngpu
    elif ngpu == 0:
        nj = njob
    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
    with open(wav_scp_file) as f:
        lines = f.readlines()
@@ -56,7 +67,7 @@
    p = Pool(nj)
    for i in range(nj):
        p.apply_async(modelscope_infer_core,
                      args=(output_dir, split_dir, njob, str(i + 1)))
                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
    p.close()
    p.join()

@@ -81,8 +92,10 @@

if __name__ == "__main__":
    params = {}
    params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
    params["data_dir"] = "./data/test"
    params["output_dir"] = "./results"
    params["ngpu"] = 1
    params["njob"] = 1
    modelscope_infer(params)
    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
    params["batch_size"] = 64
    modelscope_infer(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py

@@ -4,23 +4,18 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

from funasr.utils.compute_wer import compute_wer


def modelscope_infer_after_finetune(params):
    # prepare for decoding
    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
    for file_name in params["required_files"]:
        if file_name == "configuration.json":
            with open(os.path.join(pretrained_model_path, file_name)) as f:
                config_dict = json.load(f)
                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
                json.dump(config_dict, f, indent=4, separators=(',', ': '))
        else:
            shutil.copy(os.path.join(pretrained_model_path, file_name),
                        os.path.join(params["output_dir"], file_name))

    try:
        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
    except BaseException:
        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
    decoding_path = os.path.join(params["output_dir"], "decode_results")
    if os.path.exists(decoding_path):
        shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
    # decoding
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model=params["output_dir"],
        model=pretrained_model_path,
        output_dir=decoding_path,
        batch_size=64
        batch_size=params["batch_size"]
    )
    audio_in = os.path.join(params["data_dir"], "wav.scp")
    inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
    params = {}
    params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    modelscope_infer_after_finetune(params)
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    params["batch_size"] = 64
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py

New file
@@ -0,0 +1,57 @@
import torch
import torchaudio
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)

inference_pipeline = pipeline(
    task=Tasks.auto_speech_recognition,
    model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
    model_revision='v1.0.2')

waveform, sample_rate = torchaudio.load("waihu.wav")
speech_length = waveform.shape[1]
speech = waveform[0]

cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
cache_de = {"decode_fsmn": None}
cache = {"encoder": cache_en, "decoder": cache_de}
param_dict = {}
param_dict["cache"] = cache

first_chunk = True
speech_buffer = speech
speech_cache = []
final_result = ""

while len(speech_buffer) >= 960:
    if first_chunk:
        if len(speech_buffer) >= 14400:
            rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
            speech_buffer = speech_buffer[4800:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
        cache_en["start_idx"] = -5
        first_chunk = False
    else:
        cache_en["start_idx"] += 10
        if len(speech_buffer) >= 4800:
            cache_en["pad_left"] = 5
            rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
            speech_buffer = speech_buffer[9600:]
        else:
            cache_en["stride"] = len(speech_buffer) // 960 
            cache_en["pad_right"] = 0
            rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
            speech_buffer = []
    if len(rec_result) !=0 and rec_result['text'] != "sil":
        final_result += rec_result['text']
    print(rec_result)
print(final_result)

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py

@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer


def modelscope_infer_core(output_dir, split_dir, njob, idx):
def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
    output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
    gpu_id = (int(idx) - 1) // njob
    if ngpu > 0:
        use_gpu = 1
        gpu_id = int(idx) - 1
    else:
        use_gpu = 0
        gpu_id = -1
    if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
        gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
        os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
        model=model,
        output_dir=output_dir_job,
        batch_size=64
        batch_size=batch_size,
        ngpu=use_gpu,
    )
    audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
    inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
    # prepare for multi-GPU decoding
    ngpu = params["ngpu"]
    njob = params["njob"]
    batch_size = params["batch_size"]
    output_dir = params["output_dir"]
    model = params["model"]
    if os.path.exists(output_dir):
        shutil.rmtree(output_dir)
    os.mkdir(output_dir)
    split_dir = os.path.join(output_dir, "split")
    os.mkdir(split_dir)
    nj = ngpu * njob
    if ngpu > 0:
        nj = ngpu
    elif ngpu == 0:
        nj = njob
    wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
    with open(wav_scp_file) as f:
        lines = f.readlines()
@@ -56,7 +67,7 @@
    p = Pool(nj)
    for i in range(nj):
        p.apply_async(modelscope_infer_core,
                      args=(output_dir, split_dir, njob, str(i + 1)))
                      args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
    p.close()
    p.join()

@@ -81,8 +92,10 @@

if __name__ == "__main__":
    params = {}
    params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
    params["data_dir"] = "./data/test"
    params["output_dir"] = "./results"
    params["ngpu"] = 1
    params["njob"] = 1
    modelscope_infer(params)
    params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
    params["njob"] = 1 # if ngpu = 0, will use cpu decoding
    params["batch_size"] = 64
    modelscope_infer(params)

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py

@@ -4,23 +4,18 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

from funasr.utils.compute_wer import compute_wer


def modelscope_infer_after_finetune(params):
    # prepare for decoding
    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
    for file_name in params["required_files"]:
        if file_name == "configuration.json":
            with open(os.path.join(pretrained_model_path, file_name)) as f:
                config_dict = json.load(f)
                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
                json.dump(config_dict, f, indent=4, separators=(',', ': '))
        else:
            shutil.copy(os.path.join(pretrained_model_path, file_name),
                        os.path.join(params["output_dir"], file_name))

    try:
        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
    except BaseException:
        raise BaseException(f"Please download pretrain model from ModelScope firstly.")
    shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
    decoding_path = os.path.join(params["output_dir"], "decode_results")
    if os.path.exists(decoding_path):
        shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
    # decoding
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model=params["output_dir"],
        model=pretrained_model_path,
        output_dir=decoding_path,
        batch_size=64
        batch_size=params["batch_size"]
    )
    audio_in = os.path.join(params["data_dir"], "wav.scp")
    inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
    params = {}
    params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    modelscope_infer_after_finetune(params)
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    params["batch_size"] = 64
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md

@@ -41,7 +41,8 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
      .pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md

@@ -34,7 +34,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py

@@ -4,27 +4,17 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.hub.snapshot_download import snapshot_download

from funasr.utils.compute_wer import compute_wer


def modelscope_infer_after_finetune(params):
    # prepare for decoding
    if not os.path.exists(os.path.join(params["output_dir"], "punc")):
        os.makedirs(os.path.join(params["output_dir"], "punc"))
    if not os.path.exists(os.path.join(params["output_dir"], "vad")):
        os.makedirs(os.path.join(params["output_dir"], "vad"))
    pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
    for file_name in params["required_files"]:
        if file_name == "configuration.json":
            with open(os.path.join(pretrained_model_path, file_name)) as f:
                config_dict = json.load(f)
                config_dict["model"]["am_model_name"] = params["decoding_model_name"]
            with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
                json.dump(config_dict, f, indent=4, separators=(',', ': '))
        else:
            shutil.copy(os.path.join(pretrained_model_path, file_name),
                        os.path.join(params["output_dir"], file_name))

    try:
        pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
    except BaseException:
        raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
    decoding_path = os.path.join(params["output_dir"], "decode_results")
    if os.path.exists(decoding_path):
        shutil.rmtree(decoding_path)
@@ -33,16 +23,16 @@
    # decoding
    inference_pipeline = pipeline(
        task=Tasks.auto_speech_recognition,
        model=params["output_dir"],
        model=pretrained_model_path,
        output_dir=decoding_path,
        batch_size=64
        batch_size=params["batch_size"]
    )
    audio_in = os.path.join(params["data_dir"], "wav.scp")
    inference_pipeline(audio_in=audio_in)

    # computer CER if GT text is set
    text_in = os.path.join(params["data_dir"], "text")
    if text_in is not None:
    if os.path.exists(text_in):
        text_proc_file = os.path.join(decoding_path, "1best_recog/token")
        compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

@@ -50,8 +40,8 @@
if __name__ == '__main__':
    params = {}
    params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    modelscope_infer_after_finetune(params)
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    params["batch_size"] = 64
    modelscope_infer_after_finetune(params)

 egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py

@@ -4,6 +4,11 @@

from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)


inference_pipeline = pipeline(
    task=Tasks.punctuation,

 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py

New file
@@ -0,0 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

inference_diar_pipline = pipeline(
    task=Tasks.speaker_diarization,
    model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
    model_revision="v1.0.0",
)
results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
print(results)

 egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py

@@ -14,13 +14,12 @@
)

# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
audio_list = [[
audio_list = [
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
]]
]

results = inference_diar_pipline(audio_in=audio_list)
for rst in results:
    print(rst["value"])
print(results)

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py

@@ -1,7 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
import soundfile


if __name__ == '__main__':
    output_dir = None

 egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py

@@ -1,7 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
import logging
logger = get_logger(log_level=logging.CRITICAL)
logger.setLevel(logging.CRITICAL)
import soundfile


if __name__ == '__main__':
    output_dir = None

 funasr/bin/asr_inference.py

@@ -52,7 +52,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_launch.py

@@ -256,6 +256,9 @@
    elif mode == "paraformer":
        from funasr.bin.asr_inference_paraformer import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer_streaming":
        from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
        return inference_modelscope(**kwargs)
    elif mode == "paraformer_vad":
        from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
        return inference_modelscope(**kwargs)

 funasr/bin/asr_inference_mfcca.py

@@ -41,8 +41,6 @@
from funasr.utils import asr_utils, wav_utils, postprocess_utils
import pdb

header_colors = '\033[95m'
end_colors = '\033[0m'

global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
@@ -55,7 +53,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_paraformer.py

@@ -50,7 +50,7 @@

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_paraformer_streaming.py

New file
@@ -0,0 +1,907 @@
#!/usr/bin/env python3
import argparse
import logging
import sys
import time
import copy
import os
import codecs
import tempfile
import requests
from pathlib import Path
from typing import Optional
from typing import Sequence
from typing import Tuple
from typing import Union
from typing import Dict
from typing import Any
from typing import List

import numpy as np
import torch
from typeguard import check_argument_types

from funasr.fileio.datadir_writer import DatadirWriter
from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
from funasr.modules.beam_search.beam_search import Hypothesis
from funasr.modules.scorers.ctc import CTCPrefixScorer
from funasr.modules.scorers.length_bonus import LengthBonus
from funasr.modules.subsampling import TooShortUttError
from funasr.tasks.asr import ASRTaskParaformer as ASRTask
from funasr.tasks.lm import LMTask
from funasr.text.build_tokenizer import build_tokenizer
from funasr.text.token_id_converter import TokenIDConverter
from funasr.torch_utils.device_funcs import to_device
from funasr.torch_utils.set_all_random_seed import set_all_random_seed
from funasr.utils import config_argparse
from funasr.utils.cli_utils import get_commandline_args
from funasr.utils.types import str2bool
from funasr.utils.types import str2triple_str
from funasr.utils.types import str_or_none
from funasr.utils import asr_utils, wav_utils, postprocess_utils
from funasr.models.frontend.wav_frontend import WavFrontend
from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export

class Speech2Text:
    """Speech2Text class

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

    """

    def __init__(
            self,
            asr_train_config: Union[Path, str] = None,
            asr_model_file: Union[Path, str] = None,
            cmvn_file: Union[Path, str] = None,
            lm_train_config: Union[Path, str] = None,
            lm_file: Union[Path, str] = None,
            token_type: str = None,
            bpemodel: str = None,
            device: str = "cpu",
            maxlenratio: float = 0.0,
            minlenratio: float = 0.0,
            dtype: str = "float32",
            beam_size: int = 20,
            ctc_weight: float = 0.5,
            lm_weight: float = 1.0,
            ngram_weight: float = 0.9,
            penalty: float = 0.0,
            nbest: int = 1,
            frontend_conf: dict = None,
            hotword_list_or_file: str = None,
            **kwargs,
    ):
        assert check_argument_types()

        # 1. Build ASR model
        scorers = {}
        asr_model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, cmvn_file, device
        )
        frontend = None
        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)

        logging.info("asr_model: {}".format(asr_model))
        logging.info("asr_train_args: {}".format(asr_train_args))
        asr_model.to(dtype=getattr(torch, dtype)).eval()

        if asr_model.ctc != None:
            ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
            scorers.update(
                ctc=ctc
            )
        token_list = asr_model.token_list
        scorers.update(
            length_bonus=LengthBonus(len(token_list)),
        )

        # 2. Build Language model
        if lm_train_config is not None:
            lm, lm_train_args = LMTask.build_model_from_file(
                lm_train_config, lm_file, device
            )
            scorers["lm"] = lm.lm

        # 3. Build ngram model
        # ngram is not supported now
        ngram = None
        scorers["ngram"] = ngram

        # 4. Build BeamSearch object
        # transducer is not supported now
        beam_search_transducer = None

        weights = dict(
            decoder=1.0 - ctc_weight,
            ctc=ctc_weight,
            lm=lm_weight,
            ngram=ngram_weight,
            length_bonus=penalty,
        )
        beam_search = BeamSearch(
            beam_size=beam_size,
            weights=weights,
            scorers=scorers,
            sos=asr_model.sos,
            eos=asr_model.eos,
            vocab_size=len(token_list),
            token_list=token_list,
            pre_beam_score_key=None if ctc_weight == 1.0 else "full",
        )

        beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
        for scorer in scorers.values():
            if isinstance(scorer, torch.nn.Module):
                scorer.to(device=device, dtype=getattr(torch, dtype)).eval()

        logging.info(f"Decoding device={device}, dtype={dtype}")

        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
            token_type = asr_train_args.token_type
        if bpemodel is None:
            bpemodel = asr_train_args.bpemodel

        if token_type is None:
            tokenizer = None
        elif token_type == "bpe":
            if bpemodel is not None:
                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
            else:
                tokenizer = None
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")

        self.asr_model = asr_model
        self.asr_train_args = asr_train_args
        self.converter = converter
        self.tokenizer = tokenizer

        # 6. [Optional] Build hotword list from str, local file or url

        is_use_lm = lm_weight != 0.0 and lm_file is not None
        if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
            beam_search = None
        self.beam_search = beam_search
        logging.info(f"Beam_search: {self.beam_search}")
        self.beam_search_transducer = beam_search_transducer
        self.maxlenratio = maxlenratio
        self.minlenratio = minlenratio
        self.device = device
        self.dtype = dtype
        self.nbest = nbest
        self.frontend = frontend
        self.encoder_downsampling_factor = 1
        if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
            self.encoder_downsampling_factor = 4

    @torch.no_grad()
    def __call__(
            self, cache: dict, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
            begin_time: int = 0, end_time: int = None,
    ):
        """Inference

        Args:
                speech: Input speech data
        Returns:
                text, token, token_int, hyp

        """
        assert check_argument_types()

        # Input as audio signal
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)

        if self.frontend is not None:
            feats, feats_len = self.frontend.forward(speech, speech_lengths)
            feats = to_device(feats, device=self.device)
            feats_len = feats_len.int()
            self.asr_model.frontend = None
        else:
            feats = speech
            feats_len = speech_lengths
        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
        batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}

        # a. To device
        batch = to_device(batch, device=self.device)

        # b. Forward Encoder
        enc, enc_len = self.asr_model.encode_chunk(**batch)
        if isinstance(enc, tuple):
            enc = enc[0]
        # assert len(enc) == 1, len(enc)
        enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor

        predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.floor().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
        decoder_out = decoder_outs

        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            x = enc[i, :enc_len[i], :]
            am_scores = decoder_out[i, :pre_token_length[i], :]
            if self.beam_search is not None:
                nbest_hyps = self.beam_search(
                    x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
                )

                nbest_hyps = nbest_hyps[: self.nbest]
            else:
                yseq = am_scores.argmax(dim=-1)
                score = am_scores.max(dim=-1)[0]
                score = torch.sum(score, dim=-1)
                # pad with mask tokens to ensure compatibility with sos/eos tokens
                yseq = torch.tensor(
                    [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
                )
                nbest_hyps = [Hypothesis(yseq=yseq, score=score)]

            for hyp in nbest_hyps:
                assert isinstance(hyp, (Hypothesis)), type(hyp)

                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()

                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))

                # Change integer-ids to tokens
                token = self.converter.ids2tokens(token_int)

                if self.tokenizer is not None:
                    text = self.tokenizer.tokens2text(token)
                else:
                    text = None

                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))

        # assert check_return_type(results)
        return results


class Speech2TextExport:
    """Speech2TextExport class

    """

    def __init__(
            self,
            asr_train_config: Union[Path, str] = None,
            asr_model_file: Union[Path, str] = None,
            cmvn_file: Union[Path, str] = None,
            lm_train_config: Union[Path, str] = None,
            lm_file: Union[Path, str] = None,
            token_type: str = None,
            bpemodel: str = None,
            device: str = "cpu",
            maxlenratio: float = 0.0,
            minlenratio: float = 0.0,
            dtype: str = "float32",
            beam_size: int = 20,
            ctc_weight: float = 0.5,
            lm_weight: float = 1.0,
            ngram_weight: float = 0.9,
            penalty: float = 0.0,
            nbest: int = 1,
            frontend_conf: dict = None,
            hotword_list_or_file: str = None,
            **kwargs,
    ):

        # 1. Build ASR model
        asr_model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, cmvn_file, device
        )
        frontend = None
        if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
            frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)

        logging.info("asr_model: {}".format(asr_model))
        logging.info("asr_train_args: {}".format(asr_train_args))
        asr_model.to(dtype=getattr(torch, dtype)).eval()

        token_list = asr_model.token_list

        logging.info(f"Decoding device={device}, dtype={dtype}")

        # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
        if token_type is None:
            token_type = asr_train_args.token_type
        if bpemodel is None:
            bpemodel = asr_train_args.bpemodel

        if token_type is None:
            tokenizer = None
        elif token_type == "bpe":
            if bpemodel is not None:
                tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
            else:
                tokenizer = None
        else:
            tokenizer = build_tokenizer(token_type=token_type)
        converter = TokenIDConverter(token_list=token_list)
        logging.info(f"Text tokenizer: {tokenizer}")

        # self.asr_model = asr_model
        self.asr_train_args = asr_train_args
        self.converter = converter
        self.tokenizer = tokenizer

        self.device = device
        self.dtype = dtype
        self.nbest = nbest
        self.frontend = frontend

        model = Paraformer_export(asr_model, onnx=False)
        self.asr_model = model

    @torch.no_grad()
    def __call__(
            self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
    ):
        """Inference

        Args:
                speech: Input speech data
        Returns:
                text, token, token_int, hyp

        """
        assert check_argument_types()

        # Input as audio signal
        if isinstance(speech, np.ndarray):
            speech = torch.tensor(speech)

        if self.frontend is not None:
            feats, feats_len = self.frontend.forward(speech, speech_lengths)
            feats = to_device(feats, device=self.device)
            feats_len = feats_len.int()
            self.asr_model.frontend = None
        else:
            feats = speech
            feats_len = speech_lengths

        enc_len_batch_total = feats_len.sum()
        lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
        batch = {"speech": feats, "speech_lengths": feats_len}

        # a. To device
        batch = to_device(batch, device=self.device)

        decoder_outs = self.asr_model(**batch)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]

        results = []
        b, n, d = decoder_out.size()
        for i in range(b):
            am_scores = decoder_out[i, :ys_pad_lens[i], :]

            yseq = am_scores.argmax(dim=-1)
            score = am_scores.max(dim=-1)[0]
            score = torch.sum(score, dim=-1)
            # pad with mask tokens to ensure compatibility with sos/eos tokens
            yseq = torch.tensor(
                yseq.tolist(), device=yseq.device
            )
            nbest_hyps = [Hypothesis(yseq=yseq, score=score)]

            for hyp in nbest_hyps:
                assert isinstance(hyp, (Hypothesis)), type(hyp)

                # remove sos/eos and get results
                last_pos = -1
                if isinstance(hyp.yseq, list):
                    token_int = hyp.yseq[1:last_pos]
                else:
                    token_int = hyp.yseq[1:last_pos].tolist()

                # remove blank symbol id, which is assumed to be 0
                token_int = list(filter(lambda x: x != 0 and x != 2, token_int))

                # Change integer-ids to tokens
                token = self.converter.ids2tokens(token_int)

                if self.tokenizer is not None:
                    text = self.tokenizer.tokens2text(token)
                else:
                    text = None

                results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))

        return results


def inference(
        maxlenratio: float,
        minlenratio: float,
        batch_size: int,
        beam_size: int,
        ngpu: int,
        ctc_weight: float,
        lm_weight: float,
        penalty: float,
        log_level: Union[int, str],
        data_path_and_name_and_type,
        asr_train_config: Optional[str],
        asr_model_file: Optional[str],
        cmvn_file: Optional[str] = None,
        raw_inputs: Union[np.ndarray, torch.Tensor] = None,
        lm_train_config: Optional[str] = None,
        lm_file: Optional[str] = None,
        token_type: Optional[str] = None,
        key_file: Optional[str] = None,
        word_lm_train_config: Optional[str] = None,
        bpemodel: Optional[str] = None,
        allow_variable_data_keys: bool = False,
        streaming: bool = False,
        output_dir: Optional[str] = None,
        dtype: str = "float32",
        seed: int = 0,
        ngram_weight: float = 0.9,
        nbest: int = 1,
        num_workers: int = 1,

        **kwargs,
):
    inference_pipeline = inference_modelscope(
        maxlenratio=maxlenratio,
        minlenratio=minlenratio,
        batch_size=batch_size,
        beam_size=beam_size,
        ngpu=ngpu,
        ctc_weight=ctc_weight,
        lm_weight=lm_weight,
        penalty=penalty,
        log_level=log_level,
        asr_train_config=asr_train_config,
        asr_model_file=asr_model_file,
        cmvn_file=cmvn_file,
        raw_inputs=raw_inputs,
        lm_train_config=lm_train_config,
        lm_file=lm_file,
        token_type=token_type,
        key_file=key_file,
        word_lm_train_config=word_lm_train_config,
        bpemodel=bpemodel,
        allow_variable_data_keys=allow_variable_data_keys,
        streaming=streaming,
        output_dir=output_dir,
        dtype=dtype,
        seed=seed,
        ngram_weight=ngram_weight,
        nbest=nbest,
        num_workers=num_workers,

        **kwargs,
    )
    return inference_pipeline(data_path_and_name_and_type, raw_inputs)


def inference_modelscope(
        maxlenratio: float,
        minlenratio: float,
        batch_size: int,
        beam_size: int,
        ngpu: int,
        ctc_weight: float,
        lm_weight: float,
        penalty: float,
        log_level: Union[int, str],
        # data_path_and_name_and_type,
        asr_train_config: Optional[str],
        asr_model_file: Optional[str],
        cmvn_file: Optional[str] = None,
        lm_train_config: Optional[str] = None,
        lm_file: Optional[str] = None,
        token_type: Optional[str] = None,
        key_file: Optional[str] = None,
        word_lm_train_config: Optional[str] = None,
        bpemodel: Optional[str] = None,
        allow_variable_data_keys: bool = False,
        dtype: str = "float32",
        seed: int = 0,
        ngram_weight: float = 0.9,
        nbest: int = 1,
        num_workers: int = 1,
        output_dir: Optional[str] = None,
        param_dict: dict = None,
        **kwargs,
):
    assert check_argument_types()

    if word_lm_train_config is not None:
        raise NotImplementedError("Word LM is not implemented")
    if ngpu > 1:
        raise NotImplementedError("only single GPU decoding is supported")

    logging.basicConfig(
        level=log_level,
        format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
    )

    export_mode = False
    if param_dict is not None:
        hotword_list_or_file = param_dict.get('hotword')
        export_mode = param_dict.get("export_mode", False)
    else:
        hotword_list_or_file = None

    if ngpu >= 1 and torch.cuda.is_available():
        device = "cuda"
    else:
        device = "cpu"
        batch_size = 1

    # 1. Set random-seed
    set_all_random_seed(seed)

    # 2. Build speech2text
    speech2text_kwargs = dict(
        asr_train_config=asr_train_config,
        asr_model_file=asr_model_file,
        cmvn_file=cmvn_file,
        lm_train_config=lm_train_config,
        lm_file=lm_file,
        token_type=token_type,
        bpemodel=bpemodel,
        device=device,
        maxlenratio=maxlenratio,
        minlenratio=minlenratio,
        dtype=dtype,
        beam_size=beam_size,
        ctc_weight=ctc_weight,
        lm_weight=lm_weight,
        ngram_weight=ngram_weight,
        penalty=penalty,
        nbest=nbest,
        hotword_list_or_file=hotword_list_or_file,
    )
    if export_mode:
        speech2text = Speech2TextExport(**speech2text_kwargs)
    else:
        speech2text = Speech2Text(**speech2text_kwargs)

    def _forward(
            data_path_and_name_and_type,
            raw_inputs: Union[np.ndarray, torch.Tensor] = None,
            output_dir_v2: Optional[str] = None,
            fs: dict = None,
            param_dict: dict = None,
            **kwargs,
    ):

        hotword_list_or_file = None
        if param_dict is not None:
            hotword_list_or_file = param_dict.get('hotword')
        if 'hotword' in kwargs:
            hotword_list_or_file = kwargs['hotword']
        if hotword_list_or_file is not None or 'hotword' in kwargs:
            speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)

        # 3. Build data-iterator
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
        loader = ASRTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
            fs=fs,
            batch_size=batch_size,
            key_file=key_file,
            num_workers=num_workers,
            preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
            collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
            allow_variable_data_keys=allow_variable_data_keys,
            inference=True,
        )

        if param_dict is not None:
            use_timestamp = param_dict.get('use_timestamp', True)
        else:
            use_timestamp = True

        forward_time_total = 0.0
        length_total = 0.0
        finish_count = 0
        file_count = 1
        cache = None
        # 7 .Start for-loop
        # FIXME(kamo): The output format should be discussed about
        asr_result_list = []
        output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
        if output_path is not None:
            writer = DatadirWriter(output_path)
        else:
            writer = None
        if param_dict is not None and "cache" in param_dict:
            cache = param_dict["cache"]
        for keys, batch in loader:
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
            assert len(keys) == _bs, f"{len(keys)} != {_bs}"
            # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
            logging.info("decoding, utt_id: {}".format(keys))
            # N-best list of (text, token, token_int, hyp_object)

            time_beg = time.time()
            results = speech2text(cache=cache, **batch)
            if len(results) < 1:
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
            time_end = time.time()
            forward_time = time_end - time_beg
            lfr_factor = results[0][-1]
            length = results[0][-2]
            forward_time_total += forward_time
            length_total += length
            rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
                                                                                               100 * forward_time / (
                                                                                                           length * lfr_factor))
            logging.info(rtf_cur)

            for batch_id in range(_bs):
                result = [results[batch_id][:-2]]

                key = keys[batch_id]
                for n, result in zip(range(1, nbest + 1), result):
                    text, token, token_int, hyp = result[0], result[1], result[2], result[3]
                    time_stamp = None if len(result) < 5 else result[4]
                    # Create a directory: outdir/{n}best_recog
                    if writer is not None:
                        ibest_writer = writer[f"{n}best_recog"]

                        # Write the result to each file
                        ibest_writer["token"][key] = " ".join(token)
                        # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
                        ibest_writer["score"][key] = str(hyp.score)
                        ibest_writer["rtf"][key] = rtf_cur

                    if text is not None:
                        if use_timestamp and time_stamp is not None:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
                        else:
                            postprocessed_result = postprocess_utils.sentence_postprocess(token)
                        time_stamp_postprocessed = ""
                        if len(postprocessed_result) == 3:
                            text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
                                                                                       postprocessed_result[1], \
                                                                                       postprocessed_result[2]
                        else:
                            text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
                        item = {'key': key, 'value': text_postprocessed}
                        if time_stamp_postprocessed != "":
                            item['time_stamp'] = time_stamp_postprocessed
                        asr_result_list.append(item)
                        finish_count += 1
                        # asr_utils.print_progress(finish_count / file_count)
                        if writer is not None:
                            ibest_writer["text"][key] = text_postprocessed

                    logging.info("decoding, utt: {}, predictions: {}".format(key, text))
        rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
                                                                                                           forward_time_total,
                                                                                                           100 * forward_time_total / (
                                                                                                                       length_total * lfr_factor))
        logging.info(rtf_avg)
        if writer is not None:
            ibest_writer["rtf"]["rtf_avf"] = rtf_avg
        return asr_result_list

    return _forward


def get_parser():
    parser = config_argparse.ArgumentParser(
        description="ASR Decoding",
        formatter_class=argparse.ArgumentDefaultsHelpFormatter,
    )

    # Note(kamo): Use '_' instead of '-' as separator.
    # '-' is confusing if written in yaml.
    parser.add_argument(
        "--log_level",
        type=lambda x: x.upper(),
        default="INFO",
        choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
        help="The verbose level of logging",
    )

    parser.add_argument("--output_dir", type=str, required=True)
    parser.add_argument(
        "--ngpu",
        type=int,
        default=0,
        help="The number of gpus. 0 indicates CPU mode",
    )
    parser.add_argument("--seed", type=int, default=0, help="Random seed")
    parser.add_argument(
        "--dtype",
        default="float32",
        choices=["float16", "float32", "float64"],
        help="Data type",
    )
    parser.add_argument(
        "--num_workers",
        type=int,
        default=1,
        help="The number of workers used for DataLoader",
    )
    parser.add_argument(
        "--hotword",
        type=str_or_none,
        default=None,
        help="hotword file path or hotwords seperated by space"
    )
    group = parser.add_argument_group("Input data related")
    group.add_argument(
        "--data_path_and_name_and_type",
        type=str2triple_str,
        required=False,
        action="append",
    )
    group.add_argument("--key_file", type=str_or_none)
    group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)

    group = parser.add_argument_group("The model configuration related")
    group.add_argument(
        "--asr_train_config",
        type=str,
        help="ASR training configuration",
    )
    group.add_argument(
        "--asr_model_file",
        type=str,
        help="ASR model parameter file",
    )
    group.add_argument(
        "--cmvn_file",
        type=str,
        help="Global cmvn file",
    )
    group.add_argument(
        "--lm_train_config",
        type=str,
        help="LM training configuration",
    )
    group.add_argument(
        "--lm_file",
        type=str,
        help="LM parameter file",
    )
    group.add_argument(
        "--word_lm_train_config",
        type=str,
        help="Word LM training configuration",
    )
    group.add_argument(
        "--word_lm_file",
        type=str,
        help="Word LM parameter file",
    )
    group.add_argument(
        "--ngram_file",
        type=str,
        help="N-gram parameter file",
    )
    group.add_argument(
        "--model_tag",
        type=str,
        help="Pretrained model tag. If specify this option, *_train_config and "
             "*_file will be overwritten",
    )

    group = parser.add_argument_group("Beam-search related")
    group.add_argument(
        "--batch_size",
        type=int,
        default=1,
        help="The batch size for inference",
    )
    group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
    group.add_argument("--beam_size", type=int, default=20, help="Beam size")
    group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
    group.add_argument(
        "--maxlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain max output length. "
             "If maxlenratio=0.0 (default), it uses a end-detect "
             "function "
             "to automatically find maximum hypothesis lengths."
             "If maxlenratio<0.0, its absolute value is interpreted"
             "as a constant max output length",
    )
    group.add_argument(
        "--minlenratio",
        type=float,
        default=0.0,
        help="Input length ratio to obtain min output length",
    )
    group.add_argument(
        "--ctc_weight",
        type=float,
        default=0.5,
        help="CTC weight in joint decoding",
    )
    group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
    group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
    group.add_argument("--streaming", type=str2bool, default=False)

    group.add_argument(
        "--frontend_conf",
        default=None,
        help="",
    )
    group.add_argument("--raw_inputs", type=list, default=None)
    # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])

    group = parser.add_argument_group("Text converter related")
    group.add_argument(
        "--token_type",
        type=str_or_none,
        default=None,
        choices=["char", "bpe", None],
        help="The token type for ASR model. "
             "If not given, refers from the training args",
    )
    group.add_argument(
        "--bpemodel",
        type=str_or_none,
        default=None,
        help="The model path of sentencepiece. "
             "If not given, refers from the training args",
    )

    return parser


def main(cmd=None):
    print(get_commandline_args(), file=sys.stderr)
    parser = get_parser()
    args = parser.parse_args(cmd)
    param_dict = {'hotword': args.hotword}
    kwargs = vars(args)
    kwargs.pop("config", None)
    kwargs['param_dict'] = param_dict
    inference(**kwargs)


if __name__ == "__main__":
    main()

    # from modelscope.pipelines import pipeline
    # from modelscope.utils.constant import Tasks
    #
    # inference_16k_pipline = pipeline(
    #     task=Tasks.auto_speech_recognition,
    #     model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    #
    # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
    # print(rec_result)


 funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -58,7 +58,7 @@

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_uniasr.py

@@ -46,7 +46,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_uniasr_vad.py

@@ -46,7 +46,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/diar_inference_launch.py

@@ -133,7 +133,7 @@
        param_dict = {
            "extract_profile": True,
            "sv_train_config": "sv.yaml",
            "sv_model_file": "sv.pth",
            "sv_model_file": "sv.pb",
        }
        if "param_dict" in kwargs and kwargs["param_dict"] is not None:
            for key in param_dict:
@@ -142,6 +142,9 @@
        else:
            kwargs["param_dict"] = param_dict
        return inference_modelscope(mode=mode, **kwargs)
    elif mode == "eend-ola":
        from funasr.bin.eend_ola_inference import inference_modelscope
        return inference_modelscope(mode=mode, **kwargs)
    else:
        logging.info("Unknown decoding mode: {}".format(mode))
        return None

 funasr/bin/eend_ola_inference.py

@@ -16,6 +16,7 @@

import numpy as np
import torch
from scipy.signal import medfilt
from typeguard import check_argument_types

from funasr.models.frontend.wav_frontend import WavFrontendMel23
@@ -34,7 +35,7 @@
    Examples:
        >>> import soundfile
        >>> import numpy as np
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
        >>> profile = np.load("profiles.npy")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2diar(audio, profile)
@@ -146,7 +147,7 @@
        output_dir: Optional[str] = None,
        batch_size: int = 1,
        dtype: str = "float32",
        ngpu: int = 0,
        ngpu: int = 1,
        num_workers: int = 0,
        log_level: Union[int, str] = "INFO",
        key_file: Optional[str] = None,
@@ -179,7 +180,6 @@
        diar_model_file=diar_model_file,
        device=device,
        dtype=dtype,
        streaming=streaming,
    )
    logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
    speech2diar = Speech2Diarization.from_pretrained(
@@ -209,7 +209,7 @@
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
            data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
        loader = EENDOLADiarTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
@@ -236,9 +236,23 @@
            # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}

            results = speech2diar(**batch)

            # post process
            a = results[0][0].cpu().numpy()
            a = medfilt(a, (11, 1))
            rst = []
            for spkid, frames in enumerate(a.T):
                frames = np.pad(frames, (1, 1), 'constant')
                changes, = np.where(np.diff(frames, axis=0) != 0)
                fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
                for s, e in zip(changes[::2], changes[1::2]):
                    st = s / 10.
                    dur = (e - s) / 10.
                    rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))

            # Only supporting batch_size==1
            key, value = keys[0], output_results_str(results, keys[0])
            item = {"key": key, "value": value}
            value = "\n".join(rst)
            item = {"key": keys[0], "value": value}
            result_list.append(item)
            if output_path is not None:
                output_writer.write(value)

 funasr/bin/sond_inference.py

@@ -42,7 +42,7 @@
    Examples:
        >>> import soundfile
        >>> import numpy as np
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
        >>> profile = np.load("profiles.npy")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2diar(audio, profile)
@@ -54,7 +54,7 @@
            self,
            diar_train_config: Union[Path, str] = None,
            diar_model_file: Union[Path, str] = None,
            device: str = "cpu",
            device: Union[str, torch.device] = "cpu",
            batch_size: int = 1,
            dtype: str = "float32",
            streaming: bool = False,
@@ -114,9 +114,19 @@
            # little-endian order: lower bit first
            return (np.array(list(b)[::-1]) == '1').astype(dtype)

        return np.row_stack([int2vec(int(x), vec_dim) for x in seq])
        # process oov
        seq = np.array([int(x) for x in seq])
        new_seq = []
        for i, x in enumerate(seq):
            if x < 2 ** vec_dim:
                new_seq.append(x)
            else:
                idx_list = np.where(seq < 2 ** vec_dim)[0]
                idx = np.abs(idx_list - i).argmin()
                new_seq.append(seq[idx_list[idx]])
        return np.row_stack([int2vec(x, vec_dim) for x in new_seq])

    def post_processing(self, raw_logits: torch.Tensor, spk_num: int):
    def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
        logits_idx = raw_logits.argmax(-1)  # B, T, vocab_size -> B, T
        # upsampling outputs to match inputs
        ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
@@ -127,8 +137,14 @@
        ).squeeze(1).long()
        logits_idx = logits_idx[0].tolist()
        pse_labels = [self.token_list[x] for x in logits_idx]
        if output_format == "pse_labels":
            return pse_labels, None

        multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num]  # remove padding speakers
        multi_labels = self.smooth_multi_labels(multi_labels)
        if output_format == "binary_labels":
            return multi_labels, None

        spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
        spk_turns = self.calc_spk_turns(multi_labels, spk_list)
        results = OrderedDict()
@@ -149,6 +165,7 @@
            self,
            speech: Union[torch.Tensor, np.ndarray],
            profile: Union[torch.Tensor, np.ndarray],
            output_format: str = "speaker_turn"
    ):
        """Inference

@@ -178,7 +195,7 @@
        batch = to_device(batch, device=self.device)

        logits = self.diar_model.prediction_forward(**batch)
        results, pse_labels = self.post_processing(logits, profile.shape[1])
        results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)

        return results, pse_labels

@@ -367,7 +384,7 @@
            pse_label_writer = open("{}/labels.txt".format(output_path), "w")
        logging.info("Start to diarize...")
        result_list = []
        for keys, batch in loader:
        for idx, (keys, batch) in enumerate(loader):
            assert isinstance(batch, dict), type(batch)
            assert all(isinstance(s, str) for s in keys), keys
            _bs = len(next(iter(batch.values())))
@@ -385,6 +402,9 @@
                pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
                pse_label_writer.flush()

            if idx % 100 == 0:
                logging.info("Processing {:5d}: {}".format(idx, key))

        if output_path is not None:
            output_writer.close()
            pse_label_writer.close()

 funasr/bin/sv_inference.py

@@ -36,7 +36,7 @@

    Examples:
        >>> import soundfile
        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2xvector(audio)
        [(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@
        log_level: Union[int, str] = "INFO",
        key_file: Optional[str] = None,
        sv_train_config: Optional[str] = "sv.yaml",
        sv_model_file: Optional[str] =  "sv.pth",
        sv_model_file: Optional[str] =  "sv.pb",
        model_tag: Optional[str] = None,
        allow_variable_data_keys: bool = True,
        streaming: bool = False,

 funasr/datasets/iterable_dataset.py

@@ -8,6 +8,7 @@
from typing import Iterator
from typing import Tuple
from typing import Union
from typing import List

import kaldiio
import numpy as np
@@ -129,7 +130,7 @@
        non_iterable_list = []
        self.path_name_type_list = []

        if not isinstance(path_name_type_list[0], Tuple):
        if not isinstance(path_name_type_list[0], (Tuple, List)):
            path = path_name_type_list[0]
            name = path_name_type_list[1]
            _type = path_name_type_list[2]
@@ -227,13 +228,9 @@
                name = self.path_name_type_list[i][1]
                _type = self.path_name_type_list[i][2]
                if _type == "sound":
                    audio_type = os.path.basename(value).split(".")[-1].lower()
                    if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
                        raise NotImplementedError(
                            f'Not supported audio type: {audio_type}')
                    if audio_type == "pcm":
                        _type = "pcm"

                   audio_type = os.path.basename(value).lower()
                   if audio_type.rfind(".pcm") >= 0:
                       _type = "pcm"
                func = DATA_TYPES[_type]
                array = func(value)
                if self.fs is not None and (name == "speech" or name == "ref_speech"):
@@ -335,11 +332,8 @@
                # 2.a. Load data streamingly
                for value, (path, name, _type) in zip(values, self.path_name_type_list):
                    if _type == "sound":
                        audio_type = os.path.basename(value).split(".")[-1].lower()
                        if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
                            raise NotImplementedError(
                                f'Not supported audio type: {audio_type}')
                        if audio_type == "pcm":
                        audio_type = os.path.basename(value).lower()
                        if audio_type.rfind(".pcm") >= 0:
                            _type = "pcm"
                    func = DATA_TYPES[_type]
                    # Load entry
@@ -391,3 +385,4 @@

        if count == 0:
            raise RuntimeError("No iteration")


 funasr/datasets/large_datasets/utils/tokenize.py

@@ -18,15 +18,11 @@

def seg_tokenize(txt, seg_dict):
    out_txt = ""
    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
    for word in txt:
        if pattern.match(word):
            if word in seg_dict:
                out_txt += seg_dict[word] + " "
            else:
                out_txt += "<unk>" + " "
        if word in seg_dict:
            out_txt += seg_dict[word] + " "
        else:
            continue
            out_txt += "<unk>" + " "
    return out_txt.strip().split()

def tokenize(data,

 funasr/datasets/preprocessor.py

@@ -47,15 +47,11 @@

def seg_tokenize(txt, seg_dict):
    out_txt = ""
    pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
    for word in txt:
        if pattern.match(word):
            if word in seg_dict:
                out_txt += seg_dict[word] + " "
            else:
                out_txt += "<unk>" + " "
        if word in seg_dict:
            out_txt += seg_dict[word] + " "
        else:
            continue
            out_txt += "<unk>" + " "
    return out_txt.strip().split()

def seg_tokenize_wo_pattern(txt, seg_dict):

 funasr/export/README.md

@@ -2,6 +2,8 @@
## Environments
    torch >= 1.11.0
    modelscope >= 1.2.0
    torch-quant >= 0.4.0 (required for exporting quantized torchscript format model)
    # pip install torch-quant -i https://pypi.org/simple

## Install modelscope and funasr

@@ -11,31 +13,46 @@
   `Tips`: torch>=1.11.0

   ```shell
   python -m funasr.export.export_model [model_name] [export_dir] [onnx]
   python -m funasr.export.export_model \
       --model-name [model_name] \
       --export-dir [export_dir] \
       --type [onnx, torch] \
       --quantize [true, false] \
       --fallback-num [fallback_num]
   ```
   `model_name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb). 
   `export_dir`: the dir where the onnx is export.
    `onnx`: `true`, export onnx format model; `false`, export torchscripts format model.
   `model-name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb).

   `export-dir`: the dir where the onnx is export.

   `type`: `onnx` or `torch`, export onnx format model or torchscript format model.

   `quantize`: `true`, export quantized model at the same time; `false`, export fp32 model only.

   `fallback-num`: specify the number of fallback layers to perform automatic mixed precision quantization.


## For example
### Export onnx format model
Export model from modelscope
```shell
python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
```
Export model from local path, the model'name must be `model.pb`.
```shell
python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
```

### Export torchscripts format model
Export model from modelscope
```shell
python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
```

Export model from local path, the model'name must be `model.pb`.
```shell
python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
```

## Acknowledge
Torch model quantization is supported by [BladeDISC](https://github.com/alibaba/BladeDISC), an end-to-end DynamIc Shape Compiler project for machine learning workloads. BladeDISC provides general, transparent, and ease of use performance optimization for TensorFlow/PyTorch workloads on GPGPU and CPU backends. If you are interested, please contact us.


 funasr/export/export_model.py

@@ -10,12 +10,20 @@
from funasr.export.models import get_model
import numpy as np
import random

from funasr.utils.types import str2bool
# torch_version = float(".".join(torch.__version__.split(".")[:2]))
# assert torch_version > 1.9

class ASRModelExportParaformer:
    def __init__(self, cache_dir: Union[Path, str] = None, onnx: bool = True):
    def __init__(
        self,
        cache_dir: Union[Path, str] = None,
        onnx: bool = True,
        quant: bool = True,
        fallback_num: int = 0,
        audio_in: str = None,
        calib_num: int = 200,
    ):
        assert check_argument_types()
        self.set_all_random_seed(0)
        if cache_dir is None:
@@ -28,6 +36,11 @@
        )
        print("output dir: {}".format(self.cache_dir))
        self.onnx = onnx
        self.quant = quant
        self.fallback_num = fallback_num
        self.frontend = None
        self.audio_in = audio_in
        self.calib_num = calib_num
        

    def _export(
@@ -56,6 +69,43 @@
        print("output dir: {}".format(export_dir))


    def _torch_quantize(self, model):
        def _run_calibration_data(m):
            # using dummy inputs for a example
            if self.audio_in is not None:
                feats, feats_len = self.load_feats(self.audio_in)
                for i, (feat, len) in enumerate(zip(feats, feats_len)):
                    with torch.no_grad():
                        m(feat, len)
            else:
                dummy_input = model.get_dummy_inputs()
                m(*dummy_input)
            

        from torch_quant.module import ModuleFilter
        from torch_quant.quantizer import Backend, Quantizer
        from funasr.export.models.modules.decoder_layer import DecoderLayerSANM
        from funasr.export.models.modules.encoder_layer import EncoderLayerSANM
        module_filter = ModuleFilter(include_classes=[EncoderLayerSANM, DecoderLayerSANM])
        module_filter.exclude_op_types = [torch.nn.Conv1d]
        quantizer = Quantizer(
            module_filter=module_filter,
            backend=Backend.FBGEMM,
        )
        model.eval()
        calib_model = quantizer.calib(model)
        _run_calibration_data(calib_model)
        if self.fallback_num > 0:
            # perform automatic mixed precision quantization
            amp_model = quantizer.amp(model)
            _run_calibration_data(amp_model)
            quantizer.fallback(amp_model, num=self.fallback_num)
            print('Fallback layers:')
            print('\n'.join(quantizer.module_filter.exclude_names))
        quant_model = quantizer.quantize(model)
        return quant_model


    def _export_torchscripts(self, model, verbose, path, enc_size=None):
        if enc_size:
            dummy_input = model.get_dummy_inputs(enc_size)
@@ -66,10 +116,49 @@
        model_script = torch.jit.trace(model, dummy_input)
        model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))

        if self.quant:
            quant_model = self._torch_quantize(model)
            model_script = torch.jit.trace(quant_model, dummy_input)
            model_script.save(os.path.join(path, f'{model.model_name}_quant.torchscripts'))


    def set_all_random_seed(self, seed: int):
        random.seed(seed)
        np.random.seed(seed)
        torch.random.manual_seed(seed)

    def parse_audio_in(self, audio_in):
        
        wav_list, name_list = [], []
        if audio_in.endswith(".scp"):
            f = open(audio_in, 'r')
            lines = f.readlines()[:self.calib_num]
            for line in lines:
                name, path = line.strip().split()
                name_list.append(name)
                wav_list.append(path)
        else:
            wav_list = [audio_in,]
            name_list = ["test",]
        return wav_list, name_list
    
    def load_feats(self, audio_in: str = None):
        import torchaudio

        wav_list, name_list = self.parse_audio_in(audio_in)
        feats = []
        feats_len = []
        for line in wav_list:
            path = line.strip()
            waveform, sampling_rate = torchaudio.load(path)
            if sampling_rate != self.frontend.fs:
                waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
                                                          new_freq=self.frontend.fs)(waveform)
            fbank, fbank_len = self.frontend(waveform, [waveform.size(1)])
            feats.append(fbank)
            feats_len.append(fbank_len)
        return feats, feats_len
    
    def export(self,
               tag_name: str = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
               mode: str = 'paraformer',
@@ -96,6 +185,7 @@
        model, asr_train_args = ASRTask.build_model_from_file(
            asr_train_config, asr_model_file, cmvn_file, 'cpu'
        )
        self.frontend = model.frontend
        self._export(model, tag_name)
            

@@ -107,11 +197,12 @@

        # model_script = torch.jit.script(model)
        model_script = model #torch.jit.trace(model)
        model_path = os.path.join(path, f'{model.model_name}.onnx')

        torch.onnx.export(
            model_script,
            dummy_input,
            os.path.join(path, f'{model.model_name}.onnx'),
            model_path,
            verbose=verbose,
            opset_version=14,
            input_names=model.get_input_names(),
@@ -119,17 +210,42 @@
            dynamic_axes=model.get_dynamic_axes()
        )

        if self.quant:
            from onnxruntime.quantization import QuantType, quantize_dynamic
            import onnx
            quant_model_path = os.path.join(path, f'{model.model_name}_quant.onnx')
            onnx_model = onnx.load(model_path)
            nodes = [n.name for n in onnx_model.graph.node]
            nodes_to_exclude = [m for m in nodes if 'output' in m]
            quantize_dynamic(
                model_input=model_path,
                model_output=quant_model_path,
                op_types_to_quantize=['MatMul'],
                per_channel=True,
                reduce_range=False,
                weight_type=QuantType.QUInt8,
                nodes_to_exclude=nodes_to_exclude,
            )


if __name__ == '__main__':
    import sys
    
    model_path = sys.argv[1]
    output_dir = sys.argv[2]
    onnx = sys.argv[3]
    onnx = onnx.lower()
    onnx = onnx == 'true'
    # model_path = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
    # output_dir = "../export"
    export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=onnx)
    export_model.export(model_path)
    # export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
    import argparse
    parser = argparse.ArgumentParser()
    parser.add_argument('--model-name', type=str, required=True)
    parser.add_argument('--export-dir', type=str, required=True)
    parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
    parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
    parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
    parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
    parser.add_argument('--calib_num', type=int, default=200, help='calib max num')
    args = parser.parse_args()

    export_model = ASRModelExportParaformer(
        cache_dir=args.export_dir,
        onnx=args.type == 'onnx',
        quant=args.quantize,
        fallback_num=args.fallback_num,
        audio_in=args.audio_in,
        calib_num=args.calib_num,
    )
    export_model.export(args.model_name)

 funasr/export/models/modules/encoder_layer.py

@@ -16,6 +16,7 @@
        self.feed_forward = model.feed_forward
        self.norm1 = model.norm1
        self.norm2 = model.norm2
        self.in_size = model.in_size
        self.size = model.size

    def forward(self, x, mask):
@@ -23,13 +24,12 @@
        residual = x
        x = self.norm1(x)
        x = self.self_attn(x, mask)
        if x.size(2) == residual.size(2):
        if self.in_size == self.size:
            x = x + residual
        residual = x
        x = self.norm2(x)
        x = self.feed_forward(x)
        if x.size(2) == residual.size(2):
            x = x + residual
        x = x + residual

        return x, mask


 funasr/export/models/modules/multihead_att.py

@@ -64,6 +64,23 @@
        return self.linear_out(context_layer)  # (batch, time1, d_model)


def preprocess_for_attn(x, mask, cache, pad_fn):
    x = x * mask
    x = x.transpose(1, 2)
    if cache is None:
        x = pad_fn(x)
    else:
        x = torch.cat((cache[:, :, 1:], x), dim=2)
        cache = x
    return x, cache


torch_version = float(".".join(torch.__version__.split(".")[:2]))
if torch_version >= 1.8:
    import torch.fx
    torch.fx.wrap('preprocess_for_attn')


class MultiHeadedAttentionSANMDecoder(nn.Module):
    def __init__(self, model):
        super().__init__()
@@ -73,16 +90,7 @@
        self.attn = None

    def forward(self, inputs, mask, cache=None):
        # b, t, d = inputs.size()
        # mask = torch.reshape(mask, (b, -1, 1))
        inputs = inputs * mask

        x = inputs.transpose(1, 2)
        if cache is None:
            x = self.pad_fn(x)
        else:
            x = torch.cat((cache[:, :, 1:], x), dim=2)
            cache = x
        x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn)
        x = self.fsmn_block(x)
        x = x.transpose(1, 2)

@@ -232,4 +240,4 @@
        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
        context_layer = context_layer.view(new_context_layer_shape)
        return self.linear_out(context_layer)  # (batch, time1, d_model)
        
        

 funasr/main_funcs/average_nbest_models.py

@@ -66,13 +66,13 @@
            elif n == 1:
                # The averaged model is same as the best model
                e, _ = epoch_and_values[0]
                op = output_dir / f"{e}epoch.pth"
                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
                op = output_dir / f"{e}epoch.pb"
                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
                if sym_op.is_symlink() or sym_op.exists():
                    sym_op.unlink()
                sym_op.symlink_to(op.name)
            else:
                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
                logging.info(
                    f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                )
@@ -83,12 +83,12 @@
                    if e not in _loaded:
                        if oss_bucket is None:
                            _loaded[e] = torch.load(
                                output_dir / f"{e}epoch.pth",
                                output_dir / f"{e}epoch.pb",
                                map_location="cpu",
                            )
                        else:
                            buffer = BytesIO(
                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
                            _loaded[e] = torch.load(buffer)
                    states = _loaded[e]

@@ -115,13 +115,13 @@
                else:
                    buffer = BytesIO()
                    torch.save(avg, buffer)
                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
                                          buffer.getvalue())

        # 3. *.*.ave.pth is a symlink to the max ave model
        # 3. *.*.ave.pb is a symlink to the max ave model
        if oss_bucket is None:
            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
            if sym_op.is_symlink() or sym_op.exists():
                sym_op.unlink()
            sym_op.symlink_to(op.name)

 funasr/main_funcs/pack_funcs.py

@@ -191,12 +191,12 @@

    Examples:
        tarfile:
           model.pth
           model.pb
           some1.file
           some2.file

        >>> unpack("tarfile", "out")
        {'asr_model_file': 'out/model.pth'}
        {'asr_model_file': 'out/model.pb'}
    """
    input_archive = Path(input_archive)
    outpath = Path(outpath)

 funasr/models/decoder/sanm_decoder.py

@@ -94,6 +94,47 @@
        if self.self_attn:
            if self.normalize_before:
                tgt = self.norm2(tgt)
            x, _ = self.self_attn(tgt, tgt_mask)
            x = residual + self.dropout(x)

        if self.src_attn is not None:
            residual = x
            if self.normalize_before:
                x = self.norm3(x)

            x = residual + self.dropout(self.src_attn(x, memory, memory_mask))


        return x, tgt_mask, memory, memory_mask, cache

    def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
        """Compute decoded features.

        Args:
            tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
            tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
            memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
            memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
            cache (List[torch.Tensor]): List of cached tensors.
                Each tensor shape should be (#batch, maxlen_out - 1, size).

        Returns:
            torch.Tensor: Output tensor(#batch, maxlen_out, size).
            torch.Tensor: Mask for output tensor (#batch, maxlen_out).
            torch.Tensor: Encoded memory (#batch, maxlen_in, size).
            torch.Tensor: Encoded memory mask (#batch, maxlen_in).

        """
        # tgt = self.dropout(tgt)
        residual = tgt
        if self.normalize_before:
            tgt = self.norm1(tgt)
        tgt = self.feed_forward(tgt)

        x = tgt
        if self.self_attn:
            if self.normalize_before:
                tgt = self.norm2(tgt)
            if self.training:
                cache = None
            x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
@@ -108,7 +149,6 @@


        return x, tgt_mask, memory, memory_mask, cache


class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
    """
@@ -947,6 +987,65 @@
        )
        return logp.squeeze(0), state

    def forward_chunk(
        self,
        memory: torch.Tensor,
        tgt: torch.Tensor,
        cache: dict = None,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Forward decoder.

        Args:
            hs_pad: encoded memory, float32  (batch, maxlen_in, feat)
            hlens: (batch)
            ys_in_pad:
                input token ids, int64 (batch, maxlen_out)
                if input_layer == "embed"
                input tensor (batch, maxlen_out, #mels) in the other cases
            ys_in_lens: (batch)
        Returns:
            (tuple): tuple containing:

            x: decoded token score before softmax (batch, maxlen_out, token)
                if use_output_layer is True,
            olens: (batch, )
        """
        x = tgt
        if cache["decode_fsmn"] is None:
            cache_layer_num = len(self.decoders)
            if self.decoders2 is not None:
                cache_layer_num += len(self.decoders2)
            new_cache = [None] * cache_layer_num
        else:
            new_cache = cache["decode_fsmn"]
        for i in range(self.att_layer_num):
            decoder = self.decoders[i]
            x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                x, None, memory, None, cache=new_cache[i]
            )
            new_cache[i] = c_ret

        if self.num_blocks - self.att_layer_num > 1:
            for i in range(self.num_blocks - self.att_layer_num):
                j = i + self.att_layer_num
                decoder = self.decoders2[i]
                x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
                    x, None, memory, None, cache=new_cache[j]
                )
                new_cache[j] = c_ret

        for decoder in self.decoders3:

            x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
                x, None, memory, None, cache=None
            )
        if self.normalize_before:
            x = self.after_norm(x)
        if self.output_layer is not None:
            x = self.output_layer(x)
        cache["decode_fsmn"] = new_cache
        return x

    def forward_one_step(
        self,
        tgt: torch.Tensor,

 funasr/models/e2e_asr_paraformer.py

@@ -325,12 +325,76 @@

        return encoder_out, encoder_out_lens

    def encode_chunk(
            self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Frontend + Encoder. Note that this method is used by asr_inference.py

        Args:
                speech: (Batch, Length, ...)
                speech_lengths: (Batch, )
        """
        with autocast(False):
            # 1. Extract feats
            feats, feats_lengths = self._extract_feats(speech, speech_lengths)

            # 2. Data augmentation
            if self.specaug is not None and self.training:
                feats, feats_lengths = self.specaug(feats, feats_lengths)

            # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
            if self.normalize is not None:
                feats, feats_lengths = self.normalize(feats, feats_lengths)

        # Pre-encoder, e.g. used for raw input data
        if self.preencoder is not None:
            feats, feats_lengths = self.preencoder(feats, feats_lengths)

        # 4. Forward encoder
        # feats: (Batch, Length, Dim)
        # -> encoder_out: (Batch, Length2, Dim2)
        if self.encoder.interctc_use_conditioning:
            encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(
                feats, feats_lengths, cache=cache["encoder"], ctc=self.ctc
            )
        else:
            encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(feats, feats_lengths, cache=cache["encoder"])
        intermediate_outs = None
        if isinstance(encoder_out, tuple):
            intermediate_outs = encoder_out[1]
            encoder_out = encoder_out[0]

        # Post-encoder, e.g. NLU
        if self.postencoder is not None:
            encoder_out, encoder_out_lens = self.postencoder(
                encoder_out, encoder_out_lens
            )

        assert encoder_out.size(0) == speech.size(0), (
            encoder_out.size(),
            speech.size(0),
        )
        assert encoder_out.size(1) <= encoder_out_lens.max(), (
            encoder_out.size(),
            encoder_out_lens.max(),
        )

        if intermediate_outs is not None:
            return (encoder_out, intermediate_outs), encoder_out_lens

        return encoder_out, encoder_out_lens

    def calc_predictor(self, encoder_out, encoder_out_lens):

        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
                                                                                  ignore_id=self.ignore_id)
        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

    def calc_predictor_chunk(self, encoder_out, cache=None):

        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor.forward_chunk(encoder_out, cache["encoder"])
        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
@@ -341,6 +405,14 @@
        decoder_out = decoder_outs[0]
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out, ys_pad_lens

    def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
        decoder_outs = self.decoder.forward_chunk(
            encoder_out, sematic_embeds, cache["decoder"]
        )
        decoder_out = decoder_outs
        decoder_out = torch.log_softmax(decoder_out, dim=-1)
        return decoder_out

    def _extract_feats(
            self, speech: torch.Tensor, speech_lengths: torch.Tensor
@@ -1459,4 +1531,4 @@
                    "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
                                                                                  var_dict_tf[name_tf].shape))

        return var_dict_torch_update
        return var_dict_torch_update

 funasr/models/e2e_diar_eend_ola.py

@@ -52,15 +52,15 @@

        super().__init__()
        self.frontend = frontend
        self.encoder = encoder
        self.encoder_decoder_attractor = encoder_decoder_attractor
        self.enc = encoder
        self.eda = encoder_decoder_attractor
        self.attractor_loss_weight = attractor_loss_weight
        self.max_n_speaker = max_n_speaker
        if mapping_dict is None:
            mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
            self.mapping_dict = mapping_dict
        # PostNet
        self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
        self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
        self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)

    def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@
        pad_shape = xs.shape
        xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
        xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
        emb = self.encoder(xs, xs_mask)
        emb = self.enc(xs, xs_mask)
        emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
        emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
        return emb
@@ -76,8 +76,8 @@
    def forward_post_net(self, logits, ilens):
        maxlen = torch.max(ilens).to(torch.int).item()
        logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
        outputs, (_, _) = self.PostNet(logits)
        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
        outputs, (_, _) = self.postnet(logits)
        outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
        outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
        outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@
        text = text[:, : text_lengths.max()]

        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
        intermediate_outs = None
        if isinstance(encoder_out, tuple):
            intermediate_outs = encoder_out[1]
@@ -190,18 +190,16 @@
                            shuffle: bool = True,
                            threshold: float = 0.5,
                            **kwargs):
        if self.frontend is not None:
            speech = self.frontend(speech)
        speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
        emb = self.forward_encoder(speech, speech_lengths)
        if shuffle:
            orders = [np.arange(e.shape[0]) for e in emb]
            for order in orders:
                np.random.shuffle(order)
            attractors, probs = self.encoder_decoder_attractor.estimate(
            attractors, probs = self.eda.estimate(
                [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
        else:
            attractors, probs = self.encoder_decoder_attractor.estimate(emb)
            attractors, probs = self.eda.estimate(emb)
        attractors_active = []
        for p, att, e in zip(probs, attractors, emb):
            if n_speakers and n_speakers >= 0:
@@ -233,10 +231,23 @@
                pred[i] = pred[i - 1]
            else:
                pred[i] = 0
        pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
        pred = [self.inv_mapping_func(i) for i in pred]
        decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
        decisions = torch.from_numpy(
            np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
            torch.float32)
        decisions = decisions[:, :n_speaker]
        return decisions

    def inv_mapping_func(self, label):

        if not isinstance(label, int):
            label = int(label)
        if label in self.mapping_dict['label2dec'].keys():
            num = self.mapping_dict['label2dec'][label]
        else:
            num = -1
        return num

    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
        pass

 funasr/models/e2e_diar_sond.py

@@ -59,7 +59,8 @@
        normalize_speech_speaker: bool = False,
        ignore_id: int = -1,
        speaker_discrimination_loss_weight: float = 1.0,
        inter_score_loss_weight: float = 0.0
        inter_score_loss_weight: float = 0.0,
        inputs_type: str = "raw",
    ):
        assert check_argument_types()

@@ -86,14 +87,12 @@
        )
        self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
        self.pse_embedding = self.generate_pse_embedding()
        # self.register_buffer("pse_embedding", pse_embedding)
        self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :]).float()
        # self.register_buffer("power_weight", power_weight)
        self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :]).int()
        # self.register_buffer("int_token_arr", int_token_arr)
        self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
        self.inter_score_loss_weight = inter_score_loss_weight
        self.forward_steps = 0
        self.inputs_type = inputs_type

    def generate_pse_embedding(self):
        embedding = np.zeros((len(self.token_list), self.max_spk_num), dtype=np.float)
@@ -125,9 +124,14 @@
            binary_labels: (Batch, frames, max_spk_num)
            binary_labels_lengths: (Batch,)
        """
        assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
        assert speech.shape[0] <= binary_labels.shape[0], (speech.shape, binary_labels.shape)
        batch_size = speech.shape[0]
        self.forward_steps = self.forward_steps + 1
        if self.pse_embedding.device != speech.device:
            self.pse_embedding = self.pse_embedding.to(speech.device)
            self.power_weight = self.power_weight.to(speech.device)
            self.int_token_arr = self.int_token_arr.to(speech.device)

        # 1. Network forward
        pred, inter_outputs = self.prediction_forward(
            speech, speech_lengths,
@@ -149,9 +153,13 @@
        # the sequence length of 'pred' might be slightly less than the
        # length of 'spk_labels'. Here we force them to be equal.
        length_diff_tolerance = 2
        length_diff = pse_labels.shape[1] - pred.shape[1]
        if 0 < length_diff <= length_diff_tolerance:
            pse_labels = pse_labels[:, 0: pred.shape[1]]
        length_diff = abs(pse_labels.shape[1] - pred.shape[1])
        if length_diff <= length_diff_tolerance:
            min_len = min(pred.shape[1], pse_labels.shape[1])
            pse_labels = pse_labels[:, :min_len]
            pred = pred[:, :min_len]
            cd_score = cd_score[:, :min_len]
            ci_score = ci_score[:, :min_len]

        loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
        loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
@@ -299,7 +307,7 @@
            speech: torch.Tensor,
            speech_lengths: torch.Tensor,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        if self.encoder is not None:
        if self.encoder is not None and self.inputs_type == "raw":
            speech, speech_lengths = self.encode(speech, speech_lengths)
            speech_mask = ~make_pad_mask(speech_lengths, maxlen=speech.shape[1])
            speech_mask = speech_mask.to(speech.device).unsqueeze(-1).float()

 funasr/models/encoder/sanm_encoder.py

@@ -347,6 +347,48 @@
            return (xs_pad, intermediate_outs), olens, None
        return xs_pad, olens, None

    def forward_chunk(self,
                      xs_pad: torch.Tensor,
                      ilens: torch.Tensor,
                      cache: dict = None,
                      ctc: CTC = None,
                      ):
        xs_pad *= self.output_size() ** 0.5
        if self.embed is None:
            xs_pad = xs_pad
        else:
            xs_pad = self.embed.forward_chunk(xs_pad, cache)

        encoder_outs = self.encoders0(xs_pad, None, None, None, None)
        xs_pad, masks = encoder_outs[0], encoder_outs[1]
        intermediate_outs = []
        if len(self.interctc_layer_idx) == 0:
            encoder_outs = self.encoders(xs_pad, None, None, None, None)
            xs_pad, masks = encoder_outs[0], encoder_outs[1]
        else:
            for layer_idx, encoder_layer in enumerate(self.encoders):
                encoder_outs = encoder_layer(xs_pad, None, None, None, None)
                xs_pad, masks = encoder_outs[0], encoder_outs[1]
                if layer_idx + 1 in self.interctc_layer_idx:
                    encoder_out = xs_pad

                    # intermediate outputs are also normalized
                    if self.normalize_before:
                        encoder_out = self.after_norm(encoder_out)

                    intermediate_outs.append((layer_idx + 1, encoder_out))

                    if self.interctc_use_conditioning:
                        ctc_out = ctc.softmax(encoder_out)
                        xs_pad = xs_pad + self.conditioning_layer(ctc_out)

        if self.normalize_before:
            xs_pad = self.after_norm(xs_pad)

        if len(intermediate_outs) > 0:
            return (xs_pad, intermediate_outs), None, None
        return xs_pad, ilens, None

    def gen_tf2torch_map_dict(self):
        tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
        tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf

 funasr/models/frontend/wav_frontend.py

@@ -1,14 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed from espnet/espnet.
from abc import ABC
from typing import Tuple

import numpy as np
import torch
import torchaudio.compliance.kaldi as kaldi
from funasr.models.frontend.abs_frontend import AbsFrontend
from typeguard import check_argument_types
from torch.nn.utils.rnn import pad_sequence
from typeguard import check_argument_types

import funasr.models.frontend.eend_ola_feature as eend_ola_feature
from funasr.models.frontend.abs_frontend import AbsFrontend


def load_cmvn(cmvn_file):
@@ -275,7 +276,8 @@
    # inputs tensor has catted the cache tensor
    # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
    #               is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
        torch.Tensor, torch.Tensor, int]:
        """
        Apply lfr with data
        """
@@ -376,7 +378,8 @@
            if self.lfr_m != 1 or self.lfr_n != 1:
                # update self.lfr_splice_cache in self.apply_lfr
                # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
                                                                                     is_final)
            if self.cmvn_file is not None:
                mat = self.apply_cmvn(mat, self.cmvn)
            feat_length = mat.size(0)
@@ -398,9 +401,10 @@
        assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
        waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths)  # input shape: B T D
        if feats.shape[0]:
            #if self.reserve_waveforms is None and self.lfr_m > 1:
            # if self.reserve_waveforms is None and self.lfr_m > 1:
            #    self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
                (self.reserve_waveforms, waveforms), dim=1)
            if not self.lfr_splice_cache:  # 初始化splice_cache
                for i in range(batch_size):
                    self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -409,7 +413,8 @@
                lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache)  # B T D
                feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
                feats_lengths += lfr_splice_cache_tensor[0].shape[0]
                frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                frame_from_waveforms = int(
                    (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
                feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
                if self.lfr_m == 1:
@@ -423,14 +428,15 @@
                    self.waveforms = self.waveforms[:, :sample_length]
            else:
                # update self.reserve_waveforms and self.lfr_splice_cache
                self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
                self.reserve_waveforms = self.waveforms[:,
                                         :-(self.frame_sample_length - self.frame_shift_sample_length)]
                for i in range(batch_size):
                    self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
                return torch.empty(0), feats_lengths
        else:
            if is_final:
                self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
                feats = torch.stack(self.lfr_splice_cache) 
                feats = torch.stack(self.lfr_splice_cache)
                feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
                feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
        if is_final:
@@ -444,3 +450,54 @@
        self.reserve_waveforms = None
        self.input_cache = None
        self.lfr_splice_cache = []


class WavFrontendMel23(AbsFrontend):
    """Conventional frontend structure for ASR.
    """

    def __init__(
            self,
            fs: int = 16000,
            frame_length: int = 25,
            frame_shift: int = 10,
            lfr_m: int = 1,
            lfr_n: int = 1,
    ):
        assert check_argument_types()
        super().__init__()
        self.fs = fs
        self.frame_length = frame_length
        self.frame_shift = frame_shift
        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.n_mels = 23

    def output_size(self) -> int:
        return self.n_mels * (2 * self.lfr_m + 1)

    def forward(
            self,
            input: torch.Tensor,
            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        batch_size = input.size(0)
        feats = []
        feats_lens = []
        for i in range(batch_size):
            waveform_length = input_lengths[i]
            waveform = input[i][:waveform_length]
            waveform = waveform.numpy()
            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
            mat = eend_ola_feature.transform(mat)
            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
            mat = mat[::self.lfr_n]
            mat = torch.from_numpy(mat)
            feat_length = mat.size(0)
            feats.append(mat)
            feats_lens.append(feat_length)

        feats_lens = torch.as_tensor(feats_lens)
        feats_pad = pad_sequence(feats,
                                 batch_first=True,
                                 padding_value=0.0)
        return feats_pad, feats_lens

 funasr/models/predictor/cif.py

@@ -199,6 +199,63 @@


        return acoustic_embeds, token_num, alphas, cif_peak



    def forward_chunk(self, hidden, cache=None):

        h = hidden

        context = h.transpose(1, 2)

        queries = self.pad(context)

        output = torch.relu(self.cif_conv1d(queries))

        output = output.transpose(1, 2)

        output = self.cif_output(output)

        alphas = torch.sigmoid(output)

        alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)



        alphas = alphas.squeeze(-1)

        mask_chunk_predictor = None

        if cache is not None:

            mask_chunk_predictor = None

            mask_chunk_predictor = torch.zeros_like(alphas)

            mask_chunk_predictor[:, cache["pad_left"]:cache["stride"] + cache["pad_left"]] = 1.0

       
        if mask_chunk_predictor is not None:

            alphas = alphas * mask_chunk_predictor

      
        if cache is not None:

            if cache["cif_hidden"] is not None:

                hidden = torch.cat((cache["cif_hidden"], hidden), 1)

            if cache["cif_alphas"] is not None:

                alphas = torch.cat((cache["cif_alphas"], alphas), -1)



        token_num = alphas.sum(-1)

        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)

        len_time = alphas.size(-1)

        last_fire_place = len_time - 1

        last_fire_remainds = 0.0

        pre_alphas_length = 0

 
        mask_chunk_peak_predictor = None

        if cache is not None:

            mask_chunk_peak_predictor = None

            mask_chunk_peak_predictor = torch.zeros_like(cif_peak)

            if cache["cif_alphas"] is not None:

                pre_alphas_length = cache["cif_alphas"].size(-1)

                mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0

            mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0

            


        if mask_chunk_peak_predictor is not None:

            cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)

        
        for i in range(len_time):

            if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:

                last_fire_place = len_time - 1 - i

                last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold

                break

        last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)

        cache["cif_hidden"] = hidden[:, last_fire_place:, :]

        cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)

        token_num_int = token_num.floor().type(torch.int32).item()

        return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak



    def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):

        b, t, d = hidden.size()

        tail_threshold = self.tail_threshold


 funasr/modules/attention.py

@@ -347,15 +347,17 @@
            mask = torch.reshape(mask, (b, -1, 1))
            if mask_shfit_chunk is not None:
                mask = mask * mask_shfit_chunk
            inputs = inputs * mask

        inputs = inputs * mask
        x = inputs.transpose(1, 2)
        x = self.pad_fn(x)
        x = self.fsmn_block(x)
        x = x.transpose(1, 2)
        x += inputs
        x = self.dropout(x)
        return x * mask
        if mask is not None:
            x = x * mask
        return x

    def forward_qkv(self, x):
        """Transform query, key and value.
@@ -505,7 +507,7 @@
            # print("in fsmn, cache is None, x", x.size())

            x = self.pad_fn(x)
            if not self.training and t <= 1:
            if not self.training:
                cache = x
        else:
            # print("in fsmn, cache is not None, x", x.size())
@@ -513,7 +515,7 @@
            # if t < self.kernel_size:
            #     x = self.pad_fn(x)
            x = torch.cat((cache[:, :, 1:], x), dim=2)
            x = x[:, :, -self.kernel_size:]
            x = x[:, :, -(self.kernel_size+t-1):]
            # print("in fsmn, cache is not None, x_cat", x.size())
            cache = x
        x = self.fsmn_block(x)

 funasr/modules/eend_ola/encoder.py

@@ -87,7 +87,7 @@
                 n_layers: int,
                 n_units: int,
                 e_units: int = 2048,
                 h: int = 8,
                 h: int = 4,
                 dropout_rate: float = 0.1,
                 use_pos_emb: bool = False):
        super(EENDOLATransformerEncoder, self).__init__()

 funasr/modules/eend_ola/encoder_decoder_attractor.py

@@ -16,12 +16,12 @@
        self.n_units = n_units

    def forward_core(self, xs, zeros):
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
        xs = [self.enc0_dropout(x) for x in xs]
        xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
        xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
        _, (hx, cx) = self.encoder(xs)
        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
        max_zlen = torch.max(zlens).to(torch.int).item()
        zeros = [self.enc0_dropout(z) for z in zeros]
        zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
@@ -47,4 +47,4 @@
        zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
        attractors = self.forward_core(xs, zeros)
        probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
        return attractors, probs
        return attractors, probs

 funasr/modules/embedding.py

@@ -405,4 +405,13 @@
        positions = torch.arange(1, timesteps+1)[None, :]
        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)

        return x + position_encoding
        return x + position_encoding

    def forward_chunk(self, x, cache=None):
        start_idx = 0
        batch_size, timesteps, input_dim = x.size()
        if cache is not None:
            start_idx = cache["start_idx"]
        positions = torch.arange(1, timesteps+start_idx+1)[None, :]
        position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
        return x + position_encoding[:, start_idx: start_idx + timesteps]

 funasr/runtime/grpc/CMakeLists.txt

New file
@@ -0,0 +1,83 @@
# Copyright 2018 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# cmake build file for C++ paraformer example.
# Assumes protobuf and gRPC have been installed using cmake.
# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
# that automatically builds all the dependencies before building paraformer.

cmake_minimum_required(VERSION 3.10)

project(ASR C CXX)

include(common.cmake)

# Proto file
get_filename_component(rg_proto "../python/grpc/proto/paraformer.proto" ABSOLUTE)
get_filename_component(rg_proto_path "${rg_proto}" PATH)

# Generated sources
set(rg_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.cc")
set(rg_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.h")
set(rg_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.cc")
set(rg_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.h")
add_custom_command(
      OUTPUT "${rg_proto_srcs}" "${rg_proto_hdrs}" "${rg_grpc_srcs}" "${rg_grpc_hdrs}"
      COMMAND ${_PROTOBUF_PROTOC}
      ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
        --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
        -I "${rg_proto_path}"
        --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
        "${rg_proto}"
      DEPENDS "${rg_proto}")


# Include generated *.pb.h files
include_directories("${CMAKE_CURRENT_BINARY_DIR}")

include_directories(../onnxruntime/include/)
link_directories(../onnxruntime/build/src/)
link_directories(../onnxruntime/build/third_party/webrtc/)

link_directories(${ONNXRUNTIME_DIR}/lib)
add_subdirectory("../onnxruntime/src" onnx_src)

# rg_grpc_proto
add_library(rg_grpc_proto
  ${rg_grpc_srcs}
  ${rg_grpc_hdrs}
  ${rg_proto_srcs}
  ${rg_proto_hdrs})



target_link_libraries(rg_grpc_proto
  ${_REFLECTION}
  ${_GRPC_GRPCPP}
  ${_PROTOBUF_LIBPROTOBUF})

# Targets paraformer_(server)
foreach(_target
  paraformer_server)
  add_executable(${_target}
    "${_target}.cc")
  target_link_libraries(${_target}
    rg_grpc_proto
    rapidasr
    webrtcvad
    ${EXTRA_LIBS}
    ${_REFLECTION}
    ${_GRPC_GRPCPP}
    ${_PROTOBUF_LIBPROTOBUF})
endforeach()

 funasr/runtime/grpc/Readme.md

New file
@@ -0,0 +1,57 @@
## paraformer grpc onnx server in c++


#### Step 1. Build ../onnxruntime as it's document
```
#put onnx-lib & onnx-asr-model & vocab.txt into /path/to/asrmodel(eg: /data/asrmodel)
ls /data/asrmodel/
onnxruntime-linux-x64-1.14.0  speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch

file /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/vocab.txt
UTF-8 Unicode text
```

#### Step 2. Compile and install grpc v1.52.0 in case of grpc bugs
```
export GRPC_INSTALL_DIR=/data/soft/grpc
export PKG_CONFIG_PATH=$GRPC_INSTALL_DIR/lib/pkgconfig

git clone -b v1.52.0 --depth=1  https://github.com/grpc/grpc.git
cd grpc
git submodule update --init --recursive

mkdir -p cmake/build
pushd cmake/build
cmake -DgRPC_INSTALL=ON \
      -DgRPC_BUILD_TESTS=OFF \
      -DCMAKE_INSTALL_PREFIX=$GRPC_INSTALL_DIR \
      ../..
make
make install
popd

echo "export GRPC_INSTALL_DIR=/data/soft/grpc" >> ~/.bashrc
echo "export PKG_CONFIG_PATH=\$GRPC_INSTALL_DIR/lib/pkgconfig" >> ~/.bashrc
echo "export PATH=\$GRPC_INSTALL_DIR/bin/:\$PKG_CONFIG_PATH:\$PATH" >> ~/.bashrc
source ~/.bashrc
```

#### Step 3. Compile and start grpc onnx paraformer server
```
# set -DONNXRUNTIME_DIR=/path/to/asrmodel/onnxruntime-linux-x64-1.14.0
./rebuild.sh
```

#### Step 4. Start grpc paraformer server
```
Usage: ./cmake/build/paraformer_server port thread_num /path/to/model_file
./cmake/build/paraformer_server 10108 4 /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
```



#### Step 5. Start grpc python paraformer client  on PC with MIC
```
cd ../python/grpc
python grpc_main_client_mic.py  --host $server_ip --port 10108
```

 funasr/runtime/grpc/common.cmake

New file
@@ -0,0 +1,125 @@
# Copyright 2018 gRPC authors.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
# cmake build file for C++ route_guide example.
# Assumes protobuf and gRPC have been installed using cmake.
# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
# that automatically builds all the dependencies before building route_guide.

cmake_minimum_required(VERSION 3.5.1)

if (NOT DEFINED CMAKE_CXX_STANDARD)
  set (CMAKE_CXX_STANDARD 14)
endif()

if(MSVC)
  add_definitions(-D_WIN32_WINNT=0x600)
endif()

find_package(Threads REQUIRED)

if(GRPC_AS_SUBMODULE)
  # One way to build a projects that uses gRPC is to just include the
  # entire gRPC project tree via "add_subdirectory".
  # This approach is very simple to use, but the are some potential
  # disadvantages:
  # * it includes gRPC's CMakeLists.txt directly into your build script
  #   without and that can make gRPC's internal setting interfere with your
  #   own build.
  # * depending on what's installed on your system, the contents of submodules
  #   in gRPC's third_party/* might need to be available (and there might be
  #   additional prerequisites required to build them). Consider using
  #   the gRPC_*_PROVIDER options to fine-tune the expected behavior.
  #
  # A more robust approach to add dependency on gRPC is using
  # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).

  # Include the gRPC's cmake build (normally grpc source code would live
  # in a git submodule called "third_party/grpc", but this example lives in
  # the same repository as gRPC sources, so we just look a few directories up)
  add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
  message(STATUS "Using gRPC via add_subdirectory.")

  # After using add_subdirectory, we can now use the grpc targets directly from
  # this build.
  set(_PROTOBUF_LIBPROTOBUF libprotobuf)
  set(_REFLECTION grpc++_reflection)
  if(CMAKE_CROSSCOMPILING)
    find_program(_PROTOBUF_PROTOC protoc)
  else()
    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
  endif()
  set(_GRPC_GRPCPP grpc++)
  if(CMAKE_CROSSCOMPILING)
    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
  else()
    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
  endif()
elseif(GRPC_FETCHCONTENT)
  # Another way is to use CMake's FetchContent module to clone gRPC at
  # configure time. This makes gRPC's source code available to your project,
  # similar to a git submodule.
  message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
  include(FetchContent)
  FetchContent_Declare(
    grpc
    GIT_REPOSITORY https://github.com/grpc/grpc.git
    # when using gRPC, you will actually set this to an existing tag, such as
    # v1.25.0, v1.26.0 etc..
    # For the purpose of testing, we override the tag used to the commit
    # that's currently under test.
    GIT_TAG        vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
  FetchContent_MakeAvailable(grpc)

  # Since FetchContent uses add_subdirectory under the hood, we can use
  # the grpc targets directly from this build.
  set(_PROTOBUF_LIBPROTOBUF libprotobuf)
  set(_REFLECTION grpc++_reflection)
  set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
  set(_GRPC_GRPCPP grpc++)
  if(CMAKE_CROSSCOMPILING)
    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
  else()
    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
  endif()
else()
  # This branch assumes that gRPC and all its dependencies are already installed
  # on this system, so they can be located by find_package().

  # Find Protobuf installation
  # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
  set(protobuf_MODULE_COMPATIBLE TRUE)
  find_package(Protobuf CONFIG REQUIRED)
  message(STATUS "Using protobuf ${Protobuf_VERSION}")

  set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
  set(_REFLECTION gRPC::grpc++_reflection)
  if(CMAKE_CROSSCOMPILING)
    find_program(_PROTOBUF_PROTOC protoc)
  else()
    set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
  endif()

  # Find gRPC installation
  # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
  find_package(gRPC CONFIG REQUIRED)
  message(STATUS "Using gRPC ${gRPC_VERSION}")

  set(_GRPC_GRPCPP gRPC::grpc++)
  if(CMAKE_CROSSCOMPILING)
    find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
  else()
    set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
  endif()
endif()

 funasr/runtime/grpc/paraformer_server.cc

New file
@@ -0,0 +1,195 @@
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iostream>
#include <sstream>
#include <memory>
#include <string>

#include <grpc/grpc.h>
#include <grpcpp/server.h>
#include <grpcpp/server_builder.h>
#include <grpcpp/server_context.h>
#include <grpcpp/security/server_credentials.h>

#include "paraformer.grpc.pb.h"
#include "paraformer_server.h"


using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::ServerReader;
using grpc::ServerReaderWriter;
using grpc::ServerWriter;
using grpc::Status;


using paraformer::Request;
using paraformer::Response;
using paraformer::ASR;

ASRServicer::ASRServicer(const char* model_path, int thread_num) {
    AsrHanlde=RapidAsrInit(model_path, thread_num);
    std::cout << "ASRServicer init" << std::endl;
    init_flag = 0;
}

void ASRServicer::clear_states(const std::string& user) {
    clear_buffers(user);
    clear_transcriptions(user);
}

void ASRServicer::clear_buffers(const std::string& user) {
    if (client_buffers.count(user)) {
        client_buffers.erase(user);
    }
}

void ASRServicer::clear_transcriptions(const std::string& user) {
    if (client_transcription.count(user)) {
        client_transcription.erase(user);
    }
}

void ASRServicer::disconnect(const std::string& user) {
    clear_states(user);
    std::cout << "Disconnecting user: " << user << std::endl;
}

grpc::Status ASRServicer::Recognize(
    grpc::ServerContext* context,
    grpc::ServerReaderWriter<Response, Request>* stream) {

    Request req;
    while (stream->Read(&req)) {
        if (req.isend()) {
            std::cout << "asr end" << std::endl;
            disconnect(req.user());
            Response res;
            res.set_sentence(
                R"({"success": true, "detail": "asr end"})"
            );
            res.set_user(req.user());
            res.set_action("terminate");
            res.set_language(req.language());
            stream->Write(res);
        } else if (req.speaking()) {
            if (req.audio_data().size() > 0) {
                auto& buf = client_buffers[req.user()];
                buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
            }
            Response res;
            res.set_sentence(
                R"({"success": true, "detail": "speaking"})"
            );
            res.set_user(req.user());
            res.set_action("speaking");
            res.set_language(req.language());
            stream->Write(res);
        } else if (!req.speaking()) {
            if (client_buffers.count(req.user()) == 0) {
                Response res;
                res.set_sentence(
                    R"({"success": true, "detail": "waiting_for_voice"})"
                );
                res.set_user(req.user());
                res.set_action("waiting");
                res.set_language(req.language());
                stream->Write(res);
            }else {
                auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                std::string tmp_data = this->client_buffers[req.user()];
                this->clear_states(req.user());
                
                Response res;
                res.set_sentence(
                    R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
                );
        int data_len_int = tmp_data.length();
                std::string data_len = std::to_string(data_len_int);
                std::stringstream ss;
                ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")"  << R"("})";
                std::string result = ss.str();
                res.set_sentence(result);
                res.set_user(req.user());
                res.set_action("decoding");
                res.set_language(req.language());
                stream->Write(res);
                if (tmp_data.length() < 800) { //min input_len for asr model
                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                    std::string delay_str = std::to_string(end_time - begin_time);
                    std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", error: data_is_not_long_enough" << std::endl;
                    Response res;
                    std::stringstream ss;
                    std::string asr_result = "";
                    ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
                    std::string result = ss.str();
                    res.set_sentence(result);
                    res.set_user(req.user());
                    res.set_action("finish");
                    res.set_language(req.language());
                    
                    
                    
                    stream->Write(res);
                }
                else {
                    RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);   
                    std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;

                    auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
                    std::string delay_str = std::to_string(end_time - begin_time);
                    
                    std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
                    Response res;
                    std::stringstream ss;
                    ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
                    std::string result = ss.str();
                    res.set_sentence(result);
                    res.set_user(req.user());
                    res.set_action("finish");
                    res.set_language(req.language());
                    
                    
                    stream->Write(res);
                }
            }
        }else {
            Response res;
            res.set_sentence(
                R"({"success": false, "detail": "error, no condition matched! Unknown reason."})"
            );
            res.set_user(req.user());
            res.set_action("terminate");
            res.set_language(req.language());
            stream->Write(res);
        }
    }    
    return Status::OK;
}


void RunServer(const std::string& port, int thread_num, const char* model_path) {
    std::string server_address;
    server_address = "0.0.0.0:" + port;
    ASRServicer service(model_path, thread_num);

    ServerBuilder builder;
    builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
    builder.RegisterService(&service);
    std::unique_ptr<Server> server(builder.BuildAndStart());
    std::cout << "Server listening on " << server_address << std::endl;
    server->Wait();
}

int main(int argc, char* argv[]) {
    if (argc < 3)
    {
        printf("Usage: %s port thread_num /path/to/model_file\n", argv[0]);
        exit(-1);
    }

    RunServer(argv[1], atoi(argv[2]), argv[3]);
    return 0;
}

 funasr/runtime/grpc/paraformer_server.h

New file
@@ -0,0 +1,56 @@
#include <algorithm>
#include <chrono>
#include <cmath>
#include <iostream>
#include <memory>
#include <string>

#include <grpc/grpc.h>
#include <grpcpp/server.h>
#include <grpcpp/server_builder.h>
#include <grpcpp/server_context.h>
#include <grpcpp/security/server_credentials.h>

#include <unordered_map>
#include <chrono>

#include "paraformer.grpc.pb.h"
#include "librapidasrapi.h"


using grpc::Server;
using grpc::ServerBuilder;
using grpc::ServerContext;
using grpc::ServerReader;
using grpc::ServerReaderWriter;
using grpc::ServerWriter;
using grpc::Status;


using paraformer::Request;
using paraformer::Response;
using paraformer::ASR;

typedef struct
{
    std::string msg;
    float  snippet_time;
}RPASR_RECOG_RESULT;


class ASRServicer final : public ASR::Service {
  private:
    int init_flag;
    std::unordered_map<std::string, std::string> client_buffers;
    std::unordered_map<std::string, std::string> client_transcription;

  public:
    ASRServicer(const char* model_path, int thread_num);
    void clear_states(const std::string& user);
    void clear_buffers(const std::string& user);
    void clear_transcriptions(const std::string& user);
    void disconnect(const std::string& user);
    grpc::Status Recognize(grpc::ServerContext* context, grpc::ServerReaderWriter<Response, Request>* stream);
    RPASR_HANDLE AsrHanlde;
	
};

 funasr/runtime/grpc/rebuild.sh

New file
@@ -0,0 +1,12 @@
#!/bin/bash

rm cmake -rf
mkdir -p cmake/build

cd cmake/build

cmake  -DCMAKE_BUILD_TYPE=release ../.. -DONNXRUNTIME_DIR=/data/asrmodel/onnxruntime-linux-x64-1.14.0
make


echo "Build cmake/build/paraformer_server successfully!"

 funasr/runtime/onnxruntime/readme.md

@@ -41,8 +41,8 @@
```
导出onnx模型，[详见](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)，参考示例，从modelscope中模型导出：

```
python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
```shell
python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
```

## Building Guidance for Linux/Unix

 funasr/runtime/onnxruntime/src/Audio.cpp

@@ -237,7 +237,7 @@

    size_t nOffset = 0;

#define WAV_HEADER_SIZE 44


    speech_len = nBufLen / 2;
    speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@
            speech_data[i] = (float)speech_buff[i] / scale;
        }


        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
        return true;

    }

 funasr/runtime/onnxruntime/src/librapidasrapi.cpp

@@ -26,8 +26,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadwav(szBuf,nLen);
        audio.split();
        if (!audio.loadwav(szBuf, nLen))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -58,8 +59,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadpcmwav(szBuf, nLen);
        audio.split();
        if (!audio.loadpcmwav(szBuf, nLen))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -91,8 +93,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadpcmwav(szFileName);
        audio.split();
        if (!audio.loadpcmwav(szFileName))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -125,7 +128,7 @@
        Audio audio(1);
        if(!audio.loadwav(szWavfile))
            return nullptr;
        audio.split();
        //audio.split();

        float* buff;
        int len;

 funasr/runtime/onnxruntime/tester/tester.cpp

@@ -8,7 +8,7 @@
#include "librapidasrapi.h"

#include <iostream>

#include <fstream>
using namespace std;

int main(int argc, char *argv[])
@@ -40,10 +40,13 @@


    gettimeofday(&start, NULL);

    RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
    gettimeofday(&end, NULL);
    float snippet_time = 0.0f;


     RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);

    gettimeofday(&end, NULL);
   
    if (Result)
    {
        string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@
    }
    else
    {
        cout <<("no return data!");
        cout <<"no return data!";
    }
  
    printf("Audio length %lfs.\n", (double)snippet_time);
 
 
    //char* buff = nullptr;
    //int len = 0;
    //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
    //if (ifs.is_open())
    //{
    //    ifs.seekg(0, std::ios::end);
    //    len = ifs.tellg();
    //    ifs.seekg(0, std::ios::beg);

    //    buff = new char[len];

    //    ifs.read(buff, len);


    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);

    //    RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
    //    gettimeofday(&end, NULL);
    //   
    //    if (Result)
    //    {
    //        string msg = RapidAsrGetResult(Result, 0);
    //        setbuf(stdout, NULL);
    //        cout << "Result: \"";
    //        cout << msg << endl;
    //        cout << "\"." << endl;
    //        snippet_time = RapidAsrGetRetSnippetTime(Result);
    //        RapidAsrFreeResult(Result);
    //    }
    //    else
    //    {
    //        cout <<"no return data!";
    //    }
  
    //   
    //delete[]buff;
    //}

 
    printf("Audio length %lfs.\n", (double)snippet_time);
    seconds = (end.tv_sec - start.tv_sec);
    long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);

 funasr/runtime/python/benchmark_libtorch.md

New file
@@ -0,0 +1,45 @@
# Benchmark 

### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.

### Tools
- Install ModelScope and FunASR

    ```shell
    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
    pip install --editable ./
    cd funasr/runtime/python/utils
    pip install -r requirements.txt
    ```

- recipe

    set the model, data path and output_dir

    ```shell
    nohup bash test_rtf.sh &> log.txt &
    ```



## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 


### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz   16core-32processor    with avx512_vnni

| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
|:----------------:|:------------------:|:------:|:------------:|
| 1 (torch fp32)   |        3522        | 0.0976 |     10.3     |
|  1 (torch int8)  |        1746        | 0.0484 |     20.7     |
| 32 (torch fp32)  |        236         | 0.0066 |    152.7     |
| 32 (torch int8)  |        114         | 0.0032 |    317.4     |
| 64 (torch fp32)  |        235         | 0.0065 |    153.7     |
| 64 (torch int8)  |        113         | 0.0031 |    319.2     |


[//]: # (### Intel&#40;R&#41; Xeon&#40;R&#41; Platinum 8163 CPU @ 2.50GHz    32core-64processor   without avx512_vnni)


## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)

 funasr/runtime/python/benchmark_onnx.md

New file
@@ -0,0 +1,89 @@
# Benchmark 

### Data set:
Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.

### Tools
- Install ModelScope and FunASR

    ```shell
    pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
    git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
    pip install --editable ./
    cd funasr/runtime/python/utils
    pip install -r requirements.txt
    ```

- recipe

    set the model, data path and output_dir

    ```shell
    nohup bash test_rtf.sh &> log.txt &
    ```


## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) 

 ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni

| concurrent-tasks | processing time(s) |   RTF   | Speedup Rate |
|:----------------:|:------------------:|:-------:|:------------:|
|  1 (onnx fp32)   |        2806        | 0.0777  |     12.9     |
|  1 (onnx int8)   |        1611        | 0.0446  |     22.4     |
|  8 (onnx fp32)   |        538         | 0.0149  |     67.1     |
|  8 (onnx int8)   |        210         | 0.0058  |    172.4     |
|  16 (onnx fp32)  |        288         | 0.0080  |    125.2     |
|  16 (onnx int8)  |        117         | 0.0032  |    309.9     |
|  32 (onnx fp32)  |        167         | 0.0046  |    216.5     |
|  32 (onnx int8)  |         86         | 0.0024  |    420.0     |
|  64 (onnx fp32)  |        158         | 0.0044  |    228.1     |
|  64 (onnx int8)  |         82         | 0.0023  |    442.8     |
|  96 (onnx fp32)  |        151         | 0.0042  |    238.0     |
|  96 (onnx int8)  |         80         | 0.0022  |    452.0     |


### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz   16core-32processor    with avx512_vnni

| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
|:----------------:|:------------------:|:------:|:------------:|
|  1 (onnx fp32)   |        2613        | 0.0724 |     13.8     |
|  1 (onnx int8)   |        1321        | 0.0366 |     22.4     |
|  32 (onnx fp32)  |        170         | 0.0047 |    212.7     |
|  32 (onnx int8)  |        89          | 0.0025 |    407.0     |
|  64 (onnx fp32)  |        166         | 0.0046 |    217.1     |
|  64 (onnx int8)  |         87         | 0.0024 |    414.7     |


### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz    32core-64processor   without avx512_vnni


| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
|:----------------:|:------------------:|:------:|:------------:|
|  1 (onnx fp32)   |        2959        | 0.0820 |     12.2     |
|  1 (onnx int8)   |        2814        | 0.0778 |     12.8     |
|  16 (onnx fp32)  |        373         | 0.0103 |     96.9     |
|  16 (onnx int8)  |        331         | 0.0091 |    109.0     |
|  32 (onnx fp32)  |        211         | 0.0058 |    171.4     |
|  32 (onnx int8)  |        181         | 0.0050 |    200.0     |
|  64 (onnx fp32)  |        153         | 0.0042 |    235.9     |
|  64 (onnx int8)  |        103         | 0.0029 |    349.9     |
|  96 (onnx fp32)  |        146         | 0.0041 |    247.0     |
|  96 (onnx int8)  |        108         | 0.0030 |    334.1     |

## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)

 ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni

| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
|:----------------:|:------------------:|:------:|:------------:|
|  1 (onnx fp32)   |        1173        | 0.0325 |     30.8     |
|  1 (onnx int8)   |        976         | 0.0270 |     37.0     |
|  16 (onnx fp32)  |         91         | 0.0025 |    395.2     |
|  16 (onnx int8)  |         78         | 0.0022 |    463.0     |
|  32 (onnx fp32)  |         60         | 0.0017 |    598.8     |
|  32 (onnx int8)  |         40         | 0.0011 |    892.9     |
|  64 (onnx fp32)  |         55         | 0.0015 |    653.6     |
|  64 (onnx int8)  |         31         | 0.0009 |    1162.8    |
|  96 (onnx fp32)  |         57         | 0.0016 |    632.9     |
|  96 (onnx int8)  |         33         | 0.0009 |    1098.9    |

 funasr/runtime/python/grpc/grpc_main_server.py

@@ -10,7 +10,7 @@
                        # interceptors=(AuthInterceptor('Bearer mysecrettoken'),)
                           )
      paraformer_pb2_grpc.add_ASRServicer_to_server(
          ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir), server)
          ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir, vad_model=args.vad_model, punc_model=args.punc_model), server)
      port = "[::]:" + str(args.port)
      server.add_insecure_port(port)
      server.start()
@@ -34,7 +34,16 @@
                        type=str,
                        default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                        help="model from modelscope")
                        
    parser.add_argument("--vad_model",
                        type=str,
                        default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
                        help="model from modelscope")
    
    parser.add_argument("--punc_model",
                        type=str,
                        default="",
                        help="model from modelscope")
    
    parser.add_argument("--sample_rate",
                        type=int,
                        default=16000,
@@ -50,6 +59,7 @@
                        type=str,
                        default="/nfs/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
                        help="onnx model dir")
    
                        



 funasr/runtime/python/grpc/grpc_server.py

@@ -8,7 +8,7 @@


class ASRServicer(paraformer_pb2_grpc.ASRServicer):
    def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir):
    def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir, vad_model='', punc_model=''):
        print("ASRServicer init")
        self.backend = backend
        self.init_flag = 0
@@ -21,7 +21,7 @@
                from modelscope.utils.constant import Tasks
            except ImportError:
                raise ImportError(f"Please install modelscope")
            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model)
            self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
        elif self.backend == "onnxruntime":
            try:
                from rapid_paraformer.paraformer_onnx import Paraformer

 funasr/runtime/python/libtorch/README.md

@@ -19,11 +19,11 @@

       - `e.g.`, Export model from modelscope
         ```shell
         python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
         python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
         ```
       - `e.g.`, Export model from local path, the model'name must be `model.pb`.
         ```shell
         python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
         python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
         ```



 funasr/runtime/python/libtorch/setup.py

@@ -28,7 +28,7 @@
    install_requires=["librosa", "onnxruntime>=1.7.0",
                      "scipy", "numpy>=1.19.3",
                      "typeguard", "kaldi-native-fbank",
                      "PyYAML>=5.1.2"],
                      "PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
    packages=find_packages(include=["torch_paraformer*"]),
    keywords=[
        'funasr,paraformer'

 funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py

@@ -24,12 +24,16 @@
                 device_id: Union[str, int] = "-1",
                 plot_timestamp_to: str = "",
                 pred_bias: int = 1,
                 quantize: bool = False,
                 intra_op_num_threads: int = 1,
                 ):

        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')

        model_file = os.path.join(model_dir, 'model.torchscripts')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.torchscripts')
        config_file = os.path.join(model_dir, 'config.yaml')
        cmvn_file = os.path.join(model_dir, 'am.mvn')
        config = read_yaml(config_file)
@@ -58,26 +62,28 @@
                am_scores, valid_token_lens = outputs[0], outputs[1]
                if len(outputs) == 4:
                    # for BiCifParaformer Inference
                    us_alphas, us_cif_peak = outputs[2], outputs[3]
                    us_alphas, us_peaks = outputs[2], outputs[3]
                else:
                    us_alphas, us_cif_peak = None, None
                    us_alphas, us_peaks = None, None
            except:
                #logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                preds = ['']
            else:
                am_scores, valid_token_lens = am_scores.detach().cpu().numpy(), valid_token_lens.detach().cpu().numpy()
                preds = self.decode(am_scores, valid_token_lens)
                if us_cif_peak is None:
                if us_peaks is None:
                    for pred in preds:
                        pred = sentence_postprocess(pred)
                        asr_res.append({'preds': pred})
                else:
                    for pred, us_cif_peak_ in zip(preds, us_cif_peak):
                        text, tokens = pred
                        timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
                    for pred, us_peaks_ in zip(preds, us_peaks):
                        raw_tokens = pred
                        timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
                        text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
                        # logging.warning(timestamp)
                        if len(self.plot_timestamp_to):
                            self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
                        asr_res.append({'preds': text, 'timestamp': timestamp})
                            self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
                        asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
        return asr_res

    def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -178,6 +184,6 @@
        # Change integer-ids to tokens
        token = self.converter.ids2tokens(token_int)
        token = token[:valid_token_num-self.pred_bias]
        texts = sentence_postprocess(token)
        return texts
        # texts = sentence_postprocess(token)
        return token


 funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py

New file
@@ -0,0 +1,157 @@
import os
import numpy as np
import sys

def compute_wer(ref_file,
                hyp_file,
                cer_detail_file):
    rst = {
        'Wrd': 0,
        'Corr': 0,
        'Ins': 0,
        'Del': 0,
        'Sub': 0,
        'Snt': 0,
        'Err': 0.0,
        'S.Err': 0.0,
        'wrong_words': 0,
        'wrong_sentences': 0
    }

    hyp_dict = {}
    ref_dict = {}
    with open(hyp_file, 'r') as hyp_reader:
        for line in hyp_reader:
            key = line.strip().split()[0]
            value = line.strip().split()[1:]
            hyp_dict[key] = value
    with open(ref_file, 'r') as ref_reader:
        for line in ref_reader:
            key = line.strip().split()[0]
            value = line.strip().split()[1:]
            ref_dict[key] = value

    cer_detail_writer = open(cer_detail_file, 'w')
    for hyp_key in hyp_dict:
        if hyp_key in ref_dict:
           out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
           rst['Wrd'] += out_item['nwords']
           rst['Corr'] += out_item['cor']
           rst['wrong_words'] += out_item['wrong']
           rst['Ins'] += out_item['ins']
           rst['Del'] += out_item['del']
           rst['Sub'] += out_item['sub']
           rst['Snt'] += 1
           if out_item['wrong'] > 0:
               rst['wrong_sentences'] += 1
           cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
           cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
           cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')

    if rst['Wrd'] > 0:
        rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
    if rst['Snt'] > 0:
        rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)

    cer_detail_writer.write('\n')
    cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
                            ", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
    cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
    cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')

     
def compute_wer_by_line(hyp,
                        ref):
    hyp = list(map(lambda x: x.lower(), hyp))
    ref = list(map(lambda x: x.lower(), ref))

    len_hyp = len(hyp)
    len_ref = len(ref)

    cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)

    ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)

    for i in range(len_hyp + 1):
        cost_matrix[i][0] = i
    for j in range(len_ref + 1):
        cost_matrix[0][j] = j

    for i in range(1, len_hyp + 1):
        for j in range(1, len_ref + 1):
            if hyp[i - 1] == ref[j - 1]:
                cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
            else:
                substitution = cost_matrix[i - 1][j - 1] + 1
                insertion = cost_matrix[i - 1][j] + 1
                deletion = cost_matrix[i][j - 1] + 1

                compare_val = [substitution, insertion, deletion]

                min_val = min(compare_val)
                operation_idx = compare_val.index(min_val) + 1
                cost_matrix[i][j] = min_val
                ops_matrix[i][j] = operation_idx

    match_idx = []
    i = len_hyp
    j = len_ref
    rst = {
        'nwords': len_ref,
        'cor': 0,
        'wrong': 0,
        'ins': 0,
        'del': 0,
        'sub': 0
    }
    while i >= 0 or j >= 0:
        i_idx = max(0, i)
        j_idx = max(0, j)

        if ops_matrix[i_idx][j_idx] == 0:  # correct
            if i - 1 >= 0 and j - 1 >= 0:
                match_idx.append((j - 1, i - 1))
                rst['cor'] += 1

            i -= 1
            j -= 1

        elif ops_matrix[i_idx][j_idx] == 2:  # insert
            i -= 1
            rst['ins'] += 1

        elif ops_matrix[i_idx][j_idx] == 3:  # delete
            j -= 1
            rst['del'] += 1

        elif ops_matrix[i_idx][j_idx] == 1:  # substitute
            i -= 1
            j -= 1
            rst['sub'] += 1

        if i < 0 and j >= 0:
            rst['del'] += 1
        elif j < 0 and i >= 0:
            rst['ins'] += 1

    match_idx.reverse()
    wrong_cnt = cost_matrix[len_hyp][len_ref]
    rst['wrong'] = wrong_cnt

    return rst

def print_cer_detail(rst):
    return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
            + ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
            + str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
            + ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))

if __name__ == '__main__':
    if len(sys.argv) != 4:
        print("usage : python compute-wer.py test.ref test.hyp test.wer")
        sys.exit(0)

    ref_file = sys.argv[1]
    hyp_file = sys.argv[2]
    cer_detail_file = sys.argv[3]
    compute_wer(ref_file, hyp_file, cer_detail_file)

 funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py

@@ -1,11 +1,11 @@
import numpy as np


def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0):
def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
    if not len(char_list):
        return []
    START_END_THRESHOLD = 5
    MAX_TOKEN_DURATION = 14
    MAX_TOKEN_DURATION = 30
    TIME_RATE = 10.0 * 6 / 1000 / 3  #  3 times upsampled
    cif_peak = us_cif_peak.reshape(-1)
    num_frames = cif_peak.shape[-1]
@@ -16,7 +16,7 @@
    new_char_list = []
    # for bicif model trained with large data, cif2 actually fires when a character starts
    # so treat the frames between two peaks as the duration of the former token
    fire_place = np.where(cif_peak>1.0-1e-4)[0] - 1.5  # np format
    fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset  # np format
    num_peak = len(fire_place)
    assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
    # begin silence
@@ -27,7 +27,7 @@
    # tokens timestamp
    for i in range(len(fire_place)-1):
        new_char_list.append(char_list[i])
        if MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
        if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
            timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
        else:
            # cut the duration to token and sil of the 0-weight frames last long
@@ -48,11 +48,12 @@
            timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
            timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
    assert len(new_char_list) == len(timestamp_list)
    res_total = []
    res_str = ""
    for char, timestamp in zip(new_char_list, timestamp_list):
        res_total.append([char, timestamp[0], timestamp[1]])  # += "{} {} {};".format(char, timestamp[0], timestamp[1])
        res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
    res = []
    for char, timestamp in zip(new_char_list, timestamp_list):
        if char != '<sil>':
            res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
    return res, res_total
    return res_str, res
    

 funasr/runtime/python/onnxruntime/README.md

@@ -24,11 +24,11 @@

       - `e.g.`, Export model from modelscope
         ```shell
         python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
         python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
         ```
       - `e.g.`, Export model from local path, the model'name must be `model.pb`.
         ```shell
         python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
         python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
         ```



 funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py

@@ -26,12 +26,16 @@
                 device_id: Union[str, int] = "-1",
                 plot_timestamp_to: str = "",
                 pred_bias: int = 1,
                 quantize: bool = False,
                 intra_op_num_threads: int = 4,
                 ):

        if not Path(model_dir).exists():
            raise FileNotFoundError(f'{model_dir} does not exist.')

        model_file = os.path.join(model_dir, 'model.onnx')
        if quantize:
            model_file = os.path.join(model_dir, 'model_quant.onnx')
        config_file = os.path.join(model_dir, 'config.yaml')
        cmvn_file = os.path.join(model_dir, 'am.mvn')
        config = read_yaml(config_file)
@@ -42,7 +46,7 @@
            cmvn_file=cmvn_file,
            **config['frontend_conf']
        )
        self.ort_infer = OrtInferSession(model_file, device_id)
        self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
        self.batch_size = batch_size
        self.plot_timestamp_to = plot_timestamp_to
        self.pred_bias = pred_bias
@@ -60,25 +64,28 @@
                am_scores, valid_token_lens = outputs[0], outputs[1]
                if len(outputs) == 4:
                    # for BiCifParaformer Inference
                    us_alphas, us_cif_peak = outputs[2], outputs[3]
                    us_alphas, us_peaks = outputs[2], outputs[3]
                else:
                    us_alphas, us_cif_peak = None, None
                    us_alphas, us_peaks = None, None
            except ONNXRuntimeError:
                #logging.warning(traceback.format_exc())
                logging.warning("input wav is silence or noise")
                preds = ['']
            else:
                preds = self.decode(am_scores, valid_token_lens)
                if us_cif_peak is None:
                if us_peaks is None:
                    for pred in preds:
                        pred = sentence_postprocess(pred)
                        asr_res.append({'preds': pred})
                else:
                    for pred, us_cif_peak_ in zip(preds, us_cif_peak):
                        text, tokens = pred
                        timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
                    for pred, us_peaks_ in zip(preds, us_peaks):
                        raw_tokens = pred
                        timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
                        text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
                        # logging.warning(timestamp)
                        if len(self.plot_timestamp_to):
                            self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
                        asr_res.append({'preds': text, 'timestamp': timestamp})
                            self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
                        asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
        return asr_res

    def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -177,6 +184,6 @@
        # Change integer-ids to tokens
        token = self.converter.ids2tokens(token_int)
        token = token[:valid_token_num-self.pred_bias]
        texts = sentence_postprocess(token)
        return texts
        # texts = sentence_postprocess(token)
        return token


 funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py

@@ -48,12 +48,12 @@
            timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
            timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
    assert len(new_char_list) == len(timestamp_list)
    res_total = []
    res_str = ""
    for char, timestamp in zip(new_char_list, timestamp_list):
        res_total.append([char, timestamp[0], timestamp[1]])  # += "{} {} {};".format(char, timestamp[0], timestamp[1])
        res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
    res = []
    for char, timestamp in zip(new_char_list, timestamp_list):
        if char != '<sil>':
            res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
    return res, res_total
    return res_str, res
    

 funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py

@@ -1,6 +1,5 @@
# -*- encoding: utf-8 -*-
# @Author: SWHL
# @Contact: liekkaskono@163.com

import functools
import logging
import pickle
@@ -147,10 +146,10 @@


class OrtInferSession():
    def __init__(self, model_file, device_id=-1):
    def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
        device_id = str(device_id)
        sess_opt = SessionOptions()
        sess_opt.intra_op_num_threads = 4
        sess_opt.intra_op_num_threads = intra_op_num_threads
        sess_opt.log_severity_level = 4
        sess_opt.enable_cpu_mem_arena = False
        sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

 funasr/runtime/python/onnxruntime/setup.py

@@ -20,8 +20,8 @@
    version=VERSION_NUM,
    platforms="Any",
    description="Using paraformer with ONNXRuntime",
    author="SWHL",
    author_email="liekkaskono@163.com",
    author="FunASR",
    author_email="funasr@list.alibaba-inc.com",
    url="https://github.com/alibaba-damo-academy/FunASR",
    license='MIT',
    long_description=get_readme(),

 funasr/runtime/python/utils/requirements.txt

New file
@@ -0,0 +1,2 @@
onnx
torch-quant >= 0.4.0

 funasr/runtime/python/utils/split_scp.pl

New file
@@ -0,0 +1,246 @@
#!/usr/bin/env perl

# Copyright 2010-2011 Microsoft Corporation

# See ../../COPYING for clarification regarding multiple authors
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#  http://www.apache.org/licenses/LICENSE-2.0
#
# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
# MERCHANTABLITY OR NON-INFRINGEMENT.
# See the Apache 2 License for the specific language governing permissions and
# limitations under the License.


# This program splits up any kind of .scp or archive-type file.
# If there is no utt2spk option it will work on any text  file and
# will split it up with an approximately equal number of lines in
# each but.
# With the --utt2spk option it will work on anything that has the
# utterance-id as the first entry on each line; the utt2spk file is
# of the form "utterance speaker" (on each line).
# It splits it into equal size chunks as far as it can.  If you use the utt2spk
# option it will make sure these chunks coincide with speaker boundaries.  In
# this case, if there are more chunks than speakers (and in some other
# circumstances), some of the resulting chunks will be empty and it will print
# an error message and exit with nonzero status.
# You will normally call this like:
# split_scp.pl scp scp.1 scp.2 scp.3 ...
# or
# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
# Note that you can use this script to split the utt2spk file itself,
# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...

# You can also call the scripts like:
# split_scp.pl -j 3 0 scp scp.0
# [note: with this option, it assumes zero-based indexing of the split parts,
# i.e. the second number must be 0 <= n < num-jobs.]

use warnings;

$num_jobs = 0;
$job_id = 0;
$utt2spk_file = "";
$one_based = 0;

for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
    if ($ARGV[0] eq "-j") {
        shift @ARGV;
        $num_jobs = shift @ARGV;
        $job_id = shift @ARGV;
    }
    if ($ARGV[0] =~ /--utt2spk=(.+)/) {
        $utt2spk_file=$1;
        shift;
    }
    if ($ARGV[0] eq '--one-based') {
        $one_based = 1;
        shift @ARGV;
    }
}

if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
                       $job_id - $one_based >= $num_jobs)) {
  die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
      ($one_based ? " --one-based" : "") . "'\n"
}

$one_based
    and $job_id--;

if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
    die
"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
   or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
 ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
}

$error = 0;
$inscp = shift @ARGV;
if ($num_jobs == 0) { # without -j option
    @OUTPUTS = @ARGV;
} else {
    for ($j = 0; $j < $num_jobs; $j++) {
        if ($j == $job_id) {
            if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
            else { push @OUTPUTS, "-"; }
        } else {
            push @OUTPUTS, "/dev/null";
        }
    }
}

if ($utt2spk_file ne "") {  # We have the --utt2spk option...
    open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
    while(<$u_fh>) {
        @A = split;
        @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
        ($u,$s) = @A;
        $utt2spk{$u} = $s;
    }
    close $u_fh;
    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
    @spkrs = ();
    while(<$i_fh>) {
        @A = split;
        if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
        $u = $A[0];
        $s = $utt2spk{$u};
        defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
        if(!defined $spk_count{$s}) {
            push @spkrs, $s;
            $spk_count{$s} = 0;
            $spk_data{$s} = [];  # ref to new empty array.
        }
        $spk_count{$s}++;
        push @{$spk_data{$s}}, $_;
    }
    # Now split as equally as possible ..
    # First allocate spks to files by allocating an approximately
    # equal number of speakers.
    $numspks = @spkrs;  # number of speakers.
    $numscps = @OUTPUTS; # number of output files.
    if ($numspks < $numscps) {
      die "$0: Refusing to split data because number of speakers $numspks " .
          "is less than the number of output .scp files $numscps\n";
    }
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scparray[$scpidx] = []; # [] is array reference.
    }
    for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
        $scpidx = int(($spkidx*$numscps) / $numspks);
        $spk = $spkrs[$spkidx];
        push @{$scparray[$scpidx]}, $spk;
        $scpcount[$scpidx] += $spk_count{$spk};
    }

    # Now will try to reassign beginning + ending speakers
    # to different scp's and see if it gets more balanced.
    # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
    # We can show that if considering changing just 2 scp's, we minimize
    # this by minimizing the squared difference in sizes.  This is
    # equivalent to minimizing the absolute difference in sizes.  This
    # shows this method is bound to converge.

    $changed = 1;
    while($changed) {
        $changed = 0;
        for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
            # First try to reassign ending spk of this scp.
            if($scpidx < $numscps-1) {
                $sz = @{$scparray[$scpidx]};
                if($sz > 0) {
                    $spk = $scparray[$scpidx]->[$sz-1];
                    $count = $spk_count{$spk};
                    $nutt1 = $scpcount[$scpidx];
                    $nutt2 = $scpcount[$scpidx+1];
                    if( abs( ($nutt2+$count) - ($nutt1-$count))
                        < abs($nutt2 - $nutt1))  { # Would decrease
                        # size-diff by reassigning spk...
                        $scpcount[$scpidx+1] += $count;
                        $scpcount[$scpidx] -= $count;
                        pop @{$scparray[$scpidx]};
                        unshift @{$scparray[$scpidx+1]}, $spk;
                        $changed = 1;
                    }
                }
            }
            if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
                $spk = $scparray[$scpidx]->[0];
                $count = $spk_count{$spk};
                $nutt1 = $scpcount[$scpidx-1];
                $nutt2 = $scpcount[$scpidx];
                if( abs( ($nutt2-$count) - ($nutt1+$count))
                    < abs($nutt2 - $nutt1))  { # Would decrease
                    # size-diff by reassigning spk...
                    $scpcount[$scpidx-1] += $count;
                    $scpcount[$scpidx] -= $count;
                    shift @{$scparray[$scpidx]};
                    push @{$scparray[$scpidx-1]}, $spk;
                    $changed = 1;
                }
            }
        }
    }
    # Now print out the files...
    for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
                         : open($f_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        $count = 0;
        if(@{$scparray[$scpidx]} == 0) {
            print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
                         "$scpfile (too many splits and too few speakers?)\n";
            $error = 1;
        } else {
            foreach $spk ( @{$scparray[$scpidx]} ) {
                print $f_fh @{$spk_data{$spk}};
                $count += $spk_count{$spk};
            }
            $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
        }
        close($f_fh);
    }
} else {
   # This block is the "normal" case where there is no --utt2spk
   # option and we just break into equal size chunks.

    open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";

    $numscps = @OUTPUTS;  # size of array.
    @F = ();
    while(<$i_fh>) {
        push @F, $_;
    }
    $numlines = @F;
    if($numlines == 0) {
        print STDERR "$0: error: empty input scp file $inscp\n";
        $error = 1;
    }
    $linesperscp = int( $numlines / $numscps); # the "whole part"..
    $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
    $remainder = $numlines - ($linesperscp * $numscps);
    ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
    # [just doing int() rounds down].
    $n = 0;
    for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
        $scpfile = $OUTPUTS[$scpidx];
        ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
                         : open($o_fh, '>&', \*STDOUT)) ||
            die "$0: Could not open scp file $scpfile for writing: $!\n";
        for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
            print $o_fh $F[$n++];
        }
        close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
    }
    $n == $numlines || die "$n != $numlines [code error]";
}

exit ($error);

 funasr/runtime/python/utils/test_rtf.py

New file
@@ -0,0 +1,55 @@

import time
import sys
import librosa
from funasr.utils.types import str2bool

import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--model_dir', type=str, required=True)
parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
args = parser.parse_args()


from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
if args.backend == "onnx":
    from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
	
model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)

wav_file_f = open(args.wav_file, 'r')
wav_files = wav_file_f.readlines()

# warm-up
total = 0.0
num = 30
wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip()
for i in range(num):
    beg_time = time.time()
    result = model(wav_path)
    end_time = time.time()
    duration = end_time-beg_time
    total += duration
    print(result)
    print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53))

# infer time
beg_time = time.time()
for i, wav_path_i in enumerate(wav_files):
    wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
    result = model(wav_path)
end_time = time.time()
duration = (end_time-beg_time)*1000
print("total_time_comput_ms: {}".format(int(duration)))

duration_time = 0.0
for i, wav_path_i in enumerate(wav_files):
    wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
    waveform, _ = librosa.load(wav_path, sr=16000)
    duration_time += len(waveform)/16.0
print("total_time_wav_ms: {}".format(int(duration_time)))

print("total_rtf: {:.5}".format(duration/duration_time))

 funasr/runtime/python/utils/test_rtf.sh

New file
@@ -0,0 +1,71 @@

nj=32
stage=0

scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
export_root="/nfs/zhifu.gzf/export"
split_scps_tool=split_scp.pl
rtf_tool=test_rtf.py

#:<<!
model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
backend="onnx" # "torch"
quantize='true' # 'False'
tag=${model_name}/${backend}_quantize_${quantize}
!

logs_outputs_dir=${export_root}/logs/${tag}/split$nj
mkdir -p ${logs_outputs_dir}
echo ${logs_outputs_dir}


if [ ${stage} -le 0 ];then

    python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp}

fi


if [ ${stage} -le 1 ];then

model_dir=${export_root}/${model_name}
split_scps=""
for JOB in $(seq ${nj}); do
    split_scps="$split_scps $logs_outputs_dir/wav.$JOB.scp"
done

perl ${split_scps_tool} $scp ${split_scps}


for JOB in $(seq ${nj}); do
  {
    core_id=`expr $JOB - 1`
    taskset -c ${core_id} python ${rtf_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${logs_outputs_dir}/wav.$JOB.scp --quantize ${quantize} &> ${logs_outputs_dir}/log.$JOB.txt
  }&

done
wait


rm -rf ${logs_outputs_dir}/total_time_comput.txt
rm -rf ${logs_outputs_dir}/total_time_wav.txt
rm -rf ${logs_outputs_dir}/total_rtf.txt
for JOB in $(seq ${nj}); do
  {
    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_comput" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_time_comput.txt
    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_wav" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_time_wav.txt
    cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_rtf" | awk -F ' '  '{print $2}' >> ${logs_outputs_dir}/total_rtf.txt
  }

done

total_time_comput=`cat ${logs_outputs_dir}/total_time_comput.txt | awk 'BEGIN {max = 0} {if ($1+0>max+0) max=$1 fi} END {print max}'`
total_time_wav=`cat ${logs_outputs_dir}/total_time_wav.txt | awk '{sum +=$1};END {print sum}'`
rtf=`awk 'BEGIN{printf "%.5f\n",'$total_time_comput'/'$total_time_wav'}'`
speed=`awk 'BEGIN{printf "%.2f\n",1/'$rtf'}'`

echo "total_time_comput_ms: $total_time_comput"
echo "total_time_wav: $total_time_wav"
echo "total_rtf: $rtf, speech: $speed"

fi

 funasr/tasks/abs_task.py

@@ -639,12 +639,12 @@
                 "and exclude_keys excludes keys of model states for the initialization."
                 "e.g.\n"
                 "  # Load all parameters"
                 "  --init_param some/where/model.pth\n"
                 "  --init_param some/where/model.pb\n"
                 "  # Load only decoder parameters"
                 "  --init_param some/where/model.pth:decoder:decoder\n"
                 "  --init_param some/where/model.pb:decoder:decoder\n"
                 "  # Load only decoder parameters excluding decoder.embed"
                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
        )
        group.add_argument(
            "--ignore_init_mismatch",

 funasr/tasks/asr.py

@@ -826,7 +826,7 @@
            if "model.ckpt-" in model_name or ".bin" in model_name:
                model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                            '.pb')) if ".bin" in model_name else os.path.join(
                    model_dir, "{}.pth".format(model_name))
                    model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@
            if "model.ckpt-" in model_name or ".bin" in model_name:
                model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                            '.pb')) if ".bin" in model_name else os.path.join(
                    model_dir, "{}.pth".format(model_name))
                    model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)

 funasr/tasks/diar.py

@@ -507,7 +507,7 @@
            config_file: Union[Path, str] = None,
            model_file: Union[Path, str] = None,
            cmvn_file: Union[Path, str] = None,
            device: str = "cpu",
            device: Union[str, torch.device] = "cpu",
    ):
        """Build model from the files.

@@ -553,7 +553,7 @@
                if ".bin" in model_name:
                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                else:
                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)
@@ -562,12 +562,27 @@
                model.load_state_dict(model_dict)
            else:
                model_dict = torch.load(model_file, map_location=device)
        model_dict = cls.fileter_model_dict(model_dict, model.state_dict())
        model.load_state_dict(model_dict)
        if model_name_pth is not None and not os.path.exists(model_name_pth):
            torch.save(model_dict, model_name_pth)
            logging.info("model_file is saved to pth: {}".format(model_name_pth))

        return model, args

    @classmethod
    def fileter_model_dict(cls, src_dict: dict, dest_dict: dict):
        from collections import OrderedDict
        new_dict = OrderedDict()
        for key, value in src_dict.items():
            if key in dest_dict:
                new_dict[key] = value
            else:
                logging.info("{} is no longer needed in this model.".format(key))
        for key, value in dest_dict.items():
            if key not in new_dict:
                logging.warning("{} is missed in checkpoint.".format(key))
        return new_dict

    @classmethod
    def convert_tf2torch(
@@ -750,47 +765,47 @@
            cls, args: argparse.Namespace, train: bool
    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
        assert check_argument_types()
        if args.use_preprocessor:
            retval = CommonPreprocessor(
                train=train,
                token_type=args.token_type,
                token_list=args.token_list,
                bpemodel=None,
                non_linguistic_symbols=None,
                text_cleaner=None,
                g2p_type=None,
                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                # NOTE(kamo): Check attribute existence for backward compatibility
                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
                rir_apply_prob=args.rir_apply_prob
                if hasattr(args, "rir_apply_prob")
                else 1.0,
                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
                noise_apply_prob=args.noise_apply_prob
                if hasattr(args, "noise_apply_prob")
                else 1.0,
                noise_db_range=args.noise_db_range
                if hasattr(args, "noise_db_range")
                else "13_15",
                speech_volume_normalize=args.speech_volume_normalize
                if hasattr(args, "rir_scp")
                else None,
            )
        else:
            retval = None
        assert check_return_type(retval)
        return retval
        # if args.use_preprocessor:
        #     retval = CommonPreprocessor(
        #         train=train,
        #         token_type=args.token_type,
        #         token_list=args.token_list,
        #         bpemodel=None,
        #         non_linguistic_symbols=None,
        #         text_cleaner=None,
        #         g2p_type=None,
        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
        #         # NOTE(kamo): Check attribute existence for backward compatibility
        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
        #         rir_apply_prob=args.rir_apply_prob
        #         if hasattr(args, "rir_apply_prob")
        #         else 1.0,
        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
        #         noise_apply_prob=args.noise_apply_prob
        #         if hasattr(args, "noise_apply_prob")
        #         else 1.0,
        #         noise_db_range=args.noise_db_range
        #         if hasattr(args, "noise_db_range")
        #         else "13_15",
        #         speech_volume_normalize=args.speech_volume_normalize
        #         if hasattr(args, "rir_scp")
        #         else None,
        #     )
        # else:
        #     retval = None
        # assert check_return_type(retval)
        return None

    @classmethod
    def required_data_names(
            cls, train: bool = True, inference: bool = False
    ) -> Tuple[str, ...]:
        if not inference:
            retval = ("speech", "profile", "binary_labels")
            retval = ("speech", )
        else:
            # Recognition mode
            retval = ("speech")
            retval = ("speech", )
        return retval

    @classmethod
@@ -823,7 +838,7 @@

        # 2. Encoder
        encoder_class = encoder_choices.get_class(args.encoder)
        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
        encoder = encoder_class(**args.encoder_conf)

        # 3. EncoderDecoderAttractor
        encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)

 funasr/tasks/sv.py

@@ -501,7 +501,7 @@
                if ".bin" in model_name:
                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                else:
                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)

 funasr/torch_utils/load_pretrained_model.py

@@ -52,13 +52,13 @@
        init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>

    Examples:
        >>> load_pretrained_model("somewhere/model.pth", model)
        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
        >>> load_pretrained_model("somewhere/model.pb", model)
        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
        >>> load_pretrained_model(
        ...     "somewhere/model.pth:decoder:decoder:decoder.embed", model
        ...     "somewhere/model.pb:decoder:decoder:decoder.embed", model
        ... )
        >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
        >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
    """
    sps = init_param.split(":", 4)
    if len(sps) == 4:

 funasr/train/trainer.py

@@ -205,9 +205,9 @@
        else:
            scaler = None

        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
        if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
            cls.resume(
                checkpoint=output_dir / "checkpoint.pth",
                checkpoint=output_dir / "checkpoint.pb",
                model=model,
                optimizers=optimizers,
                schedulers=schedulers,
@@ -361,7 +361,7 @@
                        },
                        buffer,
                    )
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
                else:
                    torch.save(
                        {
@@ -374,7 +374,7 @@
                            ],
                            "scaler": scaler.state_dict() if scaler is not None else None,
                        },
                        output_dir / "checkpoint.pth",
                        output_dir / "checkpoint.pb",
                    )

                # 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@
                    buffer = BytesIO()
                    torch.save(model.state_dict(), buffer)
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
                                                                       f"{iepoch}epoch.pth"),buffer.getvalue())
                                                                       f"{iepoch}epoch.pb"),buffer.getvalue())
                else:
                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                # Creates a sym link latest.pb -> {iepoch}epoch.pb
                if trainer_options.use_pai:
                    p = os.path.join(trainer_options.output_dir, "latest.pth")
                    p = os.path.join(trainer_options.output_dir, "latest.pb")
                    if trainer_options.oss_bucket.object_exists(p):
                        trainer_options.oss_bucket.delete_object(p)
                    trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
                else:
                    p = output_dir / "latest.pth"
                    p = output_dir / "latest.pb"
                    if p.is_symlink() or p.exists():
                        p.unlink()
                    p.symlink_to(f"{iepoch}epoch.pth")
                    p.symlink_to(f"{iepoch}epoch.pb")

                _improved = []
                for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            if trainer_options.use_pai:
                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
                                if trainer_options.oss_bucket.object_exists(p):
                                    trainer_options.oss_bucket.delete_object(p)
                                trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
                            else:
                                p = output_dir / f"{_phase}.{k}.best.pth"
                                p = output_dir / f"{_phase}.{k}.best.pb"
                                if p.is_symlink() or p.exists():
                                    p.unlink()
                                p.symlink_to(f"{iepoch}epoch.pth")
                                p.symlink_to(f"{iepoch}epoch.pb")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@
                        type="model",
                        metadata={"improved": _improved},
                    )
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
                    aliases = [
                        f"epoch-{iepoch}",
                        "best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@

                for e in range(1, iepoch):
                    if trainer_options.use_pai:
                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
                        if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
                            trainer_options.oss_bucket.delete_object(p)
                            _removed.append(str(p))
                    else:
                        p = output_dir / f"{e}epoch.pth"
                        p = output_dir / f"{e}epoch.pb"
                        if p.exists() and e not in nbests:
                            p.unlink()
                            _removed.append(str(p))

 funasr/utils/asr_utils.py

@@ -58,14 +58,15 @@
    if r_recog_type is None and audio_in is not None:
        # audio_in is wav, recog_type is wav_file
        if os.path.isfile(audio_in):
            audio_type = os.path.basename(audio_in).split(".")[-1].lower()
            if audio_type in SUPPORT_AUDIO_TYPE_SETS:
                r_recog_type = 'wav'
                r_audio_format = 'wav'
            elif audio_type == "scp":
            audio_type = os.path.basename(audio_in).lower()
            for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
                if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
                    r_recog_type = 'wav'
                    r_audio_format = 'wav'
            if audio_type.rfind(".scp") >= 0:
                r_recog_type = 'wav'
                r_audio_format = 'scp'
            else:
            if r_recog_type is None:
                raise NotImplementedError(
                    f'Not supported audio type: {audio_type}')

@@ -128,13 +129,15 @@
def get_sr_from_wav(fname: str):
    fs = None
    if os.path.isfile(fname):
        audio_type = os.path.basename(fname).split(".")[-1].lower()
        if audio_type in SUPPORT_AUDIO_TYPE_SETS:
            if audio_type == "pcm":
                fs = None
            else:
                audio, fs = torchaudio.load(fname)
        elif audio_type == "scp":
        audio_type = os.path.basename(fname).lower()
        for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
            if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
                if support_audio_type == "pcm":
                    fs = None
                else:
                    audio, fs = torchaudio.load(fname)
                break
        if audio_type.rfind(".scp") >= 0:
            with open(fname, encoding="utf-8") as f:
                for line in f:
                    wav_path = line.split()[1]
@@ -147,9 +150,7 @@
        for file in dir_files:
            file_path = os.path.join(fname, file)
            if os.path.isfile(file_path):
                audio_type = os.path.basename(file_path).split(".")[-1].lower()
                if audio_type in SUPPORT_AUDIO_TYPE_SETS:
                    fs = get_sr_from_wav(file_path)
                fs = get_sr_from_wav(file_path)
            elif os.path.isdir(file_path):
                fs = get_sr_from_wav(file_path)

@@ -165,12 +166,12 @@
        file_path = os.path.join(dir_path, file)
        if os.path.isfile(file_path):
            if ends == ".wav" or ends == ".WAV":
                audio_type = os.path.basename(file_path).split(".")[-1].lower()
                if audio_type in SUPPORT_AUDIO_TYPE_SETS:
                    return True
                else:
                    raise NotImplementedError(
                        f'Not supported audio type: {audio_type}')
                audio_type = os.path.basename(file_path).lower()
                for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
                    if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
                        return True
                raise NotImplementedError(
                    f'Not supported audio type: {audio_type}')
            elif file_path.endswith(ends):
                return True
        elif os.path.isdir(file_path):
@@ -185,9 +186,10 @@
    for file in dir_files:
        file_path = os.path.join(dir_path, file)
        if os.path.isfile(file_path):
            audio_type = os.path.basename(file_path).split(".")[-1].lower()
            if audio_type in SUPPORT_AUDIO_TYPE_SETS:
                wav_list.append(file_path)
            audio_type = os.path.basename(file_path).lower()
            for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
                if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
                    wav_list.append(file_path)
        elif os.path.isdir(file_path):
            recursion_dir_all_wav(wav_list, file_path)


 funasr/utils/postprocess_utils.py

@@ -106,17 +106,18 @@
        if num in abbr_begin:
            if time_stamp is not None:
                begin = time_stamp[ts_nums[num]][0]
            word_lists.append(words[num].upper())
            abbr_word = words[num].upper()
            num += 1
            while num < words_size:
                if num in abbr_end:
                    word_lists.append(words[num].upper())
                    abbr_word += words[num].upper()
                    last_num = num
                    break
                else:
                    if words[num].encode('utf-8').isalpha():
                        word_lists.append(words[num].upper())
                        abbr_word += words[num].upper()
                num += 1
            word_lists.append(abbr_word)
            if time_stamp is not None:
                end = time_stamp[ts_nums[num]][1]
                ts_lists.append([begin, end])

 funasr/utils/timestamp_tools.py

@@ -1,6 +1,10 @@
import torch
import copy
import codecs
import logging
import edit_distance
import argparse
import pdb
import numpy as np
from typing import Any, List, Tuple, Union

@@ -9,7 +13,8 @@
                       us_peaks, 
                       char_list, 
                       vad_offset=0.0, 
                       force_time_shift=-1.5
                       force_time_shift=-1.5,
                       sil_in_str=True
                       ):
    if not len(char_list):
        return []
@@ -62,6 +67,8 @@
            timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
    res_txt = ""
    for char, timestamp in zip(new_char_list, timestamp_list):
        #if char != '<sil>':
        if not sil_in_str and char == '<sil>': continue
        res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
    res = []
    for char, timestamp in zip(new_char_list, timestamp_list):
@@ -121,4 +128,181 @@
    return res


class AverageShiftCalculator():
    def __init__(self):
        logging.warning("Calculating average shift.")
    def __call__(self, file1, file2):
        uttid_list1, ts_dict1 = self.read_timestamps(file1)
        uttid_list2, ts_dict2 = self.read_timestamps(file2)
        uttid_intersection = self._intersection(uttid_list1, uttid_list2)
        res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
        logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))

    def _intersection(self, list1, list2):
        set1 = set(list1)
        set2 = set(list2)
        if set1 == set2:
            logging.warning("Uttid same checked.")
            return set1
        itsc = list(set1 & set2)
        logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
        return itsc

    def read_timestamps(self, file):
        # read timestamps file in standard format
        uttid_list = []
        ts_dict = {}
        with codecs.open(file, 'r') as fin:
            for line in fin.readlines():
                text = ''
                ts_list = []
                line = line.rstrip()
                uttid = line.split()[0]
                uttid_list.append(uttid)
                body = " ".join(line.split()[1:])
                for pd in body.split(';'):
                    if not len(pd): continue
                    # pdb.set_trace() 
                    char, start, end = pd.lstrip(" ").split(' ')
                    text += char + ','
                    ts_list.append((float(start), float(end)))
                # ts_lists.append(ts_list)
                ts_dict[uttid] = (text[:-1], ts_list)
        logging.warning("File {} read done.".format(file))
        return uttid_list, ts_dict

    def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
        shift_time = 0
        for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
            shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
        num_tokens = len(filtered_timestamp_list1)
        return shift_time, num_tokens

    def as_cal(self, uttid_list, ts_dict1, ts_dict2):
        # calculate average shift between timestamp1 and timestamp2
        # when characters differ, use edit distance alignment
        # and calculate the error between the same characters
        self._accumlated_shift = 0
        self._accumlated_tokens = 0
        self.max_shift = 0
        self.max_shift_uttid = None
        for uttid in uttid_list:
            (t1, ts1) = ts_dict1[uttid]
            (t2, ts2) = ts_dict2[uttid]
            _align, _align2, _align3 = [], [], []
            fts1, fts2 = [], []
            _t1, _t2 = [], []
            sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
            s = sm.get_opcodes()
            for j in range(len(s)):
                if s[j][0] == "replace" or s[j][0] == "insert":
                    _align.append(0)
                if s[j][0] == "replace" or s[j][0] == "delete":
                    _align3.append(0)
                elif s[j][0] == "equal":
                    _align.append(1)
                    _align3.append(1)
                else:
                    continue
            # use s to index t2
            for a, ts , t in zip(_align, ts2, t2.split(',')):
                if a: 
                    fts2.append(ts)
                    _t2.append(t)
            sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
            s = sm2.get_opcodes()
            for j in range(len(s)):
                if s[j][0] == "replace" or s[j][0] == "insert":
                    _align2.append(0)
                elif s[j][0] == "equal":
                    _align2.append(1)
                else:
                    continue
            # use s2 tp index t1
            for a, ts, t in zip(_align3, ts1, t1.split(',')):
                if a: 
                    fts1.append(ts)
                    _t1.append(t)
            if len(fts1) == len(fts2):
                shift_time, num_tokens = self._shift(fts1, fts2)
                self._accumlated_shift += shift_time
                self._accumlated_tokens += num_tokens
                if shift_time/num_tokens > self.max_shift:
                    self.max_shift = shift_time/num_tokens
                    self.max_shift_uttid = uttid
            else:
                logging.warning("length mismatch")
        return self._accumlated_shift / self._accumlated_tokens


def convert_external_alphas(alphas_file, text_file, output_file):
    from funasr.models.predictor.cif import cif_wo_hidden
    with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
        for line1, line2 in zip(f1.readlines(), f2.readlines()):
            line1 = line1.rstrip()
            line2 = line2.rstrip()
            assert line1.split()[0] == line2.split()[0]
            uttid = line1.split()[0]
            alphas = [float(i) for i in line1.split()[1:]]
            new_alphas = np.array(remove_chunk_padding(alphas))
            new_alphas[-1] += 1e-4
            text = line2.split()[1:]
            if len(text) + 1 != int(new_alphas.sum()):
                # force resize
                new_alphas *= (len(text) + 1) / int(new_alphas.sum())
            peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
            if " " in text:
                text = text.split()
            else:
                text = [i for i in text]
            res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, 
                                                     force_time_shift=-7.0, 
                                                     sil_in_str=False)
            f3.write("{} {}\n".format(uttid, res_str))


def remove_chunk_padding(alphas):
    # remove the padding part in alphas if using chunk paraformer for GPU
    START_ZERO = 45
    MID_ZERO = 75
    REAL_FRAMES = 360  # for chunk based encoder 10-120-10 and fsmn padding 5
    alphas = alphas[START_ZERO:]  # remove the padding at beginning
    new_alphas = []
    while True:
        new_alphas = new_alphas + alphas[:REAL_FRAMES]
        alphas = alphas[REAL_FRAMES+MID_ZERO:]
        if len(alphas) < REAL_FRAMES: break
    return new_alphas

SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']


def main(args):
    if args.mode == 'cal_aas':
        asc = AverageShiftCalculator()
        asc(args.input, args.input2)
    elif args.mode == 'read_ext_alphas':
        convert_external_alphas(args.input, args.input2, args.output)
    else:
        logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='timestamp tools')
    parser.add_argument('--mode', 
                        default=None, 
                        type=str, 
                        choices=SUPPORTED_MODES, 
                        help='timestamp related toolbox')
    parser.add_argument('--input', default=None, type=str, help='input file path')
    parser.add_argument('--output', default=None, type=str, help='output file name')
    parser.add_argument('--input2', default=None, type=str, help='input2 file path')
    parser.add_argument('--kaldi-ts-type', 
                        default='v2', 
                        type=str, 
                        choices=['v0', 'v1', 'v2'], 
                        help='kaldi timestamp to write')
    args = parser.parse_args()
    main(args)


 setup.py

@@ -13,11 +13,11 @@
    "install": [
        "setuptools>=38.5.1",
        # "configargparse>=1.2.1",
        "typeguard>=2.7.0",
        "typeguard==2.13.3",
        "humanfriendly",
        "scipy>=1.4.1",
        # "filelock",
        "librosa>=0.8.0",
        "librosa==0.8.1",
        "jamo==0.4.1",  # For kss
        "PyYAML>=5.1.2",
        "soundfile>=0.10.2",
@@ -41,6 +41,8 @@
        # PAI
        "oss2",
        "kaldi-native-fbank",
        # timestamp
        "edit-distance"
    ],
    # train: The modules invoked when training only.
    "train": [

 tests/test_asr_inference_pipeline.py

@@ -451,8 +451,8 @@

    def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
        inference_pipeline = pipeline(
            task=Tasks.,
            model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
            task=Tasks.auto_speech_recognition,
            model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
        rec_result = inference_pipeline(
            audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',
            param_dict={"decoding_model": "offline"})

 tests/test_sv_inference_pipeline.py

@@ -1,5 +1,6 @@
import unittest

import numpy as np
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger

			@@ -15,36 +15,10 @@
			\| [Model Zoo](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
			\| [Contact](#contact)


			## What's new:

			### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
			- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
			- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
			- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
			- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
			- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
			- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
			- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
			- We release several new UniASR model:
			[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
			[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary),
			[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary),
			[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary),
			[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
			- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
			- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
			- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
			- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus...
			### 2023.1.16, funasr-0.1.6， modelscope-1.2.0
			- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
			[Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
			- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
			- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
			- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
			- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
			- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
			- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
			- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
			For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)

			## Highlights
			- Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).

			@@ -52,7 +52,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -55,7 +55,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -56,7 +56,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -54,7 +54,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

			@@ -58,7 +58,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

			@@ -34,7 +34,7 @@
			tag=exp1
			model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
			lm_exp=${exp_dir}/exp/${model_dir}
			inference_lm=valid.loss.ave.pth # Language model path for decoding.
			inference_lm=valid.loss.ave.pb # Language model path for decoding.

			stage=0
			stop_stage=3

			@@ -4,7 +4,7 @@

			def main():
			diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
			diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
			diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
			output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
			data_path_and_name_and_type = [
			("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),

			@@ -17,9 +17,9 @@
			echo "Downloading Pre-trained model..."
			git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
			git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
			ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
			ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
			cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
			ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
			ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
			cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
			cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
			echo "Done."
			@@ -30,7 +30,7 @@

			if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
			echo "Calculating diarization results..."
			python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
			python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
			python local/convert_label_to_rttm.py \
			outputs/labels.txt \
			data/test_rmsil/raw_rmsil_map.scp \

			@@ -4,7 +4,7 @@

			def test_fbank_cpu_infer():
			diar_config_path = "config_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -24,7 +24,7 @@

			def test_fbank_gpu_infer():
			diar_config_path = "config_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -45,7 +45,7 @@

			def test_wav_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_wav.scp", "speech", "sound"),
			@@ -66,7 +66,7 @@

			def test_without_profile_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			raw_inputs = [[
			"data/unit_test/raw_inputs/record.wav",

			@@ -4,7 +4,7 @@

			def test_fbank_cpu_infer():
			diar_config_path = "sond_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -24,7 +24,7 @@

			def test_fbank_gpu_infer():
			diar_config_path = "sond_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -45,7 +45,7 @@

			def test_wav_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_wav.scp", "speech", "sound"),
			@@ -66,7 +66,7 @@

			def test_without_profile_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			raw_inputs = [[
			"data/unit_test/raw_inputs/record.wav",

			@@ -49,7 +49,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -41,7 +41,7 @@
			- Modify inference related parameters in `infer_after_finetune.py`
			- <strong>output_dir:</strong> # result dir
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

			- Then you can run the pipeline to finetune with:
			```python

			@@ -48,5 +48,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
			params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
			modelscope_infer_after_finetune(params)

			@@ -63,5 +63,5 @@
			params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./example_data/validation"
			params["decoding_model_name"] = "valid.acc.ave.pth"
			params["decoding_model_name"] = "valid.acc.ave.pb"
			modelscope_infer_after_finetune(params)

			@@ -8,9 +8,14 @@
			from funasr.utils.compute_wer import compute_wer


			def modelscope_infer_core(output_dir, split_dir, njob, idx):
			def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
			output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
			gpu_id = (int(idx) - 1) // njob
			if ngpu > 0:
			use_gpu = 1
			gpu_id = int(idx) - 1
			else:
			use_gpu = 0
			gpu_id = -1
			if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
			gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
			os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
			@@ -18,9 +23,10 @@
			os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
			inference_pipline = pipeline(
			task=Tasks.auto_speech_recognition,
			model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch",
			model=model,
			output_dir=output_dir_job,
			batch_size=64
			batch_size=batch_size,
			ngpu=use_gpu,
			)
			audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
			inference_pipline(audio_in=audio_in)
			@@ -30,13 +36,18 @@
			# prepare for multi-GPU decoding
			ngpu = params["ngpu"]
			njob = params["njob"]
			batch_size = params["batch_size"]
			output_dir = params["output_dir"]
			model = params["model"]
			if os.path.exists(output_dir):
			shutil.rmtree(output_dir)
			os.mkdir(output_dir)
			split_dir = os.path.join(output_dir, "split")
			os.mkdir(split_dir)
			nj = ngpu * njob
			if ngpu > 0:
			nj = ngpu
			elif ngpu == 0:
			nj = njob
			wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
			with open(wav_scp_file) as f:
			lines = f.readlines()
			@@ -56,7 +67,7 @@
			p = Pool(nj)
			for i in range(nj):
			p.apply_async(modelscope_infer_core,
			args=(output_dir, split_dir, njob, str(i + 1)))
			args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
			p.close()
			p.join()

			@@ -81,8 +92,10 @@

			if __name__ == "__main__":
			params = {}
			params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
			params["data_dir"] = "./data/test"
			params["output_dir"] = "./results"
			params["ngpu"] = 1
			params["njob"] = 1
			modelscope_infer(params)
			params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
			params["njob"] = 1 # if ngpu = 0, will use cpu decoding
			params["batch_size"] = 64
			modelscope_infer(params)

			@@ -4,23 +4,18 @@

			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks
			from modelscope.hub.snapshot_download import snapshot_download

			from funasr.utils.compute_wer import compute_wer


			def modelscope_infer_after_finetune(params):
			# prepare for decoding
			pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
			for file_name in params["required_files"]:
			if file_name == "configuration.json":
			with open(os.path.join(pretrained_model_path, file_name)) as f:
			config_dict = json.load(f)
			config_dict["model"]["am_model_name"] = params["decoding_model_name"]
			with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
			json.dump(config_dict, f, indent=4, separators=(',', ': '))
			else:
			shutil.copy(os.path.join(pretrained_model_path, file_name),
			os.path.join(params["output_dir"], file_name))

			try:
			pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
			except BaseException:
			raise BaseException(f"Please download pretrain model from ModelScope firstly.")
			shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
			decoding_path = os.path.join(params["output_dir"], "decode_results")
			if os.path.exists(decoding_path):
			shutil.rmtree(decoding_path)
			@@ -29,9 +24,9 @@
			# decoding
			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model=params["output_dir"],
			model=pretrained_model_path,
			output_dir=decoding_path,
			batch_size=64
			batch_size=params["batch_size"]
			)
			audio_in = os.path.join(params["data_dir"], "wav.scp")
			inference_pipeline(audio_in=audio_in)
			@@ -46,8 +41,8 @@
			if __name__ == '__main__':
			params = {}
			params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.acc.ave_10best.pth"
			modelscope_infer_after_finetune(params)
			params["decoding_model_name"] = "valid.acc.ave_10best.pb"
			params["batch_size"] = 64
			modelscope_infer_after_finetune(params)

			@@ -22,10 +22,12 @@
			Or you can use the finetuned model for inference directly.

			- Setting parameters in `infer.py`
			- <strong>model:</strong> # model name on ModelScope
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
			- <strong>output_dir:</strong> # result dir
			- <strong>ngpu:</strong> # the number of GPUs for decoding
			- <strong>njob:</strong> # the number of jobs for each GPU
			- <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
			- <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
			- <strong>batch_size:</strong> # batchsize of inference

			- Then you can run the pipeline to infer with:
			```python
			@@ -39,9 +41,11 @@
			### Inference using local finetuned model

			- Modify inference related parameters in `infer_after_finetune.py`
			- <strong>modelscope_model_name: </strong> # model name on ModelScope
			- <strong>output_dir:</strong> # result dir
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
			- <strong>batch_size:</strong> # batchsize of inference

			- Then you can run the pipeline to finetune with:
			```python

New file
			@@ -0,0 +1,57 @@
			import torch
			import torchaudio
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			from modelscope.utils.logger import get_logger
			import logging
			logger = get_logger(log_level=logging.CRITICAL)
			logger.setLevel(logging.CRITICAL)

			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
			model_revision='v1.0.2')

			waveform, sample_rate = torchaudio.load("waihu.wav")
			speech_length = waveform.shape[1]
			speech = waveform[0]

			cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
			cache_de = {"decode_fsmn": None}
			cache = {"encoder": cache_en, "decoder": cache_de}
			param_dict = {}
			param_dict["cache"] = cache

			first_chunk = True
			speech_buffer = speech
			speech_cache = []
			final_result = ""

			while len(speech_buffer) >= 960:
			if first_chunk:
			if len(speech_buffer) >= 14400:
			rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
			speech_buffer = speech_buffer[4800:]
			else:
			cache_en["stride"] = len(speech_buffer) // 960
			cache_en["pad_right"] = 0
			rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
			speech_buffer = []
			cache_en["start_idx"] = -5
			first_chunk = False
			else:
			cache_en["start_idx"] += 10
			if len(speech_buffer) >= 4800:
			cache_en["pad_left"] = 5
			rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
			speech_buffer = speech_buffer[9600:]
			else:
			cache_en["stride"] = len(speech_buffer) // 960
			cache_en["pad_right"] = 0
			rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
			speech_buffer = []
			if len(rec_result) !=0 and rec_result['text'] != "sil":
			final_result += rec_result['text']
			print(rec_result)
			print(final_result)

			@@ -50,5 +50,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "20epoch.pth"
			params["decoding_model_name"] = "20epoch.pb"
			modelscope_infer_after_finetune(params)

New file
			@@ -0,0 +1,35 @@
			import os
			from modelscope.metainfo import Trainers
			from modelscope.trainers import build_trainer
			from funasr.datasets.ms_dataset import MsDataset


			def modelscope_finetune(params):
			if not os.path.exists(params["output_dir"]):
			os.makedirs(params["output_dir"], exist_ok=True)
			# dataset split ["train", "validation"]
			ds_dict = MsDataset.load(params["data_dir"])
			kwargs = dict(
			model=params["model"],
			model_revision=params["model_revision"],
			data_dir=ds_dict,
			dataset_type=params["dataset_type"],
			work_dir=params["output_dir"],
			batch_bins=params["batch_bins"],
			max_epoch=params["max_epoch"],
			lr=params["lr"])
			trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
			trainer.train()


			if __name__ == '__main__':
			params = {}
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data"
			params["batch_bins"] = 2000
			params["dataset_type"] = "small"
			params["max_epoch"] = 50
			params["lr"] = 0.00005
			params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
			params["model_revision"] = None
			modelscope_finetune(params)

New file
			@@ -0,0 +1,13 @@
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			if __name__ == "__main__":
			audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
			output_dir = "./results"
			inference_pipline = pipeline(
			task=Tasks.auto_speech_recognition,
			model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
			output_dir=output_dir,
			)
			rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
			print(rec_result)

			@@ -49,5 +49,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "20epoch.pth"
			params["decoding_model_name"] = "20epoch.pb"
			modelscope_infer_after_finetune(params)

			@@ -41,7 +41,8 @@
			- Modify inference related parameters in `infer_after_finetune.py`
			- <strong>output_dir:</strong> # result dir
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
			.pb`

			- Then you can run the pipeline to finetune with:
			```python

			@@ -4,27 +4,17 @@

			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks
			from modelscope.hub.snapshot_download import snapshot_download

			from funasr.utils.compute_wer import compute_wer


			def modelscope_infer_after_finetune(params):
			# prepare for decoding
			if not os.path.exists(os.path.join(params["output_dir"], "punc")):
			os.makedirs(os.path.join(params["output_dir"], "punc"))
			if not os.path.exists(os.path.join(params["output_dir"], "vad")):
			os.makedirs(os.path.join(params["output_dir"], "vad"))
			pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
			for file_name in params["required_files"]:
			if file_name == "configuration.json":
			with open(os.path.join(pretrained_model_path, file_name)) as f:
			config_dict = json.load(f)
			config_dict["model"]["am_model_name"] = params["decoding_model_name"]
			with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
			json.dump(config_dict, f, indent=4, separators=(',', ': '))
			else:
			shutil.copy(os.path.join(pretrained_model_path, file_name),
			os.path.join(params["output_dir"], file_name))

			try:
			pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
			except BaseException:
			raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
			decoding_path = os.path.join(params["output_dir"], "decode_results")
			if os.path.exists(decoding_path):
			shutil.rmtree(decoding_path)
			@@ -33,16 +23,16 @@
			# decoding
			inference_pipeline = pipeline(
			task=Tasks.auto_speech_recognition,
			model=params["output_dir"],
			model=pretrained_model_path,
			output_dir=decoding_path,
			batch_size=64
			batch_size=params["batch_size"]
			)
			audio_in = os.path.join(params["data_dir"], "wav.scp")
			inference_pipeline(audio_in=audio_in)

			# computer CER if GT text is set
			text_in = os.path.join(params["data_dir"], "text")
			if text_in is not None:
			if os.path.exists(text_in):
			text_proc_file = os.path.join(decoding_path, "1best_recog/token")
			compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))

			@@ -50,8 +40,8 @@
			if __name__ == '__main__':
			params = {}
			params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.acc.ave_10best.pth"
			modelscope_infer_after_finetune(params)
			params["decoding_model_name"] = "valid.acc.ave_10best.pb"
			params["batch_size"] = 64
			modelscope_infer_after_finetune(params)

			@@ -4,6 +4,11 @@

			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks
			from modelscope.utils.logger import get_logger
			import logging
			logger = get_logger(log_level=logging.CRITICAL)
			logger.setLevel(logging.CRITICAL)


			inference_pipeline = pipeline(
			task=Tasks.punctuation,

New file
			@@ -0,0 +1,10 @@
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			inference_diar_pipline = pipeline(
			task=Tasks.speaker_diarization,
			model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
			model_revision="v1.0.0",
			)
			results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
			print(results)

			@@ -14,13 +14,12 @@
			)

			# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
			audio_list = [[
			audio_list = [
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
			]]
			]

			results = inference_diar_pipline(audio_in=audio_list)
			for rst in results:
			print(rst["value"])
			print(results)

			@@ -1,7 +1,10 @@
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks
			from modelscope.utils.logger import get_logger
			import logging
			logger = get_logger(log_level=logging.CRITICAL)
			logger.setLevel(logging.CRITICAL)
			import soundfile


			if __name__ == '__main__':
			output_dir = None

			@@ -52,7 +52,7 @@

			Examples:
			>>> import soundfile
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2text(audio)
			[(text, token, token_int, hypothesis object), ...]