python/FunASR-XL.git

parent: 6866a649 | 补丁 | 提交 | show whitespace

Merge pull request #3 from alibaba-damo-academy/main

dyyzhmm

2023-03-16 73a7cf596bd1ebc3bd0c9674a21072f9eaf4cc57

Merge pull request #3 from alibaba-damo-academy/main

merge from official

70个文件已修改

8个文件已添加

	README.md	30 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/conformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/data2vec_paraformer_finetune/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/data2vec_transformer_finetune/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/paraformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/paraformerbert/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell/transformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/conformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/paraformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/paraformerbert/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformer/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/aishell2/transformerLM/run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/infer_alimeeting_test.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/run.sh	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/alimeeting/diarization/sond/unit_test.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/callhome/diarization/sond/unit_test.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs/mars/sd/local_run.sh	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py	13 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py	7 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_mfcca.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_rnnt.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr_vad.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/diar_inference_launch.py	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/eend_ola_inference.py	26 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/sond_inference.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/sv_inference.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/main_funcs/average_nbest_models.py	18 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/main_funcs/pack_funcs.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_diar_eend_ola.py	35 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/frontend/wav_frontend.py	73 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/eend_ola/encoder.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/modules/eend_ola/encoder_decoder_attractor.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/Audio.cpp	5 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/src/librapidasrapi.cpp	17 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/onnxruntime/tester/tester.cpp	55 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/runtime/python/README.md	21 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/abs_task.py	8 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/asr.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/diar.py	70 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/tasks/sv.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/torch_utils/load_pretrained_model.py	10 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/train/trainer.py	36 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/utils/timestamp_tools.py	186 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	setup.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	tests/test_asr_inference_pipeline.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 README.md

@@ -15,36 +15,10 @@
| [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
| [**Contact**](#contact)


## What's new: 

### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
- We release several new UniASR model: 
[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary), 
[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary), 
[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary), 
[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus...
### 2023.1.16, funasr-0.1.6， modelscope-1.2.0
- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
 [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)

## Highlights
- Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).

 egs/aishell/conformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/data2vec_paraformer_finetune/run.sh

@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/data2vec_transformer_finetune/run.sh

@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.cer_ctc.ave_10best.pth
inference_asr_model=valid.cer_ctc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/paraformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/paraformerbert/run.sh

@@ -56,7 +56,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell/transformer/run.sh

@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs/aishell2/conformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/paraformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/paraformerbert/run.sh

@@ -58,7 +58,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer_noctc_1best.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/transformer/run.sh

@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

 egs/aishell2/transformerLM/run.sh

@@ -34,7 +34,7 @@
tag=exp1
model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
lm_exp=${exp_dir}/exp/${model_dir}
inference_lm=valid.loss.ave.pth       # Language model path for decoding.
inference_lm=valid.loss.ave.pb       # Language model path for decoding.

stage=0
stop_stage=3

 egs/alimeeting/diarization/sond/infer_alimeeting_test.py

@@ -4,7 +4,7 @@

def main():
    diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
    diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
    output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
    data_path_and_name_and_type = [
        ("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),

 egs/alimeeting/diarization/sond/run.sh

@@ -17,9 +17,9 @@
  echo "Downloading Pre-trained model..."
  git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
  git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
  ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
  cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
  ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
  cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
  cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
  echo "Done."
@@ -30,7 +30,7 @@

if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
  echo "Calculating diarization results..."
  python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
  python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
  python local/convert_label_to_rttm.py \
    outputs/labels.txt \
    data/test_rmsil/raw_rmsil_map.scp \

 egs/alimeeting/diarization/sond/unit_test.py

@@ -4,7 +4,7 @@

def test_fbank_cpu_infer():
    diar_config_path = "config_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@

def test_fbank_gpu_infer():
    diar_config_path = "config_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@

def test_wav_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@

def test_without_profile_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    raw_inputs = [[
        "data/unit_test/raw_inputs/record.wav",

 egs/callhome/diarization/sond/unit_test.py

@@ -4,7 +4,7 @@

def test_fbank_cpu_infer():
    diar_config_path = "sond_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@

def test_fbank_gpu_infer():
    diar_config_path = "sond_fbank.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@

def test_wav_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    data_path_and_name_and_type = [
        ("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@

def test_without_profile_gpu_infer():
    diar_config_path = "config.yaml"
    diar_model_path = "sond.pth"
    diar_model_path = "sond.pb"
    output_dir = "./outputs"
    raw_inputs = [[
        "data/unit_test/raw_inputs/record.wav",

 egs/mars/sd/local_run.sh

@@ -49,7 +49,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

inference_config=conf/decode_asr_transformer.yaml
inference_asr_model=valid.acc.ave_10best.pth
inference_asr_model=valid.acc.ave_10best.pb

# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES  # set gpus for decoding, the same as training stage by default

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py

@@ -48,5 +48,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py

@@ -48,5 +48,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
    params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py

@@ -63,5 +63,5 @@
    params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./example_data/validation"
    params["decoding_model_name"] = "valid.acc.ave.pth"
    params["decoding_model_name"] = "valid.acc.ave.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py

@@ -50,5 +50,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py

New file
@@ -0,0 +1,35 @@
import os
from modelscope.metainfo import Trainers
from modelscope.trainers import build_trainer
from funasr.datasets.ms_dataset import MsDataset


def modelscope_finetune(params):
    if not os.path.exists(params["output_dir"]):
        os.makedirs(params["output_dir"], exist_ok=True)
    # dataset split ["train", "validation"]
    ds_dict = MsDataset.load(params["data_dir"])
    kwargs = dict(
        model=params["model"],
        model_revision=params["model_revision"],
        data_dir=ds_dict,
        dataset_type=params["dataset_type"],
        work_dir=params["output_dir"],
        batch_bins=params["batch_bins"],
        max_epoch=params["max_epoch"],
        lr=params["lr"])
    trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
    trainer.train()


if __name__ == '__main__':
    params = {}
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data"
    params["batch_bins"] = 2000
    params["dataset_type"] = "small"
    params["max_epoch"] = 50
    params["lr"] = 0.00005
    params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
    params["model_revision"] = None
    modelscope_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py

New file
@@ -0,0 +1,13 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

if __name__ == "__main__":
    audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
    output_dir = "./results"
    inference_pipline = pipeline(
        task=Tasks.auto_speech_recognition,
        model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
        output_dir=output_dir,
    )
    rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
    print(rec_result)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md

@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md

@@ -41,7 +41,8 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
      .pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py

@@ -49,5 +49,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "20epoch.pth"
    params["decoding_model_name"] = "20epoch.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md

@@ -34,7 +34,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
    - <strong>output_dir:</strong> # result dir
    - <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
    - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

- Then you can run the pipeline to finetune with:
```python

 egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py

@@ -53,5 +53,5 @@
    params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
    params["output_dir"] = "./checkpoint"
    params["data_dir"] = "./data/test"
    params["decoding_model_name"] = "valid.acc.ave_10best.pth"
    params["decoding_model_name"] = "valid.acc.ave_10best.pb"
    modelscope_infer_after_finetune(params)

 egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py

New file
@@ -0,0 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks

inference_diar_pipline = pipeline(
    task=Tasks.speaker_diarization,
    model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
    model_revision="v1.0.0",
)
results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
print(results)

 egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py

@@ -14,13 +14,12 @@
)

# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
audio_list = [[
audio_list = [
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
    "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
]]
]

results = inference_diar_pipline(audio_in=audio_list)
for rst in results:
    print(rst["value"])
print(results)

 funasr/bin/asr_inference.py

@@ -52,7 +52,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_mfcca.py

@@ -55,7 +55,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_paraformer.py

@@ -50,7 +50,7 @@

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -58,7 +58,7 @@

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_rnnt.py

@@ -49,7 +49,7 @@

    Examples:
            >>> import soundfile
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
            >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
            >>> audio, rate = soundfile.read("speech.wav")
            >>> speech2text(audio)
            [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_uniasr.py

@@ -46,7 +46,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/asr_inference_uniasr_vad.py

@@ -46,7 +46,7 @@

    Examples:
        >>> import soundfile
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
        >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2text(audio)
        [(text, token, token_int, hypothesis object), ...]

 funasr/bin/diar_inference_launch.py

@@ -133,7 +133,7 @@
        param_dict = {
            "extract_profile": True,
            "sv_train_config": "sv.yaml",
            "sv_model_file": "sv.pth",
            "sv_model_file": "sv.pb",
        }
        if "param_dict" in kwargs and kwargs["param_dict"] is not None:
            for key in param_dict:
@@ -142,6 +142,9 @@
        else:
            kwargs["param_dict"] = param_dict
        return inference_modelscope(mode=mode, **kwargs)
    elif mode == "eend-ola":
        from funasr.bin.eend_ola_inference import inference_modelscope
        return inference_modelscope(mode=mode, **kwargs)
    else:
        logging.info("Unknown decoding mode: {}".format(mode))
        return None

 funasr/bin/eend_ola_inference.py

@@ -16,6 +16,7 @@

import numpy as np
import torch
from scipy.signal import medfilt
from typeguard import check_argument_types

from funasr.models.frontend.wav_frontend import WavFrontendMel23
@@ -34,7 +35,7 @@
    Examples:
        >>> import soundfile
        >>> import numpy as np
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
        >>> profile = np.load("profiles.npy")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2diar(audio, profile)
@@ -146,7 +147,7 @@
        output_dir: Optional[str] = None,
        batch_size: int = 1,
        dtype: str = "float32",
        ngpu: int = 0,
        ngpu: int = 1,
        num_workers: int = 0,
        log_level: Union[int, str] = "INFO",
        key_file: Optional[str] = None,
@@ -179,7 +180,6 @@
        diar_model_file=diar_model_file,
        device=device,
        dtype=dtype,
        streaming=streaming,
    )
    logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
    speech2diar = Speech2Diarization.from_pretrained(
@@ -209,7 +209,7 @@
        if data_path_and_name_and_type is None and raw_inputs is not None:
            if isinstance(raw_inputs, torch.Tensor):
                raw_inputs = raw_inputs.numpy()
            data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
            data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
        loader = EENDOLADiarTask.build_streaming_iterator(
            data_path_and_name_and_type,
            dtype=dtype,
@@ -236,9 +236,23 @@
            # batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}

            results = speech2diar(**batch)

            # post process
            a = results[0][0].cpu().numpy()
            a = medfilt(a, (11, 1))
            rst = []
            for spkid, frames in enumerate(a.T):
                frames = np.pad(frames, (1, 1), 'constant')
                changes, = np.where(np.diff(frames, axis=0) != 0)
                fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
                for s, e in zip(changes[::2], changes[1::2]):
                    st = s / 10.
                    dur = (e - s) / 10.
                    rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))

            # Only supporting batch_size==1
            key, value = keys[0], output_results_str(results, keys[0])
            item = {"key": key, "value": value}
            value = "\n".join(rst)
            item = {"key": keys[0], "value": value}
            result_list.append(item)
            if output_path is not None:
                output_writer.write(value)

 funasr/bin/sond_inference.py

@@ -42,7 +42,7 @@
    Examples:
        >>> import soundfile
        >>> import numpy as np
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
        >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
        >>> profile = np.load("profiles.npy")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2diar(audio, profile)

 funasr/bin/sv_inference.py

@@ -36,7 +36,7 @@

    Examples:
        >>> import soundfile
        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
        >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
        >>> audio, rate = soundfile.read("speech.wav")
        >>> speech2xvector(audio)
        [(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@
        log_level: Union[int, str] = "INFO",
        key_file: Optional[str] = None,
        sv_train_config: Optional[str] = "sv.yaml",
        sv_model_file: Optional[str] =  "sv.pth",
        sv_model_file: Optional[str] =  "sv.pb",
        model_tag: Optional[str] = None,
        allow_variable_data_keys: bool = True,
        streaming: bool = False,

 funasr/main_funcs/average_nbest_models.py

@@ -66,13 +66,13 @@
            elif n == 1:
                # The averaged model is same as the best model
                e, _ = epoch_and_values[0]
                op = output_dir / f"{e}epoch.pth"
                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
                op = output_dir / f"{e}epoch.pb"
                sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
                if sym_op.is_symlink() or sym_op.exists():
                    sym_op.unlink()
                sym_op.symlink_to(op.name)
            else:
                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
                op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
                logging.info(
                    f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
                )
@@ -83,12 +83,12 @@
                    if e not in _loaded:
                        if oss_bucket is None:
                            _loaded[e] = torch.load(
                                output_dir / f"{e}epoch.pth",
                                output_dir / f"{e}epoch.pb",
                                map_location="cpu",
                            )
                        else:
                            buffer = BytesIO(
                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
                                oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
                            _loaded[e] = torch.load(buffer)
                    states = _loaded[e]

@@ -115,13 +115,13 @@
                else:
                    buffer = BytesIO()
                    torch.save(avg, buffer)
                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
                    oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
                                          buffer.getvalue())

        # 3. *.*.ave.pth is a symlink to the max ave model
        # 3. *.*.ave.pb is a symlink to the max ave model
        if oss_bucket is None:
            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
            op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
            sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
            if sym_op.is_symlink() or sym_op.exists():
                sym_op.unlink()
            sym_op.symlink_to(op.name)

 funasr/main_funcs/pack_funcs.py

@@ -191,12 +191,12 @@

    Examples:
        tarfile:
           model.pth
           model.pb
           some1.file
           some2.file

        >>> unpack("tarfile", "out")
        {'asr_model_file': 'out/model.pth'}
        {'asr_model_file': 'out/model.pb'}
    """
    input_archive = Path(input_archive)
    outpath = Path(outpath)

 funasr/models/e2e_diar_eend_ola.py

@@ -52,15 +52,15 @@

        super().__init__()
        self.frontend = frontend
        self.encoder = encoder
        self.encoder_decoder_attractor = encoder_decoder_attractor
        self.enc = encoder
        self.eda = encoder_decoder_attractor
        self.attractor_loss_weight = attractor_loss_weight
        self.max_n_speaker = max_n_speaker
        if mapping_dict is None:
            mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
            self.mapping_dict = mapping_dict
        # PostNet
        self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
        self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
        self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)

    def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@
        pad_shape = xs.shape
        xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
        xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
        emb = self.encoder(xs, xs_mask)
        emb = self.enc(xs, xs_mask)
        emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
        emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
        return emb
@@ -76,8 +76,8 @@
    def forward_post_net(self, logits, ilens):
        maxlen = torch.max(ilens).to(torch.int).item()
        logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
        outputs, (_, _) = self.PostNet(logits)
        logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
        outputs, (_, _) = self.postnet(logits)
        outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
        outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
        outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@
        text = text[:, : text_lengths.max()]

        # 1. Encoder
        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
        encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
        intermediate_outs = None
        if isinstance(encoder_out, tuple):
            intermediate_outs = encoder_out[1]
@@ -190,18 +190,16 @@
                            shuffle: bool = True,
                            threshold: float = 0.5,
                            **kwargs):
        if self.frontend is not None:
            speech = self.frontend(speech)
        speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
        emb = self.forward_encoder(speech, speech_lengths)
        if shuffle:
            orders = [np.arange(e.shape[0]) for e in emb]
            for order in orders:
                np.random.shuffle(order)
            attractors, probs = self.encoder_decoder_attractor.estimate(
            attractors, probs = self.eda.estimate(
                [e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
        else:
            attractors, probs = self.encoder_decoder_attractor.estimate(emb)
            attractors, probs = self.eda.estimate(emb)
        attractors_active = []
        for p, att, e in zip(probs, attractors, emb):
            if n_speakers and n_speakers >= 0:
@@ -233,10 +231,23 @@
                pred[i] = pred[i - 1]
            else:
                pred[i] = 0
        pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
        pred = [self.inv_mapping_func(i) for i in pred]
        decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
        decisions = torch.from_numpy(
            np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
            torch.float32)
        decisions = decisions[:, :n_speaker]
        return decisions

    def inv_mapping_func(self, label):

        if not isinstance(label, int):
            label = int(label)
        if label in self.mapping_dict['label2dec'].keys():
            num = self.mapping_dict['label2dec'][label]
        else:
            num = -1
        return num

    def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
        pass

 funasr/models/frontend/wav_frontend.py

@@ -1,14 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed from espnet/espnet.
from abc import ABC
from typing import Tuple

import numpy as np
import torch
import torchaudio.compliance.kaldi as kaldi
from funasr.models.frontend.abs_frontend import AbsFrontend
from typeguard import check_argument_types
from torch.nn.utils.rnn import pad_sequence
from typeguard import check_argument_types

import funasr.models.frontend.eend_ola_feature as eend_ola_feature
from funasr.models.frontend.abs_frontend import AbsFrontend


def load_cmvn(cmvn_file):
@@ -275,7 +276,8 @@
    # inputs tensor has catted the cache tensor
    # def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
    #               is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
    def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
        torch.Tensor, torch.Tensor, int]:
        """
        Apply lfr with data
        """
@@ -376,7 +378,8 @@
            if self.lfr_m != 1 or self.lfr_n != 1:
                # update self.lfr_splice_cache in self.apply_lfr
                # mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
                mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
                                                                                     is_final)
            if self.cmvn_file is not None:
                mat = self.apply_cmvn(mat, self.cmvn)
            feat_length = mat.size(0)
@@ -400,7 +403,8 @@
        if feats.shape[0]:
            #if self.reserve_waveforms is None and self.lfr_m > 1:
            #    self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
            self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
                (self.reserve_waveforms, waveforms), dim=1)
            if not self.lfr_splice_cache:  # 初始化splice_cache
                for i in range(batch_size):
                    self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -409,7 +413,8 @@
                lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache)  # B T D
                feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
                feats_lengths += lfr_splice_cache_tensor[0].shape[0]
                frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                frame_from_waveforms = int(
                    (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
                minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
                feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
                if self.lfr_m == 1:
@@ -423,7 +428,8 @@
                    self.waveforms = self.waveforms[:, :sample_length]
            else:
                # update self.reserve_waveforms and self.lfr_splice_cache
                self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
                self.reserve_waveforms = self.waveforms[:,
                                         :-(self.frame_sample_length - self.frame_shift_sample_length)]
                for i in range(batch_size):
                    self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
                return torch.empty(0), feats_lengths
@@ -444,3 +450,54 @@
        self.reserve_waveforms = None
        self.input_cache = None
        self.lfr_splice_cache = []


class WavFrontendMel23(AbsFrontend):
    """Conventional frontend structure for ASR.
    """

    def __init__(
            self,
            fs: int = 16000,
            frame_length: int = 25,
            frame_shift: int = 10,
            lfr_m: int = 1,
            lfr_n: int = 1,
    ):
        assert check_argument_types()
        super().__init__()
        self.fs = fs
        self.frame_length = frame_length
        self.frame_shift = frame_shift
        self.lfr_m = lfr_m
        self.lfr_n = lfr_n
        self.n_mels = 23

    def output_size(self) -> int:
        return self.n_mels * (2 * self.lfr_m + 1)

    def forward(
            self,
            input: torch.Tensor,
            input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
        batch_size = input.size(0)
        feats = []
        feats_lens = []
        for i in range(batch_size):
            waveform_length = input_lengths[i]
            waveform = input[i][:waveform_length]
            waveform = waveform.numpy()
            mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
            mat = eend_ola_feature.transform(mat)
            mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
            mat = mat[::self.lfr_n]
            mat = torch.from_numpy(mat)
            feat_length = mat.size(0)
            feats.append(mat)
            feats_lens.append(feat_length)

        feats_lens = torch.as_tensor(feats_lens)
        feats_pad = pad_sequence(feats,
                                 batch_first=True,
                                 padding_value=0.0)
        return feats_pad, feats_lens

 funasr/modules/eend_ola/encoder.py

@@ -87,7 +87,7 @@
                 n_layers: int,
                 n_units: int,
                 e_units: int = 2048,
                 h: int = 8,
                 h: int = 4,
                 dropout_rate: float = 0.1,
                 use_pos_emb: bool = False):
        super(EENDOLATransformerEncoder, self).__init__()

 funasr/modules/eend_ola/encoder_decoder_attractor.py

@@ -16,12 +16,12 @@
        self.n_units = n_units

    def forward_core(self, xs, zeros):
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
        ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
        xs = [self.enc0_dropout(x) for x in xs]
        xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
        xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
        _, (hx, cx) = self.encoder(xs)
        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
        zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
        max_zlen = torch.max(zlens).to(torch.int).item()
        zeros = [self.enc0_dropout(z) for z in zeros]
        zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)

 funasr/runtime/onnxruntime/src/Audio.cpp

@@ -237,7 +237,7 @@

    size_t nOffset = 0;

#define WAV_HEADER_SIZE 44


    speech_len = nBufLen / 2;
    speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@
            speech_data[i] = (float)speech_buff[i] / scale;
        }


        AudioFrame* frame = new AudioFrame(speech_len);
        frame_queue.push(frame);
        return true;

    }

 funasr/runtime/onnxruntime/src/librapidasrapi.cpp

@@ -26,8 +26,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadwav(szBuf,nLen);
        audio.split();
        if (!audio.loadwav(szBuf, nLen))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -58,8 +59,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadpcmwav(szBuf, nLen);
        audio.split();
        if (!audio.loadpcmwav(szBuf, nLen))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -91,8 +93,9 @@
            return nullptr;

        Audio audio(1);
        audio.loadpcmwav(szFileName);
        audio.split();
        if (!audio.loadpcmwav(szFileName))
            return nullptr;
        //audio.split();

        float* buff;
        int len;
@@ -125,7 +128,7 @@
        Audio audio(1);
        if(!audio.loadwav(szWavfile))
            return nullptr;
        audio.split();
        //audio.split();

        float* buff;
        int len;

 funasr/runtime/onnxruntime/tester/tester.cpp

@@ -8,7 +8,7 @@
#include "librapidasrapi.h"

#include <iostream>

#include <fstream>
using namespace std;

int main(int argc, char *argv[])
@@ -40,10 +40,13 @@


    gettimeofday(&start, NULL);

    RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
    gettimeofday(&end, NULL);
    float snippet_time = 0.0f;


     RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);

    gettimeofday(&end, NULL);
   
    if (Result)
    {
        string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@
    }
    else
    {
        cout <<("no return data!");
        cout <<"no return data!";
    }
  
    printf("Audio length %lfs.\n", (double)snippet_time);

    //char* buff = nullptr;
    //int len = 0;
    //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
    //if (ifs.is_open())
    //{
    //    ifs.seekg(0, std::ios::end);
    //    len = ifs.tellg();
    //    ifs.seekg(0, std::ios::beg);

    //    buff = new char[len];

    //    ifs.read(buff, len);


    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);

    //    RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
    //    //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
    //    gettimeofday(&end, NULL);
    //   
    //    if (Result)
    //    {
    //        string msg = RapidAsrGetResult(Result, 0);
    //        setbuf(stdout, NULL);
    //        cout << "Result: \"";
    //        cout << msg << endl;
    //        cout << "\"." << endl;
    //        snippet_time = RapidAsrGetRetSnippetTime(Result);
    //        RapidAsrFreeResult(Result);
    //    }
    //    else
    //    {
    //        cout <<"no return data!";
    //    }
  
    //   
    //delete[]buff;
    //}

 
    printf("Audio length %lfs.\n", (double)snippet_time);
    seconds = (end.tv_sec - start.tv_sec);
    long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
    printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);

 funasr/runtime/python/README.md

New file
@@ -0,0 +1,21 @@
Benchmark [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) based on Aishell1 test set , the total audio duration is 36108.919 seconds.

(Note: The service has been fully warm up.)

 Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz   16core-32processor    with avx512_vnni

| concurrent-tasks | processing time(s) |  RTF   | Speedup Rate |
|:----------------:|:------------------:|:------:|:------------:|
|  1 (onnx fp32)   |        2806        | 0.0777 |     12.9     |
|  1 (onnx int8)   |        1611        | 0.0446 |     22.4     |
|  8 (onnx fp32)   |        538         | 0.0149 |     67.1     |
|  8 (onnx int8)   |        210         | 0.0058 |    172.4     |
|  16 (onnx fp32)  |        288         | 0.0080 |    125.2     |
|  16 (onnx int8)  |        117         | 0.0032 |    309.9     |
|  32 (onnx fp32)  |        167         | 0.0046 |    216.5     |
|  32 (onnx int8)  |        107         | 0.0030 |    338.0     |
|  64 (onnx fp32)  |        158         | 0.0044 |    228.1     |
|  64 (onnx int8)  |         82         | 0.0023 |    442.8     |
|  96 (onnx fp32)  |        151         | 0.0042 |    238.0     |
|  96 (onnx int8)  |         80         | 0.0022 |    452.0     |


 funasr/tasks/abs_task.py

@@ -639,12 +639,12 @@
                 "and exclude_keys excludes keys of model states for the initialization."
                 "e.g.\n"
                 "  # Load all parameters"
                 "  --init_param some/where/model.pth\n"
                 "  --init_param some/where/model.pb\n"
                 "  # Load only decoder parameters"
                 "  --init_param some/where/model.pth:decoder:decoder\n"
                 "  --init_param some/where/model.pb:decoder:decoder\n"
                 "  # Load only decoder parameters excluding decoder.embed"
                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
                 "  --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
                 "  --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
        )
        group.add_argument(
            "--ignore_init_mismatch",

 funasr/tasks/asr.py

@@ -826,7 +826,7 @@
            if "model.ckpt-" in model_name or ".bin" in model_name:
                model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                            '.pb')) if ".bin" in model_name else os.path.join(
                    model_dir, "{}.pth".format(model_name))
                    model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@
            if "model.ckpt-" in model_name or ".bin" in model_name:
                model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
                                                                            '.pb')) if ".bin" in model_name else os.path.join(
                    model_dir, "{}.pth".format(model_name))
                    model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)

 funasr/tasks/diar.py

@@ -553,7 +553,7 @@
                if ".bin" in model_name:
                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                else:
                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)
@@ -750,47 +750,47 @@
            cls, args: argparse.Namespace, train: bool
    ) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
        assert check_argument_types()
        if args.use_preprocessor:
            retval = CommonPreprocessor(
                train=train,
                token_type=args.token_type,
                token_list=args.token_list,
                bpemodel=None,
                non_linguistic_symbols=None,
                text_cleaner=None,
                g2p_type=None,
                split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
                seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
                # NOTE(kamo): Check attribute existence for backward compatibility
                rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
                rir_apply_prob=args.rir_apply_prob
                if hasattr(args, "rir_apply_prob")
                else 1.0,
                noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
                noise_apply_prob=args.noise_apply_prob
                if hasattr(args, "noise_apply_prob")
                else 1.0,
                noise_db_range=args.noise_db_range
                if hasattr(args, "noise_db_range")
                else "13_15",
                speech_volume_normalize=args.speech_volume_normalize
                if hasattr(args, "rir_scp")
                else None,
            )
        else:
            retval = None
        assert check_return_type(retval)
        return retval
        # if args.use_preprocessor:
        #     retval = CommonPreprocessor(
        #         train=train,
        #         token_type=args.token_type,
        #         token_list=args.token_list,
        #         bpemodel=None,
        #         non_linguistic_symbols=None,
        #         text_cleaner=None,
        #         g2p_type=None,
        #         split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
        #         seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
        #         # NOTE(kamo): Check attribute existence for backward compatibility
        #         rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
        #         rir_apply_prob=args.rir_apply_prob
        #         if hasattr(args, "rir_apply_prob")
        #         else 1.0,
        #         noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
        #         noise_apply_prob=args.noise_apply_prob
        #         if hasattr(args, "noise_apply_prob")
        #         else 1.0,
        #         noise_db_range=args.noise_db_range
        #         if hasattr(args, "noise_db_range")
        #         else "13_15",
        #         speech_volume_normalize=args.speech_volume_normalize
        #         if hasattr(args, "rir_scp")
        #         else None,
        #     )
        # else:
        #     retval = None
        # assert check_return_type(retval)
        return None

    @classmethod
    def required_data_names(
            cls, train: bool = True, inference: bool = False
    ) -> Tuple[str, ...]:
        if not inference:
            retval = ("speech", "profile", "binary_labels")
            retval = ("speech", )
        else:
            # Recognition mode
            retval = ("speech")
            retval = ("speech", )
        return retval

    @classmethod
@@ -823,7 +823,7 @@

        # 2. Encoder
        encoder_class = encoder_choices.get_class(args.encoder)
        encoder = encoder_class(input_size=input_size, **args.encoder_conf)
        encoder = encoder_class(**args.encoder_conf)

        # 3. EncoderDecoderAttractor
        encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)

 funasr/tasks/sv.py

@@ -501,7 +501,7 @@
                if ".bin" in model_name:
                    model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
                else:
                    model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
                    model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
                if os.path.exists(model_name_pth):
                    logging.info("model_file is load from pth: {}".format(model_name_pth))
                    model_dict = torch.load(model_name_pth, map_location=device)

 funasr/torch_utils/load_pretrained_model.py

@@ -52,13 +52,13 @@
        init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>

    Examples:
        >>> load_pretrained_model("somewhere/model.pth", model)
        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
        >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
        >>> load_pretrained_model("somewhere/model.pb", model)
        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
        >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
        >>> load_pretrained_model(
        ...     "somewhere/model.pth:decoder:decoder:decoder.embed", model
        ...     "somewhere/model.pb:decoder:decoder:decoder.embed", model
        ... )
        >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
        >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
    """
    sps = init_param.split(":", 4)
    if len(sps) == 4:

 funasr/train/trainer.py

@@ -205,9 +205,9 @@
        else:
            scaler = None

        if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
        if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
            cls.resume(
                checkpoint=output_dir / "checkpoint.pth",
                checkpoint=output_dir / "checkpoint.pb",
                model=model,
                optimizers=optimizers,
                schedulers=schedulers,
@@ -361,7 +361,7 @@
                        },
                        buffer,
                    )
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
                else:
                    torch.save(
                        {
@@ -374,7 +374,7 @@
                            ],
                            "scaler": scaler.state_dict() if scaler is not None else None,
                        },
                        output_dir / "checkpoint.pth",
                        output_dir / "checkpoint.pb",
                    )

                # 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@
                    buffer = BytesIO()
                    torch.save(model.state_dict(), buffer)
                    trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
                                                                       f"{iepoch}epoch.pth"),buffer.getvalue())
                                                                       f"{iepoch}epoch.pb"),buffer.getvalue())
                else:
                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
                    torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")

                # Creates a sym link latest.pth -> {iepoch}epoch.pth
                # Creates a sym link latest.pb -> {iepoch}epoch.pb
                if trainer_options.use_pai:
                    p = os.path.join(trainer_options.output_dir, "latest.pth")
                    p = os.path.join(trainer_options.output_dir, "latest.pb")
                    if trainer_options.oss_bucket.object_exists(p):
                        trainer_options.oss_bucket.delete_object(p)
                    trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
                                           os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
                else:
                    p = output_dir / "latest.pth"
                    p = output_dir / "latest.pb"
                    if p.is_symlink() or p.exists():
                        p.unlink()
                    p.symlink_to(f"{iepoch}epoch.pth")
                    p.symlink_to(f"{iepoch}epoch.pb")

                _improved = []
                for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@
                        # Creates sym links if it's the best result
                        if best_epoch == iepoch:
                            if trainer_options.use_pai:
                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
                                p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
                                if trainer_options.oss_bucket.object_exists(p):
                                    trainer_options.oss_bucket.delete_object(p)
                                trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
                                                       os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
                            else:
                                p = output_dir / f"{_phase}.{k}.best.pth"
                                p = output_dir / f"{_phase}.{k}.best.pb"
                                if p.is_symlink() or p.exists():
                                    p.unlink()
                                p.symlink_to(f"{iepoch}epoch.pth")
                                p.symlink_to(f"{iepoch}epoch.pb")
                            _improved.append(f"{_phase}.{k}")
                if len(_improved) == 0:
                    logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@
                        type="model",
                        metadata={"improved": _improved},
                    )
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
                    artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
                    aliases = [
                        f"epoch-{iepoch}",
                        "best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@

                for e in range(1, iepoch):
                    if trainer_options.use_pai:
                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
                        p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
                        if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
                            trainer_options.oss_bucket.delete_object(p)
                            _removed.append(str(p))
                    else:
                        p = output_dir / f"{e}epoch.pth"
                        p = output_dir / f"{e}epoch.pb"
                        if p.exists() and e not in nbests:
                            p.unlink()
                            _removed.append(str(p))

 funasr/utils/timestamp_tools.py

@@ -1,6 +1,10 @@
import torch
import copy
import codecs
import logging
import edit_distance
import argparse
import pdb
import numpy as np
from typing import Any, List, Tuple, Union

@@ -9,7 +13,8 @@
                       us_peaks, 
                       char_list, 
                       vad_offset=0.0, 
                       force_time_shift=-1.5
                       force_time_shift=-1.5,
                       sil_in_str=True
                       ):
    if not len(char_list):
        return []
@@ -62,6 +67,8 @@
            timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
    res_txt = ""
    for char, timestamp in zip(new_char_list, timestamp_list):
        #if char != '<sil>':
        if not sil_in_str and char == '<sil>': continue
        res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
    res = []
    for char, timestamp in zip(new_char_list, timestamp_list):
@@ -121,4 +128,181 @@
    return res


class AverageShiftCalculator():
    def __init__(self):
        logging.warning("Calculating average shift.")
    def __call__(self, file1, file2):
        uttid_list1, ts_dict1 = self.read_timestamps(file1)
        uttid_list2, ts_dict2 = self.read_timestamps(file2)
        uttid_intersection = self._intersection(uttid_list1, uttid_list2)
        res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
        logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
        logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))

    def _intersection(self, list1, list2):
        set1 = set(list1)
        set2 = set(list2)
        if set1 == set2:
            logging.warning("Uttid same checked.")
            return set1
        itsc = list(set1 & set2)
        logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
        return itsc

    def read_timestamps(self, file):
        # read timestamps file in standard format
        uttid_list = []
        ts_dict = {}
        with codecs.open(file, 'r') as fin:
            for line in fin.readlines():
                text = ''
                ts_list = []
                line = line.rstrip()
                uttid = line.split()[0]
                uttid_list.append(uttid)
                body = " ".join(line.split()[1:])
                for pd in body.split(';'):
                    if not len(pd): continue
                    # pdb.set_trace() 
                    char, start, end = pd.lstrip(" ").split(' ')
                    text += char + ','
                    ts_list.append((float(start), float(end)))
                # ts_lists.append(ts_list)
                ts_dict[uttid] = (text[:-1], ts_list)
        logging.warning("File {} read done.".format(file))
        return uttid_list, ts_dict

    def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
        shift_time = 0
        for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
            shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
        num_tokens = len(filtered_timestamp_list1)
        return shift_time, num_tokens

    def as_cal(self, uttid_list, ts_dict1, ts_dict2):
        # calculate average shift between timestamp1 and timestamp2
        # when characters differ, use edit distance alignment
        # and calculate the error between the same characters
        self._accumlated_shift = 0
        self._accumlated_tokens = 0
        self.max_shift = 0
        self.max_shift_uttid = None
        for uttid in uttid_list:
            (t1, ts1) = ts_dict1[uttid]
            (t2, ts2) = ts_dict2[uttid]
            _align, _align2, _align3 = [], [], []
            fts1, fts2 = [], []
            _t1, _t2 = [], []
            sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
            s = sm.get_opcodes()
            for j in range(len(s)):
                if s[j][0] == "replace" or s[j][0] == "insert":
                    _align.append(0)
                if s[j][0] == "replace" or s[j][0] == "delete":
                    _align3.append(0)
                elif s[j][0] == "equal":
                    _align.append(1)
                    _align3.append(1)
                else:
                    continue
            # use s to index t2
            for a, ts , t in zip(_align, ts2, t2.split(',')):
                if a: 
                    fts2.append(ts)
                    _t2.append(t)
            sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
            s = sm2.get_opcodes()
            for j in range(len(s)):
                if s[j][0] == "replace" or s[j][0] == "insert":
                    _align2.append(0)
                elif s[j][0] == "equal":
                    _align2.append(1)
                else:
                    continue
            # use s2 tp index t1
            for a, ts, t in zip(_align3, ts1, t1.split(',')):
                if a: 
                    fts1.append(ts)
                    _t1.append(t)
            if len(fts1) == len(fts2):
                shift_time, num_tokens = self._shift(fts1, fts2)
                self._accumlated_shift += shift_time
                self._accumlated_tokens += num_tokens
                if shift_time/num_tokens > self.max_shift:
                    self.max_shift = shift_time/num_tokens
                    self.max_shift_uttid = uttid
            else:
                logging.warning("length mismatch")
        return self._accumlated_shift / self._accumlated_tokens


def convert_external_alphas(alphas_file, text_file, output_file):
    from funasr.models.predictor.cif import cif_wo_hidden
    with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
        for line1, line2 in zip(f1.readlines(), f2.readlines()):
            line1 = line1.rstrip()
            line2 = line2.rstrip()
            assert line1.split()[0] == line2.split()[0]
            uttid = line1.split()[0]
            alphas = [float(i) for i in line1.split()[1:]]
            new_alphas = np.array(remove_chunk_padding(alphas))
            new_alphas[-1] += 1e-4
            text = line2.split()[1:]
            if len(text) + 1 != int(new_alphas.sum()):
                # force resize
                new_alphas *= (len(text) + 1) / int(new_alphas.sum())
            peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
            if " " in text:
                text = text.split()
            else:
                text = [i for i in text]
            res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text, 
                                                     force_time_shift=-7.0, 
                                                     sil_in_str=False)
            f3.write("{} {}\n".format(uttid, res_str))


def remove_chunk_padding(alphas):
    # remove the padding part in alphas if using chunk paraformer for GPU
    START_ZERO = 45
    MID_ZERO = 75
    REAL_FRAMES = 360  # for chunk based encoder 10-120-10 and fsmn padding 5
    alphas = alphas[START_ZERO:]  # remove the padding at beginning
    new_alphas = []
    while True:
        new_alphas = new_alphas + alphas[:REAL_FRAMES]
        alphas = alphas[REAL_FRAMES+MID_ZERO:]
        if len(alphas) < REAL_FRAMES: break
    return new_alphas

SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']


def main(args):
    if args.mode == 'cal_aas':
        asc = AverageShiftCalculator()
        asc(args.input, args.input2)
    elif args.mode == 'read_ext_alphas':
        convert_external_alphas(args.input, args.input2, args.output)
    else:
        logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))


if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='timestamp tools')
    parser.add_argument('--mode', 
                        default=None, 
                        type=str, 
                        choices=SUPPORTED_MODES, 
                        help='timestamp related toolbox')
    parser.add_argument('--input', default=None, type=str, help='input file path')
    parser.add_argument('--output', default=None, type=str, help='output file name')
    parser.add_argument('--input2', default=None, type=str, help='input2 file path')
    parser.add_argument('--kaldi-ts-type', 
                        default='v2', 
                        type=str, 
                        choices=['v0', 'v1', 'v2'], 
                        help='kaldi timestamp to write')
    args = parser.parse_args()
    main(args)


 setup.py

@@ -17,7 +17,7 @@
        "humanfriendly",
        "scipy>=1.4.1",
        # "filelock",
        "librosa>=0.8.0",
        "librosa==0.8.1",
        "jamo==0.4.1",  # For kss
        "PyYAML>=5.1.2",
        "soundfile>=0.10.2",
@@ -41,6 +41,8 @@
        # PAI
        "oss2",
        "kaldi-native-fbank",
        # timestamp
        "edit-distance"
    ],
    # train: The modules invoked when training only.
    "train": [

 tests/test_asr_inference_pipeline.py

@@ -451,7 +451,7 @@

    def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
        inference_pipeline = pipeline(
            task=Tasks.,
            task=Tasks.auto_speech_recognition,
            model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
        rec_result = inference_pipeline(
            audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',

			@@ -15,36 +15,10 @@
			\| [Model Zoo](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
			\| [Contact](#contact)


			## What's new:

			### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
			- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
			- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
			- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
			- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
			- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
			- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
			- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
			- We release several new UniASR model:
			[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
			[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary),
			[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary),
			[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary),
			[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
			- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
			- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
			- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
			- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3、flac、ogg、opus...
			### 2023.1.16, funasr-0.1.6， modelscope-1.2.0
			- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
			[Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
			- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
			- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
			- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
			- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
			- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
			- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
			- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
			For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)

			## Highlights
			- Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).

			@@ -52,7 +52,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -55,7 +55,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -56,7 +56,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -54,7 +54,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

			@@ -58,7 +58,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer_noctc_1best.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default

			@@ -34,7 +34,7 @@
			tag=exp1
			model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
			lm_exp=${exp_dir}/exp/${model_dir}
			inference_lm=valid.loss.ave.pth # Language model path for decoding.
			inference_lm=valid.loss.ave.pb # Language model path for decoding.

			stage=0
			stop_stage=3

			@@ -4,7 +4,7 @@

			def main():
			diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
			diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
			diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
			output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
			data_path_and_name_and_type = [
			("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),

			@@ -17,9 +17,9 @@
			echo "Downloading Pre-trained model..."
			git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
			git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
			ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
			ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
			cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
			ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
			ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
			cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
			cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
			echo "Done."
			@@ -30,7 +30,7 @@

			if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
			echo "Calculating diarization results..."
			python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
			python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
			python local/convert_label_to_rttm.py \
			outputs/labels.txt \
			data/test_rmsil/raw_rmsil_map.scp \

			@@ -4,7 +4,7 @@

			def test_fbank_cpu_infer():
			diar_config_path = "config_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -24,7 +24,7 @@

			def test_fbank_gpu_infer():
			diar_config_path = "config_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -45,7 +45,7 @@

			def test_wav_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_wav.scp", "speech", "sound"),
			@@ -66,7 +66,7 @@

			def test_without_profile_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			raw_inputs = [[
			"data/unit_test/raw_inputs/record.wav",

			@@ -4,7 +4,7 @@

			def test_fbank_cpu_infer():
			diar_config_path = "sond_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -24,7 +24,7 @@

			def test_fbank_gpu_infer():
			diar_config_path = "sond_fbank.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
			@@ -45,7 +45,7 @@

			def test_wav_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			data_path_and_name_and_type = [
			("data/unit_test/test_wav.scp", "speech", "sound"),
			@@ -66,7 +66,7 @@

			def test_without_profile_gpu_infer():
			diar_config_path = "config.yaml"
			diar_model_path = "sond.pth"
			diar_model_path = "sond.pb"
			output_dir = "./outputs"
			raw_inputs = [[
			"data/unit_test/raw_inputs/record.wav",

			@@ -49,7 +49,7 @@
			model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"

			inference_config=conf/decode_asr_transformer.yaml
			inference_asr_model=valid.acc.ave_10best.pth
			inference_asr_model=valid.acc.ave_10best.pb

			# you can set gpu num for decoding here
			gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default

			@@ -41,7 +41,7 @@
			- Modify inference related parameters in `infer_after_finetune.py`
			- <strong>output_dir:</strong> # result dir
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`

			- Then you can run the pipeline to finetune with:
			```python

			@@ -48,5 +48,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
			params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
			modelscope_infer_after_finetune(params)

			@@ -63,5 +63,5 @@
			params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./example_data/validation"
			params["decoding_model_name"] = "valid.acc.ave.pth"
			params["decoding_model_name"] = "valid.acc.ave.pb"
			modelscope_infer_after_finetune(params)

			@@ -49,5 +49,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.acc.ave_10best.pth"
			params["decoding_model_name"] = "valid.acc.ave_10best.pb"
			modelscope_infer_after_finetune(params)

			@@ -50,5 +50,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "20epoch.pth"
			params["decoding_model_name"] = "20epoch.pb"
			modelscope_infer_after_finetune(params)

New file
			@@ -0,0 +1,35 @@
			import os
			from modelscope.metainfo import Trainers
			from modelscope.trainers import build_trainer
			from funasr.datasets.ms_dataset import MsDataset


			def modelscope_finetune(params):
			if not os.path.exists(params["output_dir"]):
			os.makedirs(params["output_dir"], exist_ok=True)
			# dataset split ["train", "validation"]
			ds_dict = MsDataset.load(params["data_dir"])
			kwargs = dict(
			model=params["model"],
			model_revision=params["model_revision"],
			data_dir=ds_dict,
			dataset_type=params["dataset_type"],
			work_dir=params["output_dir"],
			batch_bins=params["batch_bins"],
			max_epoch=params["max_epoch"],
			lr=params["lr"])
			trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
			trainer.train()


			if __name__ == '__main__':
			params = {}
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data"
			params["batch_bins"] = 2000
			params["dataset_type"] = "small"
			params["max_epoch"] = 50
			params["lr"] = 0.00005
			params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
			params["model_revision"] = None
			modelscope_finetune(params)

New file
			@@ -0,0 +1,13 @@
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			if __name__ == "__main__":
			audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
			output_dir = "./results"
			inference_pipline = pipeline(
			task=Tasks.auto_speech_recognition,
			model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
			output_dir=output_dir,
			)
			rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
			print(rec_result)

			@@ -41,7 +41,8 @@
			- Modify inference related parameters in `infer_after_finetune.py`
			- <strong>output_dir:</strong> # result dir
			- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
			- <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
			.pb`

			- Then you can run the pipeline to finetune with:
			```python

			@@ -53,5 +53,5 @@
			params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
			params["output_dir"] = "./checkpoint"
			params["data_dir"] = "./data/test"
			params["decoding_model_name"] = "valid.acc.ave_10best.pth"
			params["decoding_model_name"] = "valid.acc.ave_10best.pb"
			modelscope_infer_after_finetune(params)

New file
			@@ -0,0 +1,10 @@
			from modelscope.pipelines import pipeline
			from modelscope.utils.constant import Tasks

			inference_diar_pipline = pipeline(
			task=Tasks.speaker_diarization,
			model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
			model_revision="v1.0.0",
			)
			results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
			print(results)

			@@ -14,13 +14,12 @@
			)

			# 以 audio_list 作为输入，其中第一个音频为待检测语音，后面的音频为不同说话人的声纹注册语音
			audio_list = [[
			audio_list = [
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
			"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
			]]
			]

			results = inference_diar_pipline(audio_in=audio_list)
			for rst in results:
			print(rst["value"])
			print(results)

			@@ -52,7 +52,7 @@

			Examples:
			>>> import soundfile
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2text(audio)
			[(text, token, token_int, hypothesis object), ...]

			@@ -50,7 +50,7 @@

			Examples:
			>>> import soundfile
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2text(audio)
			[(text, token, token_int, hypothesis object), ...]

			@@ -46,7 +46,7 @@

			Examples:
			>>> import soundfile
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
			>>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2text(audio)
			[(text, token, token_int, hypothesis object), ...]

			@@ -133,7 +133,7 @@
			param_dict = {
			"extract_profile": True,
			"sv_train_config": "sv.yaml",
			"sv_model_file": "sv.pth",
			"sv_model_file": "sv.pb",
			}
			if "param_dict" in kwargs and kwargs["param_dict"] is not None:
			for key in param_dict:
			@@ -142,6 +142,9 @@
			else:
			kwargs["param_dict"] = param_dict
			return inference_modelscope(mode=mode, **kwargs)
			elif mode == "eend-ola":
			from funasr.bin.eend_ola_inference import inference_modelscope
			return inference_modelscope(mode=mode, **kwargs)
			else:
			logging.info("Unknown decoding mode: {}".format(mode))
			return None

			@@ -16,6 +16,7 @@

			import numpy as np
			import torch
			from scipy.signal import medfilt
			from typeguard import check_argument_types

			from funasr.models.frontend.wav_frontend import WavFrontendMel23
			@@ -34,7 +35,7 @@
			Examples:
			>>> import soundfile
			>>> import numpy as np
			>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
			>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
			>>> profile = np.load("profiles.npy")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2diar(audio, profile)
			@@ -146,7 +147,7 @@
			output_dir: Optional[str] = None,
			batch_size: int = 1,
			dtype: str = "float32",
			ngpu: int = 0,
			ngpu: int = 1,
			num_workers: int = 0,
			log_level: Union[int, str] = "INFO",
			key_file: Optional[str] = None,
			@@ -179,7 +180,6 @@
			diar_model_file=diar_model_file,
			device=device,
			dtype=dtype,
			streaming=streaming,
			)
			logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
			speech2diar = Speech2Diarization.from_pretrained(
			@@ -209,7 +209,7 @@
			if data_path_and_name_and_type is None and raw_inputs is not None:
			if isinstance(raw_inputs, torch.Tensor):
			raw_inputs = raw_inputs.numpy()
			data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
			data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
			loader = EENDOLADiarTask.build_streaming_iterator(
			data_path_and_name_and_type,
			dtype=dtype,
			@@ -236,9 +236,23 @@
			# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}

			results = speech2diar(**batch)

			# post process
			a = results[0][0].cpu().numpy()
			a = medfilt(a, (11, 1))
			rst = []
			for spkid, frames in enumerate(a.T):
			frames = np.pad(frames, (1, 1), 'constant')
			changes, = np.where(np.diff(frames, axis=0) != 0)
			fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
			for s, e in zip(changes[::2], changes[1::2]):
			st = s / 10.
			dur = (e - s) / 10.
			rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))

			# Only supporting batch_size==1
			key, value = keys[0], output_results_str(results, keys[0])
			item = {"key": key, "value": value}
			value = "\n".join(rst)
			item = {"key": keys[0], "value": value}
			result_list.append(item)
			if output_path is not None:
			output_writer.write(value)

			@@ -42,7 +42,7 @@
			Examples:
			>>> import soundfile
			>>> import numpy as np
			>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
			>>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
			>>> profile = np.load("profiles.npy")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2diar(audio, profile)

			@@ -36,7 +36,7 @@

			Examples:
			>>> import soundfile
			>>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
			>>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
			>>> audio, rate = soundfile.read("speech.wav")
			>>> speech2xvector(audio)
			[(text, token, token_int, hypothesis object), ...]
			@@ -169,7 +169,7 @@
			log_level: Union[int, str] = "INFO",
			key_file: Optional[str] = None,
			sv_train_config: Optional[str] = "sv.yaml",
			sv_model_file: Optional[str] = "sv.pth",
			sv_model_file: Optional[str] = "sv.pb",
			model_tag: Optional[str] = None,
			allow_variable_data_keys: bool = True,
			streaming: bool = False,