From 4dc3a1b011e1e72eb737417b8e0e0bec7a7e3a6e Mon Sep 17 00:00:00 2001
From: aky15 <ankeyu.aky@11.17.44.249>
Date: 星期二, 21 三月 2023 15:12:21 +0800
Subject: [PATCH] resolve conflict
---
egs/aishell/data2vec_transformer_finetune/run.sh | 2
egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py | 10
funasr/runtime/onnxruntime/readme.md | 4
funasr/runtime/grpc/Readme.md | 57
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py | 57
funasr/runtime/python/grpc/grpc_main_server.py | 14
egs/aishell/conformer/run.sh | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md | 2
funasr/bin/asr_inference_paraformer.py | 2
funasr/bin/asr_inference_uniasr_vad.py | 2
tests/test_asr_inference_pipeline.py | 4
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py | 29
funasr/bin/sond_inference.py | 32
funasr/runtime/grpc/common.cmake | 125 +
funasr/models/encoder/sanm_encoder.py | 42
funasr/tasks/diar.py | 87
funasr/export/models/modules/multihead_att.py | 30
funasr/tasks/abs_task.py | 8
funasr/utils/postprocess_utils.py | 7
egs/aishell/data2vec_paraformer_finetune/run.sh | 2
funasr/bin/asr_inference_uniasr.py | 2
egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py | 7
funasr/utils/timestamp_tools.py | 186 ++
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md | 10
egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md | 2
funasr/runtime/grpc/rebuild.sh | 12
egs/aishell2/paraformer/run.sh | 2
funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py | 29
funasr/models/e2e_diar_eend_ola.py | 35
funasr/models/e2e_diar_sond.py | 26
funasr/runtime/python/benchmark_onnx.md | 89 +
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py | 2
egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py | 5
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py | 35
funasr/modules/eend_ola/encoder.py | 2
funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py | 28
funasr/datasets/iterable_dataset.py | 21
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py | 2
funasr/models/decoder/sanm_decoder.py | 101 +
egs/aishell2/paraformerbert/run.sh | 2
funasr/runtime/python/benchmark_libtorch.md | 45
funasr/bin/asr_inference_paraformer_streaming.py | 907 +++++++++++++
egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py | 2
egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py | 5
funasr/runtime/grpc/paraformer_server.h | 56
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py | 2
egs/aishell2/transformerLM/run.sh | 2
funasr/models/e2e_asr_paraformer.py | 74 +
egs/mars/sd/local_run.sh | 2
funasr/export/README.md | 33
funasr/bin/diar_inference_launch.py | 5
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py | 35
funasr/runtime/python/utils/requirements.txt | 2
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py | 29
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py | 29
funasr/runtime/python/onnxruntime/README.md | 4
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py | 31
egs/aishell2/transformer/run.sh | 2
funasr/main_funcs/average_nbest_models.py | 18
funasr/runtime/python/grpc/grpc_server.py | 4
egs/aishell/paraformer/run.sh | 2
egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py | 2
funasr/utils/asr_utils.py | 52
funasr/runtime/python/utils/test_rtf.py | 55
funasr/torch_utils/load_pretrained_model.py | 10
egs/aishell/paraformerbert/run.sh | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py | 13
egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py | 5
funasr/train/trainer.py | 36
setup.py | 6
egs/callhome/diarization/sond/unit_test.py | 8
funasr/runtime/python/utils/test_rtf.sh | 71 +
egs/alimeeting/diarization/sond/infer_alimeeting_test.py | 2
funasr/export/export_model.py | 146 +
egs/aishell/transformer/run.sh | 2
funasr/runtime/python/libtorch/README.md | 4
funasr/runtime/python/utils/split_scp.pl | 246 +++
funasr/runtime/onnxruntime/src/Audio.cpp | 5
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py | 34
funasr/bin/asr_inference_paraformer_vad_punc.py | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py | 13
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py | 31
funasr/runtime/grpc/paraformer_server.cc | 195 ++
funasr/bin/asr_inference_mfcca.py | 4
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md | 2
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py | 31
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md | 2
README.md | 30
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py | 13
funasr/runtime/onnxruntime/src/librapidasrapi.cpp | 17
funasr/bin/asr_inference.py | 2
funasr/bin/eend_ola_inference.py | 26
funasr/datasets/large_datasets/utils/tokenize.py | 10
egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md | 2
funasr/modules/embedding.py | 11
funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py | 15
funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py | 6
funasr/modules/eend_ola/encoder_decoder_attractor.py | 6
funasr/export/models/modules/encoder_layer.py | 6
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py | 2
egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py | 29
funasr/runtime/onnxruntime/tester/tester.cpp | 57
funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py | 7
egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py | 2
funasr/modules/attention.py | 10
funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py | 157 ++
funasr/datasets/preprocessor.py | 10
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py | 35
funasr/main_funcs/pack_funcs.py | 4
egs/alimeeting/diarization/sond/unit_test.py | 8
funasr/runtime/python/onnxruntime/setup.py | 4
funasr/tasks/sv.py | 2
funasr/tasks/asr.py | 4
egs/aishell2/conformer/run.sh | 2
funasr/runtime/python/libtorch/setup.py | 2
egs/alimeeting/diarization/sond/run.sh | 6
funasr/models/frontend/wav_frontend.py | 77
funasr/runtime/grpc/CMakeLists.txt | 83 +
egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md | 2
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md | 3
funasr/models/predictor/cif.py | 57
tests/test_sv_inference_pipeline.py | 1
funasr/bin/sv_inference.py | 4
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py | 31
egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py | 2
funasr/bin/asr_inference_launch.py | 3
128 files changed, 3,628 insertions(+), 508 deletions(-)
diff --git a/README.md b/README.md
index 0d1079b..23f1abe 100644
--- a/README.md
+++ b/README.md
@@ -15,36 +15,10 @@
| [**Model Zoo**](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
| [**Contact**](#contact)
+
## What's new:
-### 2023.2.17, funasr-0.2.0, modelscope-1.3.0
-- We support a new feature, export paraformer models into [onnx and torchscripts](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export) from modelscope. The local finetuned models are also supported.
-- We support a new feature, [onnxruntime](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python), you could deploy the runtime without modelscope or funasr, for the [paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary) model, the rtf of onnxruntime is 3x speedup(0.110->0.038) on cpu, [details](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/onnxruntime/paraformer/rapid_paraformer#speed).
-- We support a new feature, [grpc](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/runtime/python/grpc), you could build the ASR service with grpc, by deploying the modelscope pipeline or onnxruntime.
-- We release a new model [paraformer-large-contextual](https://www.modelscope.cn/models/damo/speech_paraformer-large-contextual_asr_nat-zh-cn-16k-common-vocab8404/summary), which supports the hotword customization based on the incentive enhancement, and improves the recall and precision of hotwords.
-- We optimize the timestamp alignment of [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), the prediction accuracy of timestamp is much improved, and achieving accumulated average shift (aas) of 74.7ms, [details](https://arxiv.org/abs/2301.12343).
-- We release a new model, [8k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We release a new model, [MFCCA](https://www.modelscope.cn/models/NPU-ASLP/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/summary), a multi-channel multi-speaker model which is independent of the number and geometry of microphones and supports Mandarin meeting transcription.
-- We release several new UniASR model:
-[Southern Fujian Dialect model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/summary),
-[French model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fr-16k-common-vocab3472-tensorflow1-online/summary),
-[German model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-de-16k-common-vocab3690-tensorflow1-online/summary),
-[Vietnamese model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-vi-16k-common-vocab1001-pytorch-online/summary),
-[Persian model](https://modelscope.cn/models/damo/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/summary).
-- We release a new model, [paraformer-data2vec model](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/summary), an unsupervised pretraining model on AISHELL-2, which is inited for paraformer model and then finetune on AISHEL-1.
-- We release a new feature, the `VAD`, `ASR` and `PUNC` models could be integrated freely, which could be models from [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), or the local finetine models. The [demo](https://github.com/alibaba-damo-academy/FunASR/discussions/134).
-- We optimized the [punctuation common model](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), enhance the recall and precision, fix the badcases of missing punctuation marks.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including: mp3銆乫lac銆乷gg銆乷pus...
-### 2023.1.16, funasr-0.1.6锛� modelscope-1.2.0
-- We release a new version model [Paraformer-large-long](https://modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary), which integrate the [VAD](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary) model, [ASR](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary),
- [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary) model and timestamp together. The model could take in several hours long inputs.
-- We release a new model, [16k VAD model](https://modelscope.cn/models/damo/speech_fsmn_vad_zh-cn-16k-common-pytorch/summary), which could predict the duration of none-silence speech. It could be freely integrated with any ASR models in [modelscope](https://www.modelscope.cn/models/damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary).
-- We release a new model, [Punctuation](https://www.modelscope.cn/models/damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch/summary), which could predict the punctuation of ASR models's results. It could be freely integrated with any ASR models in [Model Zoo](docs/modelscope_models.md).
-- We release a new model, [Data2vec](https://www.modelscope.cn/models/damo/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/summary), an unsupervised pretraining model which could be finetuned on ASR and other downstream tasks.
-- We release a new model, [Paraformer-Tiny](https://www.modelscope.cn/models/damo/speech_paraformer-tiny-commandword_asr_nat-zh-cn-16k-vocab544-pytorch/summary), a lightweight Paraformer model which supports Mandarin command words recognition.
-- We release a new model, [SV](https://www.modelscope.cn/models/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/summary), which could extract speaker embeddings and further perform speaker verification on paired utterances. It will be supported for speaker diarization in the future version.
-- We improve the pipeline of modelscope to speedup the inference, by integrating the process of build model into build pipeline.
-- Various new types of audio input types are now supported by modelscope inference pipeline, including wav.scp, wav format, audio bytes, wave samples...
+For the release notes, please ref to [news](https://github.com/alibaba-damo-academy/FunASR/releases)
## Highlights
- Many types of typical models are supported, e.g., [Tranformer](https://arxiv.org/abs/1706.03762), [Conformer](https://arxiv.org/abs/2005.08100), [Paraformer](https://arxiv.org/abs/2206.08317).
diff --git a/egs/aishell/conformer/run.sh b/egs/aishell/conformer/run.sh
index 41db45d..09ddab8 100755
--- a/egs/aishell/conformer/run.sh
+++ b/egs/aishell/conformer/run.sh
@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_paraformer_finetune/run.sh b/egs/aishell/data2vec_paraformer_finetune/run.sh
index cada164..d033ce2 100755
--- a/egs/aishell/data2vec_paraformer_finetune/run.sh
+++ b/egs/aishell/data2vec_paraformer_finetune/run.sh
@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/data2vec_transformer_finetune/run.sh b/egs/aishell/data2vec_transformer_finetune/run.sh
index 7ab8626..26222e6 100755
--- a/egs/aishell/data2vec_transformer_finetune/run.sh
+++ b/egs/aishell/data2vec_transformer_finetune/run.sh
@@ -55,7 +55,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.cer_ctc.ave_10best.pth
+inference_asr_model=valid.cer_ctc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformer/run.sh b/egs/aishell/paraformer/run.sh
index 2b0f144..53b5f90 100755
--- a/egs/aishell/paraformer/run.sh
+++ b/egs/aishell/paraformer/run.sh
@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/paraformerbert/run.sh b/egs/aishell/paraformerbert/run.sh
index 96310ab..2487eac 100755
--- a/egs/aishell/paraformerbert/run.sh
+++ b/egs/aishell/paraformerbert/run.sh
@@ -56,7 +56,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell/transformer/run.sh b/egs/aishell/transformer/run.sh
index 4c307b0..f66a338 100755
--- a/egs/aishell/transformer/run.sh
+++ b/egs/aishell/transformer/run.sh
@@ -52,7 +52,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs/aishell2/conformer/run.sh b/egs/aishell2/conformer/run.sh
index bd6d81e..f9ea69a 100755
--- a/egs/aishell2/conformer/run.sh
+++ b/egs/aishell2/conformer/run.sh
@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformer/run.sh b/egs/aishell2/paraformer/run.sh
index 2b7d841..e1ea4fe 100755
--- a/egs/aishell2/paraformer/run.sh
+++ b/egs/aishell2/paraformer/run.sh
@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/paraformerbert/run.sh b/egs/aishell2/paraformerbert/run.sh
index d0407d4..239a7e3 100755
--- a/egs/aishell2/paraformerbert/run.sh
+++ b/egs/aishell2/paraformerbert/run.sh
@@ -58,7 +58,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer_noctc_1best.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformer/run.sh b/egs/aishell2/transformer/run.sh
index a5a14ec..6f2dd4d 100755
--- a/egs/aishell2/transformer/run.sh
+++ b/egs/aishell2/transformer/run.sh
@@ -54,7 +54,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, e.g., gpuid_list=2,3, the same as training stage by default
diff --git a/egs/aishell2/transformerLM/run.sh b/egs/aishell2/transformerLM/run.sh
index 28e3762..9e7a713 100755
--- a/egs/aishell2/transformerLM/run.sh
+++ b/egs/aishell2/transformerLM/run.sh
@@ -34,7 +34,7 @@
tag=exp1
model_dir="baseline_$(basename "${lm_config}" .yaml)_${lang}_${token_type}_${tag}"
lm_exp=${exp_dir}/exp/${model_dir}
-inference_lm=valid.loss.ave.pth # Language model path for decoding.
+inference_lm=valid.loss.ave.pb # Language model path for decoding.
stage=0
stop_stage=3
diff --git a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
index 0988f5d..b4d534b 100644
--- a/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
+++ b/egs/alimeeting/diarization/sond/infer_alimeeting_test.py
@@ -4,7 +4,7 @@
def main():
diar_config_path = sys.argv[1] if len(sys.argv) > 1 else "sond_fbank.yaml"
- diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pth"
+ diar_model_path = sys.argv[2] if len(sys.argv) > 2 else "sond.pb"
output_dir = sys.argv[3] if len(sys.argv) > 3 else "./outputs"
data_path_and_name_and_type = [
("data/test_rmsil/feats.scp", "speech", "kaldi_ark"),
diff --git a/egs/alimeeting/diarization/sond/run.sh b/egs/alimeeting/diarization/sond/run.sh
index 7e9a7f7..19ae40c 100644
--- a/egs/alimeeting/diarization/sond/run.sh
+++ b/egs/alimeeting/diarization/sond/run.sh
@@ -17,9 +17,9 @@
echo "Downloading Pre-trained model..."
git clone https://www.modelscope.cn/damo/speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch.git
git clone https://www.modelscope.cn/damo/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch.git
- ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pth ./sv.pth
+ ln -s speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.pb ./sv.pb
cp speech_xvector_sv-zh-cn-cnceleb-16k-spk3465-pytorch/sv.yaml ./sv.yaml
- ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pth ./sond.pth
+ ln -s speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.pb ./sond.pb
cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond_fbank.yaml ./sond_fbank.yaml
cp speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/sond.yaml ./sond.yaml
echo "Done."
@@ -30,7 +30,7 @@
if [ ${stage} -le 1 ] && [ ${stop_stage} -ge 1 ]; then
echo "Calculating diarization results..."
- python infer_alimeeting_test.py sond_fbank.yaml sond.pth outputs
+ python infer_alimeeting_test.py sond_fbank.yaml sond.pb outputs
python local/convert_label_to_rttm.py \
outputs/labels.txt \
data/test_rmsil/raw_rmsil_map.scp \
diff --git a/egs/alimeeting/diarization/sond/unit_test.py b/egs/alimeeting/diarization/sond/unit_test.py
index 84a4247..0f40ab2 100644
--- a/egs/alimeeting/diarization/sond/unit_test.py
+++ b/egs/alimeeting/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
def test_fbank_cpu_infer():
diar_config_path = "config_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
def test_fbank_gpu_infer():
diar_config_path = "config_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
def test_wav_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
def test_without_profile_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
raw_inputs = [[
"data/unit_test/raw_inputs/record.wav",
diff --git a/egs/callhome/diarization/sond/unit_test.py b/egs/callhome/diarization/sond/unit_test.py
index 519ac56..a48eda1 100644
--- a/egs/callhome/diarization/sond/unit_test.py
+++ b/egs/callhome/diarization/sond/unit_test.py
@@ -4,7 +4,7 @@
def test_fbank_cpu_infer():
diar_config_path = "sond_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -24,7 +24,7 @@
def test_fbank_gpu_infer():
diar_config_path = "sond_fbank.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_feats.scp", "speech", "kaldi_ark"),
@@ -45,7 +45,7 @@
def test_wav_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
data_path_and_name_and_type = [
("data/unit_test/test_wav.scp", "speech", "sound"),
@@ -66,7 +66,7 @@
def test_without_profile_gpu_infer():
diar_config_path = "config.yaml"
- diar_model_path = "sond.pth"
+ diar_model_path = "sond.pb"
output_dir = "./outputs"
raw_inputs = [[
"data/unit_test/raw_inputs/record.wav",
diff --git a/egs/mars/sd/local_run.sh b/egs/mars/sd/local_run.sh
index 3b319f4..4516e9f 100755
--- a/egs/mars/sd/local_run.sh
+++ b/egs/mars/sd/local_run.sh
@@ -49,7 +49,7 @@
model_dir="baseline_$(basename "${asr_config}" .yaml)_${feats_type}_${lang}_${token_type}_${tag}"
inference_config=conf/decode_asr_transformer.yaml
-inference_asr_model=valid.acc.ave_10best.pth
+inference_asr_model=valid.acc.ave_10best.pb
# you can set gpu num for decoding here
gpuid_list=$CUDA_VISIBLE_DEVICES # set gpus for decoding, the same as training stage by default
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
index 56c282c..b326067 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-paraformer-zh-cn-aishell2-16k/infer_after_finetune.py
@@ -48,5 +48,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+ params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
index c2e4354..053986d 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed~~~~
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
index e163999..2f038a8 100644
--- a/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/data2vec/speech_data2vec_pretrain-zh-cn-aishell2-16k-pytorch/infer_after_finetune.py
@@ -48,5 +48,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.cer_ctc.ave.pth"
+ params["decoding_model_name"] = "valid.cer_ctc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
index 9097e7a..16aeada 100644
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
index e714a3d..333b66a 100755
--- a/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
+++ b/egs_modelscope/asr/mfcca/speech_mfcca_asr-zh-cn-16k-alimeeting-vocab4950/infer_after_finetune.py
@@ -63,5 +63,5 @@
params["required_files"] = ["feats_stats.npz", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./example_data/validation"
- params["decoding_model_name"] = "valid.acc.ave.pth"
+ params["decoding_model_name"] = "valid.acc.ave.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
index dd3fb48..2fceb48 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
index 6c34ed0..fafe565 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell1-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
index d616d3e..d70af72 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
index 6140bb7..731cafe 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-aishell2-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index dfd509d..a044361 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -22,10 +22,12 @@
Or you can use the finetuned model for inference directly.
- Setting parameters in `infer.py`
+ - <strong>model:</strong> # model name on ModelScope
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- <strong>output_dir:</strong> # result dir
- - <strong>ngpu:</strong> # the number of GPUs for decoding
- - <strong>njob:</strong> # the number of jobs for each GPU
+ - <strong>ngpu:</strong> # the number of GPUs for decoding, if `ngpu` > 0, use GPU decoding
+ - <strong>njob:</strong> # the number of jobs for CPU decoding, if `ngpu` = 0, use CPU decoding, please set `njob`
+ - <strong>batch_size:</strong> # batchsize of inference
- Then you can run the pipeline to infer with:
```python
@@ -39,9 +41,11 @@
### Inference using local finetuned model
- Modify inference related parameters in `infer_after_finetune.py`
+ - <strong>modelscope_model_name: </strong> # model name on ModelScope
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
+ - <strong>batch_size:</strong> # batchsize of inference
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
index f9f6114..795a1e7 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer.py
@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 94393ec..295c95d 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,23 +4,18 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
new file mode 100644
index 0000000..c1c541b
--- /dev/null
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online/infer.py
@@ -0,0 +1,57 @@
+import torch
+import torchaudio
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
+inference_pipeline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8404-online',
+ model_revision='v1.0.2')
+
+waveform, sample_rate = torchaudio.load("waihu.wav")
+speech_length = waveform.shape[1]
+speech = waveform[0]
+
+cache_en = {"start_idx": 0, "pad_left": 0, "stride": 10, "pad_right": 5, "cif_hidden": None, "cif_alphas": None}
+cache_de = {"decode_fsmn": None}
+cache = {"encoder": cache_en, "decoder": cache_de}
+param_dict = {}
+param_dict["cache"] = cache
+
+first_chunk = True
+speech_buffer = speech
+speech_cache = []
+final_result = ""
+
+while len(speech_buffer) >= 960:
+ if first_chunk:
+ if len(speech_buffer) >= 14400:
+ rec_result = inference_pipeline(audio_in=speech_buffer[0:14400], param_dict=param_dict)
+ speech_buffer = speech_buffer[4800:]
+ else:
+ cache_en["stride"] = len(speech_buffer) // 960
+ cache_en["pad_right"] = 0
+ rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
+ speech_buffer = []
+ cache_en["start_idx"] = -5
+ first_chunk = False
+ else:
+ cache_en["start_idx"] += 10
+ if len(speech_buffer) >= 4800:
+ cache_en["pad_left"] = 5
+ rec_result = inference_pipeline(audio_in=speech_buffer[:19200], param_dict=param_dict)
+ speech_buffer = speech_buffer[9600:]
+ else:
+ cache_en["stride"] = len(speech_buffer) // 960
+ cache_en["pad_right"] = 0
+ rec_result = inference_pipeline(audio_in=speech_buffer, param_dict=param_dict)
+ speech_buffer = []
+ if len(rec_result) !=0 and rec_result['text'] != "sil":
+ final_result += rec_result['text']
+ print(rec_result)
+print(final_result)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
index f08b31f..0b508fb 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer.py
@@ -8,9 +8,14 @@
from funasr.utils.compute_wer import compute_wer
-def modelscope_infer_core(output_dir, split_dir, njob, idx):
+def modelscope_infer_core(output_dir, split_dir, njob, idx, batch_size, ngpu, model):
output_dir_job = os.path.join(output_dir, "output.{}".format(idx))
- gpu_id = (int(idx) - 1) // njob
+ if ngpu > 0:
+ use_gpu = 1
+ gpu_id = int(idx) - 1
+ else:
+ use_gpu = 0
+ gpu_id = -1
if "CUDA_VISIBLE_DEVICES" in os.environ.keys():
gpu_list = os.environ['CUDA_VISIBLE_DEVICES'].split(",")
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_list[gpu_id])
@@ -18,9 +23,10 @@
os.environ['CUDA_VISIBLE_DEVICES'] = str(gpu_id)
inference_pipline = pipeline(
task=Tasks.auto_speech_recognition,
- model="damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1",
+ model=model,
output_dir=output_dir_job,
- batch_size=64
+ batch_size=batch_size,
+ ngpu=use_gpu,
)
audio_in = os.path.join(split_dir, "wav.{}.scp".format(idx))
inference_pipline(audio_in=audio_in)
@@ -30,13 +36,18 @@
# prepare for multi-GPU decoding
ngpu = params["ngpu"]
njob = params["njob"]
+ batch_size = params["batch_size"]
output_dir = params["output_dir"]
+ model = params["model"]
if os.path.exists(output_dir):
shutil.rmtree(output_dir)
os.mkdir(output_dir)
split_dir = os.path.join(output_dir, "split")
os.mkdir(split_dir)
- nj = ngpu * njob
+ if ngpu > 0:
+ nj = ngpu
+ elif ngpu == 0:
+ nj = njob
wav_scp_file = os.path.join(params["data_dir"], "wav.scp")
with open(wav_scp_file) as f:
lines = f.readlines()
@@ -56,7 +67,7 @@
p = Pool(nj)
for i in range(nj):
p.apply_async(modelscope_infer_core,
- args=(output_dir, split_dir, njob, str(i + 1)))
+ args=(output_dir, split_dir, njob, str(i + 1), batch_size, ngpu, model))
p.close()
p.join()
@@ -81,8 +92,10 @@
if __name__ == "__main__":
params = {}
+ params["model"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
params["data_dir"] = "./data/test"
params["output_dir"] = "./results"
- params["ngpu"] = 1
- params["njob"] = 1
- modelscope_infer(params)
+ params["ngpu"] = 1 # if ngpu > 0, will use gpu decoding
+ params["njob"] = 1 # if ngpu = 0, will use cpu decoding
+ params["batch_size"] = 64
+ modelscope_infer(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
index 96102cc..e8fee02 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1/infer_after_finetune.py
@@ -4,23 +4,18 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")
+ shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -29,9 +24,9 @@
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
@@ -46,8 +41,8 @@
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer_asr_nat-zh-cn-8k-common-vocab8358-tensorflow1"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
index d91a40a..6593f4e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-offline/infer_after_finetune.py
@@ -50,5 +50,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
index dfd509d..b68f1e9 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
index f9fb0db..f067c81 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-fa-16k-common-vocab1257-pytorch-online/infer_after_finetune.py
@@ -50,5 +50,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
new file mode 100644
index 0000000..56fb583
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
new file mode 100644
index 0000000..c54ab8c
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_he.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-he-16k-common-vocab1085-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
index 030c2e2..d4df29e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-minnan-16k-common-vocab3825/infer_after_finetune.py
@@ -50,5 +50,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
new file mode 100644
index 0000000..8bbce60
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
new file mode 100644
index 0000000..cfd869f
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_my.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-my-16k-common-vocab696-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
new file mode 100644
index 0000000..5e313e5
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/finetune.py
@@ -0,0 +1,35 @@
+import os
+from modelscope.metainfo import Trainers
+from modelscope.trainers import build_trainer
+from funasr.datasets.ms_dataset import MsDataset
+
+
+def modelscope_finetune(params):
+ if not os.path.exists(params["output_dir"]):
+ os.makedirs(params["output_dir"], exist_ok=True)
+ # dataset split ["train", "validation"]
+ ds_dict = MsDataset.load(params["data_dir"])
+ kwargs = dict(
+ model=params["model"],
+ model_revision=params["model_revision"],
+ data_dir=ds_dict,
+ dataset_type=params["dataset_type"],
+ work_dir=params["output_dir"],
+ batch_bins=params["batch_bins"],
+ max_epoch=params["max_epoch"],
+ lr=params["lr"])
+ trainer = build_trainer(Trainers.speech_asr_trainer, default_args=kwargs)
+ trainer.train()
+
+
+if __name__ == '__main__':
+ params = {}
+ params["output_dir"] = "./checkpoint"
+ params["data_dir"] = "./data"
+ params["batch_bins"] = 2000
+ params["dataset_type"] = "small"
+ params["max_epoch"] = 50
+ params["lr"] = 0.00005
+ params["model"] = "damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch"
+ params["model_revision"] = None
+ modelscope_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
new file mode 100644
index 0000000..e8c5524
--- /dev/null
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch/infer.py
@@ -0,0 +1,13 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+if __name__ == "__main__":
+ audio_in = "https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_ur.wav"
+ output_dir = "./results"
+ inference_pipline = pipeline(
+ task=Tasks.auto_speech_recognition,
+ model="damo/speech_UniASR_asr_2pass-ur-16k-common-vocab877-pytorch",
+ output_dir=output_dir,
+ )
+ rec_result = inference_pipline(audio_in=audio_in, param_dict={"decoding_model":"offline"})
+ print(rec_result)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
index dd947d3..9a84f9b 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/README.md
@@ -41,7 +41,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
index 3b39a16..861fefb 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-offline/infer_after_finetune.py
@@ -49,5 +49,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
index dd947d3..eff933e 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/README.md
@@ -41,7 +41,8 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave
+ .pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
index 4860cf7..d73cae2 100644
--- a/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
+++ b/egs_modelscope/asr/uniasr/speech_UniASR_asr_2pass-zh-cn-8k-common-vocab3445-pytorch-online/infer_after_finetune.py
@@ -49,5 +49,5 @@
params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "20epoch.pth"
+ params["decoding_model_name"] = "20epoch.pb"
modelscope_infer_after_finetune(params)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
index 1094bb5..94144ef 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/README.md
@@ -34,7 +34,7 @@
- Modify inference related parameters in `infer_after_finetune.py`
- <strong>output_dir:</strong> # result dir
- <strong>data_dir:</strong> # the dataset dir needs to include `test/wav.scp`. If `test/text` is also exists, CER will be computed
- - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pth`
+ - <strong>decoding_model_name:</strong> # set the checkpoint name for decoding, e.g., `valid.cer_ctc.ave.pb`
- Then you can run the pipeline to finetune with:
```python
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
index 5f171b4..473019c 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/infer_after_finetune.py
@@ -4,27 +4,17 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.hub.snapshot_download import snapshot_download
from funasr.utils.compute_wer import compute_wer
-
def modelscope_infer_after_finetune(params):
# prepare for decoding
- if not os.path.exists(os.path.join(params["output_dir"], "punc")):
- os.makedirs(os.path.join(params["output_dir"], "punc"))
- if not os.path.exists(os.path.join(params["output_dir"], "vad")):
- os.makedirs(os.path.join(params["output_dir"], "vad"))
- pretrained_model_path = os.path.join(os.environ["HOME"], ".cache/modelscope/hub", params["modelscope_model_name"])
- for file_name in params["required_files"]:
- if file_name == "configuration.json":
- with open(os.path.join(pretrained_model_path, file_name)) as f:
- config_dict = json.load(f)
- config_dict["model"]["am_model_name"] = params["decoding_model_name"]
- with open(os.path.join(params["output_dir"], "configuration.json"), "w") as f:
- json.dump(config_dict, f, indent=4, separators=(',', ': '))
- else:
- shutil.copy(os.path.join(pretrained_model_path, file_name),
- os.path.join(params["output_dir"], file_name))
+
+ try:
+ pretrained_model_path = snapshot_download(params["modelscope_model_name"], cache_dir=params["output_dir"])
+ except BaseException:
+ raise BaseException(f"Please download pretrain model from ModelScope firstly.")shutil.copy(os.path.join(params["output_dir"], params["decoding_model_name"]), os.path.join(pretrained_model_path, "model.pb"))
decoding_path = os.path.join(params["output_dir"], "decode_results")
if os.path.exists(decoding_path):
shutil.rmtree(decoding_path)
@@ -33,16 +23,16 @@
# decoding
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
- model=params["output_dir"],
+ model=pretrained_model_path,
output_dir=decoding_path,
- batch_size=64
+ batch_size=params["batch_size"]
)
audio_in = os.path.join(params["data_dir"], "wav.scp")
inference_pipeline(audio_in=audio_in)
# computer CER if GT text is set
text_in = os.path.join(params["data_dir"], "text")
- if text_in is not None:
+ if os.path.exists(text_in):
text_proc_file = os.path.join(decoding_path, "1best_recog/token")
compute_wer(text_in, text_proc_file, os.path.join(decoding_path, "text.cer"))
@@ -50,8 +40,8 @@
if __name__ == '__main__':
params = {}
params["modelscope_model_name"] = "damo/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
- params["required_files"] = ["am.mvn", "decoding.yaml", "configuration.json", "punc/punc.pb", "punc/punc.yaml", "vad/vad.mvn", "vad/vad.pb", "vad/vad.yaml"]
params["output_dir"] = "./checkpoint"
params["data_dir"] = "./data/test"
- params["decoding_model_name"] = "valid.acc.ave_10best.pth"
- modelscope_infer_after_finetune(params)
+ params["decoding_model_name"] = "valid.acc.ave_10best.pb"
+ params["batch_size"] = 64
+ modelscope_infer_after_finetune(params)
\ No newline at end of file
diff --git a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
index 540e3cf..2bac220 100644
--- a/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
+++ b/egs_modelscope/punctuation/punc_ct-transformer_zh-cn-common-vadrealtime-vocab272727/infer.py
@@ -4,6 +4,11 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
+
inference_pipeline = pipeline(
task=Tasks.punctuation,
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
new file mode 100644
index 0000000..81cb2c6
--- /dev/null
+++ b/egs_modelscope/speaker_diarization/speech_diarization_eend-ola-en-us-callhome-8k/infer.py
@@ -0,0 +1,10 @@
+from modelscope.pipelines import pipeline
+from modelscope.utils.constant import Tasks
+
+inference_diar_pipline = pipeline(
+ task=Tasks.speaker_diarization,
+ model='damo/speech_diarization_eend-ola-en-us-callhome-8k',
+ model_revision="v1.0.0",
+)
+results = inference_diar_pipline(audio_in=["https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record2.wav"])
+print(results)
\ No newline at end of file
diff --git a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
index 3cb31cf..5f4563d 100644
--- a/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
+++ b/egs_modelscope/speaker_diarization/speech_diarization_sond-zh-cn-alimeeting-16k-n16k4-pytorch/unit_test.py
@@ -14,13 +14,12 @@
)
# 浠� audio_list 浣滀负杈撳叆锛屽叾涓涓�涓煶棰戜负寰呮娴嬭闊筹紝鍚庨潰鐨勯煶棰戜负涓嶅悓璇磋瘽浜虹殑澹扮汗娉ㄥ唽璇煶
-audio_list = [[
+audio_list = [
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/record.wav",
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_A.wav",
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B.wav",
"https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_data/spk_B1.wav"
-]]
+]
results = inference_diar_pipline(audio_in=audio_list)
-for rst in results:
- print(rst["value"])
+print(results)
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
index 66b8161..d70ed25 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-16k-common/infer_online.py
@@ -1,7 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
import soundfile
-
if __name__ == '__main__':
output_dir = None
diff --git a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
index abf4ef5..fb56908 100644
--- a/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
+++ b/egs_modelscope/vad/speech_fsmn_vad_zh-cn-8k-common/infer_online.py
@@ -1,7 +1,10 @@
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
+from modelscope.utils.logger import get_logger
+import logging
+logger = get_logger(log_level=logging.CRITICAL)
+logger.setLevel(logging.CRITICAL)
import soundfile
-
if __name__ == '__main__':
output_dir = None
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 318d3d7..f3b4d56 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -52,7 +52,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index b9be3e2..53eee64 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -256,6 +256,9 @@
elif mode == "paraformer":
from funasr.bin.asr_inference_paraformer import inference_modelscope
return inference_modelscope(**kwargs)
+ elif mode == "paraformer_streaming":
+ from funasr.bin.asr_inference_paraformer_streaming import inference_modelscope
+ return inference_modelscope(**kwargs)
elif mode == "paraformer_vad":
from funasr.bin.asr_inference_paraformer_vad import inference_modelscope
return inference_modelscope(**kwargs)
diff --git a/funasr/bin/asr_inference_mfcca.py b/funasr/bin/asr_inference_mfcca.py
index 4176ba6..6f3dbb1 100644
--- a/funasr/bin/asr_inference_mfcca.py
+++ b/funasr/bin/asr_inference_mfcca.py
@@ -41,8 +41,6 @@
from funasr.utils import asr_utils, wav_utils, postprocess_utils
import pdb
-header_colors = '\033[95m'
-end_colors = '\033[0m'
global_asr_language: str = 'zh-cn'
global_sample_rate: Union[int, Dict[Any, int]] = {
@@ -55,7 +53,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 6413d92..e45e575 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -50,7 +50,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_paraformer_streaming.py b/funasr/bin/asr_inference_paraformer_streaming.py
new file mode 100644
index 0000000..9b572a0
--- /dev/null
+++ b/funasr/bin/asr_inference_paraformer_streaming.py
@@ -0,0 +1,907 @@
+#!/usr/bin/env python3
+import argparse
+import logging
+import sys
+import time
+import copy
+import os
+import codecs
+import tempfile
+import requests
+from pathlib import Path
+from typing import Optional
+from typing import Sequence
+from typing import Tuple
+from typing import Union
+from typing import Dict
+from typing import Any
+from typing import List
+
+import numpy as np
+import torch
+from typeguard import check_argument_types
+
+from funasr.fileio.datadir_writer import DatadirWriter
+from funasr.modules.beam_search.beam_search import BeamSearchPara as BeamSearch
+from funasr.modules.beam_search.beam_search import Hypothesis
+from funasr.modules.scorers.ctc import CTCPrefixScorer
+from funasr.modules.scorers.length_bonus import LengthBonus
+from funasr.modules.subsampling import TooShortUttError
+from funasr.tasks.asr import ASRTaskParaformer as ASRTask
+from funasr.tasks.lm import LMTask
+from funasr.text.build_tokenizer import build_tokenizer
+from funasr.text.token_id_converter import TokenIDConverter
+from funasr.torch_utils.device_funcs import to_device
+from funasr.torch_utils.set_all_random_seed import set_all_random_seed
+from funasr.utils import config_argparse
+from funasr.utils.cli_utils import get_commandline_args
+from funasr.utils.types import str2bool
+from funasr.utils.types import str2triple_str
+from funasr.utils.types import str_or_none
+from funasr.utils import asr_utils, wav_utils, postprocess_utils
+from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+from funasr.export.models.e2e_asr_paraformer import Paraformer as Paraformer_export
+
+class Speech2Text:
+ """Speech2Text class
+
+ Examples:
+ >>> import soundfile
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> audio, rate = soundfile.read("speech.wav")
+ >>> speech2text(audio)
+ [(text, token, token_int, hypothesis object), ...]
+
+ """
+
+ def __init__(
+ self,
+ asr_train_config: Union[Path, str] = None,
+ asr_model_file: Union[Path, str] = None,
+ cmvn_file: Union[Path, str] = None,
+ lm_train_config: Union[Path, str] = None,
+ lm_file: Union[Path, str] = None,
+ token_type: str = None,
+ bpemodel: str = None,
+ device: str = "cpu",
+ maxlenratio: float = 0.0,
+ minlenratio: float = 0.0,
+ dtype: str = "float32",
+ beam_size: int = 20,
+ ctc_weight: float = 0.5,
+ lm_weight: float = 1.0,
+ ngram_weight: float = 0.9,
+ penalty: float = 0.0,
+ nbest: int = 1,
+ frontend_conf: dict = None,
+ hotword_list_or_file: str = None,
+ **kwargs,
+ ):
+ assert check_argument_types()
+
+ # 1. Build ASR model
+ scorers = {}
+ asr_model, asr_train_args = ASRTask.build_model_from_file(
+ asr_train_config, asr_model_file, cmvn_file, device
+ )
+ frontend = None
+ if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+ frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+ logging.info("asr_model: {}".format(asr_model))
+ logging.info("asr_train_args: {}".format(asr_train_args))
+ asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+ if asr_model.ctc != None:
+ ctc = CTCPrefixScorer(ctc=asr_model.ctc, eos=asr_model.eos)
+ scorers.update(
+ ctc=ctc
+ )
+ token_list = asr_model.token_list
+ scorers.update(
+ length_bonus=LengthBonus(len(token_list)),
+ )
+
+ # 2. Build Language model
+ if lm_train_config is not None:
+ lm, lm_train_args = LMTask.build_model_from_file(
+ lm_train_config, lm_file, device
+ )
+ scorers["lm"] = lm.lm
+
+ # 3. Build ngram model
+ # ngram is not supported now
+ ngram = None
+ scorers["ngram"] = ngram
+
+ # 4. Build BeamSearch object
+ # transducer is not supported now
+ beam_search_transducer = None
+
+ weights = dict(
+ decoder=1.0 - ctc_weight,
+ ctc=ctc_weight,
+ lm=lm_weight,
+ ngram=ngram_weight,
+ length_bonus=penalty,
+ )
+ beam_search = BeamSearch(
+ beam_size=beam_size,
+ weights=weights,
+ scorers=scorers,
+ sos=asr_model.sos,
+ eos=asr_model.eos,
+ vocab_size=len(token_list),
+ token_list=token_list,
+ pre_beam_score_key=None if ctc_weight == 1.0 else "full",
+ )
+
+ beam_search.to(device=device, dtype=getattr(torch, dtype)).eval()
+ for scorer in scorers.values():
+ if isinstance(scorer, torch.nn.Module):
+ scorer.to(device=device, dtype=getattr(torch, dtype)).eval()
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+ # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+ if token_type is None:
+ token_type = asr_train_args.token_type
+ if bpemodel is None:
+ bpemodel = asr_train_args.bpemodel
+
+ if token_type is None:
+ tokenizer = None
+ elif token_type == "bpe":
+ if bpemodel is not None:
+ tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+ else:
+ tokenizer = None
+ else:
+ tokenizer = build_tokenizer(token_type=token_type)
+ converter = TokenIDConverter(token_list=token_list)
+ logging.info(f"Text tokenizer: {tokenizer}")
+
+ self.asr_model = asr_model
+ self.asr_train_args = asr_train_args
+ self.converter = converter
+ self.tokenizer = tokenizer
+
+ # 6. [Optional] Build hotword list from str, local file or url
+
+ is_use_lm = lm_weight != 0.0 and lm_file is not None
+ if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
+ beam_search = None
+ self.beam_search = beam_search
+ logging.info(f"Beam_search: {self.beam_search}")
+ self.beam_search_transducer = beam_search_transducer
+ self.maxlenratio = maxlenratio
+ self.minlenratio = minlenratio
+ self.device = device
+ self.dtype = dtype
+ self.nbest = nbest
+ self.frontend = frontend
+ self.encoder_downsampling_factor = 1
+ if asr_train_args.encoder == "data2vec_encoder" or asr_train_args.encoder_conf["input_layer"] == "conv2d":
+ self.encoder_downsampling_factor = 4
+
+ @torch.no_grad()
+ def __call__(
+ self, cache: dict, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None,
+ begin_time: int = 0, end_time: int = None,
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.asr_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+ lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ batch = {"speech": feats, "speech_lengths": feats_len, "cache": cache}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ # b. Forward Encoder
+ enc, enc_len = self.asr_model.encode_chunk(**batch)
+ if isinstance(enc, tuple):
+ enc = enc[0]
+ # assert len(enc) == 1, len(enc)
+ enc_len_batch_total = torch.sum(enc_len).item() * self.encoder_downsampling_factor
+
+ predictor_outs = self.asr_model.calc_predictor_chunk(enc, cache)
+ pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
+ predictor_outs[2], predictor_outs[3]
+ pre_token_length = pre_token_length.floor().long()
+ if torch.max(pre_token_length) < 1:
+ return []
+ decoder_outs = self.asr_model.cal_decoder_with_predictor_chunk(enc, pre_acoustic_embeds, cache)
+ decoder_out = decoder_outs
+
+ results = []
+ b, n, d = decoder_out.size()
+ for i in range(b):
+ x = enc[i, :enc_len[i], :]
+ am_scores = decoder_out[i, :pre_token_length[i], :]
+ if self.beam_search is not None:
+ nbest_hyps = self.beam_search(
+ x=x, am_scores=am_scores, maxlenratio=self.maxlenratio, minlenratio=self.minlenratio
+ )
+
+ nbest_hyps = nbest_hyps[: self.nbest]
+ else:
+ yseq = am_scores.argmax(dim=-1)
+ score = am_scores.max(dim=-1)[0]
+ score = torch.sum(score, dim=-1)
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ yseq = torch.tensor(
+ [self.asr_model.sos] + yseq.tolist() + [self.asr_model.eos], device=yseq.device
+ )
+ nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+ for hyp in nbest_hyps:
+ assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ if isinstance(hyp.yseq, list):
+ token_int = hyp.yseq[1:last_pos]
+ else:
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+
+ if self.tokenizer is not None:
+ text = self.tokenizer.tokens2text(token)
+ else:
+ text = None
+
+ results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+ # assert check_return_type(results)
+ return results
+
+
+class Speech2TextExport:
+ """Speech2TextExport class
+
+ """
+
+ def __init__(
+ self,
+ asr_train_config: Union[Path, str] = None,
+ asr_model_file: Union[Path, str] = None,
+ cmvn_file: Union[Path, str] = None,
+ lm_train_config: Union[Path, str] = None,
+ lm_file: Union[Path, str] = None,
+ token_type: str = None,
+ bpemodel: str = None,
+ device: str = "cpu",
+ maxlenratio: float = 0.0,
+ minlenratio: float = 0.0,
+ dtype: str = "float32",
+ beam_size: int = 20,
+ ctc_weight: float = 0.5,
+ lm_weight: float = 1.0,
+ ngram_weight: float = 0.9,
+ penalty: float = 0.0,
+ nbest: int = 1,
+ frontend_conf: dict = None,
+ hotword_list_or_file: str = None,
+ **kwargs,
+ ):
+
+ # 1. Build ASR model
+ asr_model, asr_train_args = ASRTask.build_model_from_file(
+ asr_train_config, asr_model_file, cmvn_file, device
+ )
+ frontend = None
+ if asr_train_args.frontend is not None and asr_train_args.frontend_conf is not None:
+ frontend = WavFrontend(cmvn_file=cmvn_file, **asr_train_args.frontend_conf)
+
+ logging.info("asr_model: {}".format(asr_model))
+ logging.info("asr_train_args: {}".format(asr_train_args))
+ asr_model.to(dtype=getattr(torch, dtype)).eval()
+
+ token_list = asr_model.token_list
+
+ logging.info(f"Decoding device={device}, dtype={dtype}")
+
+ # 5. [Optional] Build Text converter: e.g. bpe-sym -> Text
+ if token_type is None:
+ token_type = asr_train_args.token_type
+ if bpemodel is None:
+ bpemodel = asr_train_args.bpemodel
+
+ if token_type is None:
+ tokenizer = None
+ elif token_type == "bpe":
+ if bpemodel is not None:
+ tokenizer = build_tokenizer(token_type=token_type, bpemodel=bpemodel)
+ else:
+ tokenizer = None
+ else:
+ tokenizer = build_tokenizer(token_type=token_type)
+ converter = TokenIDConverter(token_list=token_list)
+ logging.info(f"Text tokenizer: {tokenizer}")
+
+ # self.asr_model = asr_model
+ self.asr_train_args = asr_train_args
+ self.converter = converter
+ self.tokenizer = tokenizer
+
+ self.device = device
+ self.dtype = dtype
+ self.nbest = nbest
+ self.frontend = frontend
+
+ model = Paraformer_export(asr_model, onnx=False)
+ self.asr_model = model
+
+ @torch.no_grad()
+ def __call__(
+ self, speech: Union[torch.Tensor, np.ndarray], speech_lengths: Union[torch.Tensor, np.ndarray] = None
+ ):
+ """Inference
+
+ Args:
+ speech: Input speech data
+ Returns:
+ text, token, token_int, hyp
+
+ """
+ assert check_argument_types()
+
+ # Input as audio signal
+ if isinstance(speech, np.ndarray):
+ speech = torch.tensor(speech)
+
+ if self.frontend is not None:
+ feats, feats_len = self.frontend.forward(speech, speech_lengths)
+ feats = to_device(feats, device=self.device)
+ feats_len = feats_len.int()
+ self.asr_model.frontend = None
+ else:
+ feats = speech
+ feats_len = speech_lengths
+
+ enc_len_batch_total = feats_len.sum()
+ lfr_factor = max(1, (feats.size()[-1] // 80) - 1)
+ batch = {"speech": feats, "speech_lengths": feats_len}
+
+ # a. To device
+ batch = to_device(batch, device=self.device)
+
+ decoder_outs = self.asr_model(**batch)
+ decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+
+ results = []
+ b, n, d = decoder_out.size()
+ for i in range(b):
+ am_scores = decoder_out[i, :ys_pad_lens[i], :]
+
+ yseq = am_scores.argmax(dim=-1)
+ score = am_scores.max(dim=-1)[0]
+ score = torch.sum(score, dim=-1)
+ # pad with mask tokens to ensure compatibility with sos/eos tokens
+ yseq = torch.tensor(
+ yseq.tolist(), device=yseq.device
+ )
+ nbest_hyps = [Hypothesis(yseq=yseq, score=score)]
+
+ for hyp in nbest_hyps:
+ assert isinstance(hyp, (Hypothesis)), type(hyp)
+
+ # remove sos/eos and get results
+ last_pos = -1
+ if isinstance(hyp.yseq, list):
+ token_int = hyp.yseq[1:last_pos]
+ else:
+ token_int = hyp.yseq[1:last_pos].tolist()
+
+ # remove blank symbol id, which is assumed to be 0
+ token_int = list(filter(lambda x: x != 0 and x != 2, token_int))
+
+ # Change integer-ids to tokens
+ token = self.converter.ids2tokens(token_int)
+
+ if self.tokenizer is not None:
+ text = self.tokenizer.tokens2text(token)
+ else:
+ text = None
+
+ results.append((text, token, token_int, hyp, enc_len_batch_total, lfr_factor))
+
+ return results
+
+
+def inference(
+ maxlenratio: float,
+ minlenratio: float,
+ batch_size: int,
+ beam_size: int,
+ ngpu: int,
+ ctc_weight: float,
+ lm_weight: float,
+ penalty: float,
+ log_level: Union[int, str],
+ data_path_and_name_and_type,
+ asr_train_config: Optional[str],
+ asr_model_file: Optional[str],
+ cmvn_file: Optional[str] = None,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ streaming: bool = False,
+ output_dir: Optional[str] = None,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
+
+ **kwargs,
+):
+ inference_pipeline = inference_modelscope(
+ maxlenratio=maxlenratio,
+ minlenratio=minlenratio,
+ batch_size=batch_size,
+ beam_size=beam_size,
+ ngpu=ngpu,
+ ctc_weight=ctc_weight,
+ lm_weight=lm_weight,
+ penalty=penalty,
+ log_level=log_level,
+ asr_train_config=asr_train_config,
+ asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
+ raw_inputs=raw_inputs,
+ lm_train_config=lm_train_config,
+ lm_file=lm_file,
+ token_type=token_type,
+ key_file=key_file,
+ word_lm_train_config=word_lm_train_config,
+ bpemodel=bpemodel,
+ allow_variable_data_keys=allow_variable_data_keys,
+ streaming=streaming,
+ output_dir=output_dir,
+ dtype=dtype,
+ seed=seed,
+ ngram_weight=ngram_weight,
+ nbest=nbest,
+ num_workers=num_workers,
+
+ **kwargs,
+ )
+ return inference_pipeline(data_path_and_name_and_type, raw_inputs)
+
+
+def inference_modelscope(
+ maxlenratio: float,
+ minlenratio: float,
+ batch_size: int,
+ beam_size: int,
+ ngpu: int,
+ ctc_weight: float,
+ lm_weight: float,
+ penalty: float,
+ log_level: Union[int, str],
+ # data_path_and_name_and_type,
+ asr_train_config: Optional[str],
+ asr_model_file: Optional[str],
+ cmvn_file: Optional[str] = None,
+ lm_train_config: Optional[str] = None,
+ lm_file: Optional[str] = None,
+ token_type: Optional[str] = None,
+ key_file: Optional[str] = None,
+ word_lm_train_config: Optional[str] = None,
+ bpemodel: Optional[str] = None,
+ allow_variable_data_keys: bool = False,
+ dtype: str = "float32",
+ seed: int = 0,
+ ngram_weight: float = 0.9,
+ nbest: int = 1,
+ num_workers: int = 1,
+ output_dir: Optional[str] = None,
+ param_dict: dict = None,
+ **kwargs,
+):
+ assert check_argument_types()
+
+ if word_lm_train_config is not None:
+ raise NotImplementedError("Word LM is not implemented")
+ if ngpu > 1:
+ raise NotImplementedError("only single GPU decoding is supported")
+
+ logging.basicConfig(
+ level=log_level,
+ format="%(asctime)s (%(module)s:%(lineno)d) %(levelname)s: %(message)s",
+ )
+
+ export_mode = False
+ if param_dict is not None:
+ hotword_list_or_file = param_dict.get('hotword')
+ export_mode = param_dict.get("export_mode", False)
+ else:
+ hotword_list_or_file = None
+
+ if ngpu >= 1 and torch.cuda.is_available():
+ device = "cuda"
+ else:
+ device = "cpu"
+ batch_size = 1
+
+ # 1. Set random-seed
+ set_all_random_seed(seed)
+
+ # 2. Build speech2text
+ speech2text_kwargs = dict(
+ asr_train_config=asr_train_config,
+ asr_model_file=asr_model_file,
+ cmvn_file=cmvn_file,
+ lm_train_config=lm_train_config,
+ lm_file=lm_file,
+ token_type=token_type,
+ bpemodel=bpemodel,
+ device=device,
+ maxlenratio=maxlenratio,
+ minlenratio=minlenratio,
+ dtype=dtype,
+ beam_size=beam_size,
+ ctc_weight=ctc_weight,
+ lm_weight=lm_weight,
+ ngram_weight=ngram_weight,
+ penalty=penalty,
+ nbest=nbest,
+ hotword_list_or_file=hotword_list_or_file,
+ )
+ if export_mode:
+ speech2text = Speech2TextExport(**speech2text_kwargs)
+ else:
+ speech2text = Speech2Text(**speech2text_kwargs)
+
+ def _forward(
+ data_path_and_name_and_type,
+ raw_inputs: Union[np.ndarray, torch.Tensor] = None,
+ output_dir_v2: Optional[str] = None,
+ fs: dict = None,
+ param_dict: dict = None,
+ **kwargs,
+ ):
+
+ hotword_list_or_file = None
+ if param_dict is not None:
+ hotword_list_or_file = param_dict.get('hotword')
+ if 'hotword' in kwargs:
+ hotword_list_or_file = kwargs['hotword']
+ if hotword_list_or_file is not None or 'hotword' in kwargs:
+ speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
+
+ # 3. Build data-iterator
+ if data_path_and_name_and_type is None and raw_inputs is not None:
+ if isinstance(raw_inputs, torch.Tensor):
+ raw_inputs = raw_inputs.numpy()
+ data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ loader = ASRTask.build_streaming_iterator(
+ data_path_and_name_and_type,
+ dtype=dtype,
+ fs=fs,
+ batch_size=batch_size,
+ key_file=key_file,
+ num_workers=num_workers,
+ preprocess_fn=ASRTask.build_preprocess_fn(speech2text.asr_train_args, False),
+ collate_fn=ASRTask.build_collate_fn(speech2text.asr_train_args, False),
+ allow_variable_data_keys=allow_variable_data_keys,
+ inference=True,
+ )
+
+ if param_dict is not None:
+ use_timestamp = param_dict.get('use_timestamp', True)
+ else:
+ use_timestamp = True
+
+ forward_time_total = 0.0
+ length_total = 0.0
+ finish_count = 0
+ file_count = 1
+ cache = None
+ # 7 .Start for-loop
+ # FIXME(kamo): The output format should be discussed about
+ asr_result_list = []
+ output_path = output_dir_v2 if output_dir_v2 is not None else output_dir
+ if output_path is not None:
+ writer = DatadirWriter(output_path)
+ else:
+ writer = None
+ if param_dict is not None and "cache" in param_dict:
+ cache = param_dict["cache"]
+ for keys, batch in loader:
+ assert isinstance(batch, dict), type(batch)
+ assert all(isinstance(s, str) for s in keys), keys
+ _bs = len(next(iter(batch.values())))
+ assert len(keys) == _bs, f"{len(keys)} != {_bs}"
+ # batch = {k: v for k, v in batch.items() if not k.endswith("_lengths")}
+ logging.info("decoding, utt_id: {}".format(keys))
+ # N-best list of (text, token, token_int, hyp_object)
+
+ time_beg = time.time()
+ results = speech2text(cache=cache, **batch)
+ if len(results) < 1:
+ hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
+ results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
+ time_end = time.time()
+ forward_time = time_end - time_beg
+ lfr_factor = results[0][-1]
+ length = results[0][-2]
+ forward_time_total += forward_time
+ length_total += length
+ rtf_cur = "decoding, feature length: {}, forward_time: {:.4f}, rtf: {:.4f}".format(length, forward_time,
+ 100 * forward_time / (
+ length * lfr_factor))
+ logging.info(rtf_cur)
+
+ for batch_id in range(_bs):
+ result = [results[batch_id][:-2]]
+
+ key = keys[batch_id]
+ for n, result in zip(range(1, nbest + 1), result):
+ text, token, token_int, hyp = result[0], result[1], result[2], result[3]
+ time_stamp = None if len(result) < 5 else result[4]
+ # Create a directory: outdir/{n}best_recog
+ if writer is not None:
+ ibest_writer = writer[f"{n}best_recog"]
+
+ # Write the result to each file
+ ibest_writer["token"][key] = " ".join(token)
+ # ibest_writer["token_int"][key] = " ".join(map(str, token_int))
+ ibest_writer["score"][key] = str(hyp.score)
+ ibest_writer["rtf"][key] = rtf_cur
+
+ if text is not None:
+ if use_timestamp and time_stamp is not None:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token, time_stamp)
+ else:
+ postprocessed_result = postprocess_utils.sentence_postprocess(token)
+ time_stamp_postprocessed = ""
+ if len(postprocessed_result) == 3:
+ text_postprocessed, time_stamp_postprocessed, word_lists = postprocessed_result[0], \
+ postprocessed_result[1], \
+ postprocessed_result[2]
+ else:
+ text_postprocessed, word_lists = postprocessed_result[0], postprocessed_result[1]
+ item = {'key': key, 'value': text_postprocessed}
+ if time_stamp_postprocessed != "":
+ item['time_stamp'] = time_stamp_postprocessed
+ asr_result_list.append(item)
+ finish_count += 1
+ # asr_utils.print_progress(finish_count / file_count)
+ if writer is not None:
+ ibest_writer["text"][key] = text_postprocessed
+
+ logging.info("decoding, utt: {}, predictions: {}".format(key, text))
+ rtf_avg = "decoding, feature length total: {}, forward_time total: {:.4f}, rtf avg: {:.4f}".format(length_total,
+ forward_time_total,
+ 100 * forward_time_total / (
+ length_total * lfr_factor))
+ logging.info(rtf_avg)
+ if writer is not None:
+ ibest_writer["rtf"]["rtf_avf"] = rtf_avg
+ return asr_result_list
+
+ return _forward
+
+
+def get_parser():
+ parser = config_argparse.ArgumentParser(
+ description="ASR Decoding",
+ formatter_class=argparse.ArgumentDefaultsHelpFormatter,
+ )
+
+ # Note(kamo): Use '_' instead of '-' as separator.
+ # '-' is confusing if written in yaml.
+ parser.add_argument(
+ "--log_level",
+ type=lambda x: x.upper(),
+ default="INFO",
+ choices=("CRITICAL", "ERROR", "WARNING", "INFO", "DEBUG", "NOTSET"),
+ help="The verbose level of logging",
+ )
+
+ parser.add_argument("--output_dir", type=str, required=True)
+ parser.add_argument(
+ "--ngpu",
+ type=int,
+ default=0,
+ help="The number of gpus. 0 indicates CPU mode",
+ )
+ parser.add_argument("--seed", type=int, default=0, help="Random seed")
+ parser.add_argument(
+ "--dtype",
+ default="float32",
+ choices=["float16", "float32", "float64"],
+ help="Data type",
+ )
+ parser.add_argument(
+ "--num_workers",
+ type=int,
+ default=1,
+ help="The number of workers used for DataLoader",
+ )
+ parser.add_argument(
+ "--hotword",
+ type=str_or_none,
+ default=None,
+ help="hotword file path or hotwords seperated by space"
+ )
+ group = parser.add_argument_group("Input data related")
+ group.add_argument(
+ "--data_path_and_name_and_type",
+ type=str2triple_str,
+ required=False,
+ action="append",
+ )
+ group.add_argument("--key_file", type=str_or_none)
+ group.add_argument("--allow_variable_data_keys", type=str2bool, default=False)
+
+ group = parser.add_argument_group("The model configuration related")
+ group.add_argument(
+ "--asr_train_config",
+ type=str,
+ help="ASR training configuration",
+ )
+ group.add_argument(
+ "--asr_model_file",
+ type=str,
+ help="ASR model parameter file",
+ )
+ group.add_argument(
+ "--cmvn_file",
+ type=str,
+ help="Global cmvn file",
+ )
+ group.add_argument(
+ "--lm_train_config",
+ type=str,
+ help="LM training configuration",
+ )
+ group.add_argument(
+ "--lm_file",
+ type=str,
+ help="LM parameter file",
+ )
+ group.add_argument(
+ "--word_lm_train_config",
+ type=str,
+ help="Word LM training configuration",
+ )
+ group.add_argument(
+ "--word_lm_file",
+ type=str,
+ help="Word LM parameter file",
+ )
+ group.add_argument(
+ "--ngram_file",
+ type=str,
+ help="N-gram parameter file",
+ )
+ group.add_argument(
+ "--model_tag",
+ type=str,
+ help="Pretrained model tag. If specify this option, *_train_config and "
+ "*_file will be overwritten",
+ )
+
+ group = parser.add_argument_group("Beam-search related")
+ group.add_argument(
+ "--batch_size",
+ type=int,
+ default=1,
+ help="The batch size for inference",
+ )
+ group.add_argument("--nbest", type=int, default=1, help="Output N-best hypotheses")
+ group.add_argument("--beam_size", type=int, default=20, help="Beam size")
+ group.add_argument("--penalty", type=float, default=0.0, help="Insertion penalty")
+ group.add_argument(
+ "--maxlenratio",
+ type=float,
+ default=0.0,
+ help="Input length ratio to obtain max output length. "
+ "If maxlenratio=0.0 (default), it uses a end-detect "
+ "function "
+ "to automatically find maximum hypothesis lengths."
+ "If maxlenratio<0.0, its absolute value is interpreted"
+ "as a constant max output length",
+ )
+ group.add_argument(
+ "--minlenratio",
+ type=float,
+ default=0.0,
+ help="Input length ratio to obtain min output length",
+ )
+ group.add_argument(
+ "--ctc_weight",
+ type=float,
+ default=0.5,
+ help="CTC weight in joint decoding",
+ )
+ group.add_argument("--lm_weight", type=float, default=1.0, help="RNNLM weight")
+ group.add_argument("--ngram_weight", type=float, default=0.9, help="ngram weight")
+ group.add_argument("--streaming", type=str2bool, default=False)
+
+ group.add_argument(
+ "--frontend_conf",
+ default=None,
+ help="",
+ )
+ group.add_argument("--raw_inputs", type=list, default=None)
+ # example=[{'key':'EdevDEWdIYQ_0021','file':'/mnt/data/jiangyu.xzy/test_data/speech_io/SPEECHIO_ASR_ZH00007_zhibodaihuo/wav/EdevDEWdIYQ_0021.wav'}])
+
+ group = parser.add_argument_group("Text converter related")
+ group.add_argument(
+ "--token_type",
+ type=str_or_none,
+ default=None,
+ choices=["char", "bpe", None],
+ help="The token type for ASR model. "
+ "If not given, refers from the training args",
+ )
+ group.add_argument(
+ "--bpemodel",
+ type=str_or_none,
+ default=None,
+ help="The model path of sentencepiece. "
+ "If not given, refers from the training args",
+ )
+
+ return parser
+
+
+def main(cmd=None):
+ print(get_commandline_args(), file=sys.stderr)
+ parser = get_parser()
+ args = parser.parse_args(cmd)
+ param_dict = {'hotword': args.hotword}
+ kwargs = vars(args)
+ kwargs.pop("config", None)
+ kwargs['param_dict'] = param_dict
+ inference(**kwargs)
+
+
+if __name__ == "__main__":
+ main()
+
+ # from modelscope.pipelines import pipeline
+ # from modelscope.utils.constant import Tasks
+ #
+ # inference_16k_pipline = pipeline(
+ # task=Tasks.auto_speech_recognition,
+ # model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
+ #
+ # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
+ # print(rec_result)
+
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index a0e7b47..3f57751 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -58,7 +58,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index 8b31fad..ac71538 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -46,7 +46,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/asr_inference_uniasr_vad.py b/funasr/bin/asr_inference_uniasr_vad.py
index e5815df..7cb889b 100644
--- a/funasr/bin/asr_inference_uniasr_vad.py
+++ b/funasr/bin/asr_inference_uniasr_vad.py
@@ -46,7 +46,7 @@
Examples:
>>> import soundfile
- >>> speech2text = Speech2Text("asr_config.yml", "asr.pth")
+ >>> speech2text = Speech2Text("asr_config.yml", "asr.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2text(audio)
[(text, token, token_int, hypothesis object), ...]
diff --git a/funasr/bin/diar_inference_launch.py b/funasr/bin/diar_inference_launch.py
index 7738f4f..85e4518 100755
--- a/funasr/bin/diar_inference_launch.py
+++ b/funasr/bin/diar_inference_launch.py
@@ -133,7 +133,7 @@
param_dict = {
"extract_profile": True,
"sv_train_config": "sv.yaml",
- "sv_model_file": "sv.pth",
+ "sv_model_file": "sv.pb",
}
if "param_dict" in kwargs and kwargs["param_dict"] is not None:
for key in param_dict:
@@ -142,6 +142,9 @@
else:
kwargs["param_dict"] = param_dict
return inference_modelscope(mode=mode, **kwargs)
+ elif mode == "eend-ola":
+ from funasr.bin.eend_ola_inference import inference_modelscope
+ return inference_modelscope(mode=mode, **kwargs)
else:
logging.info("Unknown decoding mode: {}".format(mode))
return None
diff --git a/funasr/bin/eend_ola_inference.py b/funasr/bin/eend_ola_inference.py
index d65895f..01d3f29 100755
--- a/funasr/bin/eend_ola_inference.py
+++ b/funasr/bin/eend_ola_inference.py
@@ -16,6 +16,7 @@
import numpy as np
import torch
+from scipy.signal import medfilt
from typeguard import check_argument_types
from funasr.models.frontend.wav_frontend import WavFrontendMel23
@@ -34,7 +35,7 @@
Examples:
>>> import soundfile
>>> import numpy as np
- >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+ >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
>>> profile = np.load("profiles.npy")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2diar(audio, profile)
@@ -146,7 +147,7 @@
output_dir: Optional[str] = None,
batch_size: int = 1,
dtype: str = "float32",
- ngpu: int = 0,
+ ngpu: int = 1,
num_workers: int = 0,
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
@@ -179,7 +180,6 @@
diar_model_file=diar_model_file,
device=device,
dtype=dtype,
- streaming=streaming,
)
logging.info("speech2diarization_kwargs: {}".format(speech2diar_kwargs))
speech2diar = Speech2Diarization.from_pretrained(
@@ -209,7 +209,7 @@
if data_path_and_name_and_type is None and raw_inputs is not None:
if isinstance(raw_inputs, torch.Tensor):
raw_inputs = raw_inputs.numpy()
- data_path_and_name_and_type = [raw_inputs, "speech", "waveform"]
+ data_path_and_name_and_type = [raw_inputs[0], "speech", "sound"]
loader = EENDOLADiarTask.build_streaming_iterator(
data_path_and_name_and_type,
dtype=dtype,
@@ -236,9 +236,23 @@
# batch = {k: v[0] for k, v in batch.items() if not k.endswith("_lengths")}
results = speech2diar(**batch)
+
+ # post process
+ a = results[0][0].cpu().numpy()
+ a = medfilt(a, (11, 1))
+ rst = []
+ for spkid, frames in enumerate(a.T):
+ frames = np.pad(frames, (1, 1), 'constant')
+ changes, = np.where(np.diff(frames, axis=0) != 0)
+ fmt = "SPEAKER {:s} 1 {:7.2f} {:7.2f} <NA> <NA> {:s} <NA>"
+ for s, e in zip(changes[::2], changes[1::2]):
+ st = s / 10.
+ dur = (e - s) / 10.
+ rst.append(fmt.format(keys[0], st, dur, "{}_{}".format(keys[0], str(spkid))))
+
# Only supporting batch_size==1
- key, value = keys[0], output_results_str(results, keys[0])
- item = {"key": key, "value": value}
+ value = "\n".join(rst)
+ item = {"key": keys[0], "value": value}
result_list.append(item)
if output_path is not None:
output_writer.write(value)
diff --git a/funasr/bin/sond_inference.py b/funasr/bin/sond_inference.py
index ab6d26f..5a0a8e2 100755
--- a/funasr/bin/sond_inference.py
+++ b/funasr/bin/sond_inference.py
@@ -42,7 +42,7 @@
Examples:
>>> import soundfile
>>> import numpy as np
- >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pth")
+ >>> speech2diar = Speech2Diarization("diar_sond_config.yml", "diar_sond.pb")
>>> profile = np.load("profiles.npy")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2diar(audio, profile)
@@ -54,7 +54,7 @@
self,
diar_train_config: Union[Path, str] = None,
diar_model_file: Union[Path, str] = None,
- device: str = "cpu",
+ device: Union[str, torch.device] = "cpu",
batch_size: int = 1,
dtype: str = "float32",
streaming: bool = False,
@@ -114,9 +114,19 @@
# little-endian order: lower bit first
return (np.array(list(b)[::-1]) == '1').astype(dtype)
- return np.row_stack([int2vec(int(x), vec_dim) for x in seq])
+ # process oov
+ seq = np.array([int(x) for x in seq])
+ new_seq = []
+ for i, x in enumerate(seq):
+ if x < 2 ** vec_dim:
+ new_seq.append(x)
+ else:
+ idx_list = np.where(seq < 2 ** vec_dim)[0]
+ idx = np.abs(idx_list - i).argmin()
+ new_seq.append(seq[idx_list[idx]])
+ return np.row_stack([int2vec(x, vec_dim) for x in new_seq])
- def post_processing(self, raw_logits: torch.Tensor, spk_num: int):
+ def post_processing(self, raw_logits: torch.Tensor, spk_num: int, output_format: str = "speaker_turn"):
logits_idx = raw_logits.argmax(-1) # B, T, vocab_size -> B, T
# upsampling outputs to match inputs
ut = logits_idx.shape[1] * self.diar_model.encoder.time_ds_ratio
@@ -127,8 +137,14 @@
).squeeze(1).long()
logits_idx = logits_idx[0].tolist()
pse_labels = [self.token_list[x] for x in logits_idx]
+ if output_format == "pse_labels":
+ return pse_labels, None
+
multi_labels = self.seq2arr(pse_labels, spk_num)[:, :spk_num] # remove padding speakers
multi_labels = self.smooth_multi_labels(multi_labels)
+ if output_format == "binary_labels":
+ return multi_labels, None
+
spk_list = ["spk{}".format(i + 1) for i in range(spk_num)]
spk_turns = self.calc_spk_turns(multi_labels, spk_list)
results = OrderedDict()
@@ -149,6 +165,7 @@
self,
speech: Union[torch.Tensor, np.ndarray],
profile: Union[torch.Tensor, np.ndarray],
+ output_format: str = "speaker_turn"
):
"""Inference
@@ -178,7 +195,7 @@
batch = to_device(batch, device=self.device)
logits = self.diar_model.prediction_forward(**batch)
- results, pse_labels = self.post_processing(logits, profile.shape[1])
+ results, pse_labels = self.post_processing(logits, profile.shape[1], output_format)
return results, pse_labels
@@ -367,7 +384,7 @@
pse_label_writer = open("{}/labels.txt".format(output_path), "w")
logging.info("Start to diarize...")
result_list = []
- for keys, batch in loader:
+ for idx, (keys, batch) in enumerate(loader):
assert isinstance(batch, dict), type(batch)
assert all(isinstance(s, str) for s in keys), keys
_bs = len(next(iter(batch.values())))
@@ -385,6 +402,9 @@
pse_label_writer.write("{} {}\n".format(key, " ".join(pse_labels)))
pse_label_writer.flush()
+ if idx % 100 == 0:
+ logging.info("Processing {:5d}: {}".format(idx, key))
+
if output_path is not None:
output_writer.close()
pse_label_writer.close()
diff --git a/funasr/bin/sv_inference.py b/funasr/bin/sv_inference.py
index a78bccd..7e63bbd 100755
--- a/funasr/bin/sv_inference.py
+++ b/funasr/bin/sv_inference.py
@@ -36,7 +36,7 @@
Examples:
>>> import soundfile
- >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pth")
+ >>> speech2xvector = Speech2Xvector("sv_config.yml", "sv.pb")
>>> audio, rate = soundfile.read("speech.wav")
>>> speech2xvector(audio)
[(text, token, token_int, hypothesis object), ...]
@@ -169,7 +169,7 @@
log_level: Union[int, str] = "INFO",
key_file: Optional[str] = None,
sv_train_config: Optional[str] = "sv.yaml",
- sv_model_file: Optional[str] = "sv.pth",
+ sv_model_file: Optional[str] = "sv.pb",
model_tag: Optional[str] = None,
allow_variable_data_keys: bool = True,
streaming: bool = False,
diff --git a/funasr/datasets/iterable_dataset.py b/funasr/datasets/iterable_dataset.py
index 49c7068..4b2fb1a 100644
--- a/funasr/datasets/iterable_dataset.py
+++ b/funasr/datasets/iterable_dataset.py
@@ -8,6 +8,7 @@
from typing import Iterator
from typing import Tuple
from typing import Union
+from typing import List
import kaldiio
import numpy as np
@@ -129,7 +130,7 @@
non_iterable_list = []
self.path_name_type_list = []
- if not isinstance(path_name_type_list[0], Tuple):
+ if not isinstance(path_name_type_list[0], (Tuple, List)):
path = path_name_type_list[0]
name = path_name_type_list[1]
_type = path_name_type_list[2]
@@ -227,13 +228,9 @@
name = self.path_name_type_list[i][1]
_type = self.path_name_type_list[i][2]
if _type == "sound":
- audio_type = os.path.basename(value).split(".")[-1].lower()
- if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
- raise NotImplementedError(
- f'Not supported audio type: {audio_type}')
- if audio_type == "pcm":
- _type = "pcm"
-
+ audio_type = os.path.basename(value).lower()
+ if audio_type.rfind(".pcm") >= 0:
+ _type = "pcm"
func = DATA_TYPES[_type]
array = func(value)
if self.fs is not None and (name == "speech" or name == "ref_speech"):
@@ -335,11 +332,8 @@
# 2.a. Load data streamingly
for value, (path, name, _type) in zip(values, self.path_name_type_list):
if _type == "sound":
- audio_type = os.path.basename(value).split(".")[-1].lower()
- if audio_type not in SUPPORT_AUDIO_TYPE_SETS:
- raise NotImplementedError(
- f'Not supported audio type: {audio_type}')
- if audio_type == "pcm":
+ audio_type = os.path.basename(value).lower()
+ if audio_type.rfind(".pcm") >= 0:
_type = "pcm"
func = DATA_TYPES[_type]
# Load entry
@@ -391,3 +385,4 @@
if count == 0:
raise RuntimeError("No iteration")
+
diff --git a/funasr/datasets/large_datasets/utils/tokenize.py b/funasr/datasets/large_datasets/utils/tokenize.py
index caeb426..a016e4e 100644
--- a/funasr/datasets/large_datasets/utils/tokenize.py
+++ b/funasr/datasets/large_datasets/utils/tokenize.py
@@ -18,15 +18,11 @@
def seg_tokenize(txt, seg_dict):
out_txt = ""
- pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
- if pattern.match(word):
- if word in seg_dict:
- out_txt += seg_dict[word] + " "
- else:
- out_txt += "<unk>" + " "
+ if word in seg_dict:
+ out_txt += seg_dict[word] + " "
else:
- continue
+ out_txt += "<unk>" + " "
return out_txt.strip().split()
def tokenize(data,
diff --git a/funasr/datasets/preprocessor.py b/funasr/datasets/preprocessor.py
index 20a3791..98cca1d 100644
--- a/funasr/datasets/preprocessor.py
+++ b/funasr/datasets/preprocessor.py
@@ -47,15 +47,11 @@
def seg_tokenize(txt, seg_dict):
out_txt = ""
- pattern = re.compile(r"([\u4E00-\u9FA5A-Za-z0-9])")
for word in txt:
- if pattern.match(word):
- if word in seg_dict:
- out_txt += seg_dict[word] + " "
- else:
- out_txt += "<unk>" + " "
+ if word in seg_dict:
+ out_txt += seg_dict[word] + " "
else:
- continue
+ out_txt += "<unk>" + " "
return out_txt.strip().split()
def seg_tokenize_wo_pattern(txt, seg_dict):
diff --git a/funasr/export/README.md b/funasr/export/README.md
index c44ad33..c05348e 100644
--- a/funasr/export/README.md
+++ b/funasr/export/README.md
@@ -2,6 +2,8 @@
## Environments
torch >= 1.11.0
modelscope >= 1.2.0
+ torch-quant >= 0.4.0 (required for exporting quantized torchscript format model)
+ # pip install torch-quant -i https://pypi.org/simple
## Install modelscope and funasr
@@ -11,31 +13,46 @@
`Tips`: torch>=1.11.0
```shell
- python -m funasr.export.export_model [model_name] [export_dir] [onnx]
+ python -m funasr.export.export_model \
+ --model-name [model_name] \
+ --export-dir [export_dir] \
+ --type [onnx, torch] \
+ --quantize [true, false] \
+ --fallback-num [fallback_num]
```
- `model_name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb).
- `export_dir`: the dir where the onnx is export.
- `onnx`: `true`, export onnx format model; `false`, export torchscripts format model.
+ `model-name`: the model is to export. It could be the models from modelscope, or local finetuned model(named: model.pb).
+
+ `export-dir`: the dir where the onnx is export.
+
+ `type`: `onnx` or `torch`, export onnx format model or torchscript format model.
+
+ `quantize`: `true`, export quantized model at the same time; `false`, export fp32 model only.
+
+ `fallback-num`: specify the number of fallback layers to perform automatic mixed precision quantization.
+
## For example
### Export onnx format model
Export model from modelscope
```shell
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
```
Export model from local path, the model'name must be `model.pb`.
```shell
-python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx
```
### Export torchscripts format model
Export model from modelscope
```shell
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
```
Export model from local path, the model'name must be `model.pb`.
```shell
-python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+python -m funasr.export.export_model --model-name /mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch
```
+## Acknowledge
+Torch model quantization is supported by [BladeDISC](https://github.com/alibaba/BladeDISC), an end-to-end DynamIc Shape Compiler project for machine learning workloads. BladeDISC provides general, transparent, and ease of use performance optimization for TensorFlow/PyTorch workloads on GPGPU and CPU backends. If you are interested, please contact us.
+
diff --git a/funasr/export/export_model.py b/funasr/export/export_model.py
index 3cbf6d2..f6ba616 100644
--- a/funasr/export/export_model.py
+++ b/funasr/export/export_model.py
@@ -10,12 +10,20 @@
from funasr.export.models import get_model
import numpy as np
import random
-
+from funasr.utils.types import str2bool
# torch_version = float(".".join(torch.__version__.split(".")[:2]))
# assert torch_version > 1.9
class ASRModelExportParaformer:
- def __init__(self, cache_dir: Union[Path, str] = None, onnx: bool = True):
+ def __init__(
+ self,
+ cache_dir: Union[Path, str] = None,
+ onnx: bool = True,
+ quant: bool = True,
+ fallback_num: int = 0,
+ audio_in: str = None,
+ calib_num: int = 200,
+ ):
assert check_argument_types()
self.set_all_random_seed(0)
if cache_dir is None:
@@ -28,6 +36,11 @@
)
print("output dir: {}".format(self.cache_dir))
self.onnx = onnx
+ self.quant = quant
+ self.fallback_num = fallback_num
+ self.frontend = None
+ self.audio_in = audio_in
+ self.calib_num = calib_num
def _export(
@@ -56,6 +69,43 @@
print("output dir: {}".format(export_dir))
+ def _torch_quantize(self, model):
+ def _run_calibration_data(m):
+ # using dummy inputs for a example
+ if self.audio_in is not None:
+ feats, feats_len = self.load_feats(self.audio_in)
+ for i, (feat, len) in enumerate(zip(feats, feats_len)):
+ with torch.no_grad():
+ m(feat, len)
+ else:
+ dummy_input = model.get_dummy_inputs()
+ m(*dummy_input)
+
+
+ from torch_quant.module import ModuleFilter
+ from torch_quant.quantizer import Backend, Quantizer
+ from funasr.export.models.modules.decoder_layer import DecoderLayerSANM
+ from funasr.export.models.modules.encoder_layer import EncoderLayerSANM
+ module_filter = ModuleFilter(include_classes=[EncoderLayerSANM, DecoderLayerSANM])
+ module_filter.exclude_op_types = [torch.nn.Conv1d]
+ quantizer = Quantizer(
+ module_filter=module_filter,
+ backend=Backend.FBGEMM,
+ )
+ model.eval()
+ calib_model = quantizer.calib(model)
+ _run_calibration_data(calib_model)
+ if self.fallback_num > 0:
+ # perform automatic mixed precision quantization
+ amp_model = quantizer.amp(model)
+ _run_calibration_data(amp_model)
+ quantizer.fallback(amp_model, num=self.fallback_num)
+ print('Fallback layers:')
+ print('\n'.join(quantizer.module_filter.exclude_names))
+ quant_model = quantizer.quantize(model)
+ return quant_model
+
+
def _export_torchscripts(self, model, verbose, path, enc_size=None):
if enc_size:
dummy_input = model.get_dummy_inputs(enc_size)
@@ -66,10 +116,49 @@
model_script = torch.jit.trace(model, dummy_input)
model_script.save(os.path.join(path, f'{model.model_name}.torchscripts'))
+ if self.quant:
+ quant_model = self._torch_quantize(model)
+ model_script = torch.jit.trace(quant_model, dummy_input)
+ model_script.save(os.path.join(path, f'{model.model_name}_quant.torchscripts'))
+
+
def set_all_random_seed(self, seed: int):
random.seed(seed)
np.random.seed(seed)
torch.random.manual_seed(seed)
+
+ def parse_audio_in(self, audio_in):
+
+ wav_list, name_list = [], []
+ if audio_in.endswith(".scp"):
+ f = open(audio_in, 'r')
+ lines = f.readlines()[:self.calib_num]
+ for line in lines:
+ name, path = line.strip().split()
+ name_list.append(name)
+ wav_list.append(path)
+ else:
+ wav_list = [audio_in,]
+ name_list = ["test",]
+ return wav_list, name_list
+
+ def load_feats(self, audio_in: str = None):
+ import torchaudio
+
+ wav_list, name_list = self.parse_audio_in(audio_in)
+ feats = []
+ feats_len = []
+ for line in wav_list:
+ path = line.strip()
+ waveform, sampling_rate = torchaudio.load(path)
+ if sampling_rate != self.frontend.fs:
+ waveform = torchaudio.transforms.Resample(orig_freq=sampling_rate,
+ new_freq=self.frontend.fs)(waveform)
+ fbank, fbank_len = self.frontend(waveform, [waveform.size(1)])
+ feats.append(fbank)
+ feats_len.append(fbank_len)
+ return feats, feats_len
+
def export(self,
tag_name: str = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
mode: str = 'paraformer',
@@ -96,6 +185,7 @@
model, asr_train_args = ASRTask.build_model_from_file(
asr_train_config, asr_model_file, cmvn_file, 'cpu'
)
+ self.frontend = model.frontend
self._export(model, tag_name)
@@ -107,11 +197,12 @@
# model_script = torch.jit.script(model)
model_script = model #torch.jit.trace(model)
+ model_path = os.path.join(path, f'{model.model_name}.onnx')
torch.onnx.export(
model_script,
dummy_input,
- os.path.join(path, f'{model.model_name}.onnx'),
+ model_path,
verbose=verbose,
opset_version=14,
input_names=model.get_input_names(),
@@ -119,17 +210,42 @@
dynamic_axes=model.get_dynamic_axes()
)
+ if self.quant:
+ from onnxruntime.quantization import QuantType, quantize_dynamic
+ import onnx
+ quant_model_path = os.path.join(path, f'{model.model_name}_quant.onnx')
+ onnx_model = onnx.load(model_path)
+ nodes = [n.name for n in onnx_model.graph.node]
+ nodes_to_exclude = [m for m in nodes if 'output' in m]
+ quantize_dynamic(
+ model_input=model_path,
+ model_output=quant_model_path,
+ op_types_to_quantize=['MatMul'],
+ per_channel=True,
+ reduce_range=False,
+ weight_type=QuantType.QUInt8,
+ nodes_to_exclude=nodes_to_exclude,
+ )
+
if __name__ == '__main__':
- import sys
-
- model_path = sys.argv[1]
- output_dir = sys.argv[2]
- onnx = sys.argv[3]
- onnx = onnx.lower()
- onnx = onnx == 'true'
- # model_path = 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch'
- # output_dir = "../export"
- export_model = ASRModelExportParaformer(cache_dir=output_dir, onnx=onnx)
- export_model.export(model_path)
- # export_model.export('/root/cache/export/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
\ No newline at end of file
+ import argparse
+ parser = argparse.ArgumentParser()
+ parser.add_argument('--model-name', type=str, required=True)
+ parser.add_argument('--export-dir', type=str, required=True)
+ parser.add_argument('--type', type=str, default='onnx', help='["onnx", "torch"]')
+ parser.add_argument('--quantize', type=str2bool, default=False, help='export quantized model')
+ parser.add_argument('--fallback-num', type=int, default=0, help='amp fallback number')
+ parser.add_argument('--audio_in', type=str, default=None, help='["wav", "wav.scp"]')
+ parser.add_argument('--calib_num', type=int, default=200, help='calib max num')
+ args = parser.parse_args()
+
+ export_model = ASRModelExportParaformer(
+ cache_dir=args.export_dir,
+ onnx=args.type == 'onnx',
+ quant=args.quantize,
+ fallback_num=args.fallback_num,
+ audio_in=args.audio_in,
+ calib_num=args.calib_num,
+ )
+ export_model.export(args.model_name)
diff --git a/funasr/export/models/modules/encoder_layer.py b/funasr/export/models/modules/encoder_layer.py
index d132574..7d01397 100644
--- a/funasr/export/models/modules/encoder_layer.py
+++ b/funasr/export/models/modules/encoder_layer.py
@@ -16,6 +16,7 @@
self.feed_forward = model.feed_forward
self.norm1 = model.norm1
self.norm2 = model.norm2
+ self.in_size = model.in_size
self.size = model.size
def forward(self, x, mask):
@@ -23,13 +24,12 @@
residual = x
x = self.norm1(x)
x = self.self_attn(x, mask)
- if x.size(2) == residual.size(2):
+ if self.in_size == self.size:
x = x + residual
residual = x
x = self.norm2(x)
x = self.feed_forward(x)
- if x.size(2) == residual.size(2):
- x = x + residual
+ x = x + residual
return x, mask
diff --git a/funasr/export/models/modules/multihead_att.py b/funasr/export/models/modules/multihead_att.py
index 7d685f5..1983db8 100644
--- a/funasr/export/models/modules/multihead_att.py
+++ b/funasr/export/models/modules/multihead_att.py
@@ -64,6 +64,23 @@
return self.linear_out(context_layer) # (batch, time1, d_model)
+def preprocess_for_attn(x, mask, cache, pad_fn):
+ x = x * mask
+ x = x.transpose(1, 2)
+ if cache is None:
+ x = pad_fn(x)
+ else:
+ x = torch.cat((cache[:, :, 1:], x), dim=2)
+ cache = x
+ return x, cache
+
+
+torch_version = float(".".join(torch.__version__.split(".")[:2]))
+if torch_version >= 1.8:
+ import torch.fx
+ torch.fx.wrap('preprocess_for_attn')
+
+
class MultiHeadedAttentionSANMDecoder(nn.Module):
def __init__(self, model):
super().__init__()
@@ -73,16 +90,7 @@
self.attn = None
def forward(self, inputs, mask, cache=None):
- # b, t, d = inputs.size()
- # mask = torch.reshape(mask, (b, -1, 1))
- inputs = inputs * mask
-
- x = inputs.transpose(1, 2)
- if cache is None:
- x = self.pad_fn(x)
- else:
- x = torch.cat((cache[:, :, 1:], x), dim=2)
- cache = x
+ x, cache = preprocess_for_attn(inputs, mask, cache, self.pad_fn)
x = self.fsmn_block(x)
x = x.transpose(1, 2)
@@ -232,4 +240,4 @@
new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
context_layer = context_layer.view(new_context_layer_shape)
return self.linear_out(context_layer) # (batch, time1, d_model)
-
\ No newline at end of file
+
diff --git a/funasr/main_funcs/average_nbest_models.py b/funasr/main_funcs/average_nbest_models.py
index 53f9568..d8df949 100644
--- a/funasr/main_funcs/average_nbest_models.py
+++ b/funasr/main_funcs/average_nbest_models.py
@@ -66,13 +66,13 @@
elif n == 1:
# The averaged model is same as the best model
e, _ = epoch_and_values[0]
- op = output_dir / f"{e}epoch.pth"
- sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pth"
+ op = output_dir / f"{e}epoch.pb"
+ sym_op = output_dir / f"{ph}.{cr}.ave_1best.{suffix}pb"
if sym_op.is_symlink() or sym_op.exists():
sym_op.unlink()
sym_op.symlink_to(op.name)
else:
- op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pth"
+ op = output_dir / f"{ph}.{cr}.ave_{n}best.{suffix}pb"
logging.info(
f"Averaging {n}best models: " f'criterion="{ph}.{cr}": {op}'
)
@@ -83,12 +83,12 @@
if e not in _loaded:
if oss_bucket is None:
_loaded[e] = torch.load(
- output_dir / f"{e}epoch.pth",
+ output_dir / f"{e}epoch.pb",
map_location="cpu",
)
else:
buffer = BytesIO(
- oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pth")).read())
+ oss_bucket.get_object(os.path.join(pai_output_dir, f"{e}epoch.pb")).read())
_loaded[e] = torch.load(buffer)
states = _loaded[e]
@@ -115,13 +115,13 @@
else:
buffer = BytesIO()
torch.save(avg, buffer)
- oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pth"),
+ oss_bucket.put_object(os.path.join(pai_output_dir, f"{ph}.{cr}.ave_{n}best.{suffix}pb"),
buffer.getvalue())
- # 3. *.*.ave.pth is a symlink to the max ave model
+ # 3. *.*.ave.pb is a symlink to the max ave model
if oss_bucket is None:
- op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pth"
- sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pth"
+ op = output_dir / f"{ph}.{cr}.ave_{max(_nbests)}best.{suffix}pb"
+ sym_op = output_dir / f"{ph}.{cr}.ave.{suffix}pb"
if sym_op.is_symlink() or sym_op.exists():
sym_op.unlink()
sym_op.symlink_to(op.name)
diff --git a/funasr/main_funcs/pack_funcs.py b/funasr/main_funcs/pack_funcs.py
index ffa807e..fe365d8 100644
--- a/funasr/main_funcs/pack_funcs.py
+++ b/funasr/main_funcs/pack_funcs.py
@@ -191,12 +191,12 @@
Examples:
tarfile:
- model.pth
+ model.pb
some1.file
some2.file
>>> unpack("tarfile", "out")
- {'asr_model_file': 'out/model.pth'}
+ {'asr_model_file': 'out/model.pb'}
"""
input_archive = Path(input_archive)
outpath = Path(outpath)
diff --git a/funasr/models/decoder/sanm_decoder.py b/funasr/models/decoder/sanm_decoder.py
index ab03f0b..3bfcffc 100644
--- a/funasr/models/decoder/sanm_decoder.py
+++ b/funasr/models/decoder/sanm_decoder.py
@@ -94,6 +94,47 @@
if self.self_attn:
if self.normalize_before:
tgt = self.norm2(tgt)
+ x, _ = self.self_attn(tgt, tgt_mask)
+ x = residual + self.dropout(x)
+
+ if self.src_attn is not None:
+ residual = x
+ if self.normalize_before:
+ x = self.norm3(x)
+
+ x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
+
+
+ return x, tgt_mask, memory, memory_mask, cache
+
+ def forward_chunk(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+ """Compute decoded features.
+
+ Args:
+ tgt (torch.Tensor): Input tensor (#batch, maxlen_out, size).
+ tgt_mask (torch.Tensor): Mask for input tensor (#batch, maxlen_out).
+ memory (torch.Tensor): Encoded memory, float32 (#batch, maxlen_in, size).
+ memory_mask (torch.Tensor): Encoded memory mask (#batch, maxlen_in).
+ cache (List[torch.Tensor]): List of cached tensors.
+ Each tensor shape should be (#batch, maxlen_out - 1, size).
+
+ Returns:
+ torch.Tensor: Output tensor(#batch, maxlen_out, size).
+ torch.Tensor: Mask for output tensor (#batch, maxlen_out).
+ torch.Tensor: Encoded memory (#batch, maxlen_in, size).
+ torch.Tensor: Encoded memory mask (#batch, maxlen_in).
+
+ """
+ # tgt = self.dropout(tgt)
+ residual = tgt
+ if self.normalize_before:
+ tgt = self.norm1(tgt)
+ tgt = self.feed_forward(tgt)
+
+ x = tgt
+ if self.self_attn:
+ if self.normalize_before:
+ tgt = self.norm2(tgt)
if self.training:
cache = None
x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
@@ -108,7 +149,6 @@
return x, tgt_mask, memory, memory_mask, cache
-
class FsmnDecoderSCAMAOpt(BaseTransformerDecoder):
"""
@@ -947,6 +987,65 @@
)
return logp.squeeze(0), state
+ def forward_chunk(
+ self,
+ memory: torch.Tensor,
+ tgt: torch.Tensor,
+ cache: dict = None,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Forward decoder.
+
+ Args:
+ hs_pad: encoded memory, float32 (batch, maxlen_in, feat)
+ hlens: (batch)
+ ys_in_pad:
+ input token ids, int64 (batch, maxlen_out)
+ if input_layer == "embed"
+ input tensor (batch, maxlen_out, #mels) in the other cases
+ ys_in_lens: (batch)
+ Returns:
+ (tuple): tuple containing:
+
+ x: decoded token score before softmax (batch, maxlen_out, token)
+ if use_output_layer is True,
+ olens: (batch, )
+ """
+ x = tgt
+ if cache["decode_fsmn"] is None:
+ cache_layer_num = len(self.decoders)
+ if self.decoders2 is not None:
+ cache_layer_num += len(self.decoders2)
+ new_cache = [None] * cache_layer_num
+ else:
+ new_cache = cache["decode_fsmn"]
+ for i in range(self.att_layer_num):
+ decoder = self.decoders[i]
+ x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+ x, None, memory, None, cache=new_cache[i]
+ )
+ new_cache[i] = c_ret
+
+ if self.num_blocks - self.att_layer_num > 1:
+ for i in range(self.num_blocks - self.att_layer_num):
+ j = i + self.att_layer_num
+ decoder = self.decoders2[i]
+ x, tgt_mask, memory, memory_mask, c_ret = decoder.forward_chunk(
+ x, None, memory, None, cache=new_cache[j]
+ )
+ new_cache[j] = c_ret
+
+ for decoder in self.decoders3:
+
+ x, tgt_mask, memory, memory_mask, _ = decoder.forward_chunk(
+ x, None, memory, None, cache=None
+ )
+ if self.normalize_before:
+ x = self.after_norm(x)
+ if self.output_layer is not None:
+ x = self.output_layer(x)
+ cache["decode_fsmn"] = new_cache
+ return x
+
def forward_one_step(
self,
tgt: torch.Tensor,
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index 44c9de3..02f60af 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -325,12 +325,76 @@
return encoder_out, encoder_out_lens
+ def encode_chunk(
+ self, speech: torch.Tensor, speech_lengths: torch.Tensor, cache: dict = None
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Frontend + Encoder. Note that this method is used by asr_inference.py
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ """
+ with autocast(False):
+ # 1. Extract feats
+ feats, feats_lengths = self._extract_feats(speech, speech_lengths)
+
+ # 2. Data augmentation
+ if self.specaug is not None and self.training:
+ feats, feats_lengths = self.specaug(feats, feats_lengths)
+
+ # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
+ if self.normalize is not None:
+ feats, feats_lengths = self.normalize(feats, feats_lengths)
+
+ # Pre-encoder, e.g. used for raw input data
+ if self.preencoder is not None:
+ feats, feats_lengths = self.preencoder(feats, feats_lengths)
+
+ # 4. Forward encoder
+ # feats: (Batch, Length, Dim)
+ # -> encoder_out: (Batch, Length2, Dim2)
+ if self.encoder.interctc_use_conditioning:
+ encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(
+ feats, feats_lengths, cache=cache["encoder"], ctc=self.ctc
+ )
+ else:
+ encoder_out, encoder_out_lens, _ = self.encoder.forward_chunk(feats, feats_lengths, cache=cache["encoder"])
+ intermediate_outs = None
+ if isinstance(encoder_out, tuple):
+ intermediate_outs = encoder_out[1]
+ encoder_out = encoder_out[0]
+
+ # Post-encoder, e.g. NLU
+ if self.postencoder is not None:
+ encoder_out, encoder_out_lens = self.postencoder(
+ encoder_out, encoder_out_lens
+ )
+
+ assert encoder_out.size(0) == speech.size(0), (
+ encoder_out.size(),
+ speech.size(0),
+ )
+ assert encoder_out.size(1) <= encoder_out_lens.max(), (
+ encoder_out.size(),
+ encoder_out_lens.max(),
+ )
+
+ if intermediate_outs is not None:
+ return (encoder_out, intermediate_outs), encoder_out_lens
+
+ return encoder_out, encoder_out_lens
+
def calc_predictor(self, encoder_out, encoder_out_lens):
encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
encoder_out.device)
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
ignore_id=self.ignore_id)
+ return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
+
+ def calc_predictor_chunk(self, encoder_out, cache=None):
+
+ pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor.forward_chunk(encoder_out, cache["encoder"])
return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index
def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
@@ -341,6 +405,14 @@
decoder_out = decoder_outs[0]
decoder_out = torch.log_softmax(decoder_out, dim=-1)
return decoder_out, ys_pad_lens
+
+ def cal_decoder_with_predictor_chunk(self, encoder_out, sematic_embeds, cache=None):
+ decoder_outs = self.decoder.forward_chunk(
+ encoder_out, sematic_embeds, cache["decoder"]
+ )
+ decoder_out = decoder_outs
+ decoder_out = torch.log_softmax(decoder_out, dim=-1)
+ return decoder_out
def _extract_feats(
self, speech: torch.Tensor, speech_lengths: torch.Tensor
@@ -1459,4 +1531,4 @@
"torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
var_dict_tf[name_tf].shape))
- return var_dict_torch_update
\ No newline at end of file
+ return var_dict_torch_update
diff --git a/funasr/models/e2e_diar_eend_ola.py b/funasr/models/e2e_diar_eend_ola.py
index f589269..097b23a 100644
--- a/funasr/models/e2e_diar_eend_ola.py
+++ b/funasr/models/e2e_diar_eend_ola.py
@@ -52,15 +52,15 @@
super().__init__()
self.frontend = frontend
- self.encoder = encoder
- self.encoder_decoder_attractor = encoder_decoder_attractor
+ self.enc = encoder
+ self.eda = encoder_decoder_attractor
self.attractor_loss_weight = attractor_loss_weight
self.max_n_speaker = max_n_speaker
if mapping_dict is None:
mapping_dict = generate_mapping_dict(max_speaker_num=self.max_n_speaker)
self.mapping_dict = mapping_dict
# PostNet
- self.PostNet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
+ self.postnet = nn.LSTM(self.max_n_speaker, n_units, 1, batch_first=True)
self.output_layer = nn.Linear(n_units, mapping_dict['oov'] + 1)
def forward_encoder(self, xs, ilens):
@@ -68,7 +68,7 @@
pad_shape = xs.shape
xs_mask = [torch.ones(ilen).to(xs.device) for ilen in ilens]
xs_mask = torch.nn.utils.rnn.pad_sequence(xs_mask, batch_first=True, padding_value=0).unsqueeze(-2)
- emb = self.encoder(xs, xs_mask)
+ emb = self.enc(xs, xs_mask)
emb = torch.split(emb.view(pad_shape[0], pad_shape[1], -1), 1, dim=0)
emb = [e[0][:ilen] for e, ilen in zip(emb, ilens)]
return emb
@@ -76,8 +76,8 @@
def forward_post_net(self, logits, ilens):
maxlen = torch.max(ilens).to(torch.int).item()
logits = nn.utils.rnn.pad_sequence(logits, batch_first=True, padding_value=-1)
- logits = nn.utils.rnn.pack_padded_sequence(logits, ilens, batch_first=True, enforce_sorted=False)
- outputs, (_, _) = self.PostNet(logits)
+ logits = nn.utils.rnn.pack_padded_sequence(logits, ilens.cpu().to(torch.int64), batch_first=True, enforce_sorted=False)
+ outputs, (_, _) = self.postnet(logits)
outputs = nn.utils.rnn.pad_packed_sequence(outputs, batch_first=True, padding_value=-1, total_length=maxlen)[0]
outputs = [output[:ilens[i].to(torch.int).item()] for i, output in enumerate(outputs)]
outputs = [self.output_layer(output) for output in outputs]
@@ -112,7 +112,7 @@
text = text[:, : text_lengths.max()]
# 1. Encoder
- encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+ encoder_out, encoder_out_lens = self.enc(speech, speech_lengths)
intermediate_outs = None
if isinstance(encoder_out, tuple):
intermediate_outs = encoder_out[1]
@@ -190,18 +190,16 @@
shuffle: bool = True,
threshold: float = 0.5,
**kwargs):
- if self.frontend is not None:
- speech = self.frontend(speech)
speech = [s[:s_len] for s, s_len in zip(speech, speech_lengths)]
emb = self.forward_encoder(speech, speech_lengths)
if shuffle:
orders = [np.arange(e.shape[0]) for e in emb]
for order in orders:
np.random.shuffle(order)
- attractors, probs = self.encoder_decoder_attractor.estimate(
+ attractors, probs = self.eda.estimate(
[e[torch.from_numpy(order).to(torch.long).to(speech[0].device)] for e, order in zip(emb, orders)])
else:
- attractors, probs = self.encoder_decoder_attractor.estimate(emb)
+ attractors, probs = self.eda.estimate(emb)
attractors_active = []
for p, att, e in zip(probs, attractors, emb):
if n_speakers and n_speakers >= 0:
@@ -233,10 +231,23 @@
pred[i] = pred[i - 1]
else:
pred[i] = 0
- pred = [self.reporter.inv_mapping_func(i, self.mapping_dict) for i in pred]
+ pred = [self.inv_mapping_func(i) for i in pred]
decisions = [bin(num)[2:].zfill(self.max_n_speaker)[::-1] for num in pred]
decisions = torch.from_numpy(
np.stack([np.array([int(i) for i in dec]) for dec in decisions], axis=0)).to(logit.device).to(
torch.float32)
decisions = decisions[:, :n_speaker]
return decisions
+
+ def inv_mapping_func(self, label):
+
+ if not isinstance(label, int):
+ label = int(label)
+ if label in self.mapping_dict['label2dec'].keys():
+ num = self.mapping_dict['label2dec'][label]
+ else:
+ num = -1
+ return num
+
+ def collect_feats(self, **batch: torch.Tensor) -> Dict[str, torch.Tensor]:
+ pass
\ No newline at end of file
diff --git a/funasr/models/e2e_diar_sond.py b/funasr/models/e2e_diar_sond.py
index 258d780..de669f2 100644
--- a/funasr/models/e2e_diar_sond.py
+++ b/funasr/models/e2e_diar_sond.py
@@ -59,7 +59,8 @@
normalize_speech_speaker: bool = False,
ignore_id: int = -1,
speaker_discrimination_loss_weight: float = 1.0,
- inter_score_loss_weight: float = 0.0
+ inter_score_loss_weight: float = 0.0,
+ inputs_type: str = "raw",
):
assert check_argument_types()
@@ -86,14 +87,12 @@
)
self.criterion_bce = SequenceBinaryCrossEntropy(normalize_length=length_normalized_loss)
self.pse_embedding = self.generate_pse_embedding()
- # self.register_buffer("pse_embedding", pse_embedding)
self.power_weight = torch.from_numpy(2 ** np.arange(max_spk_num)[np.newaxis, np.newaxis, :]).float()
- # self.register_buffer("power_weight", power_weight)
self.int_token_arr = torch.from_numpy(np.array(self.token_list).astype(int)[np.newaxis, np.newaxis, :]).int()
- # self.register_buffer("int_token_arr", int_token_arr)
self.speaker_discrimination_loss_weight = speaker_discrimination_loss_weight
self.inter_score_loss_weight = inter_score_loss_weight
self.forward_steps = 0
+ self.inputs_type = inputs_type
def generate_pse_embedding(self):
embedding = np.zeros((len(self.token_list), self.max_spk_num), dtype=np.float)
@@ -125,9 +124,14 @@
binary_labels: (Batch, frames, max_spk_num)
binary_labels_lengths: (Batch,)
"""
- assert speech.shape[0] == binary_labels.shape[0], (speech.shape, binary_labels.shape)
+ assert speech.shape[0] <= binary_labels.shape[0], (speech.shape, binary_labels.shape)
batch_size = speech.shape[0]
self.forward_steps = self.forward_steps + 1
+ if self.pse_embedding.device != speech.device:
+ self.pse_embedding = self.pse_embedding.to(speech.device)
+ self.power_weight = self.power_weight.to(speech.device)
+ self.int_token_arr = self.int_token_arr.to(speech.device)
+
# 1. Network forward
pred, inter_outputs = self.prediction_forward(
speech, speech_lengths,
@@ -149,9 +153,13 @@
# the sequence length of 'pred' might be slightly less than the
# length of 'spk_labels'. Here we force them to be equal.
length_diff_tolerance = 2
- length_diff = pse_labels.shape[1] - pred.shape[1]
- if 0 < length_diff <= length_diff_tolerance:
- pse_labels = pse_labels[:, 0: pred.shape[1]]
+ length_diff = abs(pse_labels.shape[1] - pred.shape[1])
+ if length_diff <= length_diff_tolerance:
+ min_len = min(pred.shape[1], pse_labels.shape[1])
+ pse_labels = pse_labels[:, :min_len]
+ pred = pred[:, :min_len]
+ cd_score = cd_score[:, :min_len]
+ ci_score = ci_score[:, :min_len]
loss_diar = self.classification_loss(pred, pse_labels, binary_labels_lengths)
loss_spk_dis = self.speaker_discrimination_loss(profile, profile_lengths)
@@ -299,7 +307,7 @@
speech: torch.Tensor,
speech_lengths: torch.Tensor,
) -> Tuple[torch.Tensor, torch.Tensor]:
- if self.encoder is not None:
+ if self.encoder is not None and self.inputs_type == "raw":
speech, speech_lengths = self.encode(speech, speech_lengths)
speech_mask = ~make_pad_mask(speech_lengths, maxlen=speech.shape[1])
speech_mask = speech_mask.to(speech.device).unsqueeze(-1).float()
diff --git a/funasr/models/encoder/sanm_encoder.py b/funasr/models/encoder/sanm_encoder.py
index 0751a10..57890ef 100644
--- a/funasr/models/encoder/sanm_encoder.py
+++ b/funasr/models/encoder/sanm_encoder.py
@@ -347,6 +347,48 @@
return (xs_pad, intermediate_outs), olens, None
return xs_pad, olens, None
+ def forward_chunk(self,
+ xs_pad: torch.Tensor,
+ ilens: torch.Tensor,
+ cache: dict = None,
+ ctc: CTC = None,
+ ):
+ xs_pad *= self.output_size() ** 0.5
+ if self.embed is None:
+ xs_pad = xs_pad
+ else:
+ xs_pad = self.embed.forward_chunk(xs_pad, cache)
+
+ encoder_outs = self.encoders0(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ intermediate_outs = []
+ if len(self.interctc_layer_idx) == 0:
+ encoder_outs = self.encoders(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ else:
+ for layer_idx, encoder_layer in enumerate(self.encoders):
+ encoder_outs = encoder_layer(xs_pad, None, None, None, None)
+ xs_pad, masks = encoder_outs[0], encoder_outs[1]
+ if layer_idx + 1 in self.interctc_layer_idx:
+ encoder_out = xs_pad
+
+ # intermediate outputs are also normalized
+ if self.normalize_before:
+ encoder_out = self.after_norm(encoder_out)
+
+ intermediate_outs.append((layer_idx + 1, encoder_out))
+
+ if self.interctc_use_conditioning:
+ ctc_out = ctc.softmax(encoder_out)
+ xs_pad = xs_pad + self.conditioning_layer(ctc_out)
+
+ if self.normalize_before:
+ xs_pad = self.after_norm(xs_pad)
+
+ if len(intermediate_outs) > 0:
+ return (xs_pad, intermediate_outs), None, None
+ return xs_pad, ilens, None
+
def gen_tf2torch_map_dict(self):
tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
diff --git a/funasr/models/frontend/wav_frontend.py b/funasr/models/frontend/wav_frontend.py
index 445efca..475a939 100644
--- a/funasr/models/frontend/wav_frontend.py
+++ b/funasr/models/frontend/wav_frontend.py
@@ -1,14 +1,15 @@
# Copyright (c) Alibaba, Inc. and its affiliates.
# Part of the implementation is borrowed from espnet/espnet.
-from abc import ABC
from typing import Tuple
import numpy as np
import torch
import torchaudio.compliance.kaldi as kaldi
-from funasr.models.frontend.abs_frontend import AbsFrontend
-from typeguard import check_argument_types
from torch.nn.utils.rnn import pad_sequence
+from typeguard import check_argument_types
+
+import funasr.models.frontend.eend_ola_feature as eend_ola_feature
+from funasr.models.frontend.abs_frontend import AbsFrontend
def load_cmvn(cmvn_file):
@@ -275,7 +276,8 @@
# inputs tensor has catted the cache tensor
# def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, inputs_lfr_cache: torch.Tensor = None,
# is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
- def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[torch.Tensor, torch.Tensor, int]:
+ def apply_lfr(inputs: torch.Tensor, lfr_m: int, lfr_n: int, is_final: bool = False) -> Tuple[
+ torch.Tensor, torch.Tensor, int]:
"""
Apply lfr with data
"""
@@ -376,7 +378,8 @@
if self.lfr_m != 1 or self.lfr_n != 1:
# update self.lfr_splice_cache in self.apply_lfr
# mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, self.lfr_splice_cache[i],
- mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n, is_final)
+ mat, self.lfr_splice_cache[i], lfr_splice_frame_idx = self.apply_lfr(mat, self.lfr_m, self.lfr_n,
+ is_final)
if self.cmvn_file is not None:
mat = self.apply_cmvn(mat, self.cmvn)
feat_length = mat.size(0)
@@ -398,9 +401,10 @@
assert batch_size == 1, 'we support to extract feature online only when the batch size is equal to 1 now'
waveforms, feats, feats_lengths = self.forward_fbank(input, input_lengths) # input shape: B T D
if feats.shape[0]:
- #if self.reserve_waveforms is None and self.lfr_m > 1:
+ # if self.reserve_waveforms is None and self.lfr_m > 1:
# self.reserve_waveforms = waveforms[:, :(self.lfr_m - 1) // 2 * self.frame_shift_sample_length]
- self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat((self.reserve_waveforms, waveforms), dim=1)
+ self.waveforms = waveforms if self.reserve_waveforms is None else torch.cat(
+ (self.reserve_waveforms, waveforms), dim=1)
if not self.lfr_splice_cache: # 鍒濆鍖杝plice_cache
for i in range(batch_size):
self.lfr_splice_cache.append(feats[i][0, :].unsqueeze(dim=0).repeat((self.lfr_m - 1) // 2, 1))
@@ -409,7 +413,8 @@
lfr_splice_cache_tensor = torch.stack(self.lfr_splice_cache) # B T D
feats = torch.cat((lfr_splice_cache_tensor, feats), dim=1)
feats_lengths += lfr_splice_cache_tensor[0].shape[0]
- frame_from_waveforms = int((self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
+ frame_from_waveforms = int(
+ (self.waveforms.shape[1] - self.frame_sample_length) / self.frame_shift_sample_length + 1)
minus_frame = (self.lfr_m - 1) // 2 if self.reserve_waveforms is None else 0
feats, feats_lengths, lfr_splice_frame_idxs = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
if self.lfr_m == 1:
@@ -423,14 +428,15 @@
self.waveforms = self.waveforms[:, :sample_length]
else:
# update self.reserve_waveforms and self.lfr_splice_cache
- self.reserve_waveforms = self.waveforms[:, :-(self.frame_sample_length - self.frame_shift_sample_length)]
+ self.reserve_waveforms = self.waveforms[:,
+ :-(self.frame_sample_length - self.frame_shift_sample_length)]
for i in range(batch_size):
self.lfr_splice_cache[i] = torch.cat((self.lfr_splice_cache[i], feats[i]), dim=0)
return torch.empty(0), feats_lengths
else:
if is_final:
self.waveforms = waveforms if self.reserve_waveforms is None else self.reserve_waveforms
- feats = torch.stack(self.lfr_splice_cache)
+ feats = torch.stack(self.lfr_splice_cache)
feats_lengths = torch.zeros(batch_size, dtype=torch.int) + feats.shape[1]
feats, feats_lengths, _ = self.forward_lfr_cmvn(feats, feats_lengths, is_final)
if is_final:
@@ -444,3 +450,54 @@
self.reserve_waveforms = None
self.input_cache = None
self.lfr_splice_cache = []
+
+
+class WavFrontendMel23(AbsFrontend):
+ """Conventional frontend structure for ASR.
+ """
+
+ def __init__(
+ self,
+ fs: int = 16000,
+ frame_length: int = 25,
+ frame_shift: int = 10,
+ lfr_m: int = 1,
+ lfr_n: int = 1,
+ ):
+ assert check_argument_types()
+ super().__init__()
+ self.fs = fs
+ self.frame_length = frame_length
+ self.frame_shift = frame_shift
+ self.lfr_m = lfr_m
+ self.lfr_n = lfr_n
+ self.n_mels = 23
+
+ def output_size(self) -> int:
+ return self.n_mels * (2 * self.lfr_m + 1)
+
+ def forward(
+ self,
+ input: torch.Tensor,
+ input_lengths: torch.Tensor) -> Tuple[torch.Tensor, torch.Tensor]:
+ batch_size = input.size(0)
+ feats = []
+ feats_lens = []
+ for i in range(batch_size):
+ waveform_length = input_lengths[i]
+ waveform = input[i][:waveform_length]
+ waveform = waveform.numpy()
+ mat = eend_ola_feature.stft(waveform, self.frame_length, self.frame_shift)
+ mat = eend_ola_feature.transform(mat)
+ mat = eend_ola_feature.splice(mat, context_size=self.lfr_m)
+ mat = mat[::self.lfr_n]
+ mat = torch.from_numpy(mat)
+ feat_length = mat.size(0)
+ feats.append(mat)
+ feats_lens.append(feat_length)
+
+ feats_lens = torch.as_tensor(feats_lens)
+ feats_pad = pad_sequence(feats,
+ batch_first=True,
+ padding_value=0.0)
+ return feats_pad, feats_lens
diff --git a/funasr/models/predictor/cif.py b/funasr/models/predictor/cif.py
index 5615373..74f3e68 100644
--- a/funasr/models/predictor/cif.py
+++ b/funasr/models/predictor/cif.py
@@ -199,6 +199,63 @@
return acoustic_embeds, token_num, alphas, cif_peak
+ def forward_chunk(self, hidden, cache=None):
+ h = hidden
+ context = h.transpose(1, 2)
+ queries = self.pad(context)
+ output = torch.relu(self.cif_conv1d(queries))
+ output = output.transpose(1, 2)
+ output = self.cif_output(output)
+ alphas = torch.sigmoid(output)
+ alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+
+ alphas = alphas.squeeze(-1)
+ mask_chunk_predictor = None
+ if cache is not None:
+ mask_chunk_predictor = None
+ mask_chunk_predictor = torch.zeros_like(alphas)
+ mask_chunk_predictor[:, cache["pad_left"]:cache["stride"] + cache["pad_left"]] = 1.0
+
+ if mask_chunk_predictor is not None:
+ alphas = alphas * mask_chunk_predictor
+
+ if cache is not None:
+ if cache["cif_hidden"] is not None:
+ hidden = torch.cat((cache["cif_hidden"], hidden), 1)
+ if cache["cif_alphas"] is not None:
+ alphas = torch.cat((cache["cif_alphas"], alphas), -1)
+
+ token_num = alphas.sum(-1)
+ acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+ len_time = alphas.size(-1)
+ last_fire_place = len_time - 1
+ last_fire_remainds = 0.0
+ pre_alphas_length = 0
+
+ mask_chunk_peak_predictor = None
+ if cache is not None:
+ mask_chunk_peak_predictor = None
+ mask_chunk_peak_predictor = torch.zeros_like(cif_peak)
+ if cache["cif_alphas"] is not None:
+ pre_alphas_length = cache["cif_alphas"].size(-1)
+ mask_chunk_peak_predictor[:, :pre_alphas_length] = 1.0
+ mask_chunk_peak_predictor[:, pre_alphas_length + cache["pad_left"]:pre_alphas_length + cache["stride"] + cache["pad_left"]] = 1.0
+
+
+ if mask_chunk_peak_predictor is not None:
+ cif_peak = cif_peak * mask_chunk_peak_predictor.squeeze(-1)
+
+ for i in range(len_time):
+ if cif_peak[0][len_time - 1 - i] > self.threshold or cif_peak[0][len_time - 1 - i] == self.threshold:
+ last_fire_place = len_time - 1 - i
+ last_fire_remainds = cif_peak[0][len_time - 1 - i] - self.threshold
+ break
+ last_fire_remainds = torch.tensor([last_fire_remainds], dtype=alphas.dtype).to(alphas.device)
+ cache["cif_hidden"] = hidden[:, last_fire_place:, :]
+ cache["cif_alphas"] = torch.cat((last_fire_remainds.unsqueeze(0), alphas[:, last_fire_place+1:]), -1)
+ token_num_int = token_num.floor().type(torch.int32).item()
+ return acoustic_embeds[:, 0:token_num_int, :], token_num, alphas, cif_peak
+
def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
b, t, d = hidden.size()
tail_threshold = self.tail_threshold
diff --git a/funasr/modules/attention.py b/funasr/modules/attention.py
index 6277005..31d5a87 100644
--- a/funasr/modules/attention.py
+++ b/funasr/modules/attention.py
@@ -347,15 +347,17 @@
mask = torch.reshape(mask, (b, -1, 1))
if mask_shfit_chunk is not None:
mask = mask * mask_shfit_chunk
+ inputs = inputs * mask
- inputs = inputs * mask
x = inputs.transpose(1, 2)
x = self.pad_fn(x)
x = self.fsmn_block(x)
x = x.transpose(1, 2)
x += inputs
x = self.dropout(x)
- return x * mask
+ if mask is not None:
+ x = x * mask
+ return x
def forward_qkv(self, x):
"""Transform query, key and value.
@@ -505,7 +507,7 @@
# print("in fsmn, cache is None, x", x.size())
x = self.pad_fn(x)
- if not self.training and t <= 1:
+ if not self.training:
cache = x
else:
# print("in fsmn, cache is not None, x", x.size())
@@ -513,7 +515,7 @@
# if t < self.kernel_size:
# x = self.pad_fn(x)
x = torch.cat((cache[:, :, 1:], x), dim=2)
- x = x[:, :, -self.kernel_size:]
+ x = x[:, :, -(self.kernel_size+t-1):]
# print("in fsmn, cache is not None, x_cat", x.size())
cache = x
x = self.fsmn_block(x)
diff --git a/funasr/modules/eend_ola/encoder.py b/funasr/modules/eend_ola/encoder.py
index 4999031..90a63f3 100644
--- a/funasr/modules/eend_ola/encoder.py
+++ b/funasr/modules/eend_ola/encoder.py
@@ -87,7 +87,7 @@
n_layers: int,
n_units: int,
e_units: int = 2048,
- h: int = 8,
+ h: int = 4,
dropout_rate: float = 0.1,
use_pos_emb: bool = False):
super(EENDOLATransformerEncoder, self).__init__()
diff --git a/funasr/modules/eend_ola/encoder_decoder_attractor.py b/funasr/modules/eend_ola/encoder_decoder_attractor.py
index db01b00..45ac982 100644
--- a/funasr/modules/eend_ola/encoder_decoder_attractor.py
+++ b/funasr/modules/eend_ola/encoder_decoder_attractor.py
@@ -16,12 +16,12 @@
self.n_units = n_units
def forward_core(self, xs, zeros):
- ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.float32).to(xs[0].device)
+ ilens = torch.from_numpy(np.array([x.shape[0] for x in xs])).to(torch.int64)
xs = [self.enc0_dropout(x) for x in xs]
xs = nn.utils.rnn.pad_sequence(xs, batch_first=True, padding_value=-1)
xs = nn.utils.rnn.pack_padded_sequence(xs, ilens, batch_first=True, enforce_sorted=False)
_, (hx, cx) = self.encoder(xs)
- zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.float32).to(zeros[0].device)
+ zlens = torch.from_numpy(np.array([z.shape[0] for z in zeros])).to(torch.int64)
max_zlen = torch.max(zlens).to(torch.int).item()
zeros = [self.enc0_dropout(z) for z in zeros]
zeros = nn.utils.rnn.pad_sequence(zeros, batch_first=True, padding_value=-1)
@@ -47,4 +47,4 @@
zeros = [torch.zeros(max_n_speakers, self.n_units).to(torch.float32).to(xs[0].device) for _ in xs]
attractors = self.forward_core(xs, zeros)
probs = [torch.sigmoid(torch.flatten(self.counter(att))) for att in attractors]
- return attractors, probs
\ No newline at end of file
+ return attractors, probs
diff --git a/funasr/modules/embedding.py b/funasr/modules/embedding.py
index b61a61a..e4f9bff 100644
--- a/funasr/modules/embedding.py
+++ b/funasr/modules/embedding.py
@@ -405,4 +405,13 @@
positions = torch.arange(1, timesteps+1)[None, :]
position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
- return x + position_encoding
\ No newline at end of file
+ return x + position_encoding
+
+ def forward_chunk(self, x, cache=None):
+ start_idx = 0
+ batch_size, timesteps, input_dim = x.size()
+ if cache is not None:
+ start_idx = cache["start_idx"]
+ positions = torch.arange(1, timesteps+start_idx+1)[None, :]
+ position_encoding = self.encode(positions, input_dim, x.dtype).to(x.device)
+ return x + position_encoding[:, start_idx: start_idx + timesteps]
diff --git a/funasr/runtime/grpc/CMakeLists.txt b/funasr/runtime/grpc/CMakeLists.txt
new file mode 100644
index 0000000..56e3074
--- /dev/null
+++ b/funasr/runtime/grpc/CMakeLists.txt
@@ -0,0 +1,83 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ paraformer example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building paraformer.
+
+cmake_minimum_required(VERSION 3.10)
+
+project(ASR C CXX)
+
+include(common.cmake)
+
+# Proto file
+get_filename_component(rg_proto "../python/grpc/proto/paraformer.proto" ABSOLUTE)
+get_filename_component(rg_proto_path "${rg_proto}" PATH)
+
+# Generated sources
+set(rg_proto_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.cc")
+set(rg_proto_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.pb.h")
+set(rg_grpc_srcs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.cc")
+set(rg_grpc_hdrs "${CMAKE_CURRENT_BINARY_DIR}/paraformer.grpc.pb.h")
+add_custom_command(
+ OUTPUT "${rg_proto_srcs}" "${rg_proto_hdrs}" "${rg_grpc_srcs}" "${rg_grpc_hdrs}"
+ COMMAND ${_PROTOBUF_PROTOC}
+ ARGS --grpc_out "${CMAKE_CURRENT_BINARY_DIR}"
+ --cpp_out "${CMAKE_CURRENT_BINARY_DIR}"
+ -I "${rg_proto_path}"
+ --plugin=protoc-gen-grpc="${_GRPC_CPP_PLUGIN_EXECUTABLE}"
+ "${rg_proto}"
+ DEPENDS "${rg_proto}")
+
+
+# Include generated *.pb.h files
+include_directories("${CMAKE_CURRENT_BINARY_DIR}")
+
+include_directories(../onnxruntime/include/)
+link_directories(../onnxruntime/build/src/)
+link_directories(../onnxruntime/build/third_party/webrtc/)
+
+link_directories(${ONNXRUNTIME_DIR}/lib)
+add_subdirectory("../onnxruntime/src" onnx_src)
+
+# rg_grpc_proto
+add_library(rg_grpc_proto
+ ${rg_grpc_srcs}
+ ${rg_grpc_hdrs}
+ ${rg_proto_srcs}
+ ${rg_proto_hdrs})
+
+
+
+target_link_libraries(rg_grpc_proto
+ ${_REFLECTION}
+ ${_GRPC_GRPCPP}
+ ${_PROTOBUF_LIBPROTOBUF})
+
+# Targets paraformer_(server)
+foreach(_target
+ paraformer_server)
+ add_executable(${_target}
+ "${_target}.cc")
+ target_link_libraries(${_target}
+ rg_grpc_proto
+ rapidasr
+ webrtcvad
+ ${EXTRA_LIBS}
+ ${_REFLECTION}
+ ${_GRPC_GRPCPP}
+ ${_PROTOBUF_LIBPROTOBUF})
+endforeach()
diff --git a/funasr/runtime/grpc/Readme.md b/funasr/runtime/grpc/Readme.md
new file mode 100644
index 0000000..80e55aa
--- /dev/null
+++ b/funasr/runtime/grpc/Readme.md
@@ -0,0 +1,57 @@
+## paraformer grpc onnx server in c++
+
+
+#### Step 1. Build ../onnxruntime as it's document
+```
+#put onnx-lib & onnx-asr-model & vocab.txt into /path/to/asrmodel(eg: /data/asrmodel)
+ls /data/asrmodel/
+onnxruntime-linux-x64-1.14.0 speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+
+file /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/vocab.txt
+UTF-8 Unicode text
+```
+
+#### Step 2. Compile and install grpc v1.52.0 in case of grpc bugs
+```
+export GRPC_INSTALL_DIR=/data/soft/grpc
+export PKG_CONFIG_PATH=$GRPC_INSTALL_DIR/lib/pkgconfig
+
+git clone -b v1.52.0 --depth=1 https://github.com/grpc/grpc.git
+cd grpc
+git submodule update --init --recursive
+
+mkdir -p cmake/build
+pushd cmake/build
+cmake -DgRPC_INSTALL=ON \
+ -DgRPC_BUILD_TESTS=OFF \
+ -DCMAKE_INSTALL_PREFIX=$GRPC_INSTALL_DIR \
+ ../..
+make
+make install
+popd
+
+echo "export GRPC_INSTALL_DIR=/data/soft/grpc" >> ~/.bashrc
+echo "export PKG_CONFIG_PATH=\$GRPC_INSTALL_DIR/lib/pkgconfig" >> ~/.bashrc
+echo "export PATH=\$GRPC_INSTALL_DIR/bin/:\$PKG_CONFIG_PATH:\$PATH" >> ~/.bashrc
+source ~/.bashrc
+```
+
+#### Step 3. Compile and start grpc onnx paraformer server
+```
+# set -DONNXRUNTIME_DIR=/path/to/asrmodel/onnxruntime-linux-x64-1.14.0
+./rebuild.sh
+```
+
+#### Step 4. Start grpc paraformer server
+```
+Usage: ./cmake/build/paraformer_server port thread_num /path/to/model_file
+./cmake/build/paraformer_server 10108 4 /data/asrmodel/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch
+```
+
+
+
+#### Step 5. Start grpc python paraformer client on PC with MIC
+```
+cd ../python/grpc
+python grpc_main_client_mic.py --host $server_ip --port 10108
+```
diff --git a/funasr/runtime/grpc/common.cmake b/funasr/runtime/grpc/common.cmake
new file mode 100644
index 0000000..1326a5b
--- /dev/null
+++ b/funasr/runtime/grpc/common.cmake
@@ -0,0 +1,125 @@
+# Copyright 2018 gRPC authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# cmake build file for C++ route_guide example.
+# Assumes protobuf and gRPC have been installed using cmake.
+# See cmake_externalproject/CMakeLists.txt for all-in-one cmake build
+# that automatically builds all the dependencies before building route_guide.
+
+cmake_minimum_required(VERSION 3.5.1)
+
+if (NOT DEFINED CMAKE_CXX_STANDARD)
+ set (CMAKE_CXX_STANDARD 14)
+endif()
+
+if(MSVC)
+ add_definitions(-D_WIN32_WINNT=0x600)
+endif()
+
+find_package(Threads REQUIRED)
+
+if(GRPC_AS_SUBMODULE)
+ # One way to build a projects that uses gRPC is to just include the
+ # entire gRPC project tree via "add_subdirectory".
+ # This approach is very simple to use, but the are some potential
+ # disadvantages:
+ # * it includes gRPC's CMakeLists.txt directly into your build script
+ # without and that can make gRPC's internal setting interfere with your
+ # own build.
+ # * depending on what's installed on your system, the contents of submodules
+ # in gRPC's third_party/* might need to be available (and there might be
+ # additional prerequisites required to build them). Consider using
+ # the gRPC_*_PROVIDER options to fine-tune the expected behavior.
+ #
+ # A more robust approach to add dependency on gRPC is using
+ # cmake's ExternalProject_Add (see cmake_externalproject/CMakeLists.txt).
+
+ # Include the gRPC's cmake build (normally grpc source code would live
+ # in a git submodule called "third_party/grpc", but this example lives in
+ # the same repository as gRPC sources, so we just look a few directories up)
+ add_subdirectory(../../.. ${CMAKE_CURRENT_BINARY_DIR}/grpc EXCLUDE_FROM_ALL)
+ message(STATUS "Using gRPC via add_subdirectory.")
+
+ # After using add_subdirectory, we can now use the grpc targets directly from
+ # this build.
+ set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+ set(_REFLECTION grpc++_reflection)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_PROTOBUF_PROTOC protoc)
+ else()
+ set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+ endif()
+ set(_GRPC_GRPCPP grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
+ endif()
+elseif(GRPC_FETCHCONTENT)
+ # Another way is to use CMake's FetchContent module to clone gRPC at
+ # configure time. This makes gRPC's source code available to your project,
+ # similar to a git submodule.
+ message(STATUS "Using gRPC via add_subdirectory (FetchContent).")
+ include(FetchContent)
+ FetchContent_Declare(
+ grpc
+ GIT_REPOSITORY https://github.com/grpc/grpc.git
+ # when using gRPC, you will actually set this to an existing tag, such as
+ # v1.25.0, v1.26.0 etc..
+ # For the purpose of testing, we override the tag used to the commit
+ # that's currently under test.
+ GIT_TAG vGRPC_TAG_VERSION_OF_YOUR_CHOICE)
+ FetchContent_MakeAvailable(grpc)
+
+ # Since FetchContent uses add_subdirectory under the hood, we can use
+ # the grpc targets directly from this build.
+ set(_PROTOBUF_LIBPROTOBUF libprotobuf)
+ set(_REFLECTION grpc++_reflection)
+ set(_PROTOBUF_PROTOC $<TARGET_FILE:protoc>)
+ set(_GRPC_GRPCPP grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:grpc_cpp_plugin>)
+ endif()
+else()
+ # This branch assumes that gRPC and all its dependencies are already installed
+ # on this system, so they can be located by find_package().
+
+ # Find Protobuf installation
+ # Looks for protobuf-config.cmake file installed by Protobuf's cmake installation.
+ set(protobuf_MODULE_COMPATIBLE TRUE)
+ find_package(Protobuf CONFIG REQUIRED)
+ message(STATUS "Using protobuf ${Protobuf_VERSION}")
+
+ set(_PROTOBUF_LIBPROTOBUF protobuf::libprotobuf)
+ set(_REFLECTION gRPC::grpc++_reflection)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_PROTOBUF_PROTOC protoc)
+ else()
+ set(_PROTOBUF_PROTOC $<TARGET_FILE:protobuf::protoc>)
+ endif()
+
+ # Find gRPC installation
+ # Looks for gRPCConfig.cmake file installed by gRPC's cmake installation.
+ find_package(gRPC CONFIG REQUIRED)
+ message(STATUS "Using gRPC ${gRPC_VERSION}")
+
+ set(_GRPC_GRPCPP gRPC::grpc++)
+ if(CMAKE_CROSSCOMPILING)
+ find_program(_GRPC_CPP_PLUGIN_EXECUTABLE grpc_cpp_plugin)
+ else()
+ set(_GRPC_CPP_PLUGIN_EXECUTABLE $<TARGET_FILE:gRPC::grpc_cpp_plugin>)
+ endif()
+endif()
diff --git a/funasr/runtime/grpc/paraformer_server.cc b/funasr/runtime/grpc/paraformer_server.cc
new file mode 100644
index 0000000..e5814a5
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.cc
@@ -0,0 +1,195 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <sstream>
+#include <memory>
+#include <string>
+
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+#include <grpcpp/server_context.h>
+#include <grpcpp/security/server_credentials.h>
+
+#include "paraformer.grpc.pb.h"
+#include "paraformer_server.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+ASRServicer::ASRServicer(const char* model_path, int thread_num) {
+ AsrHanlde=RapidAsrInit(model_path, thread_num);
+ std::cout << "ASRServicer init" << std::endl;
+ init_flag = 0;
+}
+
+void ASRServicer::clear_states(const std::string& user) {
+ clear_buffers(user);
+ clear_transcriptions(user);
+}
+
+void ASRServicer::clear_buffers(const std::string& user) {
+ if (client_buffers.count(user)) {
+ client_buffers.erase(user);
+ }
+}
+
+void ASRServicer::clear_transcriptions(const std::string& user) {
+ if (client_transcription.count(user)) {
+ client_transcription.erase(user);
+ }
+}
+
+void ASRServicer::disconnect(const std::string& user) {
+ clear_states(user);
+ std::cout << "Disconnecting user: " << user << std::endl;
+}
+
+grpc::Status ASRServicer::Recognize(
+ grpc::ServerContext* context,
+ grpc::ServerReaderWriter<Response, Request>* stream) {
+
+ Request req;
+ while (stream->Read(&req)) {
+ if (req.isend()) {
+ std::cout << "asr end" << std::endl;
+ disconnect(req.user());
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "asr end"})"
+ );
+ res.set_user(req.user());
+ res.set_action("terminate");
+ res.set_language(req.language());
+ stream->Write(res);
+ } else if (req.speaking()) {
+ if (req.audio_data().size() > 0) {
+ auto& buf = client_buffers[req.user()];
+ buf.insert(buf.end(), req.audio_data().begin(), req.audio_data().end());
+ }
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "speaking"})"
+ );
+ res.set_user(req.user());
+ res.set_action("speaking");
+ res.set_language(req.language());
+ stream->Write(res);
+ } else if (!req.speaking()) {
+ if (client_buffers.count(req.user()) == 0) {
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "waiting_for_voice"})"
+ );
+ res.set_user(req.user());
+ res.set_action("waiting");
+ res.set_language(req.language());
+ stream->Write(res);
+ }else {
+ auto begin_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string tmp_data = this->client_buffers[req.user()];
+ this->clear_states(req.user());
+
+ Response res;
+ res.set_sentence(
+ R"({"success": true, "detail": "decoding data: " + std::to_string(tmp_data.length()) + " bytes"})"
+ );
+ int data_len_int = tmp_data.length();
+ std::string data_len = std::to_string(data_len_int);
+ std::stringstream ss;
+ ss << R"({"success": true, "detail": "decoding data: )" << data_len << R"( bytes")" << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("decoding");
+ res.set_language(req.language());
+ stream->Write(res);
+ if (tmp_data.length() < 800) { //min input_len for asr model
+ auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string delay_str = std::to_string(end_time - begin_time);
+ std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", error: data_is_not_long_enough" << std::endl;
+ Response res;
+ std::stringstream ss;
+ std::string asr_result = "";
+ ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("finish");
+ res.set_language(req.language());
+
+
+
+ stream->Write(res);
+ }
+ else {
+ RPASR_RESULT Result= RapidAsrRecogPCMBuffer(AsrHanlde, tmp_data.c_str(), data_len_int, RASR_NONE, NULL);
+ std::string asr_result = ((RPASR_RECOG_RESULT*)Result)->msg;
+
+ auto end_time = std::chrono::duration_cast<std::chrono::milliseconds>(std::chrono::system_clock::now().time_since_epoch()).count();
+ std::string delay_str = std::to_string(end_time - begin_time);
+
+ std::cout << "user: " << req.user() << " , delay(ms): " << delay_str << ", text: " << asr_result << std::endl;
+ Response res;
+ std::stringstream ss;
+ ss << R"({"success": true, "detail": "finish_sentence","server_delay_ms":)" << delay_str << R"(,"text":")" << asr_result << R"("})";
+ std::string result = ss.str();
+ res.set_sentence(result);
+ res.set_user(req.user());
+ res.set_action("finish");
+ res.set_language(req.language());
+
+
+ stream->Write(res);
+ }
+ }
+ }else {
+ Response res;
+ res.set_sentence(
+ R"({"success": false, "detail": "error, no condition matched! Unknown reason."})"
+ );
+ res.set_user(req.user());
+ res.set_action("terminate");
+ res.set_language(req.language());
+ stream->Write(res);
+ }
+ }
+ return Status::OK;
+}
+
+
+void RunServer(const std::string& port, int thread_num, const char* model_path) {
+ std::string server_address;
+ server_address = "0.0.0.0:" + port;
+ ASRServicer service(model_path, thread_num);
+
+ ServerBuilder builder;
+ builder.AddListeningPort(server_address, grpc::InsecureServerCredentials());
+ builder.RegisterService(&service);
+ std::unique_ptr<Server> server(builder.BuildAndStart());
+ std::cout << "Server listening on " << server_address << std::endl;
+ server->Wait();
+}
+
+int main(int argc, char* argv[]) {
+ if (argc < 3)
+ {
+ printf("Usage: %s port thread_num /path/to/model_file\n", argv[0]);
+ exit(-1);
+ }
+
+ RunServer(argv[1], atoi(argv[2]), argv[3]);
+ return 0;
+}
diff --git a/funasr/runtime/grpc/paraformer_server.h b/funasr/runtime/grpc/paraformer_server.h
new file mode 100644
index 0000000..f356d94
--- /dev/null
+++ b/funasr/runtime/grpc/paraformer_server.h
@@ -0,0 +1,56 @@
+#include <algorithm>
+#include <chrono>
+#include <cmath>
+#include <iostream>
+#include <memory>
+#include <string>
+
+#include <grpc/grpc.h>
+#include <grpcpp/server.h>
+#include <grpcpp/server_builder.h>
+#include <grpcpp/server_context.h>
+#include <grpcpp/security/server_credentials.h>
+
+#include <unordered_map>
+#include <chrono>
+
+#include "paraformer.grpc.pb.h"
+#include "librapidasrapi.h"
+
+
+using grpc::Server;
+using grpc::ServerBuilder;
+using grpc::ServerContext;
+using grpc::ServerReader;
+using grpc::ServerReaderWriter;
+using grpc::ServerWriter;
+using grpc::Status;
+
+
+using paraformer::Request;
+using paraformer::Response;
+using paraformer::ASR;
+
+typedef struct
+{
+ std::string msg;
+ float snippet_time;
+}RPASR_RECOG_RESULT;
+
+
+class ASRServicer final : public ASR::Service {
+ private:
+ int init_flag;
+ std::unordered_map<std::string, std::string> client_buffers;
+ std::unordered_map<std::string, std::string> client_transcription;
+
+ public:
+ ASRServicer(const char* model_path, int thread_num);
+ void clear_states(const std::string& user);
+ void clear_buffers(const std::string& user);
+ void clear_transcriptions(const std::string& user);
+ void disconnect(const std::string& user);
+ grpc::Status Recognize(grpc::ServerContext* context, grpc::ServerReaderWriter<Response, Request>* stream);
+ RPASR_HANDLE AsrHanlde;
+
+};
diff --git a/funasr/runtime/grpc/rebuild.sh b/funasr/runtime/grpc/rebuild.sh
new file mode 100644
index 0000000..9b41ed6
--- /dev/null
+++ b/funasr/runtime/grpc/rebuild.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+
+rm cmake -rf
+mkdir -p cmake/build
+
+cd cmake/build
+
+cmake -DCMAKE_BUILD_TYPE=release ../.. -DONNXRUNTIME_DIR=/data/asrmodel/onnxruntime-linux-x64-1.14.0
+make
+
+
+echo "Build cmake/build/paraformer_server successfully!"
diff --git a/funasr/runtime/onnxruntime/readme.md b/funasr/runtime/onnxruntime/readme.md
index fa2f276..41c63c6 100644
--- a/funasr/runtime/onnxruntime/readme.md
+++ b/funasr/runtime/onnxruntime/readme.md
@@ -41,8 +41,8 @@
```
瀵煎嚭onnx妯″瀷锛孾璇﹁](https://github.com/alibaba-damo-academy/FunASR/tree/main/funasr/export)锛屽弬鑰冪ず渚嬶紝浠巑odelscope涓ā鍨嬪鍑猴細
-```
-python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+```shell
+python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
```
## Building Guidance for Linux/Unix
diff --git a/funasr/runtime/onnxruntime/src/Audio.cpp b/funasr/runtime/onnxruntime/src/Audio.cpp
index 43dfb6b..53bf9d0 100644
--- a/funasr/runtime/onnxruntime/src/Audio.cpp
+++ b/funasr/runtime/onnxruntime/src/Audio.cpp
@@ -237,7 +237,7 @@
size_t nOffset = 0;
-#define WAV_HEADER_SIZE 44
+
speech_len = nBufLen / 2;
speech_align_len = (int)(ceil((float)speech_len / align_size) * align_size);
@@ -263,7 +263,8 @@
speech_data[i] = (float)speech_buff[i] / scale;
}
-
+ AudioFrame* frame = new AudioFrame(speech_len);
+ frame_queue.push(frame);
return true;
}
diff --git a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
index 1f8f7ca..f5f9d66 100644
--- a/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
+++ b/funasr/runtime/onnxruntime/src/librapidasrapi.cpp
@@ -26,8 +26,9 @@
return nullptr;
Audio audio(1);
- audio.loadwav(szBuf,nLen);
- audio.split();
+ if (!audio.loadwav(szBuf, nLen))
+ return nullptr;
+ //audio.split();
float* buff;
int len;
@@ -58,8 +59,9 @@
return nullptr;
Audio audio(1);
- audio.loadpcmwav(szBuf, nLen);
- audio.split();
+ if (!audio.loadpcmwav(szBuf, nLen))
+ return nullptr;
+ //audio.split();
float* buff;
int len;
@@ -91,8 +93,9 @@
return nullptr;
Audio audio(1);
- audio.loadpcmwav(szFileName);
- audio.split();
+ if (!audio.loadpcmwav(szFileName))
+ return nullptr;
+ //audio.split();
float* buff;
int len;
@@ -125,7 +128,7 @@
Audio audio(1);
if(!audio.loadwav(szWavfile))
return nullptr;
- audio.split();
+ //audio.split();
float* buff;
int len;
diff --git a/funasr/runtime/onnxruntime/tester/tester.cpp b/funasr/runtime/onnxruntime/tester/tester.cpp
index b9a85b7..ba5c61c 100644
--- a/funasr/runtime/onnxruntime/tester/tester.cpp
+++ b/funasr/runtime/onnxruntime/tester/tester.cpp
@@ -8,7 +8,7 @@
#include "librapidasrapi.h"
#include <iostream>
-
+#include <fstream>
using namespace std;
int main(int argc, char *argv[])
@@ -40,10 +40,13 @@
gettimeofday(&start, NULL);
-
- RPASR_RESULT Result=RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
- gettimeofday(&end, NULL);
float snippet_time = 0.0f;
+
+
+ RPASR_RESULT Result=RapidAsrRecogFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+ gettimeofday(&end, NULL);
+
if (Result)
{
string msg = RapidAsrGetResult(Result, 0);
@@ -56,11 +59,51 @@
}
else
{
- cout <<("no return data!");
+ cout <<"no return data!";
}
-
- printf("Audio length %lfs.\n", (double)snippet_time);
+
+
+ //char* buff = nullptr;
+ //int len = 0;
+ //ifstream ifs(argv[2], std::ios::binary | std::ios::in);
+ //if (ifs.is_open())
+ //{
+ // ifs.seekg(0, std::ios::end);
+ // len = ifs.tellg();
+ // ifs.seekg(0, std::ios::beg);
+ // buff = new char[len];
+
+ // ifs.read(buff, len);
+
+
+ // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+
+ // RPASR_RESULT Result=RapidAsrRecogPCMBuffer(AsrHanlde, buff,len, RASR_NONE, NULL);
+ // //RPASR_RESULT Result = RapidAsrRecogPCMFile(AsrHanlde, argv[2], RASR_NONE, NULL);
+ // gettimeofday(&end, NULL);
+ //
+ // if (Result)
+ // {
+ // string msg = RapidAsrGetResult(Result, 0);
+ // setbuf(stdout, NULL);
+ // cout << "Result: \"";
+ // cout << msg << endl;
+ // cout << "\"." << endl;
+ // snippet_time = RapidAsrGetRetSnippetTime(Result);
+ // RapidAsrFreeResult(Result);
+ // }
+ // else
+ // {
+ // cout <<"no return data!";
+ // }
+
+ //
+ //delete[]buff;
+ //}
+
+
+ printf("Audio length %lfs.\n", (double)snippet_time);
seconds = (end.tv_sec - start.tv_sec);
long taking_micros = ((seconds * 1000000) + end.tv_usec) - (start.tv_usec);
printf("Model inference takes %lfs.\n", (double)taking_micros / 1000000);
diff --git a/funasr/runtime/python/benchmark_libtorch.md b/funasr/runtime/python/benchmark_libtorch.md
new file mode 100644
index 0000000..6c068fe
--- /dev/null
+++ b/funasr/runtime/python/benchmark_libtorch.md
@@ -0,0 +1,45 @@
+# Benchmark
+
+### Data set:
+Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
+
+### Tools
+- Install ModelScope and FunASR
+
+ ```shell
+ pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+ git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+ pip install --editable ./
+ cd funasr/runtime/python/utils
+ pip install -r requirements.txt
+ ```
+
+- recipe
+
+ set the model, data path and output_dir
+
+ ```shell
+ nohup bash test_rtf.sh &> log.txt &
+ ```
+
+
+
+## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+
+
+### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz 16core-32processor with avx512_vnni
+
+| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+| 1 (torch fp32) | 3522 | 0.0976 | 10.3 |
+| 1 (torch int8) | 1746 | 0.0484 | 20.7 |
+| 32 (torch fp32) | 236 | 0.0066 | 152.7 |
+| 32 (torch int8) | 114 | 0.0032 | 317.4 |
+| 64 (torch fp32) | 235 | 0.0065 | 153.7 |
+| 64 (torch int8) | 113 | 0.0031 | 319.2 |
+
+
+[//]: # (### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni)
+
+
+## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
diff --git a/funasr/runtime/python/benchmark_onnx.md b/funasr/runtime/python/benchmark_onnx.md
new file mode 100644
index 0000000..ca7556b
--- /dev/null
+++ b/funasr/runtime/python/benchmark_onnx.md
@@ -0,0 +1,89 @@
+# Benchmark
+
+### Data set:
+Aishell1 [test set](https://www.openslr.org/33/) , the total audio duration is 36108.919 seconds.
+
+### Tools
+- Install ModelScope and FunASR
+
+ ```shell
+ pip install "modelscope[audio_asr]" --upgrade -f https://modelscope.oss-cn-beijing.aliyuncs.com/releases/repo.html
+ git clone https://github.com/alibaba-damo-academy/FunASR.git && cd FunASR
+ pip install --editable ./
+ cd funasr/runtime/python/utils
+ pip install -r requirements.txt
+ ```
+
+- recipe
+
+ set the model, data path and output_dir
+
+ ```shell
+ nohup bash test_rtf.sh &> log.txt &
+ ```
+
+
+## [Paraformer-large](https://www.modelscope.cn/models/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/summary)
+
+ ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
+
+| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
+|:----------------:|:------------------:|:-------:|:------------:|
+| 1 (onnx fp32) | 2806 | 0.0777 | 12.9 |
+| 1 (onnx int8) | 1611 | 0.0446 | 22.4 |
+| 8 (onnx fp32) | 538 | 0.0149 | 67.1 |
+| 8 (onnx int8) | 210 | 0.0058 | 172.4 |
+| 16 (onnx fp32) | 288 | 0.0080 | 125.2 |
+| 16 (onnx int8) | 117 | 0.0032 | 309.9 |
+| 32 (onnx fp32) | 167 | 0.0046 | 216.5 |
+| 32 (onnx int8) | 86 | 0.0024 | 420.0 |
+| 64 (onnx fp32) | 158 | 0.0044 | 228.1 |
+| 64 (onnx int8) | 82 | 0.0023 | 442.8 |
+| 96 (onnx fp32) | 151 | 0.0042 | 238.0 |
+| 96 (onnx int8) | 80 | 0.0022 | 452.0 |
+
+
+### Intel(R) Xeon(R) Platinum 8269CY CPU @ 2.50GHz 16core-32processor with avx512_vnni
+
+| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+| 1 (onnx fp32) | 2613 | 0.0724 | 13.8 |
+| 1 (onnx int8) | 1321 | 0.0366 | 22.4 |
+| 32 (onnx fp32) | 170 | 0.0047 | 212.7 |
+| 32 (onnx int8) | 89 | 0.0025 | 407.0 |
+| 64 (onnx fp32) | 166 | 0.0046 | 217.1 |
+| 64 (onnx int8) | 87 | 0.0024 | 414.7 |
+
+
+### Intel(R) Xeon(R) Platinum 8163 CPU @ 2.50GHz 32core-64processor without avx512_vnni
+
+
+| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+| 1 (onnx fp32) | 2959 | 0.0820 | 12.2 |
+| 1 (onnx int8) | 2814 | 0.0778 | 12.8 |
+| 16 (onnx fp32) | 373 | 0.0103 | 96.9 |
+| 16 (onnx int8) | 331 | 0.0091 | 109.0 |
+| 32 (onnx fp32) | 211 | 0.0058 | 171.4 |
+| 32 (onnx int8) | 181 | 0.0050 | 200.0 |
+| 64 (onnx fp32) | 153 | 0.0042 | 235.9 |
+| 64 (onnx int8) | 103 | 0.0029 | 349.9 |
+| 96 (onnx fp32) | 146 | 0.0041 | 247.0 |
+| 96 (onnx int8) | 108 | 0.0030 | 334.1 |
+
+## [Paraformer](https://modelscope.cn/models/damo/speech_paraformer_asr_nat-zh-cn-16k-common-vocab8358-tensorflow1/summary)
+
+ ### Intel(R) Xeon(R) Platinum 8369B CPU @ 2.90GHz 16core-32processor with avx512_vnni
+
+| concurrent-tasks | processing time(s) | RTF | Speedup Rate |
+|:----------------:|:------------------:|:------:|:------------:|
+| 1 (onnx fp32) | 1173 | 0.0325 | 30.8 |
+| 1 (onnx int8) | 976 | 0.0270 | 37.0 |
+| 16 (onnx fp32) | 91 | 0.0025 | 395.2 |
+| 16 (onnx int8) | 78 | 0.0022 | 463.0 |
+| 32 (onnx fp32) | 60 | 0.0017 | 598.8 |
+| 32 (onnx int8) | 40 | 0.0011 | 892.9 |
+| 64 (onnx fp32) | 55 | 0.0015 | 653.6 |
+| 64 (onnx int8) | 31 | 0.0009 | 1162.8 |
+| 96 (onnx fp32) | 57 | 0.0016 | 632.9 |
+| 96 (onnx int8) | 33 | 0.0009 | 1098.9 |
diff --git a/funasr/runtime/python/grpc/grpc_main_server.py b/funasr/runtime/python/grpc/grpc_main_server.py
index e862ac4..ae386fa 100644
--- a/funasr/runtime/python/grpc/grpc_main_server.py
+++ b/funasr/runtime/python/grpc/grpc_main_server.py
@@ -10,7 +10,7 @@
# interceptors=(AuthInterceptor('Bearer mysecrettoken'),)
)
paraformer_pb2_grpc.add_ASRServicer_to_server(
- ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir), server)
+ ASRServicer(args.user_allowed, args.model, args.sample_rate, args.backend, args.onnx_dir, vad_model=args.vad_model, punc_model=args.punc_model), server)
port = "[::]:" + str(args.port)
server.add_insecure_port(port)
server.start()
@@ -34,7 +34,16 @@
type=str,
default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
help="model from modelscope")
-
+ parser.add_argument("--vad_model",
+ type=str,
+ default="damo/speech_fsmn_vad_zh-cn-16k-common-pytorch",
+ help="model from modelscope")
+
+ parser.add_argument("--punc_model",
+ type=str,
+ default="",
+ help="model from modelscope")
+
parser.add_argument("--sample_rate",
type=int,
default=16000,
@@ -50,6 +59,7 @@
type=str,
default="/nfs/models/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch",
help="onnx model dir")
+
diff --git a/funasr/runtime/python/grpc/grpc_server.py b/funasr/runtime/python/grpc/grpc_server.py
index 95fe96c..0fdf30c 100644
--- a/funasr/runtime/python/grpc/grpc_server.py
+++ b/funasr/runtime/python/grpc/grpc_server.py
@@ -8,7 +8,7 @@
class ASRServicer(paraformer_pb2_grpc.ASRServicer):
- def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir):
+ def __init__(self, user_allowed, model, sample_rate, backend, onnx_dir, vad_model='', punc_model=''):
print("ASRServicer init")
self.backend = backend
self.init_flag = 0
@@ -21,7 +21,7 @@
from modelscope.utils.constant import Tasks
except ImportError:
raise ImportError(f"Please install modelscope")
- self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model)
+ self.inference_16k_pipeline = pipeline(task=Tasks.auto_speech_recognition, model=model, vad_model=vad_model, punc_model=punc_model)
elif self.backend == "onnxruntime":
try:
from rapid_paraformer.paraformer_onnx import Paraformer
diff --git a/funasr/runtime/python/libtorch/README.md b/funasr/runtime/python/libtorch/README.md
index 1e2d919..cf5bbcc 100644
--- a/funasr/runtime/python/libtorch/README.md
+++ b/funasr/runtime/python/libtorch/README.md
@@ -19,11 +19,11 @@
- `e.g.`, Export model from modelscope
```shell
- python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+ python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
```
- `e.g.`, Export model from local path, the model'name must be `model.pb`.
```shell
- python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" false
+ python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type torch --quantize False
```
diff --git a/funasr/runtime/python/libtorch/setup.py b/funasr/runtime/python/libtorch/setup.py
index 0f9e40d..c50e497 100644
--- a/funasr/runtime/python/libtorch/setup.py
+++ b/funasr/runtime/python/libtorch/setup.py
@@ -28,7 +28,7 @@
install_requires=["librosa", "onnxruntime>=1.7.0",
"scipy", "numpy>=1.19.3",
"typeguard", "kaldi-native-fbank",
- "PyYAML>=5.1.2"],
+ "PyYAML>=5.1.2", "torch-quant >= 0.4.0"],
packages=find_packages(include=["torch_paraformer*"]),
keywords=[
'funasr,paraformer'
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py b/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
index 3545ccf..3c0606d 100644
--- a/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
+++ b/funasr/runtime/python/libtorch/torch_paraformer/paraformer_bin.py
@@ -24,12 +24,16 @@
device_id: Union[str, int] = "-1",
plot_timestamp_to: str = "",
pred_bias: int = 1,
+ quantize: bool = False,
+ intra_op_num_threads: int = 1,
):
if not Path(model_dir).exists():
raise FileNotFoundError(f'{model_dir} does not exist.')
model_file = os.path.join(model_dir, 'model.torchscripts')
+ if quantize:
+ model_file = os.path.join(model_dir, 'model_quant.torchscripts')
config_file = os.path.join(model_dir, 'config.yaml')
cmvn_file = os.path.join(model_dir, 'am.mvn')
config = read_yaml(config_file)
@@ -58,26 +62,28 @@
am_scores, valid_token_lens = outputs[0], outputs[1]
if len(outputs) == 4:
# for BiCifParaformer Inference
- us_alphas, us_cif_peak = outputs[2], outputs[3]
+ us_alphas, us_peaks = outputs[2], outputs[3]
else:
- us_alphas, us_cif_peak = None, None
+ us_alphas, us_peaks = None, None
except:
#logging.warning(traceback.format_exc())
logging.warning("input wav is silence or noise")
preds = ['']
else:
- am_scores, valid_token_lens = am_scores.detach().cpu().numpy(), valid_token_lens.detach().cpu().numpy()
preds = self.decode(am_scores, valid_token_lens)
- if us_cif_peak is None:
+ if us_peaks is None:
for pred in preds:
+ pred = sentence_postprocess(pred)
asr_res.append({'preds': pred})
else:
- for pred, us_cif_peak_ in zip(preds, us_cif_peak):
- text, tokens = pred
- timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
+ for pred, us_peaks_ in zip(preds, us_peaks):
+ raw_tokens = pred
+ timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
+ text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
+ # logging.warning(timestamp)
if len(self.plot_timestamp_to):
- self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
- asr_res.append({'preds': text, 'timestamp': timestamp})
+ self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
+ asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
return asr_res
def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -178,6 +184,6 @@
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
token = token[:valid_token_num-self.pred_bias]
- texts = sentence_postprocess(token)
- return texts
+ # texts = sentence_postprocess(token)
+ return token
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py b/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py
new file mode 100755
index 0000000..349a3f6
--- /dev/null
+++ b/funasr/runtime/python/libtorch/torch_paraformer/utils/compute_wer.py
@@ -0,0 +1,157 @@
+import os
+import numpy as np
+import sys
+
+def compute_wer(ref_file,
+ hyp_file,
+ cer_detail_file):
+ rst = {
+ 'Wrd': 0,
+ 'Corr': 0,
+ 'Ins': 0,
+ 'Del': 0,
+ 'Sub': 0,
+ 'Snt': 0,
+ 'Err': 0.0,
+ 'S.Err': 0.0,
+ 'wrong_words': 0,
+ 'wrong_sentences': 0
+ }
+
+ hyp_dict = {}
+ ref_dict = {}
+ with open(hyp_file, 'r') as hyp_reader:
+ for line in hyp_reader:
+ key = line.strip().split()[0]
+ value = line.strip().split()[1:]
+ hyp_dict[key] = value
+ with open(ref_file, 'r') as ref_reader:
+ for line in ref_reader:
+ key = line.strip().split()[0]
+ value = line.strip().split()[1:]
+ ref_dict[key] = value
+
+ cer_detail_writer = open(cer_detail_file, 'w')
+ for hyp_key in hyp_dict:
+ if hyp_key in ref_dict:
+ out_item = compute_wer_by_line(hyp_dict[hyp_key], ref_dict[hyp_key])
+ rst['Wrd'] += out_item['nwords']
+ rst['Corr'] += out_item['cor']
+ rst['wrong_words'] += out_item['wrong']
+ rst['Ins'] += out_item['ins']
+ rst['Del'] += out_item['del']
+ rst['Sub'] += out_item['sub']
+ rst['Snt'] += 1
+ if out_item['wrong'] > 0:
+ rst['wrong_sentences'] += 1
+ cer_detail_writer.write(hyp_key + print_cer_detail(out_item) + '\n')
+ cer_detail_writer.write("ref:" + '\t' + "".join(ref_dict[hyp_key]) + '\n')
+ cer_detail_writer.write("hyp:" + '\t' + "".join(hyp_dict[hyp_key]) + '\n')
+
+ if rst['Wrd'] > 0:
+ rst['Err'] = round(rst['wrong_words'] * 100 / rst['Wrd'], 2)
+ if rst['Snt'] > 0:
+ rst['S.Err'] = round(rst['wrong_sentences'] * 100 / rst['Snt'], 2)
+
+ cer_detail_writer.write('\n')
+ cer_detail_writer.write("%WER " + str(rst['Err']) + " [ " + str(rst['wrong_words'])+ " / " + str(rst['Wrd']) +
+ ", " + str(rst['Ins']) + " ins, " + str(rst['Del']) + " del, " + str(rst['Sub']) + " sub ]" + '\n')
+ cer_detail_writer.write("%SER " + str(rst['S.Err']) + " [ " + str(rst['wrong_sentences']) + " / " + str(rst['Snt']) + " ]" + '\n')
+ cer_detail_writer.write("Scored " + str(len(hyp_dict)) + " sentences, " + str(len(hyp_dict) - rst['Snt']) + " not present in hyp." + '\n')
+
+
+def compute_wer_by_line(hyp,
+ ref):
+ hyp = list(map(lambda x: x.lower(), hyp))
+ ref = list(map(lambda x: x.lower(), ref))
+
+ len_hyp = len(hyp)
+ len_ref = len(ref)
+
+ cost_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int16)
+
+ ops_matrix = np.zeros((len_hyp + 1, len_ref + 1), dtype=np.int8)
+
+ for i in range(len_hyp + 1):
+ cost_matrix[i][0] = i
+ for j in range(len_ref + 1):
+ cost_matrix[0][j] = j
+
+ for i in range(1, len_hyp + 1):
+ for j in range(1, len_ref + 1):
+ if hyp[i - 1] == ref[j - 1]:
+ cost_matrix[i][j] = cost_matrix[i - 1][j - 1]
+ else:
+ substitution = cost_matrix[i - 1][j - 1] + 1
+ insertion = cost_matrix[i - 1][j] + 1
+ deletion = cost_matrix[i][j - 1] + 1
+
+ compare_val = [substitution, insertion, deletion]
+
+ min_val = min(compare_val)
+ operation_idx = compare_val.index(min_val) + 1
+ cost_matrix[i][j] = min_val
+ ops_matrix[i][j] = operation_idx
+
+ match_idx = []
+ i = len_hyp
+ j = len_ref
+ rst = {
+ 'nwords': len_ref,
+ 'cor': 0,
+ 'wrong': 0,
+ 'ins': 0,
+ 'del': 0,
+ 'sub': 0
+ }
+ while i >= 0 or j >= 0:
+ i_idx = max(0, i)
+ j_idx = max(0, j)
+
+ if ops_matrix[i_idx][j_idx] == 0: # correct
+ if i - 1 >= 0 and j - 1 >= 0:
+ match_idx.append((j - 1, i - 1))
+ rst['cor'] += 1
+
+ i -= 1
+ j -= 1
+
+ elif ops_matrix[i_idx][j_idx] == 2: # insert
+ i -= 1
+ rst['ins'] += 1
+
+ elif ops_matrix[i_idx][j_idx] == 3: # delete
+ j -= 1
+ rst['del'] += 1
+
+ elif ops_matrix[i_idx][j_idx] == 1: # substitute
+ i -= 1
+ j -= 1
+ rst['sub'] += 1
+
+ if i < 0 and j >= 0:
+ rst['del'] += 1
+ elif j < 0 and i >= 0:
+ rst['ins'] += 1
+
+ match_idx.reverse()
+ wrong_cnt = cost_matrix[len_hyp][len_ref]
+ rst['wrong'] = wrong_cnt
+
+ return rst
+
+def print_cer_detail(rst):
+ return ("(" + "nwords=" + str(rst['nwords']) + ",cor=" + str(rst['cor'])
+ + ",ins=" + str(rst['ins']) + ",del=" + str(rst['del']) + ",sub="
+ + str(rst['sub']) + ") corr:" + '{:.2%}'.format(rst['cor']/rst['nwords'])
+ + ",cer:" + '{:.2%}'.format(rst['wrong']/rst['nwords']))
+
+if __name__ == '__main__':
+ if len(sys.argv) != 4:
+ print("usage : python compute-wer.py test.ref test.hyp test.wer")
+ sys.exit(0)
+
+ ref_file = sys.argv[1]
+ hyp_file = sys.argv[2]
+ cer_detail_file = sys.argv[3]
+ compute_wer(ref_file, hyp_file, cer_detail_file)
diff --git a/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py b/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
index 767e864..3a01812 100644
--- a/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
+++ b/funasr/runtime/python/libtorch/torch_paraformer/utils/timestamp_utils.py
@@ -1,11 +1,11 @@
import numpy as np
-def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0):
+def time_stamp_lfr6_onnx(us_cif_peak, char_list, begin_time=0.0, total_offset=-1.5):
if not len(char_list):
return []
START_END_THRESHOLD = 5
- MAX_TOKEN_DURATION = 14
+ MAX_TOKEN_DURATION = 30
TIME_RATE = 10.0 * 6 / 1000 / 3 # 3 times upsampled
cif_peak = us_cif_peak.reshape(-1)
num_frames = cif_peak.shape[-1]
@@ -16,7 +16,7 @@
new_char_list = []
# for bicif model trained with large data, cif2 actually fires when a character starts
# so treat the frames between two peaks as the duration of the former token
- fire_place = np.where(cif_peak>1.0-1e-4)[0] - 1.5 # np format
+ fire_place = np.where(cif_peak>1.0-1e-4)[0] + total_offset # np format
num_peak = len(fire_place)
assert num_peak == len(char_list) + 1 # number of peaks is supposed to be number of tokens + 1
# begin silence
@@ -27,7 +27,7 @@
# tokens timestamp
for i in range(len(fire_place)-1):
new_char_list.append(char_list[i])
- if MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
+ if i == len(fire_place)-2 or MAX_TOKEN_DURATION < 0 or fire_place[i+1] - fire_place[i] < MAX_TOKEN_DURATION:
timestamp_list.append([fire_place[i]*TIME_RATE, fire_place[i+1]*TIME_RATE])
else:
# cut the duration to token and sil of the 0-weight frames last long
@@ -48,11 +48,12 @@
timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
assert len(new_char_list) == len(timestamp_list)
- res_total = []
+ res_str = ""
for char, timestamp in zip(new_char_list, timestamp_list):
- res_total.append([char, timestamp[0], timestamp[1]]) # += "{} {} {};".format(char, timestamp[0], timestamp[1])
+ res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
res = []
for char, timestamp in zip(new_char_list, timestamp_list):
if char != '<sil>':
res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
- return res, res_total
\ No newline at end of file
+ return res_str, res
+
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/README.md b/funasr/runtime/python/onnxruntime/README.md
index 6ed9849..e2a09f1 100644
--- a/funasr/runtime/python/onnxruntime/README.md
+++ b/funasr/runtime/python/onnxruntime/README.md
@@ -24,11 +24,11 @@
- `e.g.`, Export model from modelscope
```shell
- python -m funasr.export.export_model 'damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+ python -m funasr.export.export_model --model-name damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
```
- `e.g.`, Export model from local path, the model'name must be `model.pb`.
```shell
- python -m funasr.export.export_model '/mnt/workspace/damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch' "./export" true
+ python -m funasr.export.export_model --model-name ./damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch --export-dir ./export --type onnx --quantize False
```
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
index 850f007..5567940 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/paraformer_onnx.py
@@ -26,12 +26,16 @@
device_id: Union[str, int] = "-1",
plot_timestamp_to: str = "",
pred_bias: int = 1,
+ quantize: bool = False,
+ intra_op_num_threads: int = 4,
):
if not Path(model_dir).exists():
raise FileNotFoundError(f'{model_dir} does not exist.')
model_file = os.path.join(model_dir, 'model.onnx')
+ if quantize:
+ model_file = os.path.join(model_dir, 'model_quant.onnx')
config_file = os.path.join(model_dir, 'config.yaml')
cmvn_file = os.path.join(model_dir, 'am.mvn')
config = read_yaml(config_file)
@@ -42,7 +46,7 @@
cmvn_file=cmvn_file,
**config['frontend_conf']
)
- self.ort_infer = OrtInferSession(model_file, device_id)
+ self.ort_infer = OrtInferSession(model_file, device_id, intra_op_num_threads=intra_op_num_threads)
self.batch_size = batch_size
self.plot_timestamp_to = plot_timestamp_to
self.pred_bias = pred_bias
@@ -60,25 +64,28 @@
am_scores, valid_token_lens = outputs[0], outputs[1]
if len(outputs) == 4:
# for BiCifParaformer Inference
- us_alphas, us_cif_peak = outputs[2], outputs[3]
+ us_alphas, us_peaks = outputs[2], outputs[3]
else:
- us_alphas, us_cif_peak = None, None
+ us_alphas, us_peaks = None, None
except ONNXRuntimeError:
#logging.warning(traceback.format_exc())
logging.warning("input wav is silence or noise")
preds = ['']
else:
preds = self.decode(am_scores, valid_token_lens)
- if us_cif_peak is None:
+ if us_peaks is None:
for pred in preds:
+ pred = sentence_postprocess(pred)
asr_res.append({'preds': pred})
else:
- for pred, us_cif_peak_ in zip(preds, us_cif_peak):
- text, tokens = pred
- timestamp, timestamp_total = time_stamp_lfr6_onnx(us_cif_peak_, copy.copy(tokens))
+ for pred, us_peaks_ in zip(preds, us_peaks):
+ raw_tokens = pred
+ timestamp, timestamp_raw = time_stamp_lfr6_onnx(us_peaks_, copy.copy(raw_tokens))
+ text_proc, timestamp_proc, _ = sentence_postprocess(raw_tokens, timestamp_raw)
+ # logging.warning(timestamp)
if len(self.plot_timestamp_to):
- self.plot_wave_timestamp(waveform_list[0], timestamp_total, self.plot_timestamp_to)
- asr_res.append({'preds': text, 'timestamp': timestamp})
+ self.plot_wave_timestamp(waveform_list[0], timestamp, self.plot_timestamp_to)
+ asr_res.append({'preds': text_proc, 'timestamp': timestamp_proc, "raw_tokens": raw_tokens})
return asr_res
def plot_wave_timestamp(self, wav, text_timestamp, dest):
@@ -177,6 +184,6 @@
# Change integer-ids to tokens
token = self.converter.ids2tokens(token_int)
token = token[:valid_token_num-self.pred_bias]
- texts = sentence_postprocess(token)
- return texts
+ # texts = sentence_postprocess(token)
+ return token
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
index dd702f3..3a01812 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/timestamp_utils.py
@@ -48,12 +48,12 @@
timestamp_list[i][0] = timestamp_list[i][0] + begin_time / 1000.0
timestamp_list[i][1] = timestamp_list[i][1] + begin_time / 1000.0
assert len(new_char_list) == len(timestamp_list)
- res_total = []
+ res_str = ""
for char, timestamp in zip(new_char_list, timestamp_list):
- res_total.append([char, timestamp[0], timestamp[1]]) # += "{} {} {};".format(char, timestamp[0], timestamp[1])
+ res_str += "{} {} {};".format(char, timestamp[0], timestamp[1])
res = []
for char, timestamp in zip(new_char_list, timestamp_list):
if char != '<sil>':
res.append([int(timestamp[0] * 1000), int(timestamp[1] * 1000)])
- return res, res_total
+ return res_str, res
\ No newline at end of file
diff --git a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
index 392fe6b..2edde11 100644
--- a/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
+++ b/funasr/runtime/python/onnxruntime/rapid_paraformer/utils/utils.py
@@ -1,6 +1,5 @@
# -*- encoding: utf-8 -*-
-# @Author: SWHL
-# @Contact: liekkaskono@163.com
+
import functools
import logging
import pickle
@@ -147,10 +146,10 @@
class OrtInferSession():
- def __init__(self, model_file, device_id=-1):
+ def __init__(self, model_file, device_id=-1, intra_op_num_threads=4):
device_id = str(device_id)
sess_opt = SessionOptions()
- sess_opt.intra_op_num_threads = 4
+ sess_opt.intra_op_num_threads = intra_op_num_threads
sess_opt.log_severity_level = 4
sess_opt.enable_cpu_mem_arena = False
sess_opt.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL
diff --git a/funasr/runtime/python/onnxruntime/setup.py b/funasr/runtime/python/onnxruntime/setup.py
index f062500..299910f 100644
--- a/funasr/runtime/python/onnxruntime/setup.py
+++ b/funasr/runtime/python/onnxruntime/setup.py
@@ -20,8 +20,8 @@
version=VERSION_NUM,
platforms="Any",
description="Using paraformer with ONNXRuntime",
- author="SWHL",
- author_email="liekkaskono@163.com",
+ author="FunASR",
+ author_email="funasr@list.alibaba-inc.com",
url="https://github.com/alibaba-damo-academy/FunASR",
license='MIT',
long_description=get_readme(),
diff --git a/funasr/runtime/python/utils/requirements.txt b/funasr/runtime/python/utils/requirements.txt
new file mode 100644
index 0000000..600eb80
--- /dev/null
+++ b/funasr/runtime/python/utils/requirements.txt
@@ -0,0 +1,2 @@
+onnx
+torch-quant >= 0.4.0
\ No newline at end of file
diff --git a/funasr/runtime/python/utils/split_scp.pl b/funasr/runtime/python/utils/split_scp.pl
new file mode 100755
index 0000000..0876dcb
--- /dev/null
+++ b/funasr/runtime/python/utils/split_scp.pl
@@ -0,0 +1,246 @@
+#!/usr/bin/env perl
+
+# Copyright 2010-2011 Microsoft Corporation
+
+# See ../../COPYING for clarification regarding multiple authors
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# THIS CODE IS PROVIDED *AS IS* BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, EITHER EXPRESS OR IMPLIED, INCLUDING WITHOUT LIMITATION ANY IMPLIED
+# WARRANTIES OR CONDITIONS OF TITLE, FITNESS FOR A PARTICULAR PURPOSE,
+# MERCHANTABLITY OR NON-INFRINGEMENT.
+# See the Apache 2 License for the specific language governing permissions and
+# limitations under the License.
+
+
+# This program splits up any kind of .scp or archive-type file.
+# If there is no utt2spk option it will work on any text file and
+# will split it up with an approximately equal number of lines in
+# each but.
+# With the --utt2spk option it will work on anything that has the
+# utterance-id as the first entry on each line; the utt2spk file is
+# of the form "utterance speaker" (on each line).
+# It splits it into equal size chunks as far as it can. If you use the utt2spk
+# option it will make sure these chunks coincide with speaker boundaries. In
+# this case, if there are more chunks than speakers (and in some other
+# circumstances), some of the resulting chunks will be empty and it will print
+# an error message and exit with nonzero status.
+# You will normally call this like:
+# split_scp.pl scp scp.1 scp.2 scp.3 ...
+# or
+# split_scp.pl --utt2spk=utt2spk scp scp.1 scp.2 scp.3 ...
+# Note that you can use this script to split the utt2spk file itself,
+# e.g. split_scp.pl --utt2spk=utt2spk utt2spk utt2spk.1 utt2spk.2 ...
+
+# You can also call the scripts like:
+# split_scp.pl -j 3 0 scp scp.0
+# [note: with this option, it assumes zero-based indexing of the split parts,
+# i.e. the second number must be 0 <= n < num-jobs.]
+
+use warnings;
+
+$num_jobs = 0;
+$job_id = 0;
+$utt2spk_file = "";
+$one_based = 0;
+
+for ($x = 1; $x <= 3 && @ARGV > 0; $x++) {
+ if ($ARGV[0] eq "-j") {
+ shift @ARGV;
+ $num_jobs = shift @ARGV;
+ $job_id = shift @ARGV;
+ }
+ if ($ARGV[0] =~ /--utt2spk=(.+)/) {
+ $utt2spk_file=$1;
+ shift;
+ }
+ if ($ARGV[0] eq '--one-based') {
+ $one_based = 1;
+ shift @ARGV;
+ }
+}
+
+if ($num_jobs != 0 && ($num_jobs < 0 || $job_id - $one_based < 0 ||
+ $job_id - $one_based >= $num_jobs)) {
+ die "$0: Invalid job number/index values for '-j $num_jobs $job_id" .
+ ($one_based ? " --one-based" : "") . "'\n"
+}
+
+$one_based
+ and $job_id--;
+
+if(($num_jobs == 0 && @ARGV < 2) || ($num_jobs > 0 && (@ARGV < 1 || @ARGV > 2))) {
+ die
+"Usage: split_scp.pl [--utt2spk=<utt2spk_file>] in.scp out1.scp out2.scp ...
+ or: split_scp.pl -j num-jobs job-id [--one-based] [--utt2spk=<utt2spk_file>] in.scp [out.scp]
+ ... where 0 <= job-id < num-jobs, or 1 <= job-id <- num-jobs if --one-based.\n";
+}
+
+$error = 0;
+$inscp = shift @ARGV;
+if ($num_jobs == 0) { # without -j option
+ @OUTPUTS = @ARGV;
+} else {
+ for ($j = 0; $j < $num_jobs; $j++) {
+ if ($j == $job_id) {
+ if (@ARGV > 0) { push @OUTPUTS, $ARGV[0]; }
+ else { push @OUTPUTS, "-"; }
+ } else {
+ push @OUTPUTS, "/dev/null";
+ }
+ }
+}
+
+if ($utt2spk_file ne "") { # We have the --utt2spk option...
+ open($u_fh, '<', $utt2spk_file) || die "$0: Error opening utt2spk file $utt2spk_file: $!\n";
+ while(<$u_fh>) {
+ @A = split;
+ @A == 2 || die "$0: Bad line $_ in utt2spk file $utt2spk_file\n";
+ ($u,$s) = @A;
+ $utt2spk{$u} = $s;
+ }
+ close $u_fh;
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+ @spkrs = ();
+ while(<$i_fh>) {
+ @A = split;
+ if(@A == 0) { die "$0: Empty or space-only line in scp file $inscp\n"; }
+ $u = $A[0];
+ $s = $utt2spk{$u};
+ defined $s || die "$0: No utterance $u in utt2spk file $utt2spk_file\n";
+ if(!defined $spk_count{$s}) {
+ push @spkrs, $s;
+ $spk_count{$s} = 0;
+ $spk_data{$s} = []; # ref to new empty array.
+ }
+ $spk_count{$s}++;
+ push @{$spk_data{$s}}, $_;
+ }
+ # Now split as equally as possible ..
+ # First allocate spks to files by allocating an approximately
+ # equal number of speakers.
+ $numspks = @spkrs; # number of speakers.
+ $numscps = @OUTPUTS; # number of output files.
+ if ($numspks < $numscps) {
+ die "$0: Refusing to split data because number of speakers $numspks " .
+ "is less than the number of output .scp files $numscps\n";
+ }
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scparray[$scpidx] = []; # [] is array reference.
+ }
+ for ($spkidx = 0; $spkidx < $numspks; $spkidx++) {
+ $scpidx = int(($spkidx*$numscps) / $numspks);
+ $spk = $spkrs[$spkidx];
+ push @{$scparray[$scpidx]}, $spk;
+ $scpcount[$scpidx] += $spk_count{$spk};
+ }
+
+ # Now will try to reassign beginning + ending speakers
+ # to different scp's and see if it gets more balanced.
+ # Suppose objf we're minimizing is sum_i (num utts in scp[i] - average)^2.
+ # We can show that if considering changing just 2 scp's, we minimize
+ # this by minimizing the squared difference in sizes. This is
+ # equivalent to minimizing the absolute difference in sizes. This
+ # shows this method is bound to converge.
+
+ $changed = 1;
+ while($changed) {
+ $changed = 0;
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ # First try to reassign ending spk of this scp.
+ if($scpidx < $numscps-1) {
+ $sz = @{$scparray[$scpidx]};
+ if($sz > 0) {
+ $spk = $scparray[$scpidx]->[$sz-1];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx];
+ $nutt2 = $scpcount[$scpidx+1];
+ if( abs( ($nutt2+$count) - ($nutt1-$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx+1] += $count;
+ $scpcount[$scpidx] -= $count;
+ pop @{$scparray[$scpidx]};
+ unshift @{$scparray[$scpidx+1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ if($scpidx > 0 && @{$scparray[$scpidx]} > 0) {
+ $spk = $scparray[$scpidx]->[0];
+ $count = $spk_count{$spk};
+ $nutt1 = $scpcount[$scpidx-1];
+ $nutt2 = $scpcount[$scpidx];
+ if( abs( ($nutt2-$count) - ($nutt1+$count))
+ < abs($nutt2 - $nutt1)) { # Would decrease
+ # size-diff by reassigning spk...
+ $scpcount[$scpidx-1] += $count;
+ $scpcount[$scpidx] -= $count;
+ shift @{$scparray[$scpidx]};
+ push @{$scparray[$scpidx-1]}, $spk;
+ $changed = 1;
+ }
+ }
+ }
+ }
+ # Now print out the files...
+ for($scpidx = 0; $scpidx < $numscps; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($f_fh, '>', $scpfile)
+ : open($f_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ $count = 0;
+ if(@{$scparray[$scpidx]} == 0) {
+ print STDERR "$0: eError: split_scp.pl producing empty .scp file " .
+ "$scpfile (too many splits and too few speakers?)\n";
+ $error = 1;
+ } else {
+ foreach $spk ( @{$scparray[$scpidx]} ) {
+ print $f_fh @{$spk_data{$spk}};
+ $count += $spk_count{$spk};
+ }
+ $count == $scpcount[$scpidx] || die "Count mismatch [code error]";
+ }
+ close($f_fh);
+ }
+} else {
+ # This block is the "normal" case where there is no --utt2spk
+ # option and we just break into equal size chunks.
+
+ open($i_fh, '<', $inscp) || die "$0: Error opening input scp file $inscp: $!\n";
+
+ $numscps = @OUTPUTS; # size of array.
+ @F = ();
+ while(<$i_fh>) {
+ push @F, $_;
+ }
+ $numlines = @F;
+ if($numlines == 0) {
+ print STDERR "$0: error: empty input scp file $inscp\n";
+ $error = 1;
+ }
+ $linesperscp = int( $numlines / $numscps); # the "whole part"..
+ $linesperscp >= 1 || die "$0: You are splitting into too many pieces! [reduce \$nj ($numscps) to be smaller than the number of lines ($numlines) in $inscp]\n";
+ $remainder = $numlines - ($linesperscp * $numscps);
+ ($remainder >= 0 && $remainder < $numlines) || die "bad remainder $remainder";
+ # [just doing int() rounds down].
+ $n = 0;
+ for($scpidx = 0; $scpidx < @OUTPUTS; $scpidx++) {
+ $scpfile = $OUTPUTS[$scpidx];
+ ($scpfile ne '-' ? open($o_fh, '>', $scpfile)
+ : open($o_fh, '>&', \*STDOUT)) ||
+ die "$0: Could not open scp file $scpfile for writing: $!\n";
+ for($k = 0; $k < $linesperscp + ($scpidx < $remainder ? 1 : 0); $k++) {
+ print $o_fh $F[$n++];
+ }
+ close($o_fh) || die "$0: Eror closing scp file $scpfile: $!\n";
+ }
+ $n == $numlines || die "$n != $numlines [code error]";
+}
+
+exit ($error);
diff --git a/funasr/runtime/python/utils/test_rtf.py b/funasr/runtime/python/utils/test_rtf.py
new file mode 100644
index 0000000..536ee2d
--- /dev/null
+++ b/funasr/runtime/python/utils/test_rtf.py
@@ -0,0 +1,55 @@
+
+import time
+import sys
+import librosa
+from funasr.utils.types import str2bool
+
+import argparse
+parser = argparse.ArgumentParser()
+parser.add_argument('--model_dir', type=str, required=True)
+parser.add_argument('--backend', type=str, default='onnx', help='["onnx", "torch"]')
+parser.add_argument('--wav_file', type=str, default=None, help='amp fallback number')
+parser.add_argument('--quantize', type=str2bool, default=False, help='quantized model')
+parser.add_argument('--intra_op_num_threads', type=int, default=1, help='intra_op_num_threads for onnx')
+args = parser.parse_args()
+
+
+from funasr.runtime.python.libtorch.torch_paraformer import Paraformer
+if args.backend == "onnx":
+ from funasr.runtime.python.onnxruntime.rapid_paraformer import Paraformer
+
+model = Paraformer(args.model_dir, batch_size=1, quantize=args.quantize, intra_op_num_threads=args.intra_op_num_threads)
+
+wav_file_f = open(args.wav_file, 'r')
+wav_files = wav_file_f.readlines()
+
+# warm-up
+total = 0.0
+num = 30
+wav_path = wav_files[0].split("\t")[1].strip() if "\t" in wav_files[0] else wav_files[0].split(" ")[1].strip()
+for i in range(num):
+ beg_time = time.time()
+ result = model(wav_path)
+ end_time = time.time()
+ duration = end_time-beg_time
+ total += duration
+ print(result)
+ print("num: {}, time, {}, avg: {}, rtf: {}".format(len(wav_path), duration, total/(i+1), (total/(i+1))/5.53))
+
+# infer time
+beg_time = time.time()
+for i, wav_path_i in enumerate(wav_files):
+ wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
+ result = model(wav_path)
+end_time = time.time()
+duration = (end_time-beg_time)*1000
+print("total_time_comput_ms: {}".format(int(duration)))
+
+duration_time = 0.0
+for i, wav_path_i in enumerate(wav_files):
+ wav_path = wav_path_i.split("\t")[1].strip() if "\t" in wav_path_i else wav_path_i.split(" ")[1].strip()
+ waveform, _ = librosa.load(wav_path, sr=16000)
+ duration_time += len(waveform)/16.0
+print("total_time_wav_ms: {}".format(int(duration_time)))
+
+print("total_rtf: {:.5}".format(duration/duration_time))
\ No newline at end of file
diff --git a/funasr/runtime/python/utils/test_rtf.sh b/funasr/runtime/python/utils/test_rtf.sh
new file mode 100644
index 0000000..dcce6c4
--- /dev/null
+++ b/funasr/runtime/python/utils/test_rtf.sh
@@ -0,0 +1,71 @@
+
+nj=32
+stage=0
+
+scp="/nfs/haoneng.lhn/funasr_data/aishell-1/data/test/wav.scp"
+export_root="/nfs/zhifu.gzf/export"
+split_scps_tool=split_scp.pl
+rtf_tool=test_rtf.py
+
+#:<<!
+model_name="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+backend="onnx" # "torch"
+quantize='true' # 'False'
+tag=${model_name}/${backend}_quantize_${quantize}
+!
+
+logs_outputs_dir=${export_root}/logs/${tag}/split$nj
+mkdir -p ${logs_outputs_dir}
+echo ${logs_outputs_dir}
+
+
+if [ ${stage} -le 0 ];then
+
+ python -m funasr.export.export_model --model-name ${model_name} --export-dir ${export_root} --type ${backend} --quantize ${quantize} --audio_in ${scp}
+
+fi
+
+
+if [ ${stage} -le 1 ];then
+
+model_dir=${export_root}/${model_name}
+split_scps=""
+for JOB in $(seq ${nj}); do
+ split_scps="$split_scps $logs_outputs_dir/wav.$JOB.scp"
+done
+
+perl ${split_scps_tool} $scp ${split_scps}
+
+
+for JOB in $(seq ${nj}); do
+ {
+ core_id=`expr $JOB - 1`
+ taskset -c ${core_id} python ${rtf_tool} --backend ${backend} --model_dir ${model_dir} --wav_file ${logs_outputs_dir}/wav.$JOB.scp --quantize ${quantize} &> ${logs_outputs_dir}/log.$JOB.txt
+ }&
+
+done
+wait
+
+
+rm -rf ${logs_outputs_dir}/total_time_comput.txt
+rm -rf ${logs_outputs_dir}/total_time_wav.txt
+rm -rf ${logs_outputs_dir}/total_rtf.txt
+for JOB in $(seq ${nj}); do
+ {
+ cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_comput" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_time_comput.txt
+ cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_time_wav" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_time_wav.txt
+ cat ${logs_outputs_dir}/log.$JOB.txt | grep "total_rtf" | awk -F ' ' '{print $2}' >> ${logs_outputs_dir}/total_rtf.txt
+ }
+
+done
+
+total_time_comput=`cat ${logs_outputs_dir}/total_time_comput.txt | awk 'BEGIN {max = 0} {if ($1+0>max+0) max=$1 fi} END {print max}'`
+total_time_wav=`cat ${logs_outputs_dir}/total_time_wav.txt | awk '{sum +=$1};END {print sum}'`
+rtf=`awk 'BEGIN{printf "%.5f\n",'$total_time_comput'/'$total_time_wav'}'`
+speed=`awk 'BEGIN{printf "%.2f\n",1/'$rtf'}'`
+
+echo "total_time_comput_ms: $total_time_comput"
+echo "total_time_wav: $total_time_wav"
+echo "total_rtf: $rtf, speech: $speed"
+
+fi
\ No newline at end of file
diff --git a/funasr/tasks/abs_task.py b/funasr/tasks/abs_task.py
index cc5b708..8080ef8 100644
--- a/funasr/tasks/abs_task.py
+++ b/funasr/tasks/abs_task.py
@@ -639,12 +639,12 @@
"and exclude_keys excludes keys of model states for the initialization."
"e.g.\n"
" # Load all parameters"
- " --init_param some/where/model.pth\n"
+ " --init_param some/where/model.pb\n"
" # Load only decoder parameters"
- " --init_param some/where/model.pth:decoder:decoder\n"
+ " --init_param some/where/model.pb:decoder:decoder\n"
" # Load only decoder parameters excluding decoder.embed"
- " --init_param some/where/model.pth:decoder:decoder:decoder.embed\n"
- " --init_param some/where/model.pth:decoder:decoder:decoder.embed\n",
+ " --init_param some/where/model.pb:decoder:decoder:decoder.embed\n"
+ " --init_param some/where/model.pb:decoder:decoder:decoder.embed\n",
)
group.add_argument(
"--ignore_init_mismatch",
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 36499a2..e151473 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -826,7 +826,7 @@
if "model.ckpt-" in model_name or ".bin" in model_name:
model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
'.pb')) if ".bin" in model_name else os.path.join(
- model_dir, "{}.pth".format(model_name))
+ model_dir, "{}.pb".format(model_name))
if os.path.exists(model_name_pth):
logging.info("model_file is load from pth: {}".format(model_name_pth))
model_dict = torch.load(model_name_pth, map_location=device)
@@ -1073,7 +1073,7 @@
if "model.ckpt-" in model_name or ".bin" in model_name:
model_name_pth = os.path.join(model_dir, model_name.replace('.bin',
'.pb')) if ".bin" in model_name else os.path.join(
- model_dir, "{}.pth".format(model_name))
+ model_dir, "{}.pb".format(model_name))
if os.path.exists(model_name_pth):
logging.info("model_file is load from pth: {}".format(model_name_pth))
model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/tasks/diar.py b/funasr/tasks/diar.py
index ae7ee9b..096a5c8 100644
--- a/funasr/tasks/diar.py
+++ b/funasr/tasks/diar.py
@@ -507,7 +507,7 @@
config_file: Union[Path, str] = None,
model_file: Union[Path, str] = None,
cmvn_file: Union[Path, str] = None,
- device: str = "cpu",
+ device: Union[str, torch.device] = "cpu",
):
"""Build model from the files.
@@ -553,7 +553,7 @@
if ".bin" in model_name:
model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
else:
- model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+ model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
if os.path.exists(model_name_pth):
logging.info("model_file is load from pth: {}".format(model_name_pth))
model_dict = torch.load(model_name_pth, map_location=device)
@@ -562,12 +562,27 @@
model.load_state_dict(model_dict)
else:
model_dict = torch.load(model_file, map_location=device)
+ model_dict = cls.fileter_model_dict(model_dict, model.state_dict())
model.load_state_dict(model_dict)
if model_name_pth is not None and not os.path.exists(model_name_pth):
torch.save(model_dict, model_name_pth)
logging.info("model_file is saved to pth: {}".format(model_name_pth))
return model, args
+
+ @classmethod
+ def fileter_model_dict(cls, src_dict: dict, dest_dict: dict):
+ from collections import OrderedDict
+ new_dict = OrderedDict()
+ for key, value in src_dict.items():
+ if key in dest_dict:
+ new_dict[key] = value
+ else:
+ logging.info("{} is no longer needed in this model.".format(key))
+ for key, value in dest_dict.items():
+ if key not in new_dict:
+ logging.warning("{} is missed in checkpoint.".format(key))
+ return new_dict
@classmethod
def convert_tf2torch(
@@ -750,47 +765,47 @@
cls, args: argparse.Namespace, train: bool
) -> Optional[Callable[[str, Dict[str, np.array]], Dict[str, np.ndarray]]]:
assert check_argument_types()
- if args.use_preprocessor:
- retval = CommonPreprocessor(
- train=train,
- token_type=args.token_type,
- token_list=args.token_list,
- bpemodel=None,
- non_linguistic_symbols=None,
- text_cleaner=None,
- g2p_type=None,
- split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
- seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
- # NOTE(kamo): Check attribute existence for backward compatibility
- rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
- rir_apply_prob=args.rir_apply_prob
- if hasattr(args, "rir_apply_prob")
- else 1.0,
- noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
- noise_apply_prob=args.noise_apply_prob
- if hasattr(args, "noise_apply_prob")
- else 1.0,
- noise_db_range=args.noise_db_range
- if hasattr(args, "noise_db_range")
- else "13_15",
- speech_volume_normalize=args.speech_volume_normalize
- if hasattr(args, "rir_scp")
- else None,
- )
- else:
- retval = None
- assert check_return_type(retval)
- return retval
+ # if args.use_preprocessor:
+ # retval = CommonPreprocessor(
+ # train=train,
+ # token_type=args.token_type,
+ # token_list=args.token_list,
+ # bpemodel=None,
+ # non_linguistic_symbols=None,
+ # text_cleaner=None,
+ # g2p_type=None,
+ # split_with_space=args.split_with_space if hasattr(args, "split_with_space") else False,
+ # seg_dict_file=args.seg_dict_file if hasattr(args, "seg_dict_file") else None,
+ # # NOTE(kamo): Check attribute existence for backward compatibility
+ # rir_scp=args.rir_scp if hasattr(args, "rir_scp") else None,
+ # rir_apply_prob=args.rir_apply_prob
+ # if hasattr(args, "rir_apply_prob")
+ # else 1.0,
+ # noise_scp=args.noise_scp if hasattr(args, "noise_scp") else None,
+ # noise_apply_prob=args.noise_apply_prob
+ # if hasattr(args, "noise_apply_prob")
+ # else 1.0,
+ # noise_db_range=args.noise_db_range
+ # if hasattr(args, "noise_db_range")
+ # else "13_15",
+ # speech_volume_normalize=args.speech_volume_normalize
+ # if hasattr(args, "rir_scp")
+ # else None,
+ # )
+ # else:
+ # retval = None
+ # assert check_return_type(retval)
+ return None
@classmethod
def required_data_names(
cls, train: bool = True, inference: bool = False
) -> Tuple[str, ...]:
if not inference:
- retval = ("speech", "profile", "binary_labels")
+ retval = ("speech", )
else:
# Recognition mode
- retval = ("speech")
+ retval = ("speech", )
return retval
@classmethod
@@ -823,7 +838,7 @@
# 2. Encoder
encoder_class = encoder_choices.get_class(args.encoder)
- encoder = encoder_class(input_size=input_size, **args.encoder_conf)
+ encoder = encoder_class(**args.encoder_conf)
# 3. EncoderDecoderAttractor
encoder_decoder_attractor_class = encoder_decoder_attractor_choices.get_class(args.encoder_decoder_attractor)
diff --git a/funasr/tasks/sv.py b/funasr/tasks/sv.py
index 1b08c4d..bef5dc5 100644
--- a/funasr/tasks/sv.py
+++ b/funasr/tasks/sv.py
@@ -501,7 +501,7 @@
if ".bin" in model_name:
model_name_pth = os.path.join(model_dir, model_name.replace('.bin', '.pb'))
else:
- model_name_pth = os.path.join(model_dir, "{}.pth".format(model_name))
+ model_name_pth = os.path.join(model_dir, "{}.pb".format(model_name))
if os.path.exists(model_name_pth):
logging.info("model_file is load from pth: {}".format(model_name_pth))
model_dict = torch.load(model_name_pth, map_location=device)
diff --git a/funasr/torch_utils/load_pretrained_model.py b/funasr/torch_utils/load_pretrained_model.py
index 8e3f05e..e9b18cd 100644
--- a/funasr/torch_utils/load_pretrained_model.py
+++ b/funasr/torch_utils/load_pretrained_model.py
@@ -52,13 +52,13 @@
init_param: <file_path>:<src_key>:<dst_key>:<exclude_Keys>
Examples:
- >>> load_pretrained_model("somewhere/model.pth", model)
- >>> load_pretrained_model("somewhere/model.pth:decoder:decoder", model)
- >>> load_pretrained_model("somewhere/model.pth:decoder:decoder:", model)
+ >>> load_pretrained_model("somewhere/model.pb", model)
+ >>> load_pretrained_model("somewhere/model.pb:decoder:decoder", model)
+ >>> load_pretrained_model("somewhere/model.pb:decoder:decoder:", model)
>>> load_pretrained_model(
- ... "somewhere/model.pth:decoder:decoder:decoder.embed", model
+ ... "somewhere/model.pb:decoder:decoder:decoder.embed", model
... )
- >>> load_pretrained_model("somewhere/decoder.pth::decoder", model)
+ >>> load_pretrained_model("somewhere/decoder.pb::decoder", model)
"""
sps = init_param.split(":", 4)
if len(sps) == 4:
diff --git a/funasr/train/trainer.py b/funasr/train/trainer.py
index 50bce47..efe2009 100644
--- a/funasr/train/trainer.py
+++ b/funasr/train/trainer.py
@@ -205,9 +205,9 @@
else:
scaler = None
- if trainer_options.resume and (output_dir / "checkpoint.pth").exists():
+ if trainer_options.resume and (output_dir / "checkpoint.pb").exists():
cls.resume(
- checkpoint=output_dir / "checkpoint.pth",
+ checkpoint=output_dir / "checkpoint.pb",
model=model,
optimizers=optimizers,
schedulers=schedulers,
@@ -361,7 +361,7 @@
},
buffer,
)
- trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pth"), buffer.getvalue())
+ trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir, "checkpoint.pb"), buffer.getvalue())
else:
torch.save(
{
@@ -374,7 +374,7 @@
],
"scaler": scaler.state_dict() if scaler is not None else None,
},
- output_dir / "checkpoint.pth",
+ output_dir / "checkpoint.pb",
)
# 5. Save and log the model and update the link to the best model
@@ -382,22 +382,22 @@
buffer = BytesIO()
torch.save(model.state_dict(), buffer)
trainer_options.oss_bucket.put_object(os.path.join(trainer_options.output_dir,
- f"{iepoch}epoch.pth"),buffer.getvalue())
+ f"{iepoch}epoch.pb"),buffer.getvalue())
else:
- torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pth")
+ torch.save(model.state_dict(), output_dir / f"{iepoch}epoch.pb")
- # Creates a sym link latest.pth -> {iepoch}epoch.pth
+ # Creates a sym link latest.pb -> {iepoch}epoch.pb
if trainer_options.use_pai:
- p = os.path.join(trainer_options.output_dir, "latest.pth")
+ p = os.path.join(trainer_options.output_dir, "latest.pb")
if trainer_options.oss_bucket.object_exists(p):
trainer_options.oss_bucket.delete_object(p)
trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
- os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"), p)
+ os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"), p)
else:
- p = output_dir / "latest.pth"
+ p = output_dir / "latest.pb"
if p.is_symlink() or p.exists():
p.unlink()
- p.symlink_to(f"{iepoch}epoch.pth")
+ p.symlink_to(f"{iepoch}epoch.pb")
_improved = []
for _phase, k, _mode in trainer_options.best_model_criterion:
@@ -407,16 +407,16 @@
# Creates sym links if it's the best result
if best_epoch == iepoch:
if trainer_options.use_pai:
- p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pth")
+ p = os.path.join(trainer_options.output_dir, f"{_phase}.{k}.best.pb")
if trainer_options.oss_bucket.object_exists(p):
trainer_options.oss_bucket.delete_object(p)
trainer_options.oss_bucket.copy_object(trainer_options.oss_bucket.bucket_name,
- os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pth"),p)
+ os.path.join(trainer_options.output_dir, f"{iepoch}epoch.pb"),p)
else:
- p = output_dir / f"{_phase}.{k}.best.pth"
+ p = output_dir / f"{_phase}.{k}.best.pb"
if p.is_symlink() or p.exists():
p.unlink()
- p.symlink_to(f"{iepoch}epoch.pth")
+ p.symlink_to(f"{iepoch}epoch.pb")
_improved.append(f"{_phase}.{k}")
if len(_improved) == 0:
logging.info("There are no improvements in this epoch")
@@ -438,7 +438,7 @@
type="model",
metadata={"improved": _improved},
)
- artifact.add_file(str(output_dir / f"{iepoch}epoch.pth"))
+ artifact.add_file(str(output_dir / f"{iepoch}epoch.pb"))
aliases = [
f"epoch-{iepoch}",
"best" if best_epoch == iepoch else "",
@@ -473,12 +473,12 @@
for e in range(1, iepoch):
if trainer_options.use_pai:
- p = os.path.join(trainer_options.output_dir, f"{e}epoch.pth")
+ p = os.path.join(trainer_options.output_dir, f"{e}epoch.pb")
if trainer_options.oss_bucket.object_exists(p) and e not in nbests:
trainer_options.oss_bucket.delete_object(p)
_removed.append(str(p))
else:
- p = output_dir / f"{e}epoch.pth"
+ p = output_dir / f"{e}epoch.pb"
if p.exists() and e not in nbests:
p.unlink()
_removed.append(str(p))
diff --git a/funasr/utils/asr_utils.py b/funasr/utils/asr_utils.py
index 0f0e4c3..4067b04 100644
--- a/funasr/utils/asr_utils.py
+++ b/funasr/utils/asr_utils.py
@@ -58,14 +58,15 @@
if r_recog_type is None and audio_in is not None:
# audio_in is wav, recog_type is wav_file
if os.path.isfile(audio_in):
- audio_type = os.path.basename(audio_in).split(".")[-1].lower()
- if audio_type in SUPPORT_AUDIO_TYPE_SETS:
- r_recog_type = 'wav'
- r_audio_format = 'wav'
- elif audio_type == "scp":
+ audio_type = os.path.basename(audio_in).lower()
+ for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+ if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+ r_recog_type = 'wav'
+ r_audio_format = 'wav'
+ if audio_type.rfind(".scp") >= 0:
r_recog_type = 'wav'
r_audio_format = 'scp'
- else:
+ if r_recog_type is None:
raise NotImplementedError(
f'Not supported audio type: {audio_type}')
@@ -128,13 +129,15 @@
def get_sr_from_wav(fname: str):
fs = None
if os.path.isfile(fname):
- audio_type = os.path.basename(fname).split(".")[-1].lower()
- if audio_type in SUPPORT_AUDIO_TYPE_SETS:
- if audio_type == "pcm":
- fs = None
- else:
- audio, fs = torchaudio.load(fname)
- elif audio_type == "scp":
+ audio_type = os.path.basename(fname).lower()
+ for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+ if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+ if support_audio_type == "pcm":
+ fs = None
+ else:
+ audio, fs = torchaudio.load(fname)
+ break
+ if audio_type.rfind(".scp") >= 0:
with open(fname, encoding="utf-8") as f:
for line in f:
wav_path = line.split()[1]
@@ -147,9 +150,7 @@
for file in dir_files:
file_path = os.path.join(fname, file)
if os.path.isfile(file_path):
- audio_type = os.path.basename(file_path).split(".")[-1].lower()
- if audio_type in SUPPORT_AUDIO_TYPE_SETS:
- fs = get_sr_from_wav(file_path)
+ fs = get_sr_from_wav(file_path)
elif os.path.isdir(file_path):
fs = get_sr_from_wav(file_path)
@@ -165,12 +166,12 @@
file_path = os.path.join(dir_path, file)
if os.path.isfile(file_path):
if ends == ".wav" or ends == ".WAV":
- audio_type = os.path.basename(file_path).split(".")[-1].lower()
- if audio_type in SUPPORT_AUDIO_TYPE_SETS:
- return True
- else:
- raise NotImplementedError(
- f'Not supported audio type: {audio_type}')
+ audio_type = os.path.basename(file_path).lower()
+ for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+ if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+ return True
+ raise NotImplementedError(
+ f'Not supported audio type: {audio_type}')
elif file_path.endswith(ends):
return True
elif os.path.isdir(file_path):
@@ -185,9 +186,10 @@
for file in dir_files:
file_path = os.path.join(dir_path, file)
if os.path.isfile(file_path):
- audio_type = os.path.basename(file_path).split(".")[-1].lower()
- if audio_type in SUPPORT_AUDIO_TYPE_SETS:
- wav_list.append(file_path)
+ audio_type = os.path.basename(file_path).lower()
+ for support_audio_type in SUPPORT_AUDIO_TYPE_SETS:
+ if audio_type.rfind(".{}".format(support_audio_type)) >= 0:
+ wav_list.append(file_path)
elif os.path.isdir(file_path):
recursion_dir_all_wav(wav_list, file_path)
diff --git a/funasr/utils/postprocess_utils.py b/funasr/utils/postprocess_utils.py
index 2475548..40756d8 100644
--- a/funasr/utils/postprocess_utils.py
+++ b/funasr/utils/postprocess_utils.py
@@ -106,17 +106,18 @@
if num in abbr_begin:
if time_stamp is not None:
begin = time_stamp[ts_nums[num]][0]
- word_lists.append(words[num].upper())
+ abbr_word = words[num].upper()
num += 1
while num < words_size:
if num in abbr_end:
- word_lists.append(words[num].upper())
+ abbr_word += words[num].upper()
last_num = num
break
else:
if words[num].encode('utf-8').isalpha():
- word_lists.append(words[num].upper())
+ abbr_word += words[num].upper()
num += 1
+ word_lists.append(abbr_word)
if time_stamp is not None:
end = time_stamp[ts_nums[num]][1]
ts_lists.append([begin, end])
diff --git a/funasr/utils/timestamp_tools.py b/funasr/utils/timestamp_tools.py
index f5a238e..423110c 100644
--- a/funasr/utils/timestamp_tools.py
+++ b/funasr/utils/timestamp_tools.py
@@ -1,6 +1,10 @@
import torch
import copy
+import codecs
import logging
+import edit_distance
+import argparse
+import pdb
import numpy as np
from typing import Any, List, Tuple, Union
@@ -9,7 +13,8 @@
us_peaks,
char_list,
vad_offset=0.0,
- force_time_shift=-1.5
+ force_time_shift=-1.5,
+ sil_in_str=True
):
if not len(char_list):
return []
@@ -62,6 +67,8 @@
timestamp_list[i][1] = timestamp_list[i][1] + vad_offset / 1000.0
res_txt = ""
for char, timestamp in zip(new_char_list, timestamp_list):
+ #if char != '<sil>':
+ if not sil_in_str and char == '<sil>': continue
res_txt += "{} {} {};".format(char, str(timestamp[0]+0.0005)[:5], str(timestamp[1]+0.0005)[:5])
res = []
for char, timestamp in zip(new_char_list, timestamp_list):
@@ -121,4 +128,181 @@
return res
+class AverageShiftCalculator():
+ def __init__(self):
+ logging.warning("Calculating average shift.")
+ def __call__(self, file1, file2):
+ uttid_list1, ts_dict1 = self.read_timestamps(file1)
+ uttid_list2, ts_dict2 = self.read_timestamps(file2)
+ uttid_intersection = self._intersection(uttid_list1, uttid_list2)
+ res = self.as_cal(uttid_intersection, ts_dict1, ts_dict2)
+ logging.warning("Average shift of {} and {}: {}.".format(file1, file2, str(res)[:8]))
+ logging.warning("Following timestamp pair differs most: {}, detail:{}".format(self.max_shift, self.max_shift_uttid))
+
+ def _intersection(self, list1, list2):
+ set1 = set(list1)
+ set2 = set(list2)
+ if set1 == set2:
+ logging.warning("Uttid same checked.")
+ return set1
+ itsc = list(set1 & set2)
+ logging.warning("Uttid differs: file1 {}, file2 {}, lines same {}.".format(len(list1), len(list2), len(itsc)))
+ return itsc
+
+ def read_timestamps(self, file):
+ # read timestamps file in standard format
+ uttid_list = []
+ ts_dict = {}
+ with codecs.open(file, 'r') as fin:
+ for line in fin.readlines():
+ text = ''
+ ts_list = []
+ line = line.rstrip()
+ uttid = line.split()[0]
+ uttid_list.append(uttid)
+ body = " ".join(line.split()[1:])
+ for pd in body.split(';'):
+ if not len(pd): continue
+ # pdb.set_trace()
+ char, start, end = pd.lstrip(" ").split(' ')
+ text += char + ','
+ ts_list.append((float(start), float(end)))
+ # ts_lists.append(ts_list)
+ ts_dict[uttid] = (text[:-1], ts_list)
+ logging.warning("File {} read done.".format(file))
+ return uttid_list, ts_dict
+
+ def _shift(self, filtered_timestamp_list1, filtered_timestamp_list2):
+ shift_time = 0
+ for fts1, fts2 in zip(filtered_timestamp_list1, filtered_timestamp_list2):
+ shift_time += abs(fts1[0] - fts2[0]) + abs(fts1[1] - fts2[1])
+ num_tokens = len(filtered_timestamp_list1)
+ return shift_time, num_tokens
+
+ def as_cal(self, uttid_list, ts_dict1, ts_dict2):
+ # calculate average shift between timestamp1 and timestamp2
+ # when characters differ, use edit distance alignment
+ # and calculate the error between the same characters
+ self._accumlated_shift = 0
+ self._accumlated_tokens = 0
+ self.max_shift = 0
+ self.max_shift_uttid = None
+ for uttid in uttid_list:
+ (t1, ts1) = ts_dict1[uttid]
+ (t2, ts2) = ts_dict2[uttid]
+ _align, _align2, _align3 = [], [], []
+ fts1, fts2 = [], []
+ _t1, _t2 = [], []
+ sm = edit_distance.SequenceMatcher(t1.split(','), t2.split(','))
+ s = sm.get_opcodes()
+ for j in range(len(s)):
+ if s[j][0] == "replace" or s[j][0] == "insert":
+ _align.append(0)
+ if s[j][0] == "replace" or s[j][0] == "delete":
+ _align3.append(0)
+ elif s[j][0] == "equal":
+ _align.append(1)
+ _align3.append(1)
+ else:
+ continue
+ # use s to index t2
+ for a, ts , t in zip(_align, ts2, t2.split(',')):
+ if a:
+ fts2.append(ts)
+ _t2.append(t)
+ sm2 = edit_distance.SequenceMatcher(t2.split(','), t1.split(','))
+ s = sm2.get_opcodes()
+ for j in range(len(s)):
+ if s[j][0] == "replace" or s[j][0] == "insert":
+ _align2.append(0)
+ elif s[j][0] == "equal":
+ _align2.append(1)
+ else:
+ continue
+ # use s2 tp index t1
+ for a, ts, t in zip(_align3, ts1, t1.split(',')):
+ if a:
+ fts1.append(ts)
+ _t1.append(t)
+ if len(fts1) == len(fts2):
+ shift_time, num_tokens = self._shift(fts1, fts2)
+ self._accumlated_shift += shift_time
+ self._accumlated_tokens += num_tokens
+ if shift_time/num_tokens > self.max_shift:
+ self.max_shift = shift_time/num_tokens
+ self.max_shift_uttid = uttid
+ else:
+ logging.warning("length mismatch")
+ return self._accumlated_shift / self._accumlated_tokens
+
+
+def convert_external_alphas(alphas_file, text_file, output_file):
+ from funasr.models.predictor.cif import cif_wo_hidden
+ with open(alphas_file, 'r') as f1, open(text_file, 'r') as f2, open(output_file, 'w') as f3:
+ for line1, line2 in zip(f1.readlines(), f2.readlines()):
+ line1 = line1.rstrip()
+ line2 = line2.rstrip()
+ assert line1.split()[0] == line2.split()[0]
+ uttid = line1.split()[0]
+ alphas = [float(i) for i in line1.split()[1:]]
+ new_alphas = np.array(remove_chunk_padding(alphas))
+ new_alphas[-1] += 1e-4
+ text = line2.split()[1:]
+ if len(text) + 1 != int(new_alphas.sum()):
+ # force resize
+ new_alphas *= (len(text) + 1) / int(new_alphas.sum())
+ peaks = cif_wo_hidden(torch.Tensor(new_alphas).unsqueeze(0), 1.0-1e-4)
+ if " " in text:
+ text = text.split()
+ else:
+ text = [i for i in text]
+ res_str, _ = ts_prediction_lfr6_standard(new_alphas, peaks[0], text,
+ force_time_shift=-7.0,
+ sil_in_str=False)
+ f3.write("{} {}\n".format(uttid, res_str))
+
+
+def remove_chunk_padding(alphas):
+ # remove the padding part in alphas if using chunk paraformer for GPU
+ START_ZERO = 45
+ MID_ZERO = 75
+ REAL_FRAMES = 360 # for chunk based encoder 10-120-10 and fsmn padding 5
+ alphas = alphas[START_ZERO:] # remove the padding at beginning
+ new_alphas = []
+ while True:
+ new_alphas = new_alphas + alphas[:REAL_FRAMES]
+ alphas = alphas[REAL_FRAMES+MID_ZERO:]
+ if len(alphas) < REAL_FRAMES: break
+ return new_alphas
+
+SUPPORTED_MODES = ['cal_aas', 'read_ext_alphas']
+
+
+def main(args):
+ if args.mode == 'cal_aas':
+ asc = AverageShiftCalculator()
+ asc(args.input, args.input2)
+ elif args.mode == 'read_ext_alphas':
+ convert_external_alphas(args.input, args.input2, args.output)
+ else:
+ logging.error("Mode {} not in SUPPORTED_MODES: {}.".format(args.mode, SUPPORTED_MODES))
+
+
+if __name__ == '__main__':
+ parser = argparse.ArgumentParser(description='timestamp tools')
+ parser.add_argument('--mode',
+ default=None,
+ type=str,
+ choices=SUPPORTED_MODES,
+ help='timestamp related toolbox')
+ parser.add_argument('--input', default=None, type=str, help='input file path')
+ parser.add_argument('--output', default=None, type=str, help='output file name')
+ parser.add_argument('--input2', default=None, type=str, help='input2 file path')
+ parser.add_argument('--kaldi-ts-type',
+ default='v2',
+ type=str,
+ choices=['v0', 'v1', 'v2'],
+ help='kaldi timestamp to write')
+ args = parser.parse_args()
+ main(args)
diff --git a/setup.py b/setup.py
index 087d90d..6bb3ac3 100644
--- a/setup.py
+++ b/setup.py
@@ -13,11 +13,11 @@
"install": [
"setuptools>=38.5.1",
# "configargparse>=1.2.1",
- "typeguard>=2.7.0",
+ "typeguard==2.13.3",
"humanfriendly",
"scipy>=1.4.1",
# "filelock",
- "librosa>=0.8.0",
+ "librosa==0.8.1",
"jamo==0.4.1", # For kss
"PyYAML>=5.1.2",
"soundfile>=0.10.2",
@@ -41,6 +41,8 @@
# PAI
"oss2",
"kaldi-native-fbank",
+ # timestamp
+ "edit-distance"
],
# train: The modules invoked when training only.
"train": [
diff --git a/tests/test_asr_inference_pipeline.py b/tests/test_asr_inference_pipeline.py
index 70dbe89..b3c5a24 100644
--- a/tests/test_asr_inference_pipeline.py
+++ b/tests/test_asr_inference_pipeline.py
@@ -451,8 +451,8 @@
def test_uniasr_2pass_zhcn_16k_common_vocab8358_offline(self):
inference_pipeline = pipeline(
- task=Tasks.,
- model='damo/speech_UniASauto_speech_recognitionR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
+ task=Tasks.auto_speech_recognition,
+ model='damo/speech_UniASR_asr_2pass-zh-cn-16k-common-vocab8358-tensorflow1-offline')
rec_result = inference_pipeline(
audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav',
param_dict={"decoding_model": "offline"})
diff --git a/tests/test_sv_inference_pipeline.py b/tests/test_sv_inference_pipeline.py
index 265f839..54ab564 100644
--- a/tests/test_sv_inference_pipeline.py
+++ b/tests/test_sv_inference_pipeline.py
@@ -1,5 +1,6 @@
import unittest
+import numpy as np
from modelscope.pipelines import pipeline
from modelscope.utils.constant import Tasks
from modelscope.utils.logger import get_logger
--
Gitblit v1.9.1