From 475064f91451aad868ca33f36c7d52f41b0b8b40 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 11 五月 2023 19:35:10 +0800
Subject: [PATCH] paraformer vad punc
---
egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py | 4 ++--
funasr/bin/asr_inference_paraformer.py | 24 ++++++++----------------
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py | 2 +-
funasr/bin/asr_inference_launch.py | 4 ++--
4 files changed, 13 insertions(+), 21 deletions(-)
diff --git a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
index 1fa6b27..a8a670a 100644
--- a/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
+++ b/egs_modelscope/asr/paraformer/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -4,8 +4,8 @@
inference_pipeline = pipeline(
task=Tasks.auto_speech_recognition,
model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch',
- vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
+ batch_size=64,
)
-audio_in = 'https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/vad_example.wav'
+audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav'
rec_result = inference_pipeline(audio_in=audio_in)
print(rec_result)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
index 3cace60..9b474dd 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-zh-cn-16k-common-vocab8404-pytorch/demo.py
@@ -10,7 +10,7 @@
vad_model='damo/speech_fsmn_vad_zh-cn-16k-common-pytorch',
punc_model='damo/punc_ct-transformer_zh-cn-common-vocab272727-pytorch',
output_dir=output_dir,
- batch_size=8,
+ batch_size=64,
)
rec_result = inference_pipeline(audio_in=audio_in)
print(rec_result)
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index db91ed2..7b04a9e 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -291,11 +291,11 @@
elif mode == "paraformer":
from funasr.bin.asr_inference_paraformer import inference_modelscope
inference_pipeline = inference_modelscope(**kwargs)
- return inference_pipeline(kwargs["data_path_and_name_and_type"])
+ return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
elif mode.startswith("paraformer_vad"):
from funasr.bin.asr_inference_paraformer import inference_modelscope_vad_punc
inference_pipeline = inference_modelscope_vad_punc(**kwargs)
- return inference_pipeline(kwargs["data_path_and_name_and_type"])
+ return inference_pipeline(kwargs["data_path_and_name_and_type"], hotword=kwargs.get("hotword", None))
elif mode == "mfcca":
from funasr.bin.asr_inference_mfcca import inference_modelscope
return inference_modelscope(**kwargs)
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 2a33bdf..ecdb62a 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -48,6 +48,8 @@
from funasr.bin.vad_inference import Speech2VadSegment
from funasr.bin.punctuation_infer import Text2Punc
from funasr.utils.vad_utils import slice_padding_fbank
+from funasr.tasks.vad import VADTask
+from funasr.utils.timestamp_tools import time_stamp_sentence, ts_prediction_lfr6_standard
class Speech2Text:
"""Speech2Text class
@@ -293,15 +295,14 @@
text = self.tokenizer.tokens2text(token)
else:
text = None
-
+ timestamp = []
if isinstance(self.asr_model, BiCifParaformer):
_, timestamp = ts_prediction_lfr6_standard(us_alphas[i][:enc_len[i]*3],
us_peaks[i][:enc_len[i]*3],
copy.copy(token),
vad_offset=begin_time)
- results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
- else:
- results.append((text, token, token_int, hyp, [], enc_len_batch_total, lfr_factor))
+ results.append((text, token, token_int, hyp, timestamp, enc_len_batch_total, lfr_factor))
+
# assert check_return_type(results)
return results
@@ -471,7 +472,7 @@
hotword_list_or_file = None
if param_dict is not None:
hotword_list_or_file = param_dict.get('hotword')
- if 'hotword' in kwargs:
+ if 'hotword' in kwargs and kwargs['hotword'] is not None:
hotword_list_or_file = kwargs['hotword']
if hotword_list_or_file is not None or 'hotword' in kwargs:
speech2text.hotword_list = speech2text.generate_hotwords_list(hotword_list_or_file)
@@ -1018,18 +1019,9 @@
kwargs = vars(args)
kwargs.pop("config", None)
kwargs['param_dict'] = param_dict
- inference(**kwargs)
+ inference_pipeline = inference_modelscope(**kwargs)
+ return inference_pipeline(kwargs["data_path_and_name_and_type"], param_dict=param_dict)
if __name__ == "__main__":
main()
-
- # from modelscope.pipelines import pipeline
- # from modelscope.utils.constant import Tasks
- #
- # inference_16k_pipline = pipeline(
- # task=Tasks.auto_speech_recognition,
- # model='damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch')
- #
- # rec_result = inference_16k_pipline(audio_in='https://isv-data.oss-cn-hangzhou.aliyuncs.com/ics/MaaS/ASR/test_audio/asr_example_zh.wav')
- # print(rec_result)
--
Gitblit v1.9.1