From fde48a865253b21f874dedf384c1bd8b59481112 Mon Sep 17 00:00:00 2001
From: 北念 <lzr265946@alibaba-inc.com>
Date: 星期二, 17 十月 2023 14:06:47 +0800
Subject: [PATCH] update egs_modelscope paraformer-large-en
---
funasr/bin/asr_infer.py | 8 ++++++--
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py | 2 +-
egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh | 2 +-
funasr/bin/asr_inference_launch.py | 4 ++++
4 files changed, 12 insertions(+), 4 deletions(-)
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py
index f54399a..6f810ff 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.py
@@ -16,7 +16,7 @@
if __name__ == "__main__":
parser = argparse.ArgumentParser()
- parser.add_argument('--model', type=str, default="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch")
+ parser.add_argument('--model', type=str, default="damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020")
parser.add_argument('--audio_in', type=str, default="./data/test/wav.scp")
parser.add_argument('--output_dir', type=str, default="./results/")
parser.add_argument('--decoding_mode', type=str, default="normal")
diff --git a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh
index ef49d7a..36f40b6 100644
--- a/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh
+++ b/egs_modelscope/asr_vad_punc/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020/infer.sh
@@ -6,7 +6,7 @@
stage=1
stop_stage=2
-model="damo/speech_paraformer-large_asr_nat-zh-cn-16k-common-vocab8404-pytorch"
+model="damo/speech_paraformer-large-vad-punc_asr_nat-en-16k-common-vocab10020"
data_dir="./data/test"
output_dir="./results"
batch_size=64
diff --git a/funasr/bin/asr_infer.py b/funasr/bin/asr_infer.py
index 43da8bf..8073213 100644
--- a/funasr/bin/asr_infer.py
+++ b/funasr/bin/asr_infer.py
@@ -1918,6 +1918,8 @@
nbest: int = 1,
streaming: bool = False,
frontend_conf: dict = None,
+ language: str = None,
+ task: str = "transcribe",
**kwargs,
):
@@ -1960,6 +1962,8 @@
self.device = device
self.dtype = dtype
self.frontend = frontend
+ self.language = language
+ self.task = task
@torch.no_grad()
def __call__(
@@ -1986,10 +1990,10 @@
mel = log_mel_spectrogram(speech).to(self.device)
if self.asr_model.is_multilingual:
- options = DecodingOptions(fp16=False)
+ options = DecodingOptions(fp16=False, language=self.language, task=self.task)
asr_res = decode(self.asr_model, mel, options)
text = asr_res.text
- language = asr_res.language
+ language = self.language if self.language else asr_res.language
else:
asr_res = transcribe(self.asr_model, speech, fp16=False)
text = asr_res["text"]
diff --git a/funasr/bin/asr_inference_launch.py b/funasr/bin/asr_inference_launch.py
index e3de05b..1040f6f 100644
--- a/funasr/bin/asr_inference_launch.py
+++ b/funasr/bin/asr_inference_launch.py
@@ -2056,6 +2056,8 @@
ncpu = kwargs.get("ncpu", 1)
torch.set_num_threads(ncpu)
+ language = param_dict.get("language", None)
+ task = param_dict.get("task", "transcribe")
if batch_size > 1:
raise NotImplementedError("batch decoding is not implemented")
if word_lm_train_config is not None:
@@ -2099,6 +2101,8 @@
penalty=penalty,
nbest=nbest,
streaming=streaming,
+ language=language,
+ task=task,
)
logging.info("speech2text_kwargs: {}".format(speech2text_kwargs))
speech2text = Speech2TextWhisper(**speech2text_kwargs)
--
Gitblit v1.9.1