From 51ea14f910d76c5c1a581098d3808a78fdb9fcd1 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 31 一月 2023 17:33:56 +0800
Subject: [PATCH] fix paraformer bug, when predicts no token, silence
---
funasr/bin/asr_inference_paraformer.py | 6 ++++--
funasr/bin/asr_inference_uniasr.py | 4 ++--
funasr/bin/asr_inference.py | 4 ++--
funasr/bin/asr_inference_paraformer_timestamp.py | 2 +-
funasr/bin/asr_inference_paraformer_vad_punc.py | 4 +++-
5 files changed, 12 insertions(+), 8 deletions(-)
diff --git a/funasr/bin/asr_inference.py b/funasr/bin/asr_inference.py
index 985ff50..4c6d2a4 100644
--- a/funasr/bin/asr_inference.py
+++ b/funasr/bin/asr_inference.py
@@ -368,7 +368,7 @@
# except TooShortUttError as e:
# logging.warning(f"Utterance {keys} {e}")
# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-# results = [[" ", ["<space>"], [2], hyp]] * nbest
+# results = [[" ", ["sil"], [2], hyp]] * nbest
#
# # Only supporting batch_size==1
# key = keys[0]
@@ -575,7 +575,7 @@
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], hyp]] * nbest
+ results = [[" ", ["sil"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 01237f7..cb140eb 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -227,6 +227,8 @@
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
predictor_outs[2], predictor_outs[3]
pre_token_length = pre_token_length.round().long()
+ if torch.max(pre_token_length) < 1:
+ return []
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -394,7 +396,7 @@
# results = speech2text(**batch)
# if len(results) < 1:
# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-# results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+# results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
# time_end = time.time()
# forward_time = time_end - time_beg
# lfr_factor = results[0][-1]
@@ -621,7 +623,7 @@
results = speech2text(**batch)
if len(results) < 1:
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+ results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
time_end = time.time()
forward_time = time_end - time_beg
lfr_factor = results[0][-1]
diff --git a/funasr/bin/asr_inference_paraformer_timestamp.py b/funasr/bin/asr_inference_paraformer_timestamp.py
index 9560cb0..7e2e414 100644
--- a/funasr/bin/asr_inference_paraformer_timestamp.py
+++ b/funasr/bin/asr_inference_paraformer_timestamp.py
@@ -410,7 +410,7 @@
results = speech2text(**batch)
if len(results) < 1:
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
+ results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
time_end = time.time()
forward_time = time_end - time_beg
lfr_factor = results[0][-1]
diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 7a539e4..619e6fd 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -235,6 +235,8 @@
predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
+ if torch.max(pre_token_length) < 1:
+ return []
pre_token_length = pre_token_length.round().long()
decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -602,7 +604,7 @@
results = speech2text(**batch)
if len(results) < 1:
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
+ results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
time_end = time.time()
forward_time = time_end - time_beg
lfr_factor = results[0][-1]
diff --git a/funasr/bin/asr_inference_uniasr.py b/funasr/bin/asr_inference_uniasr.py
index d386ff1..4ecb1cc 100644
--- a/funasr/bin/asr_inference_uniasr.py
+++ b/funasr/bin/asr_inference_uniasr.py
@@ -391,7 +391,7 @@
# except TooShortUttError as e:
# logging.warning(f"Utterance {keys} {e}")
# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-# results = [[" ", ["<space>"], [2], hyp]] * nbest
+# results = [[" ", ["sil"], [2], hyp]] * nbest
#
# # Only supporting batch_size==1
# key = keys[0]
@@ -616,7 +616,7 @@
except TooShortUttError as e:
logging.warning(f"Utterance {keys} {e}")
hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
- results = [[" ", ["<space>"], [2], hyp]] * nbest
+ results = [[" ", ["sil"], [2], hyp]] * nbest
# Only supporting batch_size==1
key = keys[0]
--
Gitblit v1.9.1