From 51ea14f910d76c5c1a581098d3808a78fdb9fcd1 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 31 一月 2023 17:33:56 +0800
Subject: [PATCH] fix paraformer bug, when predicts no token, silence

---
 funasr/bin/asr_inference_paraformer_vad_punc.py |    4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)

diff --git a/funasr/bin/asr_inference_paraformer_vad_punc.py b/funasr/bin/asr_inference_paraformer_vad_punc.py
index 7a539e4..619e6fd 100644
--- a/funasr/bin/asr_inference_paraformer_vad_punc.py
+++ b/funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -235,6 +235,8 @@
 
         predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
         pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
+        if torch.max(pre_token_length) < 1:
+            return []
         pre_token_length = pre_token_length.round().long()
         decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
         decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -602,7 +604,7 @@
                     results = speech2text(**batch)
                     if len(results) < 1:
                         hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
-                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
+                        results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
                     time_end = time.time()
                     forward_time = time_end - time_beg
                     lfr_factor = results[0][-1]

--
Gitblit v1.9.1