游雁
2023-01-31 51ea14f910d76c5c1a581098d3808a78fdb9fcd1
fix paraformer bug, when predicts no token, silence
5个文件已修改
20 ■■■■■ 已修改文件
funasr/bin/asr_inference.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_paraformer.py 6 ●●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_paraformer_timestamp.py 2 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_paraformer_vad_punc.py 4 ●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference_uniasr.py 4 ●●●● 补丁 | 查看 | 原始文档 | blame | 历史
funasr/bin/asr_inference.py
@@ -368,7 +368,7 @@
#         except TooShortUttError as e:
#             logging.warning(f"Utterance {keys} {e}")
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp]] * nbest
#             results = [[" ", ["sil"], [2], hyp]] * nbest
#
#         # Only supporting batch_size==1
#         key = keys[0]
@@ -575,7 +575,7 @@
            except TooShortUttError as e:
                logging.warning(f"Utterance {keys} {e}")
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp]] * nbest
                results = [[" ", ["sil"], [2], hyp]] * nbest
            
            # Only supporting batch_size==1
            key = keys[0]
funasr/bin/asr_inference_paraformer.py
@@ -227,6 +227,8 @@
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -394,7 +396,7 @@
#         results = speech2text(**batch)
#         if len(results) < 1:
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
#             results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
#         time_end = time.time()
#         forward_time = time_end - time_beg
#         lfr_factor = results[0][-1]
@@ -621,7 +623,7 @@
            results = speech2text(**batch)
            if len(results) < 1:
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
            time_end = time.time()
            forward_time = time_end - time_beg
            lfr_factor = results[0][-1]
funasr/bin/asr_inference_paraformer_timestamp.py
@@ -410,7 +410,7 @@
        results = speech2text(**batch)
        if len(results) < 1:
            hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
            results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
            results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
        time_end = time.time()
        forward_time = time_end - time_beg
        lfr_factor = results[0][-1]
funasr/bin/asr_inference_paraformer_vad_punc.py
@@ -235,6 +235,8 @@
        predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
        if torch.max(pre_token_length) < 1:
            return []
        pre_token_length = pre_token_length.round().long()
        decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -602,7 +604,7 @@
                    results = speech2text(**batch)
                    if len(results) < 1:
                        hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
                        results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
                    time_end = time.time()
                    forward_time = time_end - time_beg
                    lfr_factor = results[0][-1]
funasr/bin/asr_inference_uniasr.py
@@ -391,7 +391,7 @@
#         except TooShortUttError as e:
#             logging.warning(f"Utterance {keys} {e}")
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp]] * nbest
#             results = [[" ", ["sil"], [2], hyp]] * nbest
#
#         # Only supporting batch_size==1
#         key = keys[0]
@@ -616,7 +616,7 @@
            except TooShortUttError as e:
                logging.warning(f"Utterance {keys} {e}")
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp]] * nbest
                results = [[" ", ["sil"], [2], hyp]] * nbest
    
            # Only supporting batch_size==1
            key = keys[0]