python/FunASR-XL.git

parent: 0b05ef1b | 补丁 | 提交 | show whitespace

fix paraformer bug, when predicts no token, silence

游雁

2023-01-31 51ea14f910d76c5c1a581098d3808a78fdb9fcd1

fix paraformer bug, when predicts no token, silence

5个文件已修改

	funasr/bin/asr_inference.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer.py	6 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_timestamp.py	2 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_paraformer_vad_punc.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/asr_inference_uniasr.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 funasr/bin/asr_inference.py

@@ -368,7 +368,7 @@
#         except TooShortUttError as e:
#             logging.warning(f"Utterance {keys} {e}")
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp]] * nbest
#             results = [[" ", ["sil"], [2], hyp]] * nbest
#
#         # Only supporting batch_size==1
#         key = keys[0]
@@ -575,7 +575,7 @@
            except TooShortUttError as e:
                logging.warning(f"Utterance {keys} {e}")
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp]] * nbest
                results = [[" ", ["sil"], [2], hyp]] * nbest
            
            # Only supporting batch_size==1
            key = keys[0]

 funasr/bin/asr_inference_paraformer.py

@@ -227,6 +227,8 @@
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
                                                                        predictor_outs[2], predictor_outs[3]
        pre_token_length = pre_token_length.round().long()
        if torch.max(pre_token_length) < 1:
            return []
        decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]

@@ -394,7 +396,7 @@
#         results = speech2text(**batch)
#         if len(results) < 1:
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
#             results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
#         time_end = time.time()
#         forward_time = time_end - time_beg
#         lfr_factor = results[0][-1]
@@ -621,7 +623,7 @@
            results = speech2text(**batch)
            if len(results) < 1:
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
                results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
            time_end = time.time()
            forward_time = time_end - time_beg
            lfr_factor = results[0][-1]

 funasr/bin/asr_inference_paraformer_timestamp.py

@@ -410,7 +410,7 @@
        results = speech2text(**batch)
        if len(results) < 1:
            hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
            results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
            results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
        time_end = time.time()
        forward_time = time_end - time_beg
        lfr_factor = results[0][-1]

 funasr/bin/asr_inference_paraformer_vad_punc.py

@@ -235,6 +235,8 @@

        predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
        if torch.max(pre_token_length) < 1:
            return []
        pre_token_length = pre_token_length.round().long()
        decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
        decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
@@ -602,7 +604,7 @@
                    results = speech2text(**batch)
                    if len(results) < 1:
                        hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                        results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
                        results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
                    time_end = time.time()
                    forward_time = time_end - time_beg
                    lfr_factor = results[0][-1]

 funasr/bin/asr_inference_uniasr.py

@@ -391,7 +391,7 @@
#         except TooShortUttError as e:
#             logging.warning(f"Utterance {keys} {e}")
#             hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
#             results = [[" ", ["<space>"], [2], hyp]] * nbest
#             results = [[" ", ["sil"], [2], hyp]] * nbest
#
#         # Only supporting batch_size==1
#         key = keys[0]
@@ -616,7 +616,7 @@
            except TooShortUttError as e:
                logging.warning(f"Utterance {keys} {e}")
                hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
                results = [[" ", ["<space>"], [2], hyp]] * nbest
                results = [[" ", ["sil"], [2], hyp]] * nbest
    
            # Only supporting batch_size==1
            key = keys[0]

			@@ -368,7 +368,7 @@
			# except TooShortUttError as e:
			# logging.warning(f"Utterance {keys} {e}")
			# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			# results = [[" ", ["<space>"], [2], hyp]] * nbest
			# results = [[" ", ["sil"], [2], hyp]] * nbest
			#
			# # Only supporting batch_size==1
			# key = keys[0]
			@@ -575,7 +575,7 @@
			except TooShortUttError as e:
			logging.warning(f"Utterance {keys} {e}")
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["<space>"], [2], hyp]] * nbest
			results = [[" ", ["sil"], [2], hyp]] * nbest

			# Only supporting batch_size==1
			key = keys[0]

			@@ -227,6 +227,8 @@
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \
			predictor_outs[2], predictor_outs[3]
			pre_token_length = pre_token_length.round().long()
			if torch.max(pre_token_length) < 1:
			return []
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]

			@@ -394,7 +396,7 @@
			# results = speech2text(**batch)
			# if len(results) < 1:
			# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			# results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			# results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			# time_end = time.time()
			# forward_time = time_end - time_beg
			# lfr_factor = results[0][-1]
			@@ -621,7 +623,7 @@
			results = speech2text(**batch)
			if len(results) < 1:
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			time_end = time.time()
			forward_time = time_end - time_beg
			lfr_factor = results[0][-1]

			@@ -410,7 +410,7 @@
			results = speech2text(**batch)
			if len(results) < 1:
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest
			results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest
			time_end = time.time()
			forward_time = time_end - time_beg
			lfr_factor = results[0][-1]

			@@ -235,6 +235,8 @@

			predictor_outs = self.asr_model.calc_predictor(enc, enc_len)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3]
			if torch.max(pre_token_length) < 1:
			return []
			pre_token_length = pre_token_length.round().long()
			decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
			@@ -602,7 +604,7 @@
			results = speech2text(**batch)
			if len(results) < 1:
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest
			results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest
			time_end = time.time()
			forward_time = time_end - time_beg
			lfr_factor = results[0][-1]

			@@ -391,7 +391,7 @@
			# except TooShortUttError as e:
			# logging.warning(f"Utterance {keys} {e}")
			# hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			# results = [[" ", ["<space>"], [2], hyp]] * nbest
			# results = [[" ", ["sil"], [2], hyp]] * nbest
			#
			# # Only supporting batch_size==1
			# key = keys[0]
			@@ -616,7 +616,7 @@
			except TooShortUttError as e:
			logging.warning(f"Utterance {keys} {e}")
			hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[])
			results = [[" ", ["<space>"], [2], hyp]] * nbest
			results = [[" ", ["sil"], [2], hyp]] * nbest

			# Only supporting batch_size==1
			key = keys[0]