fix paraformer bug, when predicts no token, silence
| | |
| | | # except TooShortUttError as e: |
| | | # logging.warning(f"Utterance {keys} {e}") |
| | | # hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | # results = [[" ", ["<space>"], [2], hyp]] * nbest |
| | | # results = [[" ", ["sil"], [2], hyp]] * nbest |
| | | # |
| | | # # Only supporting batch_size==1 |
| | | # key = keys[0] |
| | |
| | | except TooShortUttError as e: |
| | | logging.warning(f"Utterance {keys} {e}") |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["<space>"], [2], hyp]] * nbest |
| | | results = [[" ", ["sil"], [2], hyp]] * nbest |
| | | |
| | | # Only supporting batch_size==1 |
| | | key = keys[0] |
| | |
| | | pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], \ |
| | | predictor_outs[2], predictor_outs[3] |
| | | pre_token_length = pre_token_length.round().long() |
| | | if torch.max(pre_token_length) < 1: |
| | | return [] |
| | | decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length) |
| | | decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] |
| | | |
| | |
| | | # results = speech2text(**batch) |
| | | # if len(results) < 1: |
| | | # hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | # results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest |
| | | # results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest |
| | | # time_end = time.time() |
| | | # forward_time = time_end - time_beg |
| | | # lfr_factor = results[0][-1] |
| | |
| | | results = speech2text(**batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest |
| | | results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest |
| | | time_end = time.time() |
| | | forward_time = time_end - time_beg |
| | | lfr_factor = results[0][-1] |
| | |
| | | results = speech2text(**batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["<space>"], [2], hyp, 10, 6]] * nbest |
| | | results = [[" ", ["sil"], [2], hyp, 10, 6]] * nbest |
| | | time_end = time.time() |
| | | forward_time = time_end - time_beg |
| | | lfr_factor = results[0][-1] |
| | |
| | | |
| | | predictor_outs = self.asr_model.calc_predictor(enc, enc_len) |
| | | pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = predictor_outs[0], predictor_outs[1], predictor_outs[2], predictor_outs[3] |
| | | if torch.max(pre_token_length) < 1: |
| | | return [] |
| | | pre_token_length = pre_token_length.round().long() |
| | | decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length) |
| | | decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1] |
| | |
| | | results = speech2text(**batch) |
| | | if len(results) < 1: |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["<space>"], [2], 0, 1, 6]] * nbest |
| | | results = [[" ", ["sil"], [2], 0, 1, 6]] * nbest |
| | | time_end = time.time() |
| | | forward_time = time_end - time_beg |
| | | lfr_factor = results[0][-1] |
| | |
| | | # except TooShortUttError as e: |
| | | # logging.warning(f"Utterance {keys} {e}") |
| | | # hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | # results = [[" ", ["<space>"], [2], hyp]] * nbest |
| | | # results = [[" ", ["sil"], [2], hyp]] * nbest |
| | | # |
| | | # # Only supporting batch_size==1 |
| | | # key = keys[0] |
| | |
| | | except TooShortUttError as e: |
| | | logging.warning(f"Utterance {keys} {e}") |
| | | hyp = Hypothesis(score=0.0, scores={}, states={}, yseq=[]) |
| | | results = [[" ", ["<space>"], [2], hyp]] * nbest |
| | | results = [[" ", ["sil"], [2], hyp]] * nbest |
| | | |
| | | # Only supporting batch_size==1 |
| | | key = keys[0] |