funasr/models/sense_voice/model.py
@@ -925,6 +925,11 @@ if tok_ls: token_ids.extend(tok_ls) else: token_ids.append(124) if len(token_ids) == 0: result_i = {"key": key[i], "text": text} results.append(result_i) continue logits_speech = self.ctc.softmax(encoder_out)[i, 4 : encoder_out_lens[i].item(), :] pred = logits_speech.argmax(-1).cpu() logits_speech[pred == self.blank_id, self.blank_id] = 0