funasr/models/monotonic_aligner/model.py
@@ -166,7 +166,8 @@ meta_data["extract_feat"] = f"{time3 - time2:0.3f}" meta_data["batch_data_time"] = speech_lengths.sum().item() * frontend.frame_shift * frontend.lfr_n / 1000 speech.to(device=kwargs["device"]), speech_lengths.to(device=kwargs["device"]) speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)