funasr/models/lcbnet/model.py
@@ -422,6 +422,7 @@ else: # extract fbank feats time1 = time.perf_counter() pdb.set_trace() sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000), data_type=kwargs.get("data_type", "sound"), tokenizer=tokenizer) @@ -438,7 +439,6 @@ speech = speech.to(device=kwargs["device"]) speech_lengths = speech_lengths.to(device=kwargs["device"]) pdb.set_trace() # Encoder encoder_out, encoder_out_lens = self.encode(speech, speech_lengths) if isinstance(encoder_out, tuple):