python/FunASR-XL.git

			@@ -59,16 +59,13 @@
			enc, enc_len = self.encoder(**batch)
			mask = self.make_pad_mask(enc_len)[:, None, :]
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(enc, mask)
			pre_token_length = pre_token_length.round().long()
			pre_token_length = pre_token_length.round().type(torch.int32)

			decoder_out, _ = self.decoder(enc, enc_len, pre_acoustic_embeds, pre_token_length)
			decoder_out = torch.log_softmax(decoder_out, dim=-1)
			sample_ids = decoder_out.argmax(dim=-1)
			# sample_ids = decoder_out.argmax(dim=-1)

			return decoder_out, sample_ids

			# def get_output_size(self):
			# return self.model.encoders[0].size
			return decoder_out, pre_token_length

			def get_dummy_inputs(self):
			speech = torch.randn(2, 30, self.feats_dim)