python/FunASR-XL.git

			@@ -687,10 +687,8 @@
			# fp16
			if kwargs.get("fp16", False):
			speech = speech.to(torch.float16)
			encoder_out_lens = encoder_out_lens.to(torch.float16)
			elif kwargs.get("bf16", False):
			speech = speech.to(torch.bfloat16)
			encoder_out_lens = encoder_out_lens.to(torch.bfloat16)
			encoder_out, encoder_out_lens = self.audio_encoder(speech.permute(0, 2, 1), speech_lengths)

			# audio_adaptor
			@@ -714,13 +712,17 @@
			]

			llm_dtype = kwargs.get("llm_dtype", "fp32")
			if llm_dtype == "fp32":
			llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
			llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype

			dtype_map = {"bf16": torch.bfloat16, "fp16": torch.float16, "fp32": torch.float32}
			with torch.cuda.amp.autocast(
			enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
			):
			label = contents["assistant"][0]
			# self.llm = self.llm.to(dtype_map[llm_dtype])
			# inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
			self.llm = self.llm.to(dtype_map[llm_dtype])
			inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])

			if not kwargs.get("tearchforing", False):