python/FunASR-XL.git

			@@ -9,7 +9,6 @@
			import numpy as np

			import torch
			from typeguard import check_argument_types

			from funasr.layers.abs_normalize import AbsNormalize
			from funasr.models.ctc import CTC
			@@ -73,7 +72,6 @@
			preencoder: Optional[AbsPreEncoder] = None,
			postencoder: Optional[AbsPostEncoder] = None,
			):
			assert check_argument_types()
			assert 0.0 <= ctc_weight <= 1.0, ctc_weight
			assert 0.0 <= interctc_weight < 1.0, interctc_weight

			@@ -127,6 +125,7 @@
			if self.crit_attn_weight > 0:
			self.attn_loss = torch.nn.L1Loss()
			self.crit_attn_smooth = crit_attn_smooth
			self.length_normalized_loss = length_normalized_loss

			def forward(
			self,
			@@ -136,7 +135,7 @@
			text_lengths: torch.Tensor,
			hotword_pad: torch.Tensor,
			hotword_lengths: torch.Tensor,
			ideal_attn: torch.Tensor,
			dha_pad: torch.Tensor,
			) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
			"""Frontend + Encoder + Decoder + Calc loss

			@@ -209,7 +208,7 @@
			# 2b. Attention decoder branch
			if self.ctc_weight != 1.0:
			loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
			encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
			encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
			)

			# 3. CTC-Att loss definition
			@@ -233,6 +232,8 @@

			stats["loss"] = torch.clone(loss.detach())
			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = int((text_lengths + self.predictor_bias).sum())
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -244,7 +245,6 @@
			ys_pad_lens: torch.Tensor,
			hotword_pad: torch.Tensor,
			hotword_lengths: torch.Tensor,
			ideal_attn: torch.Tensor,
			):
			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			@@ -352,6 +352,7 @@
			else:
			hw_embed = self.bias_embed(hw_list_pad)
			hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
			hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
			else:
			hw_lengths = [len(i) for i in hw_list]
			hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)