python/FunASR-XL.git

parent: 6f88a689 | 补丁 | 提交 | ignore whitespace

haoneng.lhn

2023-12-07 4152cf4615e234b9892703a8e088cdd51937fc13

fix loss normalization for ddp training

4个文件已修改

	funasr/models/e2e_asr.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_asr_contextual_paraformer.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_asr_paraformer.py	57 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/models/e2e_uni_asr.py	3 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 funasr/models/e2e_asr.py

@@ -122,6 +122,7 @@
            self.ctc = ctc

        self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
        self.length_normalized_loss = length_normalized_loss

    def forward(
            self,
@@ -220,6 +221,8 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + 1).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight


 funasr/models/e2e_asr_contextual_paraformer.py

@@ -125,6 +125,7 @@
        if self.crit_attn_weight > 0:
            self.attn_loss = torch.nn.L1Loss()
        self.crit_attn_smooth = crit_attn_smooth
        self.length_normalized_loss = length_normalized_loss

    def forward(
            self,
@@ -231,6 +232,8 @@

        stats["loss"] = torch.clone(loss.detach())
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight
    

 funasr/models/e2e_asr_paraformer.py

@@ -137,6 +137,7 @@
        self.predictor_bias = predictor_bias
        self.sampling_ratio = sampling_ratio
        self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
        self.length_normalized_loss = length_normalized_loss
        self.step_cur = 0

        self.share_embedding = share_embedding
@@ -253,6 +254,8 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

@@ -352,8 +355,9 @@

        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
                                                                                  ignore_id=self.ignore_id)
        pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None,
                                                                                       encoder_out_mask,
                                                                                       ignore_id=self.ignore_id)
        return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
@@ -487,8 +491,9 @@
            if self.step_cur < 2:
                logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
            if self.use_1st_decoder_loss:
                sematic_embeds, decoder_out_1st, pre_loss_att = self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
                                                               pre_acoustic_embeds)
                sematic_embeds, decoder_out_1st, pre_loss_att = self.sampler_with_grad(encoder_out, encoder_out_lens,
                                                                                       ys_pad, ys_pad_lens,
                                                                                       pre_acoustic_embeds)
            else:
                sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
                                                               pre_acoustic_embeds)
@@ -727,6 +732,7 @@
        self.predictor = predictor
        self.predictor_weight = predictor_weight
        self.predictor_bias = predictor_bias
        self.length_normalized_loss = length_normalized_loss
        self.sampling_ratio = sampling_ratio
        self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
        self.step_cur = 0
@@ -860,11 +866,13 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

    def encode(
        self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
            self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Frontend + Encoder. Note that this method is used by asr_inference.py
        Args:
@@ -885,7 +893,7 @@
        # Pre-encoder, e.g. used for raw input data
        if self.preencoder is not None:
            feats, feats_lengths = self.preencoder(feats, feats_lengths)
        

        # 4. Forward encoder
        # feats: (Batch, Length, Dim)
        # -> encoder_out: (Batch, Length2, Dim2)
@@ -970,11 +978,11 @@
        return encoder_out, torch.tensor([encoder_out.size(1)])

    def _calc_att_predictor_loss(
        self,
        encoder_out: torch.Tensor,
        encoder_out_lens: torch.Tensor,
        ys_pad: torch.Tensor,
        ys_pad_lens: torch.Tensor,
            self,
            encoder_out: torch.Tensor,
            encoder_out_lens: torch.Tensor,
            ys_pad: torch.Tensor,
            ys_pad_lens: torch.Tensor,
    ):
        encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
            encoder_out.device)
@@ -1006,7 +1014,7 @@
            attention_chunk_center_bias = 0
            attention_chunk_size = encoder_chunk_size
            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
                get_mask_shift_att_chunk_decoder(None,
                                                 device=encoder_out.device,
                                                 batch_size=encoder_out.size(0)
@@ -1106,7 +1114,8 @@
            input_mask_expand_dim, 0)
        return sematic_embeds * tgt_mask, decoder_out * tgt_mask

    def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, chunk_mask=None):
    def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds,
                          chunk_mask=None):
        tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
        ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
        if self.share_embedding:
@@ -1158,7 +1167,7 @@
                                                                                           target_label_length=None,
                                                                                           )
        predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
                                                                                             encoder_out_lens+1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)
                                                                                             encoder_out_lens + 1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)

        scama_mask = None
        if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
@@ -1166,7 +1175,7 @@
            attention_chunk_center_bias = 0
            attention_chunk_size = encoder_chunk_size
            decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
            mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
                get_mask_shift_att_chunk_decoder(None,
                                                 device=encoder_out.device,
                                                 batch_size=encoder_out.size(0)
@@ -1484,6 +1493,8 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

@@ -1589,8 +1600,9 @@
        if self.predictor_bias == 1:
            _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
            ys_pad_lens = ys_pad_lens + self.predictor_bias
        pre_acoustic_embeds, pre_token_length, _, pre_peak_index, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
                                                                                  ignore_id=self.ignore_id)
        pre_acoustic_embeds, pre_token_length, _, pre_peak_index, _ = self.predictor(encoder_out, ys_pad,
                                                                                     encoder_out_mask,
                                                                                     ignore_id=self.ignore_id)

        # 0. sampler
        decoder_out_1st = None
@@ -1739,7 +1751,7 @@
            loss = loss_ctc
        else:
            loss = self.ctc_weight * loss_ctc + (
                        1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
                    1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

        # Collect Attn branch stats
        stats["loss_att"] = loss_att.detach() if loss_att is not None else None
@@ -1752,6 +1764,8 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

@@ -1952,6 +1966,8 @@
        stats["loss"] = torch.clone(loss.detach())

        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

@@ -2107,7 +2123,8 @@

        return loss_att, acc_att, cer_att, wer_att, loss_pre

    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None, clas_scale=1.0):
    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None,
                                   clas_scale=1.0):
        if hw_list is None:
            # default hotword list
            hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)]  # empty hotword list
@@ -2245,4 +2262,4 @@
                    "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
                                                                                  var_dict_tf[name_tf].shape))

        return var_dict_torch_update
        return var_dict_torch_update

 funasr/models/e2e_uni_asr.py

@@ -167,6 +167,7 @@
        self.enable_maas_finetune = enable_maas_finetune
        self.freeze_encoder2 = freeze_encoder2
        self.encoder1_encoder2_joint_training = encoder1_encoder2_joint_training
        self.length_normalized_loss = length_normalized_loss

    def forward(
        self,
@@ -440,6 +441,8 @@
        stats["loss2"] = torch.clone(loss2.detach())
        stats["loss"] = torch.clone(loss.detach())
        # force_gatherable: to-device and to-tensor if scalar for DataParallel
        if self.length_normalized_loss:
            batch_size = (text_lengths + 1).sum().type_as(batch_size)
        loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
        return loss, stats, weight

			@@ -122,6 +122,7 @@
			self.ctc = ctc

			self.extract_feats_in_collect_stats = extract_feats_in_collect_stats
			self.length_normalized_loss = length_normalized_loss

			def forward(
			self,
			@@ -220,6 +221,8 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + 1).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -125,6 +125,7 @@
			if self.crit_attn_weight > 0:
			self.attn_loss = torch.nn.L1Loss()
			self.crit_attn_smooth = crit_attn_smooth
			self.length_normalized_loss = length_normalized_loss

			def forward(
			self,
			@@ -231,6 +232,8 @@

			stats["loss"] = torch.clone(loss.detach())
			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -137,6 +137,7 @@
			self.predictor_bias = predictor_bias
			self.sampling_ratio = sampling_ratio
			self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
			self.length_normalized_loss = length_normalized_loss
			self.step_cur = 0

			self.share_embedding = share_embedding
			@@ -253,6 +254,8 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -352,8 +355,9 @@

			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None, encoder_out_mask,
			ignore_id=self.ignore_id)
			pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index = self.predictor(encoder_out, None,
			encoder_out_mask,
			ignore_id=self.ignore_id)
			return pre_acoustic_embeds, pre_token_length, alphas, pre_peak_index

			def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
			@@ -487,8 +491,9 @@
			if self.step_cur < 2:
			logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
			if self.use_1st_decoder_loss:
			sematic_embeds, decoder_out_1st, pre_loss_att = self.sampler_with_grad(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			sematic_embeds, decoder_out_1st, pre_loss_att = self.sampler_with_grad(encoder_out, encoder_out_lens,
			ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			else:
			sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
			pre_acoustic_embeds)
			@@ -727,6 +732,7 @@
			self.predictor = predictor
			self.predictor_weight = predictor_weight
			self.predictor_bias = predictor_bias
			self.length_normalized_loss = length_normalized_loss
			self.sampling_ratio = sampling_ratio
			self.criterion_pre = mae_loss(normalize_length=length_normalized_loss)
			self.step_cur = 0
			@@ -860,11 +866,13 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			def encode(
			self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
			self, speech: torch.Tensor, speech_lengths: torch.Tensor, ind: int = 0,
			) -> Tuple[torch.Tensor, torch.Tensor]:
			"""Frontend + Encoder. Note that this method is used by asr_inference.py
			Args:
			@@ -885,7 +893,7 @@
			# Pre-encoder, e.g. used for raw input data
			if self.preencoder is not None:
			feats, feats_lengths = self.preencoder(feats, feats_lengths)


			# 4. Forward encoder
			# feats: (Batch, Length, Dim)
			# -> encoder_out: (Batch, Length2, Dim2)
			@@ -970,11 +978,11 @@
			return encoder_out, torch.tensor([encoder_out.size(1)])

			def _calc_att_predictor_loss(
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			self,
			encoder_out: torch.Tensor,
			encoder_out_lens: torch.Tensor,
			ys_pad: torch.Tensor,
			ys_pad_lens: torch.Tensor,
			):
			encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
			encoder_out.device)
			@@ -1006,7 +1014,7 @@
			attention_chunk_center_bias = 0
			attention_chunk_size = encoder_chunk_size
			decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
			mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
			mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
			get_mask_shift_att_chunk_decoder(None,
			device=encoder_out.device,
			batch_size=encoder_out.size(0)
			@@ -1106,7 +1114,8 @@
			input_mask_expand_dim, 0)
			return sematic_embeds * tgt_mask, decoder_out * tgt_mask

			def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, chunk_mask=None):
			def sampler_with_grad(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds,
			chunk_mask=None):
			tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
			ys_pad_masked = ys_pad * tgt_mask[:, :, 0]
			if self.share_embedding:
			@@ -1158,7 +1167,7 @@
			target_label_length=None,
			)
			predictor_alignments, predictor_alignments_len = self.predictor.gen_frame_alignments(pre_alphas,
			encoder_out_lens+1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)
			encoder_out_lens + 1 if self.predictor.tail_threshold > 0.0 else encoder_out_lens)

			scama_mask = None
			if self.encoder.overlap_chunk_cls is not None and self.decoder_attention_chunk_type == 'chunk':
			@@ -1166,7 +1175,7 @@
			attention_chunk_center_bias = 0
			attention_chunk_size = encoder_chunk_size
			decoder_att_look_back_factor = self.encoder.overlap_chunk_cls.decoder_att_look_back_factor_cur
			mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls.\
			mask_shift_att_chunk_decoder = self.encoder.overlap_chunk_cls. \
			get_mask_shift_att_chunk_decoder(None,
			device=encoder_out.device,
			batch_size=encoder_out.size(0)
			@@ -1484,6 +1493,8 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -1589,8 +1600,9 @@
			if self.predictor_bias == 1:
			_, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
			ys_pad_lens = ys_pad_lens + self.predictor_bias
			pre_acoustic_embeds, pre_token_length, _, pre_peak_index, _ = self.predictor(encoder_out, ys_pad, encoder_out_mask,
			ignore_id=self.ignore_id)
			pre_acoustic_embeds, pre_token_length, _, pre_peak_index, _ = self.predictor(encoder_out, ys_pad,
			encoder_out_mask,
			ignore_id=self.ignore_id)

			# 0. sampler
			decoder_out_1st = None
			@@ -1739,7 +1751,7 @@
			loss = loss_ctc
			else:
			loss = self.ctc_weight * loss_ctc + (
			1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5
			1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight + loss_pre2 * self.predictor_weight * 0.5

			# Collect Attn branch stats
			stats["loss_att"] = loss_att.detach() if loss_att is not None else None
			@@ -1752,6 +1764,8 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -1952,6 +1966,8 @@
			stats["loss"] = torch.clone(loss.detach())

			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + self.predictor_bias).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight

			@@ -2107,7 +2123,8 @@

			return loss_att, acc_att, cer_att, wer_att, loss_pre

			def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None, clas_scale=1.0):
			def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None,
			clas_scale=1.0):
			if hw_list is None:
			# default hotword list
			hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list
			@@ -2245,4 +2262,4 @@
			"torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
			var_dict_tf[name_tf].shape))

			return var_dict_torch_update
			return var_dict_torch_update

			@@ -167,6 +167,7 @@
			self.enable_maas_finetune = enable_maas_finetune
			self.freeze_encoder2 = freeze_encoder2
			self.encoder1_encoder2_joint_training = encoder1_encoder2_joint_training
			self.length_normalized_loss = length_normalized_loss

			def forward(
			self,
			@@ -440,6 +441,8 @@
			stats["loss2"] = torch.clone(loss2.detach())
			stats["loss"] = torch.clone(loss.detach())
			# force_gatherable: to-device and to-tensor if scalar for DataParallel
			if self.length_normalized_loss:
			batch_size = (text_lengths + 1).sum().type_as(batch_size)
			loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
			return loss, stats, weight