shixian.shi
2023-05-04 86f5e64b9ade696c83d78dfae9f46228d678b00e
funasr/models/e2e_asr_contextual_paraformer.py
@@ -68,7 +68,7 @@
        target_buffer_length: int = -1,
        inner_dim: int = 256, 
        bias_encoder_type: str = 'lstm',
        use_decoder_embedding: bool = True,
        use_decoder_embedding: bool = False,
        crit_attn_weight: float = 0.0,
        crit_attn_smooth: float = 0.0,
        bias_encoder_dropout_rate: float = 0.0,
@@ -278,9 +278,10 @@
        # 1. Forward decoder
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info, ret_attn=(ideal_attn is not None)
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
        ) 
        decoder_out, _, attn = decoder_outs[0], decoder_outs[1], decoder_outs[2]
        decoder_out, _ = decoder_outs[0], decoder_outs[1]
        '''
        if self.crit_attn_weight > 0 and attn.shape[-1] > 1:
            ideal_attn = ideal_attn + self.crit_attn_smooth / (self.crit_attn_smooth + 1.0)
            attn_non_blank = attn[:,:,:,:-1]
@@ -288,7 +289,9 @@
            loss_ideal = self.attn_loss(attn_non_blank.max(1)[0], ideal_attn_non_blank.to(attn.device))
        else:
            loss_ideal = None
        '''
        loss_ideal = None
        if decoder_out_1st is None:
            decoder_out_1st = decoder_out
        # 2. Compute attention loss
@@ -340,7 +343,7 @@
            input_mask_expand_dim, 0)
        return sematic_embeds * tgt_mask, decoder_out * tgt_mask
    def cal_decoder_with_predictor_with_hwlist_advanced(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
        if hw_list is None:
            hw_list = [torch.Tensor([1]).long().to(encoder_out.device)]  # empty hotword list
            hw_list_pad = pad_list(hw_list, 0)
@@ -350,7 +353,6 @@
                hw_embed = self.bias_embed(hw_list_pad)
            hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
        else:
            # hw_list = hw_list[1:] + [hw_list[0]]  # reorder
            hw_lengths = [len(i) for i in hw_list]
            hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
            if self.use_decoder_embedding:
@@ -366,7 +368,6 @@
                if _h_n is not None:
                    h_n = _h_n
            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
        # import pdb; pdb.set_trace()
        
        decoder_outs = self.decoder(
            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed