From de87e1d180d214e1f49682d2b5fb7c9d2c89ae7e Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期三, 13 十二月 2023 13:57:59 +0800
Subject: [PATCH] adapted pcm to 8k

---
 funasr/models/e2e_asr_contextual_paraformer.py |   55 ++++++++++++++++---------------------------------------
 1 files changed, 16 insertions(+), 39 deletions(-)

diff --git a/funasr/models/e2e_asr_contextual_paraformer.py b/funasr/models/e2e_asr_contextual_paraformer.py
index e1dfe6c..b474dbc 100644
--- a/funasr/models/e2e_asr_contextual_paraformer.py
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@@ -9,7 +9,6 @@
 import numpy as np
 
 import torch
-from typeguard import check_argument_types
 
 from funasr.layers.abs_normalize import AbsNormalize
 from funasr.models.ctc import CTC
@@ -43,9 +42,7 @@
         frontend: Optional[AbsFrontend],
         specaug: Optional[AbsSpecAug],
         normalize: Optional[AbsNormalize],
-        preencoder: Optional[AbsPreEncoder],
         encoder: AbsEncoder,
-        postencoder: Optional[AbsPostEncoder],
         decoder: AbsDecoder,
         ctc: CTC,
         ctc_weight: float = 0.5,
@@ -68,12 +65,13 @@
         target_buffer_length: int = -1,
         inner_dim: int = 256, 
         bias_encoder_type: str = 'lstm',
-        use_decoder_embedding: bool = True,
+        use_decoder_embedding: bool = False,
         crit_attn_weight: float = 0.0,
         crit_attn_smooth: float = 0.0,
         bias_encoder_dropout_rate: float = 0.0,
+        preencoder: Optional[AbsPreEncoder] = None,
+        postencoder: Optional[AbsPostEncoder] = None,
     ):
-        assert check_argument_types()
         assert 0.0 <= ctc_weight <= 1.0, ctc_weight
         assert 0.0 <= interctc_weight < 1.0, interctc_weight
 
@@ -127,6 +125,7 @@
         if self.crit_attn_weight > 0:
             self.attn_loss = torch.nn.L1Loss()
         self.crit_attn_smooth = crit_attn_smooth
+        self.length_normalized_loss = length_normalized_loss
 
     def forward(
             self,
@@ -136,7 +135,7 @@
             text_lengths: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
+            dha_pad: torch.Tensor,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -209,7 +208,7 @@
         # 2b. Attention decoder branch
         if self.ctc_weight != 1.0:
             loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
-                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
             )
 
         # 3. CTC-Att loss definition
@@ -233,6 +232,8 @@
 
         stats["loss"] = torch.clone(loss.detach())
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + self.predictor_bias).sum())
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
     
@@ -244,7 +245,6 @@
             ys_pad_lens: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
     ):
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
@@ -278,9 +278,10 @@
 
         # 1. Forward decoder
         decoder_outs = self.decoder(
-            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info, ret_attn=(ideal_attn is not None)
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
         ) 
-        decoder_out, _, attn = decoder_outs[0], decoder_outs[1], decoder_outs[2]
+        decoder_out, _ = decoder_outs[0], decoder_outs[1]
+        '''
         if self.crit_attn_weight > 0 and attn.shape[-1] > 1:
             ideal_attn = ideal_attn + self.crit_attn_smooth / (self.crit_attn_smooth + 1.0)
             attn_non_blank = attn[:,:,:,:-1]
@@ -288,6 +289,8 @@
             loss_ideal = self.attn_loss(attn_non_blank.max(1)[0], ideal_attn_non_blank.to(attn.device))
         else:
             loss_ideal = None
+        '''
+        loss_ideal = None
 
         if decoder_out_1st is None:
             decoder_out_1st = decoder_out
@@ -340,7 +343,7 @@
             input_mask_expand_dim, 0)
         return sematic_embeds * tgt_mask, decoder_out * tgt_mask
 
-    def cal_decoder_with_predictor_with_hwlist_advanced(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
+    def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None, clas_scale=1.0):
         if hw_list is None:
             hw_list = [torch.Tensor([1]).long().to(encoder_out.device)]  # empty hotword list
             hw_list_pad = pad_list(hw_list, 0)
@@ -349,8 +352,8 @@
             else:
                 hw_embed = self.bias_embed(hw_list_pad)
             hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
+            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
         else:
-            # hw_list = hw_list[1:] + [hw_list[0]]  # reorder
             hw_lengths = [len(i) for i in hw_list]
             hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
             if self.use_decoder_embedding:
@@ -360,37 +363,11 @@
             hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
                                                             enforce_sorted=False)
             _, (h_n, _) = self.bias_encoder(hw_embed)
-            # hw_embed, _ = torch.nn.utils.rnn.pad_packed_sequence(hw_embed, batch_first=True)
-            if h_n.shape[1] > 2000: # large hotword list
-                _h_n = self.pick_hwlist_group(h_n.squeeze(0), encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens)
-                if _h_n is not None:
-                    h_n = _h_n
             hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
-        # import pdb; pdb.set_trace()
         
         decoder_outs = self.decoder(
-            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed
+            encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=hw_embed, clas_scale=clas_scale
         )
         decoder_out = decoder_outs[0]
         decoder_out = torch.log_softmax(decoder_out, dim=-1)
         return decoder_out, ys_pad_lens
-
-    def pick_hwlist_group(self, hw_embed, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens):
-        max_attn_score = 0.0
-        # max_attn_index = 0
-        argmax_g = None
-        non_blank = hw_embed[-1]
-        hw_embed_groups = hw_embed[:-1].split(2000)
-        for i, g in enumerate(hw_embed_groups):
-            g = torch.cat([g, non_blank.unsqueeze(0)], dim=0)
-            _ = self.decoder(
-                encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=g.unsqueeze(0)
-            )
-            attn = self.decoder.bias_decoder.src_attn.attn[0]
-            _max_attn_score = attn.max(0)[0][:,:-1].max()
-            if _max_attn_score > max_attn_score:
-                max_attn_score = _max_attn_score
-                # max_attn_index = i
-                argmax_g = g
-        # import pdb; pdb.set_trace()
-        return argmax_g
\ No newline at end of file

--
Gitblit v1.9.1