From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交

---
 funasr/models/contextual_paraformer/decoder.py |  129 +++++++++++++++++++++++-------------------
 1 files changed, 71 insertions(+), 58 deletions(-)

diff --git a/funasr/models/contextual_paraformer/decoder.py b/funasr/models/contextual_paraformer/decoder.py
index 1116f84..ba2ce9a 100644
--- a/funasr/models/contextual_paraformer/decoder.py
+++ b/funasr/models/contextual_paraformer/decoder.py
@@ -15,7 +15,10 @@
 from funasr.models.transformer.embedding import PositionalEncoding
 from funasr.models.paraformer.decoder import DecoderLayerSANM, ParaformerSANMDecoder
 from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM
-from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt
+from funasr.models.sanm.attention import (
+    MultiHeadedAttentionSANMDecoder,
+    MultiHeadedAttentionCrossAtt,
+)
 
 
 class ContextualDecoderLayer(torch.nn.Module):
@@ -47,7 +50,14 @@
             self.concat_linear1 = torch.nn.Linear(size + size, size)
             self.concat_linear2 = torch.nn.Linear(size + size, size)
 
-    def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None,):
+    def forward(
+        self,
+        tgt,
+        tgt_mask,
+        memory,
+        memory_mask,
+        cache=None,
+    ):
         # tgt = self.dropout(tgt)
         if isinstance(tgt, Tuple):
             tgt, _ = tgt
@@ -97,8 +107,9 @@
         if self.src_attn is not None:
             if self.normalize_before:
                 x = self.norm3(x)
-            x =  self.dropout(self.src_attn(x, memory, memory_mask))
+            x = self.dropout(self.src_attn(x, memory, memory_mask))
         return x, tgt_mask, memory, memory_mask, cache
+
 
 @tables.register("decoder_classes", "ContextualParaformerDecoder")
 class ContextualParaformerDecoder(ParaformerSANMDecoder):
@@ -107,6 +118,7 @@
     Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
     https://arxiv.org/abs/2006.01713
     """
+
     def __init__(
         self,
         vocab_size: int,
@@ -139,7 +151,7 @@
         )
 
         attention_dim = encoder_output_size
-        if input_layer == 'none':
+        if input_layer == "none":
             self.embed = None
         if input_layer == "embed":
             self.embed = torch.nn.Sequential(
@@ -194,20 +206,20 @@
             dropout_rate=dropout_rate,
             normalize_before=True,
         )
-        self.bias_output = torch.nn.Conv1d(attention_dim*2, attention_dim, 1, bias=False)
+        self.bias_output = torch.nn.Conv1d(attention_dim * 2, attention_dim, 1, bias=False)
         self.last_decoder = ContextualDecoderLayer(
-                attention_dim,
-                MultiHeadedAttentionSANMDecoder(
-                    attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
-                ),
-                MultiHeadedAttentionCrossAtt(
-                    attention_heads, attention_dim, src_attention_dropout_rate
-                ),
-                PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
-                dropout_rate,
-                normalize_before,
-                concat_after,
-            )
+            attention_dim,
+            MultiHeadedAttentionSANMDecoder(
+                attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+            ),
+            MultiHeadedAttentionCrossAtt(
+                attention_heads, attention_dim, src_attention_dropout_rate
+            ),
+            PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+            dropout_rate,
+            normalize_before,
+            concat_after,
+        )
         if num_blocks - att_layer_num <= 0:
             self.decoders2 = None
         else:
@@ -273,31 +285,25 @@
         memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :]
 
         x = tgt
-        x, tgt_mask, memory, memory_mask, _ = self.decoders(
-            x, tgt_mask, memory, memory_mask
-        )
-        _, _, x_self_attn, x_src_attn = self.last_decoder(
-            x, tgt_mask, memory, memory_mask
-        )
+        x, tgt_mask, memory, memory_mask, _ = self.decoders(x, tgt_mask, memory, memory_mask)
+        _, _, x_self_attn, x_src_attn = self.last_decoder(x, tgt_mask, memory, memory_mask)
 
         # contextual paraformer related
         contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0])
         contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :]
-        cx, tgt_mask, _, _, _ = self.bias_decoder(x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask)
+        cx, tgt_mask, _, _, _ = self.bias_decoder(
+            x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask
+        )
 
         if self.bias_output is not None:
-            x = torch.cat([x_src_attn, cx*clas_scale], dim=2)
+            x = torch.cat([x_src_attn, cx * clas_scale], dim=2)
             x = self.bias_output(x.transpose(1, 2)).transpose(1, 2)  # 2D -> D
             x = x_self_attn + self.dropout(x)
 
         if self.decoders2 is not None:
-            x, tgt_mask, memory, memory_mask, _ = self.decoders2(
-                x, tgt_mask, memory, memory_mask
-            )
+            x, tgt_mask, memory, memory_mask, _ = self.decoders2(x, tgt_mask, memory, memory_mask)
 
-        x, tgt_mask, memory, memory_mask, _ = self.decoders3(
-            x, tgt_mask, memory, memory_mask
-        )
+        x, tgt_mask, memory, memory_mask, _ = self.decoders3(x, tgt_mask, memory, memory_mask)
         if self.normalize_before:
             x = self.after_norm(x)
         olens = tgt_mask.sum(1)
@@ -308,20 +314,26 @@
 
 @tables.register("decoder_classes", "ContextualParaformerDecoderExport")
 class ContextualParaformerDecoderExport(torch.nn.Module):
-    def __init__(self, model,
-                 max_seq_len=512,
-                 model_name='decoder',
-                 onnx: bool = True,
-                 **kwargs,):
+    def __init__(
+        self,
+        model,
+        max_seq_len=512,
+        model_name="decoder",
+        onnx: bool = True,
+        **kwargs,
+    ):
         super().__init__()
         from funasr.utils.torch_function import sequence_mask
+
         self.model = model
         self.make_pad_mask = sequence_mask(max_seq_len, flip=False)
-        
+
         from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoderExport
         from funasr.models.sanm.attention import MultiHeadedAttentionCrossAttExport
         from funasr.models.paraformer.decoder import DecoderLayerSANMExport
-        from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANMExport
+        from funasr.models.transformer.positionwise_feed_forward import (
+            PositionwiseFeedForwardDecoderSANMExport,
+        )
 
         for i, d in enumerate(self.model.decoders):
             if isinstance(d.feed_forward, PositionwiseFeedForwardDecoderSANM):
@@ -344,27 +356,34 @@
             if isinstance(d.feed_forward, PositionwiseFeedForwardDecoderSANM):
                 d.feed_forward = PositionwiseFeedForwardDecoderSANMExport(d.feed_forward)
             self.model.decoders3[i] = DecoderLayerSANMExport(d)
-        
+
         self.output_layer = model.output_layer
         self.after_norm = model.after_norm
         self.model_name = model_name
 
         # bias decoder
         if isinstance(self.model.bias_decoder.src_attn, MultiHeadedAttentionCrossAtt):
-            self.model.bias_decoder.src_attn = MultiHeadedAttentionCrossAttExport(self.model.bias_decoder.src_attn)
+            self.model.bias_decoder.src_attn = MultiHeadedAttentionCrossAttExport(
+                self.model.bias_decoder.src_attn
+            )
         self.bias_decoder = self.model.bias_decoder
-        
+
         # last decoder
         if isinstance(self.model.last_decoder.src_attn, MultiHeadedAttentionCrossAtt):
-            self.model.last_decoder.src_attn = MultiHeadedAttentionCrossAttExport(self.model.last_decoder.src_attn)
+            self.model.last_decoder.src_attn = MultiHeadedAttentionCrossAttExport(
+                self.model.last_decoder.src_attn
+            )
         if isinstance(self.model.last_decoder.self_attn, MultiHeadedAttentionSANMDecoder):
-            self.model.last_decoder.self_attn = MultiHeadedAttentionSANMDecoderExport(self.model.last_decoder.self_attn)
+            self.model.last_decoder.self_attn = MultiHeadedAttentionSANMDecoderExport(
+                self.model.last_decoder.self_attn
+            )
         if isinstance(self.model.last_decoder.feed_forward, PositionwiseFeedForwardDecoderSANM):
-            self.model.last_decoder.feed_forward = PositionwiseFeedForwardDecoderSANMExport(self.model.last_decoder.feed_forward)
+            self.model.last_decoder.feed_forward = PositionwiseFeedForwardDecoderSANMExport(
+                self.model.last_decoder.feed_forward
+            )
         self.last_decoder = self.model.last_decoder
         self.bias_output = self.model.bias_output
         self.dropout = self.model.dropout
-        
 
     def prepare_mask(self, mask):
         mask_3d_btd = mask[:, :, None]
@@ -373,7 +392,7 @@
         elif len(mask.shape) == 3:
             mask_4d_bhlt = 1 - mask[:, None, :]
         mask_4d_bhlt = mask_4d_bhlt * -10000.0
-    
+
         return mask_3d_btd, mask_4d_bhlt
 
     def forward(
@@ -396,22 +415,19 @@
         # memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :]
 
         x = tgt
-        x, tgt_mask, memory, memory_mask, _ = self.model.decoders(
-            x, tgt_mask, memory, memory_mask
-        )
+        x, tgt_mask, memory, memory_mask, _ = self.model.decoders(x, tgt_mask, memory, memory_mask)
 
-        _, _, x_self_attn, x_src_attn = self.last_decoder(
-            x, tgt_mask, memory, memory_mask
-        )
+        _, _, x_self_attn, x_src_attn = self.last_decoder(x, tgt_mask, memory, memory_mask)
 
         # contextual paraformer related
         contextual_length = torch.Tensor([bias_embed.shape[1]]).int().repeat(hs_pad.shape[0])
         # contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :]
         contextual_mask = self.make_pad_mask(contextual_length)
         contextual_mask, _ = self.prepare_mask(contextual_mask)
-        # import pdb; pdb.set_trace()
         contextual_mask = contextual_mask.transpose(2, 1).unsqueeze(1)
-        cx, tgt_mask, _, _, _ = self.bias_decoder(x_self_attn, tgt_mask, bias_embed, memory_mask=contextual_mask)
+        cx, tgt_mask, _, _, _ = self.bias_decoder(
+            x_self_attn, tgt_mask, bias_embed, memory_mask=contextual_mask
+        )
 
         if self.bias_output is not None:
             x = torch.cat([x_src_attn, cx], dim=2)
@@ -422,11 +438,8 @@
             x, tgt_mask, memory, memory_mask, _ = self.model.decoders2(
                 x, tgt_mask, memory, memory_mask
             )
-        x, tgt_mask, memory, memory_mask, _ = self.model.decoders3(
-            x, tgt_mask, memory, memory_mask
-        )
+        x, tgt_mask, memory, memory_mask, _ = self.model.decoders3(x, tgt_mask, memory, memory_mask)
         x = self.after_norm(x)
         x = self.output_layer(x)
 
         return x, ys_in_lens
-

--
Gitblit v1.9.1