From 1367973f9818d8e15c7bf52ad6ffba4ddb6ac2b2 Mon Sep 17 00:00:00 2001
From: Rin Arakaki <rnarkkx@gmail.com>
Date: 星期二, 24 十二月 2024 17:51:31 +0800
Subject: [PATCH] shfit to shift (#2266)

---
 funasr/models/sanm/encoder.py |   24 ++++++++++++++++--------
 1 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py
index dc30a94..b590e24 100644
--- a/funasr/models/sanm/encoder.py
+++ b/funasr/models/sanm/encoder.py
@@ -69,7 +69,7 @@
         self.stochastic_depth_rate = stochastic_depth_rate
         self.dropout_rate = dropout_rate
 
-    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+    def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
         """Compute encoded features.
 
         Args:
@@ -106,7 +106,7 @@
                     self.self_attn(
                         x,
                         mask,
-                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_shift_chunk=mask_shift_chunk,
                         mask_att_chunk_encoder=mask_att_chunk_encoder,
                     ),
                 ),
@@ -122,7 +122,7 @@
                     self.self_attn(
                         x,
                         mask,
-                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_shift_chunk=mask_shift_chunk,
                         mask_att_chunk_encoder=mask_att_chunk_encoder,
                     )
                 )
@@ -131,7 +131,7 @@
                     self.self_attn(
                         x,
                         mask,
-                        mask_shfit_chunk=mask_shfit_chunk,
+                        mask_shift_chunk=mask_shift_chunk,
                         mask_att_chunk_encoder=mask_att_chunk_encoder,
                     )
                 )
@@ -145,7 +145,7 @@
         if not self.normalize_before:
             x = self.norm2(x)
 
-        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+        return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
 
     def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
         """Compute encoded features.
@@ -212,7 +212,7 @@
         interctc_layer_idx: List[int] = [],
         interctc_use_conditioning: bool = False,
         kernel_size: int = 11,
-        sanm_shfit: int = 0,
+        sanm_shift: int = 0,
         lora_list: List[str] = None,
         lora_rank: int = 8,
         lora_alpha: int = 16,
@@ -299,7 +299,7 @@
                 output_size,
                 attention_dropout_rate,
                 kernel_size,
-                sanm_shfit,
+                sanm_shift,
                 lora_list,
                 lora_rank,
                 lora_alpha,
@@ -312,7 +312,7 @@
                 output_size,
                 attention_dropout_rate,
                 kernel_size,
-                sanm_shfit,
+                sanm_shift,
                 lora_list,
                 lora_rank,
                 lora_alpha,
@@ -523,6 +523,7 @@
         feats_dim=560,
         model_name="encoder",
         onnx: bool = True,
+        ctc_linear: nn.Module = None,
     ):
         super().__init__()
         self.embed = model.embed
@@ -553,6 +554,8 @@
         self.num_heads = model.encoders[0].self_attn.h
         self.hidden_size = model.encoders[0].self_attn.linear_out.out_features
 
+        self.ctc_linear = ctc_linear
+
     def prepare_mask(self, mask):
         mask_3d_btd = mask[:, :, None]
         if len(mask.shape) == 2:
@@ -566,6 +569,7 @@
     def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False):
         if not online:
             speech = speech * self._output_size**0.5
+
         mask = self.make_pad_mask(speech_lengths)
         mask = self.prepare_mask(mask)
         if self.embed is None:
@@ -581,6 +585,10 @@
 
         xs_pad = self.model.after_norm(xs_pad)
 
+        if self.ctc_linear is not None:
+            xs_pad = self.ctc_linear(xs_pad)
+            xs_pad = F.softmax(xs_pad, dim=2)
+
         return xs_pad, speech_lengths
 
     def get_output_size(self):

--
Gitblit v1.9.1