From 1367973f9818d8e15c7bf52ad6ffba4ddb6ac2b2 Mon Sep 17 00:00:00 2001
From: Rin Arakaki <rnarkkx@gmail.com>
Date: 星期二, 24 十二月 2024 17:51:31 +0800
Subject: [PATCH] shfit to shift (#2266)
---
funasr/models/sanm/encoder.py | 24 ++++++++++++++++--------
1 files changed, 16 insertions(+), 8 deletions(-)
diff --git a/funasr/models/sanm/encoder.py b/funasr/models/sanm/encoder.py
index dc30a94..b590e24 100644
--- a/funasr/models/sanm/encoder.py
+++ b/funasr/models/sanm/encoder.py
@@ -69,7 +69,7 @@
self.stochastic_depth_rate = stochastic_depth_rate
self.dropout_rate = dropout_rate
- def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
+ def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
"""Compute encoded features.
Args:
@@ -106,7 +106,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
),
),
@@ -122,7 +122,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -131,7 +131,7 @@
self.self_attn(
x,
mask,
- mask_shfit_chunk=mask_shfit_chunk,
+ mask_shift_chunk=mask_shift_chunk,
mask_att_chunk_encoder=mask_att_chunk_encoder,
)
)
@@ -145,7 +145,7 @@
if not self.normalize_before:
x = self.norm2(x)
- return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
+ return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
"""Compute encoded features.
@@ -212,7 +212,7 @@
interctc_layer_idx: List[int] = [],
interctc_use_conditioning: bool = False,
kernel_size: int = 11,
- sanm_shfit: int = 0,
+ sanm_shift: int = 0,
lora_list: List[str] = None,
lora_rank: int = 8,
lora_alpha: int = 16,
@@ -299,7 +299,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
lora_list,
lora_rank,
lora_alpha,
@@ -312,7 +312,7 @@
output_size,
attention_dropout_rate,
kernel_size,
- sanm_shfit,
+ sanm_shift,
lora_list,
lora_rank,
lora_alpha,
@@ -523,6 +523,7 @@
feats_dim=560,
model_name="encoder",
onnx: bool = True,
+ ctc_linear: nn.Module = None,
):
super().__init__()
self.embed = model.embed
@@ -553,6 +554,8 @@
self.num_heads = model.encoders[0].self_attn.h
self.hidden_size = model.encoders[0].self_attn.linear_out.out_features
+ self.ctc_linear = ctc_linear
+
def prepare_mask(self, mask):
mask_3d_btd = mask[:, :, None]
if len(mask.shape) == 2:
@@ -566,6 +569,7 @@
def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False):
if not online:
speech = speech * self._output_size**0.5
+
mask = self.make_pad_mask(speech_lengths)
mask = self.prepare_mask(mask)
if self.embed is None:
@@ -581,6 +585,10 @@
xs_pad = self.model.after_norm(xs_pad)
+ if self.ctc_linear is not None:
+ xs_pad = self.ctc_linear(xs_pad)
+ xs_pad = F.softmax(xs_pad, dim=2)
+
return xs_pad, speech_lengths
def get_output_size(self):
--
Gitblit v1.9.1