Rin Arakaki
2024-12-24 1367973f9818d8e15c7bf52ad6ffba4ddb6ac2b2
funasr/models/sanm/encoder.py
@@ -69,7 +69,7 @@
        self.stochastic_depth_rate = stochastic_depth_rate
        self.dropout_rate = dropout_rate
    def forward(self, x, mask, cache=None, mask_shfit_chunk=None, mask_att_chunk_encoder=None):
    def forward(self, x, mask, cache=None, mask_shift_chunk=None, mask_att_chunk_encoder=None):
        """Compute encoded features.
        Args:
@@ -106,7 +106,7 @@
                    self.self_attn(
                        x,
                        mask,
                        mask_shfit_chunk=mask_shfit_chunk,
                        mask_shift_chunk=mask_shift_chunk,
                        mask_att_chunk_encoder=mask_att_chunk_encoder,
                    ),
                ),
@@ -122,7 +122,7 @@
                    self.self_attn(
                        x,
                        mask,
                        mask_shfit_chunk=mask_shfit_chunk,
                        mask_shift_chunk=mask_shift_chunk,
                        mask_att_chunk_encoder=mask_att_chunk_encoder,
                    )
                )
@@ -131,7 +131,7 @@
                    self.self_attn(
                        x,
                        mask,
                        mask_shfit_chunk=mask_shfit_chunk,
                        mask_shift_chunk=mask_shift_chunk,
                        mask_att_chunk_encoder=mask_att_chunk_encoder,
                    )
                )
@@ -145,7 +145,7 @@
        if not self.normalize_before:
            x = self.norm2(x)
        return x, mask, cache, mask_shfit_chunk, mask_att_chunk_encoder
        return x, mask, cache, mask_shift_chunk, mask_att_chunk_encoder
    def forward_chunk(self, x, cache=None, chunk_size=None, look_back=0):
        """Compute encoded features.
@@ -212,7 +212,7 @@
        interctc_layer_idx: List[int] = [],
        interctc_use_conditioning: bool = False,
        kernel_size: int = 11,
        sanm_shfit: int = 0,
        sanm_shift: int = 0,
        lora_list: List[str] = None,
        lora_rank: int = 8,
        lora_alpha: int = 16,
@@ -299,7 +299,7 @@
                output_size,
                attention_dropout_rate,
                kernel_size,
                sanm_shfit,
                sanm_shift,
                lora_list,
                lora_rank,
                lora_alpha,
@@ -312,7 +312,7 @@
                output_size,
                attention_dropout_rate,
                kernel_size,
                sanm_shfit,
                sanm_shift,
                lora_list,
                lora_rank,
                lora_alpha,
@@ -523,6 +523,7 @@
        feats_dim=560,
        model_name="encoder",
        onnx: bool = True,
        ctc_linear: nn.Module = None,
    ):
        super().__init__()
        self.embed = model.embed
@@ -553,6 +554,8 @@
        self.num_heads = model.encoders[0].self_attn.h
        self.hidden_size = model.encoders[0].self_attn.linear_out.out_features
        self.ctc_linear = ctc_linear
    def prepare_mask(self, mask):
        mask_3d_btd = mask[:, :, None]
        if len(mask.shape) == 2:
@@ -566,6 +569,7 @@
    def forward(self, speech: torch.Tensor, speech_lengths: torch.Tensor, online: bool = False):
        if not online:
            speech = speech * self._output_size**0.5
        mask = self.make_pad_mask(speech_lengths)
        mask = self.prepare_mask(mask)
        if self.embed is None:
@@ -581,6 +585,10 @@
        xs_pad = self.model.after_norm(xs_pad)
        if self.ctc_linear is not None:
            xs_pad = self.ctc_linear(xs_pad)
            xs_pad = F.softmax(xs_pad, dim=2)
        return xs_pad, speech_lengths
    def get_output_size(self):