kongdeqiang
2026-03-13 28ccfbfc51068a663a80764e14074df5edf2b5ba
funasr/models/conformer/encoder.py
@@ -49,6 +49,7 @@
from funasr.register import tables
import pdb
class ConvolutionModule(nn.Module):
    """ConvolutionModule in Conformer model.
@@ -146,16 +147,16 @@
    """
    def __init__(
            self,
            size,
            self_attn,
            feed_forward,
            feed_forward_macaron,
            conv_module,
            dropout_rate,
            normalize_before=True,
            concat_after=False,
            stochastic_depth_rate=0.0,
        self,
        size,
        self_attn,
        feed_forward,
        feed_forward_macaron,
        conv_module,
        dropout_rate,
        normalize_before=True,
        concat_after=False,
        stochastic_depth_rate=0.0,
    ):
        """Construct an EncoderLayer object."""
        super(EncoderLayer, self).__init__()
@@ -266,9 +267,7 @@
        residual = x
        if self.normalize_before:
            x = self.norm_ff(x)
        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(
            self.feed_forward(x)
        )
        x = residual + stoch_layer_coeff * self.ff_scale * self.dropout(self.feed_forward(x))
        if not self.normalize_before:
            x = self.norm_ff(x)
@@ -321,32 +320,32 @@
    """
    def __init__(
            self,
            input_size: int,
            output_size: int = 256,
            attention_heads: int = 4,
            linear_units: int = 2048,
            num_blocks: int = 6,
            dropout_rate: float = 0.1,
            positional_dropout_rate: float = 0.1,
            attention_dropout_rate: float = 0.0,
            input_layer: str = "conv2d",
            normalize_before: bool = True,
            concat_after: bool = False,
            positionwise_layer_type: str = "linear",
            positionwise_conv_kernel_size: int = 3,
            macaron_style: bool = False,
            rel_pos_type: str = "legacy",
            pos_enc_layer_type: str = "rel_pos",
            selfattention_layer_type: str = "rel_selfattn",
            activation_type: str = "swish",
            use_cnn_module: bool = True,
            zero_triu: bool = False,
            cnn_module_kernel: int = 31,
            padding_idx: int = -1,
            interctc_layer_idx: List[int] = [],
            interctc_use_conditioning: bool = False,
            stochastic_depth_rate: Union[float, List[float]] = 0.0,
        self,
        input_size: int,
        output_size: int = 256,
        attention_heads: int = 4,
        linear_units: int = 2048,
        num_blocks: int = 6,
        dropout_rate: float = 0.1,
        positional_dropout_rate: float = 0.1,
        attention_dropout_rate: float = 0.0,
        input_layer: str = "conv2d",
        normalize_before: bool = True,
        concat_after: bool = False,
        positionwise_layer_type: str = "linear",
        positionwise_conv_kernel_size: int = 3,
        macaron_style: bool = False,
        rel_pos_type: str = "legacy",
        pos_enc_layer_type: str = "rel_pos",
        selfattention_layer_type: str = "rel_selfattn",
        activation_type: str = "swish",
        use_cnn_module: bool = True,
        zero_triu: bool = False,
        cnn_module_kernel: int = 31,
        padding_idx: int = -1,
        interctc_layer_idx: List[int] = [],
        interctc_use_conditioning: bool = False,
        stochastic_depth_rate: Union[float, List[float]] = 0.0,
    ):
        super().__init__()
        self._output_size = output_size
@@ -373,9 +372,7 @@
        elif pos_enc_layer_type == "legacy_rel_pos":
            assert selfattention_layer_type == "legacy_rel_selfattn"
            pos_enc_class = LegacyRelPositionalEncoding
            logging.warning(
                "Using legacy_rel_pos and it will be deprecated in the future."
            )
            logging.warning("Using legacy_rel_pos and it will be deprecated in the future.")
        else:
            raise ValueError("unknown pos_enc_layer: " + pos_enc_layer_type)
@@ -432,9 +429,7 @@
                pos_enc_class(output_size, positional_dropout_rate),
            )
        elif input_layer is None:
            self.embed = torch.nn.Sequential(
                pos_enc_class(output_size, positional_dropout_rate)
            )
            self.embed = torch.nn.Sequential(pos_enc_class(output_size, positional_dropout_rate))
        else:
            raise ValueError("unknown input_layer: " + input_layer)
        self.normalize_before = normalize_before
@@ -480,9 +475,7 @@
                output_size,
                attention_dropout_rate,
            )
            logging.warning(
                "Using legacy_rel_selfattn and it will be deprecated in the future."
            )
            logging.warning("Using legacy_rel_selfattn and it will be deprecated in the future.")
        elif selfattention_layer_type == "rel_selfattn":
            assert pos_enc_layer_type == "rel_pos"
            encoder_selfattn_layer = RelPositionMultiHeadedAttention
@@ -534,11 +527,11 @@
        return self._output_size
    def forward(
            self,
            xs_pad: torch.Tensor,
            ilens: torch.Tensor,
            prev_states: torch.Tensor = None,
            ctc: CTC = None,
        self,
        xs_pad: torch.Tensor,
        ilens: torch.Tensor,
        prev_states: torch.Tensor = None,
        ctc: CTC = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, Optional[torch.Tensor]]:
        """Calculate forward propagation.
@@ -556,11 +549,11 @@
        masks = (~make_pad_mask(ilens)[:, None, :]).to(xs_pad.device)
        if (
                isinstance(self.embed, Conv2dSubsampling)
                or isinstance(self.embed, Conv2dSubsampling2)
                or isinstance(self.embed, Conv2dSubsampling6)
                or isinstance(self.embed, Conv2dSubsampling8)
                or isinstance(self.embed, Conv2dSubsamplingPad)
            isinstance(self.embed, Conv2dSubsampling)
            or isinstance(self.embed, Conv2dSubsampling2)
            or isinstance(self.embed, Conv2dSubsampling6)
            or isinstance(self.embed, Conv2dSubsampling8)
            or isinstance(self.embed, Conv2dSubsamplingPad)
        ):
            short_status, limit_size = check_short_utt(self.embed, xs_pad.size(1))
            if short_status:
@@ -573,7 +566,7 @@
            xs_pad, masks = self.embed(xs_pad, masks)
        else:
            xs_pad = self.embed(xs_pad)
        pdb.set_trace()
        intermediate_outs = []
        if len(self.interctc_layer_idx) == 0:
            xs_pad, masks = self.encoders(xs_pad, masks)
@@ -601,17 +594,17 @@
                            xs_pad = (x, pos_emb)
                        else:
                            xs_pad = xs_pad + self.conditioning_layer(ctc_out)
        pdb.set_trace()
        if isinstance(xs_pad, tuple):
            xs_pad = xs_pad[0]
        if self.normalize_before:
            xs_pad = self.after_norm(xs_pad)
        pdb.set_trace()
        olens = masks.squeeze(1).sum(1)
        if len(intermediate_outs) > 0:
            return (xs_pad, intermediate_outs), olens, None
        return xs_pad, olens, None
class CausalConvolution(torch.nn.Module):
    """ConformerConvolution module definition.
@@ -708,6 +701,7 @@
        return x, cache
class ChunkEncoderLayer(torch.nn.Module):
    """Chunk Conformer module definition.
    Args:
@@ -797,9 +791,7 @@
        residual = x
        x = self.norm_macaron(x)
        x = residual + self.feed_forward_scale * self.dropout(
            self.feed_forward_macaron(x)
        )
        x = residual + self.feed_forward_scale * self.dropout(self.feed_forward_macaron(x))
        residual = x
        x = self.norm_self_att(x)
@@ -876,9 +868,7 @@
        residual = x
        x = self.norm_conv(x)
        x, conv_cache = self.conv_mod(
            x, cache=self.cache[1], right_context=right_context
        )
        x, conv_cache = self.conv_mod(x, cache=self.cache[1], right_context=right_context)
        x = residual + x
        residual = x
@@ -889,6 +879,7 @@
        self.cache = [att_cache, conv_cache]
        return x, pos_enc
@tables.register("encoder_classes", "ChunkConformerEncoder")
class ConformerChunkEncoder(torch.nn.Module):
@@ -940,7 +931,6 @@
        """Construct an Encoder object."""
        super().__init__()
        self.embed = StreamingConvInput(
            input_size=input_size,
            conv_size=output_size,
@@ -954,9 +944,7 @@
            positional_dropout_rate,
        )
        activation = get_activation(
            activation_type
       )
        activation = get_activation(activation_type)
        pos_wise_args = (
            output_size,
@@ -985,7 +973,6 @@
            simplified_att_score,
        )
        fn_modules = []
        for _ in range(num_blocks):
            module = lambda: ChunkEncoderLayer(
@@ -996,7 +983,7 @@
                CausalConvolution(*conv_mod_args),
                dropout_rate=dropout_rate,
            )
            fn_modules.append(module)
            fn_modules.append(module)
        self.encoders = MultiBlocks(
            [fn() for fn in fn_modules],
@@ -1040,7 +1027,6 @@
        """
        return self.embed.get_size_before_subsampling(size)
    def reset_streaming_cache(self, left_context: int, device: torch.device) -> None:
        """Initialize/Reset encoder streaming cache.
        Args:
@@ -1062,9 +1048,7 @@
           x: Encoder outputs. (B, T_out, D_enc)
           x_len: Encoder outputs lenghts. (B,)
        """
        short_status, limit_size = check_short_utt(
            self.embed.subsampling_factor, x.size(1)
        )
        short_status, limit_size = check_short_utt(self.embed.subsampling_factor, x.size(1))
        if short_status:
            raise TooShortUttError(
@@ -1078,7 +1062,10 @@
        if self.unified_model_training:
            if self.training:
                chunk_size = self.default_chunk_size + torch.randint(-self.jitter_range, self.jitter_range+1, (1,)).item()
                chunk_size = (
                    self.default_chunk_size
                    + torch.randint(-self.jitter_range, self.jitter_range + 1, (1,)).item()
                )
            else:
                chunk_size = self.default_chunk_size
            x, mask = self.embed(x, mask, chunk_size)
@@ -1104,9 +1091,9 @@
            olens = mask.eq(0).sum(1)
            if self.time_reduction_factor > 1:
                x_utt = x_utt[:,::self.time_reduction_factor,:]
                x_chunk = x_chunk[:,::self.time_reduction_factor,:]
                olens = torch.floor_divide(olens-1, self.time_reduction_factor) + 1
                x_utt = x_utt[:, :: self.time_reduction_factor, :]
                x_chunk = x_chunk[:, :: self.time_reduction_factor, :]
                olens = torch.floor_divide(olens - 1, self.time_reduction_factor) + 1
            return x_utt, x_chunk, olens
@@ -1144,8 +1131,8 @@
        olens = mask.eq(0).sum(1)
        if self.time_reduction_factor > 1:
            x = x[:,::self.time_reduction_factor,:]
            olens = torch.floor_divide(olens-1, self.time_reduction_factor) + 1
            x = x[:, :: self.time_reduction_factor, :]
            olens = torch.floor_divide(olens - 1, self.time_reduction_factor) + 1
        return x, olens, None
@@ -1162,9 +1149,7 @@
           x: Encoder outputs. (B, T_out, D_enc)
           x_len: Encoder outputs lenghts. (B,)
        """
        short_status, limit_size = check_short_utt(
            self.embed.subsampling_factor, x.size(1)
        )
        short_status, limit_size = check_short_utt(self.embed.subsampling_factor, x.size(1))
        if short_status:
            raise TooShortUttError(
@@ -1185,7 +1170,7 @@
        )
        if self.time_reduction_factor > 1:
            x_utt = x_utt[:,::self.time_reduction_factor,:]
            x_utt = x_utt[:, :: self.time_reduction_factor, :]
        return x_utt
    def simu_chunk_forward(
@@ -1196,9 +1181,7 @@
        left_context: int = 32,
        right_context: int = 0,
    ) -> torch.Tensor:
        short_status, limit_size = check_short_utt(
            self.embed.subsampling_factor, x.size(1)
        )
        short_status, limit_size = check_short_utt(self.embed.subsampling_factor, x.size(1))
        if short_status:
            raise TooShortUttError(
@@ -1227,7 +1210,7 @@
        )
        olens = mask.eq(0).sum(1)
        if self.time_reduction_factor > 1:
            x = x[:,::self.time_reduction_factor,:]
            x = x[:, :: self.time_reduction_factor, :]
        return x
@@ -1255,9 +1238,7 @@
        if left_context > 0:
            processed_mask = (
                torch.arange(left_context, device=x.device)
                .view(1, left_context)
                .flip(1)
                torch.arange(left_context, device=x.device).view(1, left_context).flip(1)
            )
            processed_mask = processed_mask >= processed_frames
            mask = torch.cat([processed_mask, mask], dim=1)
@@ -1275,5 +1256,5 @@
            x = x[:, 0:-right_context, :]
        if self.time_reduction_factor > 1:
            x = x[:,::self.time_reduction_factor,:]
            x = x[:, :: self.time_reduction_factor, :]
        return x