python/FunASR-XL.git

"""Branchformer block for Transducer encoder."""
 
from typing import Dict, Optional, Tuple
 
import torch
 
 
class Branchformer(torch.nn.Module):
    """Branchformer module definition.
 
    Reference: https://arxiv.org/pdf/2207.02971.pdf
 
    Args:
        block_size: Input/output size.
        linear_size: Linear layers' hidden size.
        self_att: Self-attention module instance.
        conv_mod: Convolution module instance.
        norm_class: Normalization class.
        norm_args: Normalization module arguments.
        dropout_rate: Dropout rate.
 
    """
 
    def __init__(
        self,
        block_size: int,
        linear_size: int,
        self_att: torch.nn.Module,
        conv_mod: torch.nn.Module,
        norm_class: torch.nn.Module = torch.nn.LayerNorm,
        norm_args: Dict = {},
        dropout_rate: float = 0.0,
    ) -> None:
        """Construct a Branchformer object."""
        super().__init__()
 
        self.self_att = self_att
        self.conv_mod = conv_mod
 
        self.channel_proj1 = torch.nn.Sequential(
            torch.nn.Linear(block_size, linear_size), torch.nn.GELU()
        )
        self.channel_proj2 = torch.nn.Linear(linear_size // 2, block_size)
 
        self.merge_proj = torch.nn.Linear(block_size + block_size, block_size)
 
        self.norm_self_att = norm_class(block_size, **norm_args)
        self.norm_mlp = norm_class(block_size, **norm_args)
        self.norm_final = norm_class(block_size, **norm_args)
 
        self.dropout = torch.nn.Dropout(dropout_rate)
 
        self.block_size = block_size
        self.linear_size = linear_size
        self.cache = None
 
    def reset_streaming_cache(self, left_context: int, device: torch.device) -> None:
        """Initialize/Reset self-attention and convolution modules cache for streaming.
 
        Args:
            left_context: Number of left frames during chunk-by-chunk inference.
            device: Device to use for cache tensor.
 
        """
        self.cache = [
            torch.zeros(
                (1, left_context, self.block_size),
                device=device,
            ),
            torch.zeros(
                (
                    1,
                    self.linear_size // 2,
                    self.conv_mod.kernel_size - 1,
                ),
                device=device,
            ),
        ]
 
    def forward(
        self,
        x: torch.Tensor,
        pos_enc: torch.Tensor,
        mask: torch.Tensor,
        chunk_mask: Optional[torch.Tensor] = None,
    ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]:
        """Encode input sequences.
 
        Args:
            x: Branchformer input sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
            mask: Source mask. (B, T)
            chunk_mask: Chunk mask. (T_2, T_2)
 
        Returns:
            x: Branchformer output sequences. (B, T, D_block)
            mask: Source mask. (B, T)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
 
        """
        x1 = x
        x2 = x
 
        x1 = self.norm_self_att(x1)
 
        x1 = self.dropout(
            self.self_att(x1, x1, x1, pos_enc, mask=mask, chunk_mask=chunk_mask)
        )
 
        x2 = self.norm_mlp(x2)
 
        x2 = self.channel_proj1(x2)
        x2, _ = self.conv_mod(x2)
        x2 = self.channel_proj2(x2)
 
        x2 = self.dropout(x2)
 
        x = x + self.dropout(self.merge_proj(torch.cat([x1, x2], dim=-1)))
 
        x = self.norm_final(x)
 
        return x, mask, pos_enc
 
    def chunk_forward(
        self,
        x: torch.Tensor,
        pos_enc: torch.Tensor,
        mask: torch.Tensor,
        left_context: int = 0,
        right_context: int = 0,
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """Encode chunk of input sequence.
 
        Args:
            x: Branchformer input sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
            mask: Source mask. (B, T_2)
            left_context: Number of frames in left context.
            right_context: Number of frames in right context.
 
        Returns:
            x: Branchformer output sequences. (B, T, D_block)
            pos_enc: Positional embedding sequences. (B, 2 * (T - 1), D_block)
 
        """
        x1 = x
        x2 = x
 
        x1 = self.norm_self_att(x1)
 
        if left_context > 0:
            key = torch.cat([self.cache[0], x1], dim=1)
        else:
            key = x1
        val = key
 
        if right_context > 0:
            att_cache = key[:, -(left_context + right_context) : -right_context, :]
        else:
            att_cache = key[:, -left_context:, :]
 
        x1 = self.self_att(x1, key, val, pos_enc, mask=mask, left_context=left_context)
 
        x2 = self.norm_mlp(x2)
        x2 = self.channel_proj1(x2)
 
        x2, conv_cache = self.conv_mod(
            x2, cache=self.cache[1], right_context=right_context
        )
 
        x2 = self.channel_proj2(x2)
 
        x = x + self.merge_proj(torch.cat([x1, x2], dim=-1))
 
        x = self.norm_final(x)
        self.cache = [att_cache, conv_cache]
 
        return x, pos_enc