| | |
| | | |
| | | from funasr.modules.streaming_utils import utils as myutils |
| | | from funasr.models.decoder.transformer_decoder import BaseTransformerDecoder |
| | | from typeguard import check_argument_types |
| | | |
| | | from funasr.modules.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt |
| | | from funasr.modules.embedding import PositionalEncoding |
| | |
| | | return x, tgt_mask, x_self_attn, x_src_attn |
| | | |
| | | |
| | | class ContexutalBiasDecoder(nn.Module): |
| | | class ContextualBiasDecoder(nn.Module): |
| | | def __init__( |
| | | self, |
| | | size, |
| | |
| | | normalize_before=True, |
| | | ): |
| | | """Construct an DecoderLayer object.""" |
| | | super(ContexutalBiasDecoder, self).__init__() |
| | | super(ContextualBiasDecoder, self).__init__() |
| | | self.size = size |
| | | self.src_attn = src_attn |
| | | if src_attn is not None: |
| | |
| | | |
| | | class ContextualParaformerDecoder(ParaformerSANMDecoder): |
| | | """ |
| | | author: Speech Lab, Alibaba Group, China |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | | Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition |
| | | https://arxiv.org/abs/2006.01713 |
| | | """ |
| | |
| | | kernel_size: int = 21, |
| | | sanm_shfit: int = 0, |
| | | ): |
| | | assert check_argument_types() |
| | | super().__init__( |
| | | vocab_size=vocab_size, |
| | | encoder_output_size=encoder_output_size, |
| | |
| | | ), |
| | | ) |
| | | self.dropout = nn.Dropout(dropout_rate) |
| | | self.bias_decoder = ContexutalBiasDecoder( |
| | | self.bias_decoder = ContextualBiasDecoder( |
| | | size=attention_dim, |
| | | src_attn=MultiHeadedAttentionCrossAtt( |
| | | attention_heads, attention_dim, src_attention_dropout_rate |
| | |
| | | ys_in_pad: torch.Tensor, |
| | | ys_in_lens: torch.Tensor, |
| | | contextual_info: torch.Tensor, |
| | | clas_scale: float = 1.0, |
| | | return_hidden: bool = False, |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | """Forward decoder. |
| | |
| | | cx, tgt_mask, _, _, _ = self.bias_decoder(x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask) |
| | | |
| | | if self.bias_output is not None: |
| | | x = torch.cat([x_src_attn, cx], dim=2) |
| | | x = torch.cat([x_src_attn, cx*clas_scale], dim=2) |
| | | x = self.bias_output(x.transpose(1, 2)).transpose(1, 2) # 2D -> D |
| | | x = x_self_attn + self.dropout(x) |
| | | |