| | |
| | | from typing import List |
| | | from typing import Tuple |
| | | import logging |
| | | #!/usr/bin/env python3 |
| | | # -*- encoding: utf-8 -*- |
| | | # Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved. |
| | | # MIT License (https://opensource.org/licenses/MIT) |
| | | |
| | | import torch |
| | | import torch.nn as nn |
| | | import numpy as np |
| | | from typing import List, Tuple |
| | | |
| | | from funasr.register import tables |
| | | from funasr.models.scama import utils as myutils |
| | | from funasr.models.transformer.decoder import BaseTransformerDecoder |
| | | |
| | | from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt |
| | | from funasr.models.transformer.layer_norm import LayerNorm |
| | | from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM |
| | | from funasr.models.transformer.utils.repeat import repeat |
| | | from funasr.models.transformer.decoder import DecoderLayer |
| | | from funasr.models.transformer.attention import MultiHeadedAttention |
| | | from funasr.models.transformer.layer_norm import LayerNorm |
| | | from funasr.models.transformer.embedding import PositionalEncoding |
| | | from funasr.models.transformer.attention import MultiHeadedAttention |
| | | from funasr.models.transformer.utils.nets_utils import make_pad_mask |
| | | from funasr.models.transformer.decoder import BaseTransformerDecoder |
| | | from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward |
| | | from funasr.utils.register import register_class, registry_tables |
| | | from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM |
| | | from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt |
| | | |
| | | class DecoderLayerSANM(nn.Module): |
| | | |
| | | class DecoderLayerSANM(torch.nn.Module): |
| | | """Single decoder layer module. |
| | | |
| | | Args: |
| | |
| | | self.norm2 = LayerNorm(size) |
| | | if src_attn is not None: |
| | | self.norm3 = LayerNorm(size) |
| | | self.dropout = nn.Dropout(dropout_rate) |
| | | self.dropout = torch.nn.Dropout(dropout_rate) |
| | | self.normalize_before = normalize_before |
| | | self.concat_after = concat_after |
| | | if self.concat_after: |
| | | self.concat_linear1 = nn.Linear(size + size, size) |
| | | self.concat_linear2 = nn.Linear(size + size, size) |
| | | self.concat_linear1 = torch.nn.Linear(size + size, size) |
| | | self.concat_linear2 = torch.nn.Linear(size + size, size) |
| | | self.reserve_attn=False |
| | | self.attn_mat = [] |
| | | |
| | | def forward(self, tgt, tgt_mask, memory, memory_mask=None, cache=None): |
| | | """Compute decoded features. |
| | |
| | | residual = x |
| | | if self.normalize_before: |
| | | x = self.norm3(x) |
| | | |
| | | x = residual + self.dropout(self.src_attn(x, memory, memory_mask)) |
| | | if self.reserve_attn: |
| | | x_src_attn, attn_mat = self.src_attn(x, memory, memory_mask, ret_attn=True) |
| | | self.attn_mat.append(attn_mat) |
| | | else: |
| | | x_src_attn = self.src_attn(x, memory, memory_mask, ret_attn=False) |
| | | x = residual + self.dropout(x_src_attn) |
| | | # x = residual + self.dropout(self.src_attn(x, memory, memory_mask)) |
| | | |
| | | return x, tgt_mask, memory, memory_mask, cache |
| | | |
| | | def get_attn_mat(self, tgt, tgt_mask, memory, memory_mask=None, cache=None): |
| | | residual = tgt |
| | | tgt = self.norm1(tgt) |
| | | tgt = self.feed_forward(tgt) |
| | | |
| | | x = tgt |
| | | if self.self_attn is not None: |
| | | tgt = self.norm2(tgt) |
| | | x, cache = self.self_attn(tgt, tgt_mask, cache=cache) |
| | | x = residual + x |
| | | |
| | | residual = x |
| | | x = self.norm3(x) |
| | | x_src_attn, attn_mat = self.src_attn(x, memory, memory_mask, ret_attn=True) |
| | | return attn_mat |
| | | |
| | | def forward_one_step(self, tgt, tgt_mask, memory, memory_mask=None, cache=None): |
| | | """Compute decoded features. |
| | |
| | | return x, memory, fsmn_cache, opt_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerSANMDecoder") |
| | | @tables.register("decoder_classes", "ParaformerSANMDecoder") |
| | | class ParaformerSANMDecoder(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | |
| | | src_attention_dropout_rate: float = 0.0, |
| | | input_layer: str = "embed", |
| | | use_output_layer: bool = True, |
| | | wo_input_layer: bool = False, |
| | | pos_enc_class=PositionalEncoding, |
| | | normalize_before: bool = True, |
| | | concat_after: bool = False, |
| | |
| | | ) |
| | | |
| | | attention_dim = encoder_output_size |
| | | |
| | | if input_layer == "embed": |
| | | self.embed = torch.nn.Sequential( |
| | | torch.nn.Embedding(vocab_size, attention_dim), |
| | | # pos_enc_class(attention_dim, positional_dropout_rate), |
| | | ) |
| | | elif input_layer == "linear": |
| | | self.embed = torch.nn.Sequential( |
| | | torch.nn.Linear(vocab_size, attention_dim), |
| | | torch.nn.LayerNorm(attention_dim), |
| | | torch.nn.Dropout(dropout_rate), |
| | | torch.nn.ReLU(), |
| | | pos_enc_class(attention_dim, positional_dropout_rate), |
| | | ) |
| | | if wo_input_layer: |
| | | self.embed = None |
| | | else: |
| | | raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}") |
| | | if input_layer == "embed": |
| | | self.embed = torch.nn.Sequential( |
| | | torch.nn.Embedding(vocab_size, attention_dim), |
| | | # pos_enc_class(attention_dim, positional_dropout_rate), |
| | | ) |
| | | elif input_layer == "linear": |
| | | self.embed = torch.nn.Sequential( |
| | | torch.nn.Linear(vocab_size, attention_dim), |
| | | torch.nn.LayerNorm(attention_dim), |
| | | torch.nn.Dropout(dropout_rate), |
| | | torch.nn.ReLU(), |
| | | pos_enc_class(attention_dim, positional_dropout_rate), |
| | | ) |
| | | else: |
| | | raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}") |
| | | |
| | | self.normalize_before = normalize_before |
| | | if self.normalize_before: |
| | |
| | | hlens: torch.Tensor, |
| | | ys_in_pad: torch.Tensor, |
| | | ys_in_lens: torch.Tensor, |
| | | return_hidden: bool = False, |
| | | return_both: bool= False, |
| | | chunk_mask: torch.Tensor = None, |
| | | ) -> Tuple[torch.Tensor, torch.Tensor]: |
| | | """Forward decoder. |
| | |
| | | x, tgt_mask, memory, memory_mask |
| | | ) |
| | | if self.normalize_before: |
| | | x = self.after_norm(x) |
| | | if self.output_layer is not None: |
| | | x = self.output_layer(x) |
| | | hidden = self.after_norm(x) |
| | | |
| | | olens = tgt_mask.sum(1) |
| | | return x, olens |
| | | if self.output_layer is not None and return_hidden is False: |
| | | x = self.output_layer(hidden) |
| | | return x, olens |
| | | if return_both: |
| | | x = self.output_layer(hidden) |
| | | return x, hidden, olens |
| | | return hidden, olens |
| | | |
| | | def score(self, ys, state, x): |
| | | """Score.""" |
| | |
| | | ys.unsqueeze(0), ys_mask, x.unsqueeze(0), cache=state |
| | | ) |
| | | return logp.squeeze(0), state |
| | | |
| | | def forward_asf2( |
| | | self, |
| | | hs_pad: torch.Tensor, |
| | | hlens: torch.Tensor, |
| | | ys_in_pad: torch.Tensor, |
| | | ys_in_lens: torch.Tensor, |
| | | ): |
| | | |
| | | tgt = ys_in_pad |
| | | tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] |
| | | |
| | | memory = hs_pad |
| | | memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] |
| | | |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[0](tgt, tgt_mask, memory, memory_mask) |
| | | attn_mat = self.model.decoders[1].get_attn_mat(tgt, tgt_mask, memory, memory_mask) |
| | | return attn_mat |
| | | |
| | | def forward_asf6( |
| | | self, |
| | | hs_pad: torch.Tensor, |
| | | hlens: torch.Tensor, |
| | | ys_in_pad: torch.Tensor, |
| | | ys_in_lens: torch.Tensor, |
| | | ): |
| | | |
| | | tgt = ys_in_pad |
| | | tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None] |
| | | |
| | | memory = hs_pad |
| | | memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :] |
| | | |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[0](tgt, tgt_mask, memory, memory_mask) |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[1](tgt, tgt_mask, memory, memory_mask) |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[2](tgt, tgt_mask, memory, memory_mask) |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[3](tgt, tgt_mask, memory, memory_mask) |
| | | tgt, tgt_mask, memory, memory_mask, _ = self.decoders[4](tgt, tgt_mask, memory, memory_mask) |
| | | attn_mat = self.decoders[5].get_attn_mat(tgt, tgt_mask, memory, memory_mask) |
| | | return attn_mat |
| | | |
| | | def forward_chunk( |
| | | self, |
| | |
| | | return y, new_cache |
| | | |
| | | |
| | | @register_class("decoder_classes", "ParaformerDecoderSAN") |
| | | class ParaformerDecoderSAN(BaseTransformerDecoder): |
| | | @tables.register("decoder_classes", "ParaformerSANDecoder") |
| | | class ParaformerSANDecoder(BaseTransformerDecoder): |
| | | """ |
| | | Author: Speech Lab of DAMO Academy, Alibaba Group |
| | | Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition |