From 94de39dde2e616a01683c518023d0fab72b4e103 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 19 二月 2024 22:21:50 +0800
Subject: [PATCH] aishell example
---
funasr/models/paraformer/decoder.py | 97 ++++++++++++++++++++++++++++--------------------
1 files changed, 57 insertions(+), 40 deletions(-)
diff --git a/funasr/models/paraformer/decoder.py b/funasr/models/paraformer/decoder.py
index 3fe9d19..68018a0 100644
--- a/funasr/models/paraformer/decoder.py
+++ b/funasr/models/paraformer/decoder.py
@@ -1,25 +1,26 @@
-from typing import List
-from typing import Tuple
-import logging
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
import torch
-import torch.nn as nn
-import numpy as np
+from typing import List, Tuple
+from funasr.register import tables
from funasr.models.scama import utils as myutils
-from funasr.models.transformer.decoder import BaseTransformerDecoder
-
-from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt
-from funasr.models.transformer.layer_norm import LayerNorm
-from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM
from funasr.models.transformer.utils.repeat import repeat
from funasr.models.transformer.decoder import DecoderLayer
-from funasr.models.transformer.attention import MultiHeadedAttention
+from funasr.models.transformer.layer_norm import LayerNorm
from funasr.models.transformer.embedding import PositionalEncoding
+from funasr.models.transformer.attention import MultiHeadedAttention
from funasr.models.transformer.utils.nets_utils import make_pad_mask
+from funasr.models.transformer.decoder import BaseTransformerDecoder
from funasr.models.transformer.positionwise_feed_forward import PositionwiseFeedForward
-from funasr.utils.register import register_class, registry_tables
+from funasr.models.sanm.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM
+from funasr.models.sanm.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt
-class DecoderLayerSANM(nn.Module):
+
+class DecoderLayerSANM(torch.nn.Module):
"""Single decoder layer module.
Args:
@@ -62,12 +63,14 @@
self.norm2 = LayerNorm(size)
if src_attn is not None:
self.norm3 = LayerNorm(size)
- self.dropout = nn.Dropout(dropout_rate)
+ self.dropout = torch.nn.Dropout(dropout_rate)
self.normalize_before = normalize_before
self.concat_after = concat_after
if self.concat_after:
- self.concat_linear1 = nn.Linear(size + size, size)
- self.concat_linear2 = nn.Linear(size + size, size)
+ self.concat_linear1 = torch.nn.Linear(size + size, size)
+ self.concat_linear2 = torch.nn.Linear(size + size, size)
+ self.reserve_attn=False
+ self.attn_mat = []
def forward(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
"""Compute decoded features.
@@ -104,8 +107,13 @@
residual = x
if self.normalize_before:
x = self.norm3(x)
-
- x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
+ if self.reserve_attn:
+ x_src_attn, attn_mat = self.src_attn(x, memory, memory_mask, ret_attn=True)
+ self.attn_mat.append(attn_mat)
+ else:
+ x_src_attn = self.src_attn(x, memory, memory_mask, ret_attn=False)
+ x = residual + self.dropout(x_src_attn)
+ # x = residual + self.dropout(self.src_attn(x, memory, memory_mask))
return x, tgt_mask, memory, memory_mask, cache
@@ -193,7 +201,7 @@
return x, memory, fsmn_cache, opt_cache
-@register_class("decoder_classes", "ParaformerSANMDecoder")
+@tables.register("decoder_classes", "ParaformerSANMDecoder")
class ParaformerSANMDecoder(BaseTransformerDecoder):
"""
Author: Speech Lab of DAMO Academy, Alibaba Group
@@ -213,6 +221,7 @@
src_attention_dropout_rate: float = 0.0,
input_layer: str = "embed",
use_output_layer: bool = True,
+ wo_input_layer: bool = False,
pos_enc_class=PositionalEncoding,
normalize_before: bool = True,
concat_after: bool = False,
@@ -239,22 +248,24 @@
)
attention_dim = encoder_output_size
-
- if input_layer == "embed":
- self.embed = torch.nn.Sequential(
- torch.nn.Embedding(vocab_size, attention_dim),
- # pos_enc_class(attention_dim, positional_dropout_rate),
- )
- elif input_layer == "linear":
- self.embed = torch.nn.Sequential(
- torch.nn.Linear(vocab_size, attention_dim),
- torch.nn.LayerNorm(attention_dim),
- torch.nn.Dropout(dropout_rate),
- torch.nn.ReLU(),
- pos_enc_class(attention_dim, positional_dropout_rate),
- )
+ if wo_input_layer:
+ self.embed = None
else:
- raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+ if input_layer == "embed":
+ self.embed = torch.nn.Sequential(
+ torch.nn.Embedding(vocab_size, attention_dim),
+ # pos_enc_class(attention_dim, positional_dropout_rate),
+ )
+ elif input_layer == "linear":
+ self.embed = torch.nn.Sequential(
+ torch.nn.Linear(vocab_size, attention_dim),
+ torch.nn.LayerNorm(attention_dim),
+ torch.nn.Dropout(dropout_rate),
+ torch.nn.ReLU(),
+ pos_enc_class(attention_dim, positional_dropout_rate),
+ )
+ else:
+ raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
self.normalize_before = normalize_before
if self.normalize_before:
@@ -324,6 +335,8 @@
hlens: torch.Tensor,
ys_in_pad: torch.Tensor,
ys_in_lens: torch.Tensor,
+ return_hidden: bool = False,
+ return_both: bool= False,
chunk_mask: torch.Tensor = None,
) -> Tuple[torch.Tensor, torch.Tensor]:
"""Forward decoder.
@@ -365,12 +378,16 @@
x, tgt_mask, memory, memory_mask
)
if self.normalize_before:
- x = self.after_norm(x)
- if self.output_layer is not None:
- x = self.output_layer(x)
+ hidden = self.after_norm(x)
olens = tgt_mask.sum(1)
- return x, olens
+ if self.output_layer is not None and return_hidden is False:
+ x = self.output_layer(hidden)
+ return x, olens
+ if return_both:
+ x = self.output_layer(hidden)
+ return x, hidden, olens
+ return hidden, olens
def score(self, ys, state, x):
"""Score."""
@@ -509,8 +526,8 @@
return y, new_cache
-@register_class("decoder_classes", "ParaformerDecoderSAN")
-class ParaformerDecoderSAN(BaseTransformerDecoder):
+@tables.register("decoder_classes", "ParaformerSANDecoder")
+class ParaformerSANDecoder(BaseTransformerDecoder):
"""
Author: Speech Lab of DAMO Academy, Alibaba Group
Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
--
Gitblit v1.9.1