From 5231b54af843a486baf4649cfc45b4f06c9914c8 Mon Sep 17 00:00:00 2001
From: 北念 <lzr265946@alibaba-inc.com>
Date: 星期四, 09 二月 2023 19:37:21 +0800
Subject: [PATCH] add ContextualParaformer
---
funasr/bin/asr_inference_paraformer.py | 54 ++
funasr/models/e2e_asr_paraformer.py | 484 +++++++++++++++++++++
funasr/models/decoder/contextual_decoder.py | 776 +++++++++++++++++++++++++++++++++
funasr/tasks/asr.py | 8
4 files changed, 1,317 insertions(+), 5 deletions(-)
diff --git a/funasr/bin/asr_inference_paraformer.py b/funasr/bin/asr_inference_paraformer.py
index 3769b6c..3163e84 100644
--- a/funasr/bin/asr_inference_paraformer.py
+++ b/funasr/bin/asr_inference_paraformer.py
@@ -3,6 +3,9 @@
import logging
import sys
import time
+import copy
+import os
+import codecs
from pathlib import Path
from typing import Optional
from typing import Sequence
@@ -35,6 +38,8 @@
from funasr.utils.types import str_or_none
from funasr.utils import asr_utils, wav_utils, postprocess_utils
from funasr.models.frontend.wav_frontend import WavFrontend
+from funasr.models.e2e_asr_paraformer import BiCifParaformer, ContextualParaformer
+
header_colors = '\033[95m'
end_colors = '\033[0m'
@@ -78,6 +83,7 @@
penalty: float = 0.0,
nbest: int = 1,
frontend_conf: dict = None,
+ hotword_list_or_file: str = None,
**kwargs,
):
assert check_argument_types()
@@ -168,6 +174,34 @@
self.asr_train_args = asr_train_args
self.converter = converter
self.tokenizer = tokenizer
+
+ # 6. [Optional] Build hotword list from file or str
+ if hotword_list_or_file is None:
+ self.hotword_list = None
+ elif os.path.exists(hotword_list_or_file):
+ self.hotword_list = []
+ hotword_str_list = []
+ with codecs.open(hotword_list_or_file, 'r') as fin:
+ for line in fin.readlines():
+ hw = line.strip()
+ hotword_str_list.append(hw)
+ self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+ self.hotword_list.append([1])
+ hotword_str_list.append('<s>')
+ logging.info("Initialized hotword list from file: {}, hotword list: {}."
+ .format(hotword_list_or_file, hotword_str_list))
+ else:
+ logging.info("Attempting to parse hotwords as str...")
+ self.hotword_list = []
+ hotword_str_list = []
+ for hw in hotword_list_or_file.strip().split():
+ hotword_str_list.append(hw)
+ self.hotword_list.append(self.converter.tokens2ids([i for i in hw]))
+ self.hotword_list.append([1])
+ hotword_str_list.append('<s>')
+ logging.info("Hotword list: {}.".format(hotword_str_list))
+
+
is_use_lm = lm_weight != 0.0 and lm_file is not None
if (ctc_weight == 0.0 or asr_model.ctc == None) and not is_use_lm:
beam_search = None
@@ -229,8 +263,14 @@
pre_token_length = pre_token_length.round().long()
if torch.max(pre_token_length) < 1:
return []
- decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
- decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+ if not isinstance(self.asr_model, ContextualParaformer):
+ if self.hotword_list:
+ logging.warning("Hotword is given but asr model is not a ContextualParaformer.")
+ decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length)
+ decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
+ else:
+ decoder_outs = self.asr_model.cal_decoder_with_predictor(enc, enc_len, pre_acoustic_embeds, pre_token_length, hw_list=self.hotword_list)
+ decoder_out, ys_pad_lens = decoder_outs[0], decoder_outs[1]
results = []
b, n, d = decoder_out.size()
@@ -416,6 +456,7 @@
ngram_weight=ngram_weight,
penalty=penalty,
nbest=nbest,
+ hotword_list_or_file=hotword_list_or_file,
)
speech2text = Speech2Text(**speech2text_kwargs)
@@ -551,7 +592,12 @@
default=1,
help="The number of workers used for DataLoader",
)
-
+ parser.add_argument(
+ "--hotword",
+ type=str_or_none,
+ default=None,
+ help="hotword file path or hotwords seperated by space"
+ )
group = parser.add_argument_group("Input data related")
group.add_argument(
"--data_path_and_name_and_type",
@@ -679,8 +725,10 @@
print(get_commandline_args(), file=sys.stderr)
parser = get_parser()
args = parser.parse_args(cmd)
+ param_dict = {'hotword': args.hotword}
kwargs = vars(args)
kwargs.pop("config", None)
+ kwargs['param_dict'] = param_dict
inference(**kwargs)
diff --git a/funasr/models/decoder/contextual_decoder.py b/funasr/models/decoder/contextual_decoder.py
new file mode 100644
index 0000000..32f550a
--- /dev/null
+++ b/funasr/models/decoder/contextual_decoder.py
@@ -0,0 +1,776 @@
+from typing import List
+from typing import Tuple
+import logging
+import torch
+import torch.nn as nn
+import numpy as np
+
+from funasr.modules.streaming_utils import utils as myutils
+from funasr.models.decoder.transformer_decoder import BaseTransformerDecoder
+from typeguard import check_argument_types
+
+from funasr.modules.attention import MultiHeadedAttentionSANMDecoder, MultiHeadedAttentionCrossAtt
+from funasr.modules.embedding import PositionalEncoding
+from funasr.modules.layer_norm import LayerNorm
+from funasr.modules.positionwise_feed_forward import PositionwiseFeedForwardDecoderSANM
+from funasr.modules.repeat import repeat
+from funasr.models.decoder.sanm_decoder import DecoderLayerSANM, ParaformerSANMDecoder
+
+
+class ContextualDecoderLayer(nn.Module):
+ def __init__(
+ self,
+ size,
+ self_attn,
+ src_attn,
+ feed_forward,
+ dropout_rate,
+ normalize_before=True,
+ concat_after=False,
+ ):
+ """Construct an DecoderLayer object."""
+ super(ContextualDecoderLayer, self).__init__()
+ self.size = size
+ self.self_attn = self_attn
+ self.src_attn = src_attn
+ self.feed_forward = feed_forward
+ self.norm1 = LayerNorm(size)
+ if self_attn is not None:
+ self.norm2 = LayerNorm(size)
+ if src_attn is not None:
+ self.norm3 = LayerNorm(size)
+ self.dropout = nn.Dropout(dropout_rate)
+ self.normalize_before = normalize_before
+ self.concat_after = concat_after
+ if self.concat_after:
+ self.concat_linear1 = nn.Linear(size + size, size)
+ self.concat_linear2 = nn.Linear(size + size, size)
+
+ def forward(self, tgt, tgt_mask, memory, memory_mask, cache=None,):
+ # tgt = self.dropout(tgt)
+ if isinstance(tgt, Tuple):
+ tgt, _ = tgt
+ residual = tgt
+ if self.normalize_before:
+ tgt = self.norm1(tgt)
+ tgt = self.feed_forward(tgt)
+
+ x = tgt
+ if self.normalize_before:
+ tgt = self.norm2(tgt)
+ if self.training:
+ cache = None
+ x, cache = self.self_attn(tgt, tgt_mask, cache=cache)
+ x = residual + self.dropout(x)
+ x_self_attn = x
+
+ residual = x
+ if self.normalize_before:
+ x = self.norm3(x)
+ x = self.src_attn(x, memory, memory_mask)
+ x_src_attn = x
+
+ x = residual + self.dropout(x)
+ return x, tgt_mask, x_self_attn, x_src_attn
+
+
+class ContexutalBiasDecoder(nn.Module):
+ def __init__(
+ self,
+ size,
+ src_attn,
+ dropout_rate,
+ normalize_before=True,
+ ):
+ """Construct an DecoderLayer object."""
+ super(ContexutalBiasDecoder, self).__init__()
+ self.size = size
+ self.src_attn = src_attn
+ if src_attn is not None:
+ self.norm3 = LayerNorm(size)
+ self.dropout = nn.Dropout(dropout_rate)
+ self.normalize_before = normalize_before
+
+ def forward(self, tgt, tgt_mask, memory, memory_mask=None, cache=None):
+ x = tgt
+ if self.src_attn is not None:
+ if self.normalize_before:
+ x = self.norm3(x)
+ x = self.dropout(self.src_attn(x, memory, memory_mask))
+ return x, tgt_mask, memory, memory_mask, cache
+
+
+class ContextualParaformerDecoder(ParaformerSANMDecoder):
+ """
+ author: Speech Lab, Alibaba Group, China
+ Paraformer: Fast and Accurate Parallel Transformer for Non-autoregressive End-to-End Speech Recognition
+ https://arxiv.org/abs/2006.01713
+ """
+ def __init__(
+ self,
+ vocab_size: int,
+ encoder_output_size: int,
+ attention_heads: int = 4,
+ linear_units: int = 2048,
+ num_blocks: int = 6,
+ dropout_rate: float = 0.1,
+ positional_dropout_rate: float = 0.1,
+ self_attention_dropout_rate: float = 0.0,
+ src_attention_dropout_rate: float = 0.0,
+ input_layer: str = "embed",
+ use_output_layer: bool = True,
+ pos_enc_class=PositionalEncoding,
+ normalize_before: bool = True,
+ concat_after: bool = False,
+ att_layer_num: int = 6,
+ kernel_size: int = 21,
+ sanm_shfit: int = 0,
+ ):
+ assert check_argument_types()
+ super().__init__(
+ vocab_size=vocab_size,
+ encoder_output_size=encoder_output_size,
+ dropout_rate=dropout_rate,
+ positional_dropout_rate=positional_dropout_rate,
+ input_layer=input_layer,
+ use_output_layer=use_output_layer,
+ pos_enc_class=pos_enc_class,
+ normalize_before=normalize_before,
+ )
+
+ attention_dim = encoder_output_size
+ if input_layer == 'none':
+ self.embed = None
+ if input_layer == "embed":
+ self.embed = torch.nn.Sequential(
+ torch.nn.Embedding(vocab_size, attention_dim),
+ # pos_enc_class(attention_dim, positional_dropout_rate),
+ )
+ elif input_layer == "linear":
+ self.embed = torch.nn.Sequential(
+ torch.nn.Linear(vocab_size, attention_dim),
+ torch.nn.LayerNorm(attention_dim),
+ torch.nn.Dropout(dropout_rate),
+ torch.nn.ReLU(),
+ pos_enc_class(attention_dim, positional_dropout_rate),
+ )
+ else:
+ raise ValueError(f"only 'embed' or 'linear' is supported: {input_layer}")
+
+ self.normalize_before = normalize_before
+ if self.normalize_before:
+ self.after_norm = LayerNorm(attention_dim)
+ if use_output_layer:
+ self.output_layer = torch.nn.Linear(attention_dim, vocab_size)
+ else:
+ self.output_layer = None
+
+ self.att_layer_num = att_layer_num
+ self.num_blocks = num_blocks
+ if sanm_shfit is None:
+ sanm_shfit = (kernel_size - 1) // 2
+ self.decoders = repeat(
+ att_layer_num - 1,
+ lambda lnum: DecoderLayerSANM(
+ attention_dim,
+ MultiHeadedAttentionSANMDecoder(
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ ),
+ MultiHeadedAttentionCrossAtt(
+ attention_heads, attention_dim, src_attention_dropout_rate
+ ),
+ PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ ),
+ )
+ self.dropout = nn.Dropout(dropout_rate)
+ self.bias_decoder = ContexutalBiasDecoder(
+ size=attention_dim,
+ src_attn=MultiHeadedAttentionCrossAtt(
+ attention_heads, attention_dim, src_attention_dropout_rate
+ ),
+ dropout_rate=dropout_rate,
+ normalize_before=True,
+ )
+ self.bias_output = torch.nn.Conv1d(attention_dim*2, attention_dim, 1, bias=False)
+ self.last_decoder = ContextualDecoderLayer(
+ attention_dim,
+ MultiHeadedAttentionSANMDecoder(
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=sanm_shfit
+ ),
+ MultiHeadedAttentionCrossAtt(
+ attention_heads, attention_dim, src_attention_dropout_rate
+ ),
+ PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ )
+ if num_blocks - att_layer_num <= 0:
+ self.decoders2 = None
+ else:
+ self.decoders2 = repeat(
+ num_blocks - att_layer_num,
+ lambda lnum: DecoderLayerSANM(
+ attention_dim,
+ MultiHeadedAttentionSANMDecoder(
+ attention_dim, self_attention_dropout_rate, kernel_size, sanm_shfit=0
+ ),
+ None,
+ PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ ),
+ )
+
+ self.decoders3 = repeat(
+ 1,
+ lambda lnum: DecoderLayerSANM(
+ attention_dim,
+ None,
+ None,
+ PositionwiseFeedForwardDecoderSANM(attention_dim, linear_units, dropout_rate),
+ dropout_rate,
+ normalize_before,
+ concat_after,
+ ),
+ )
+
+ def forward(
+ self,
+ hs_pad: torch.Tensor,
+ hlens: torch.Tensor,
+ ys_in_pad: torch.Tensor,
+ ys_in_lens: torch.Tensor,
+ contextual_info: torch.Tensor,
+ return_hidden: bool = False,
+ ) -> Tuple[torch.Tensor, torch.Tensor]:
+ """Forward decoder.
+
+ Args:
+ hs_pad: encoded memory, float32 (batch, maxlen_in, feat)
+ hlens: (batch)
+ ys_in_pad:
+ input token ids, int64 (batch, maxlen_out)
+ if input_layer == "embed"
+ input tensor (batch, maxlen_out, #mels) in the other cases
+ ys_in_lens: (batch)
+ Returns:
+ (tuple): tuple containing:
+
+ x: decoded token score before softmax (batch, maxlen_out, token)
+ if use_output_layer is True,
+ olens: (batch, )
+ """
+ tgt = ys_in_pad
+ tgt_mask = myutils.sequence_mask(ys_in_lens, device=tgt.device)[:, :, None]
+
+ memory = hs_pad
+ memory_mask = myutils.sequence_mask(hlens, device=memory.device)[:, None, :]
+
+ x = tgt
+ x, tgt_mask, memory, memory_mask, _ = self.decoders(
+ x, tgt_mask, memory, memory_mask
+ )
+ _, _, x_self_attn, x_src_attn = self.last_decoder(
+ x, tgt_mask, memory, memory_mask
+ )
+
+ # contextual paraformer related
+ contextual_length = torch.Tensor([contextual_info.shape[1]]).int().repeat(hs_pad.shape[0])
+ contextual_mask = myutils.sequence_mask(contextual_length, device=memory.device)[:, None, :]
+ cx, tgt_mask, _, _, _ = self.bias_decoder(x_self_attn, tgt_mask, contextual_info, memory_mask=contextual_mask)
+
+ if self.bias_output is not None:
+ x = torch.cat([x_src_attn, cx], dim=2)
+ x = self.bias_output(x.transpose(1, 2)).transpose(1, 2) # 2D -> D
+ x = x_self_attn + self.dropout(x)
+
+ if self.decoders2 is not None:
+ x, tgt_mask, memory, memory_mask, _ = self.decoders2(
+ x, tgt_mask, memory, memory_mask
+ )
+
+ x, tgt_mask, memory, memory_mask, _ = self.decoders3(
+ x, tgt_mask, memory, memory_mask
+ )
+ if self.normalize_before:
+ x = self.after_norm(x)
+ olens = tgt_mask.sum(1)
+ if self.output_layer is not None and return_hidden is False:
+ x = self.output_layer(x)
+ return x, olens
+
+ def gen_tf2torch_map_dict(self):
+
+ tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
+ tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
+ map_dict_local = {
+
+ ## decoder
+ # ffn
+ "{}.decoders.layeridx.norm1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.norm1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.feed_forward.w_1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (1024,256),(1,256,1024)
+ "{}.decoders.layeridx.feed_forward.w_1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/conv1d/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders.layeridx.feed_forward.norm.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/LayerNorm_1/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders.layeridx.feed_forward.norm.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/LayerNorm_1/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders.layeridx.feed_forward.w_2.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_ffn/conv1d_1/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,1024),(1,1024,256)
+
+ # fsmn
+ "{}.decoders.layeridx.norm2.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_memory_block/LayerNorm/gamma".format(
+ tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.norm2.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_memory_block/LayerNorm/beta".format(
+ tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.self_attn.fsmn_block.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/decoder_memory_block/depth_conv_w".format(
+ tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 2, 0),
+ }, # (256,1,31),(1,31,256,1)
+ # src att
+ "{}.decoders.layeridx.norm3.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.norm3.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.src_attn.linear_q.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,256),(1,256,256)
+ "{}.decoders.layeridx.src_attn.linear_q.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders.layeridx.src_attn.linear_k_v.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d_1/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (1024,256),(1,256,1024)
+ "{}.decoders.layeridx.src_attn.linear_k_v.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d_1/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders.layeridx.src_attn.linear_out.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d_2/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,256),(1,256,256)
+ "{}.decoders.layeridx.src_attn.linear_out.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_layeridx/multi_head/conv1d_2/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ # dnn
+ "{}.decoders3.layeridx.norm1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders3.layeridx.norm1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.decoders3.layeridx.feed_forward.w_1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (1024,256),(1,256,1024)
+ "{}.decoders3.layeridx.feed_forward.w_1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/conv1d/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders3.layeridx.feed_forward.norm.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/LayerNorm_1/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders3.layeridx.feed_forward.norm.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/LayerNorm_1/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.decoders3.layeridx.feed_forward.w_2.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_dnn_layer_layeridx/conv1d_1/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,1024),(1,1024,256)
+
+ # embed_concat_ffn
+ "{}.embed_concat_ffn.layeridx.norm1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.embed_concat_ffn.layeridx.norm1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.embed_concat_ffn.layeridx.feed_forward.w_1.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (1024,256),(1,256,1024)
+ "{}.embed_concat_ffn.layeridx.feed_forward.w_1.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/conv1d/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.embed_concat_ffn.layeridx.feed_forward.norm.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/LayerNorm_1/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.embed_concat_ffn.layeridx.feed_forward.norm.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/LayerNorm_1/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.embed_concat_ffn.layeridx.feed_forward.w_2.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/cif_concat/conv1d_1/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,1024),(1,1024,256)
+
+ # out norm
+ "{}.after_norm.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.after_norm.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+
+ # in embed
+ "{}.embed.0.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/w_embs".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (4235,256),(4235,256)
+
+ # out layer
+ "{}.output_layer.weight".format(tensor_name_prefix_torch):
+ {"name": ["{}/dense/kernel".format(tensor_name_prefix_tf), "{}/w_embs".format(tensor_name_prefix_tf)],
+ "squeeze": [None, None],
+ "transpose": [(1, 0), None],
+ }, # (4235,256),(256,4235)
+ "{}.output_layer.bias".format(tensor_name_prefix_torch):
+ {"name": ["{}/dense/bias".format(tensor_name_prefix_tf),
+ "seq2seq/2bias" if tensor_name_prefix_tf == "seq2seq/decoder/inputter_1" else "seq2seq/bias"],
+ "squeeze": [None, None],
+ "transpose": [None, None],
+ }, # (4235,),(4235,)
+
+ ## clas decoder
+ # src att
+ "{}.bias_decoder.norm3.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/LayerNorm/gamma".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.bias_decoder.norm3.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/LayerNorm/beta".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.bias_decoder.src_attn.linear_q.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,256),(1,256,256)
+ "{}.bias_decoder.src_attn.linear_q.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ "{}.bias_decoder.src_attn.linear_k_v.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d_1/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (1024,256),(1,256,1024)
+ "{}.bias_decoder.src_attn.linear_k_v.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d_1/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (1024,),(1024,)
+ "{}.bias_decoder.src_attn.linear_out.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d_2/kernel".format(tensor_name_prefix_tf),
+ "squeeze": 0,
+ "transpose": (1, 0),
+ }, # (256,256),(1,256,256)
+ "{}.bias_decoder.src_attn.linear_out.bias".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/multi_head_1/conv1d_2/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ }, # (256,),(256,)
+ # dnn
+ "{}.bias_output.weight".format(tensor_name_prefix_torch):
+ {"name": "{}/decoder_fsmn_layer_15/conv1d/kernel".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": (2, 1, 0),
+ }, # (1024,256),(1,256,1024)
+
+ }
+ return map_dict_local
+
+ def convert_tf2torch(self,
+ var_dict_tf,
+ var_dict_torch,
+ ):
+ map_dict = self.gen_tf2torch_map_dict()
+ var_dict_torch_update = dict()
+ decoder_layeridx_sets = set()
+ for name in sorted(var_dict_torch.keys(), reverse=False):
+ names = name.split('.')
+ if names[0] == self.tf2torch_tensor_name_prefix_torch:
+ if names[1] == "decoders":
+ layeridx = int(names[2])
+ name_q = name.replace(".{}.".format(layeridx), ".layeridx.")
+ layeridx_bias = 0
+ layeridx += layeridx_bias
+ decoder_layeridx_sets.add(layeridx)
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v.replace("layeridx", "{}".format(layeridx))
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+ elif names[1] == "last_decoder":
+ layeridx = 15
+ name_q = name.replace("last_decoder", "decoders.layeridx")
+ layeridx_bias = 0
+ layeridx += layeridx_bias
+ decoder_layeridx_sets.add(layeridx)
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v.replace("layeridx", "{}".format(layeridx))
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+
+
+ elif names[1] == "decoders2":
+ layeridx = int(names[2])
+ name_q = name.replace(".{}.".format(layeridx), ".layeridx.")
+ name_q = name_q.replace("decoders2", "decoders")
+ layeridx_bias = len(decoder_layeridx_sets)
+
+ layeridx += layeridx_bias
+ if "decoders." in name:
+ decoder_layeridx_sets.add(layeridx)
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v.replace("layeridx", "{}".format(layeridx))
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+
+ elif names[1] == "decoders3":
+ layeridx = int(names[2])
+ name_q = name.replace(".{}.".format(layeridx), ".layeridx.")
+
+ layeridx_bias = 0
+ layeridx += layeridx_bias
+ if "decoders." in name:
+ decoder_layeridx_sets.add(layeridx)
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v.replace("layeridx", "{}".format(layeridx))
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+ elif names[1] == "bias_decoder":
+ name_q = name
+
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+
+
+ elif names[1] == "embed" or names[1] == "output_layer" or names[1] == "bias_output":
+ name_tf = map_dict[name]["name"]
+ if isinstance(name_tf, list):
+ idx_list = 0
+ if name_tf[idx_list] in var_dict_tf.keys():
+ pass
+ else:
+ idx_list = 1
+ data_tf = var_dict_tf[name_tf[idx_list]]
+ if map_dict[name]["squeeze"][idx_list] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"][idx_list])
+ if map_dict[name]["transpose"][idx_list] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name]["transpose"][idx_list])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info("torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(),
+ name_tf[idx_list],
+ var_dict_tf[name_tf[
+ idx_list]].shape))
+
+ else:
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"])
+ if map_dict[name]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
+ var_dict_tf[name_tf].shape))
+
+ elif names[1] == "after_norm":
+ name_tf = map_dict[name]["name"]
+ data_tf = var_dict_tf[name_tf]
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
+ var_dict_tf[name_tf].shape))
+
+ elif names[1] == "embed_concat_ffn":
+ layeridx = int(names[2])
+ name_q = name.replace(".{}.".format(layeridx), ".layeridx.")
+
+ layeridx_bias = 0
+ layeridx += layeridx_bias
+ if "decoders." in name:
+ decoder_layeridx_sets.add(layeridx)
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v.replace("layeridx", "{}".format(layeridx))
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+
+ return var_dict_torch_update
diff --git a/funasr/models/e2e_asr_paraformer.py b/funasr/models/e2e_asr_paraformer.py
index f8141cd..5786bc4 100644
--- a/funasr/models/e2e_asr_paraformer.py
+++ b/funasr/models/e2e_asr_paraformer.py
@@ -976,4 +976,486 @@
# force_gatherable: to-device and to-tensor if scalar for DataParallel
loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
- return loss, stats, weight
\ No newline at end of file
+ return loss, stats, weight
+
+class ContextualParaformer(Paraformer):
+ """
+ Paraformer model with contextual hotword
+ """
+
+ def __init__(
+ self,
+ vocab_size: int,
+ token_list: Union[Tuple[str, ...], List[str]],
+ frontend: Optional[AbsFrontend],
+ specaug: Optional[AbsSpecAug],
+ normalize: Optional[AbsNormalize],
+ preencoder: Optional[AbsPreEncoder],
+ encoder: AbsEncoder,
+ postencoder: Optional[AbsPostEncoder],
+ decoder: AbsDecoder,
+ ctc: CTC,
+ ctc_weight: float = 0.5,
+ interctc_weight: float = 0.0,
+ ignore_id: int = -1,
+ blank_id: int = 0,
+ sos: int = 1,
+ eos: int = 2,
+ lsm_weight: float = 0.0,
+ length_normalized_loss: bool = False,
+ report_cer: bool = True,
+ report_wer: bool = True,
+ sym_space: str = "<space>",
+ sym_blank: str = "<blank>",
+ extract_feats_in_collect_stats: bool = True,
+ predictor=None,
+ predictor_weight: float = 0.0,
+ predictor_bias: int = 0,
+ sampling_ratio: float = 0.2,
+ min_hw_length: int = 2,
+ max_hw_length: int = 4,
+ sample_rate: float = 0.6,
+ batch_rate: float = 0.5,
+ double_rate: float = -1.0,
+ target_buffer_length: int = -1,
+ inner_dim: int = 256,
+ bias_encoder_type: str = 'lstm',
+ label_bracket: bool = False,
+ ):
+ assert check_argument_types()
+ assert 0.0 <= ctc_weight <= 1.0, ctc_weight
+ assert 0.0 <= interctc_weight < 1.0, interctc_weight
+
+ super().__init__(
+ vocab_size=vocab_size,
+ token_list=token_list,
+ frontend=frontend,
+ specaug=specaug,
+ normalize=normalize,
+ preencoder=preencoder,
+ encoder=encoder,
+ postencoder=postencoder,
+ decoder=decoder,
+ ctc=ctc,
+ ctc_weight=ctc_weight,
+ interctc_weight=interctc_weight,
+ ignore_id=ignore_id,
+ blank_id=blank_id,
+ sos=sos,
+ eos=eos,
+ lsm_weight=lsm_weight,
+ length_normalized_loss=length_normalized_loss,
+ report_cer=report_cer,
+ report_wer=report_wer,
+ sym_space=sym_space,
+ sym_blank=sym_blank,
+ extract_feats_in_collect_stats=extract_feats_in_collect_stats,
+ predictor=predictor,
+ predictor_weight=predictor_weight,
+ predictor_bias=predictor_bias,
+ sampling_ratio=sampling_ratio,
+ )
+
+ if bias_encoder_type == 'lstm':
+ logging.warning("enable bias encoder sampling and contextual training")
+ self.bias_encoder = torch.nn.LSTM(inner_dim, inner_dim, 1, batch_first=True, dropout=0)
+ self.bias_embed = torch.nn.Embedding(vocab_size, inner_dim)
+ else:
+ logging.error("Unsupport bias encoder type")
+
+ self.min_hw_length = min_hw_length
+ self.max_hw_length = max_hw_length
+ self.sample_rate = sample_rate
+ self.batch_rate = batch_rate
+ self.target_buffer_length = target_buffer_length
+ self.double_rate = double_rate
+
+ if self.target_buffer_length > 0:
+ self.hotword_buffer = None
+ self.length_record = []
+ self.current_buffer_length = 0
+
+ def forward(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ text: torch.Tensor,
+ text_lengths: torch.Tensor,
+ ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+ """Frontend + Encoder + Decoder + Calc loss
+
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ text: (Batch, Length)
+ text_lengths: (Batch,)
+ """
+ assert text_lengths.dim() == 1, text_lengths.shape
+ # Check that batch_size is unified
+ assert (
+ speech.shape[0]
+ == speech_lengths.shape[0]
+ == text.shape[0]
+ == text_lengths.shape[0]
+ ), (speech.shape, speech_lengths.shape, text.shape, text_lengths.shape)
+ batch_size = speech.shape[0]
+ self.step_cur += 1
+ # for data-parallel
+ text = text[:, : text_lengths.max()]
+ speech = speech[:, :speech_lengths.max()]
+
+ # 1. Encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+ intermediate_outs = None
+ if isinstance(encoder_out, tuple):
+ intermediate_outs = encoder_out[1]
+ encoder_out = encoder_out[0]
+
+ loss_att, acc_att, cer_att, wer_att = None, None, None, None
+ loss_ctc, cer_ctc = None, None
+ loss_pre = None
+ stats = dict()
+
+ # 1. CTC branch
+ if self.ctc_weight != 0.0:
+ loss_ctc, cer_ctc = self._calc_ctc_loss(
+ encoder_out, encoder_out_lens, text, text_lengths
+ )
+
+ # Collect CTC branch stats
+ stats["loss_ctc"] = loss_ctc.detach() if loss_ctc is not None else None
+ stats["cer_ctc"] = cer_ctc
+
+ # Intermediate CTC (optional)
+ loss_interctc = 0.0
+ if self.interctc_weight != 0.0 and intermediate_outs is not None:
+ for layer_idx, intermediate_out in intermediate_outs:
+ # we assume intermediate_out has the same length & padding
+ # as those of encoder_out
+ loss_ic, cer_ic = self._calc_ctc_loss(
+ intermediate_out, encoder_out_lens, text, text_lengths
+ )
+ loss_interctc = loss_interctc + loss_ic
+
+ # Collect Intermedaite CTC stats
+ stats["loss_interctc_layer{}".format(layer_idx)] = (
+ loss_ic.detach() if loss_ic is not None else None
+ )
+ stats["cer_interctc_layer{}".format(layer_idx)] = cer_ic
+
+ loss_interctc = loss_interctc / len(intermediate_outs)
+
+ # calculate whole encoder loss
+ loss_ctc = (
+ 1 - self.interctc_weight
+ ) * loss_ctc + self.interctc_weight * loss_interctc
+
+ # 2b. Attention decoder branch
+ if self.ctc_weight != 1.0:
+ loss_att, acc_att, cer_att, wer_att, loss_pre = self._calc_att_loss(
+ encoder_out, encoder_out_lens, text, text_lengths
+ )
+
+ # 3. CTC-Att loss definition
+ if self.ctc_weight == 0.0:
+ loss = loss_att + loss_pre * self.predictor_weight
+ elif self.ctc_weight == 1.0:
+ loss = loss_ctc
+ else:
+ loss = self.ctc_weight * loss_ctc + (1 - self.ctc_weight) * loss_att + loss_pre * self.predictor_weight
+
+ # Collect Attn branch stats
+ stats["loss_att"] = loss_att.detach() if loss_att is not None else None
+ stats["acc"] = acc_att
+ stats["cer"] = cer_att
+ stats["wer"] = wer_att
+ stats["loss_pre"] = loss_pre.detach().cpu() if loss_pre is not None else None
+
+ stats["loss"] = torch.clone(loss.detach())
+
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+ return loss, stats, weight
+
+ def _sample_hot_word(self, ys_pad, ys_pad_lens):
+ hw_list = [torch.Tensor([0]).long().to(ys_pad.device)]
+ hw_lengths = [0] # this length is actually for indice, so -1
+ for i, length in enumerate(ys_pad_lens):
+ if length < 2:
+ continue
+ if length > self.min_hw_length + self.max_hw_length + 2 and random.random() < self.double_rate:
+ # sample double hotword
+ _max_hw_length = min(self.max_hw_length, length // 2)
+ # first hotword
+ start1 = random.randint(0, length // 3)
+ end1 = random.randint(start1 + self.min_hw_length - 1, start1 + _max_hw_length - 1)
+ hw_tokens1 = ys_pad[i][start1:end1 + 1]
+ hw_lengths.append(len(hw_tokens1) - 1)
+ hw_list.append(hw_tokens1)
+ # second hotword
+ start2 = random.randint(end1 + 1, length - self.min_hw_length)
+ end2 = random.randint(min(length - 1, start2 + self.min_hw_length - 1),
+ min(length - 1, start2 + self.max_hw_length - 1))
+ hw_tokens2 = ys_pad[i][start2:end2 + 1]
+ hw_lengths.append(len(hw_tokens2) - 1)
+ hw_list.append(hw_tokens2)
+ continue
+ if random.random() < self.sample_rate:
+ if length == 2:
+ hw_tokens = ys_pad[i][:2]
+ hw_lengths.append(1)
+ hw_list.append(hw_tokens)
+ else:
+ start = random.randint(0, length - self.min_hw_length)
+ end = random.randint(min(length - 1, start + self.min_hw_length - 1),
+ min(length - 1, start + self.max_hw_length - 1)) + 1
+ # print(start, end)
+ hw_tokens = ys_pad[i][start:end]
+ hw_lengths.append(len(hw_tokens) - 1)
+ hw_list.append(hw_tokens)
+ # padding
+ hw_list_pad = pad_list(hw_list, 0)
+ hw_embed = self.decoder.embed(hw_list_pad)
+ hw_embed, (_, _) = self.bias_encoder(hw_embed)
+ _ind = np.arange(0, len(hw_list)).tolist()
+ # update self.hotword_buffer, throw a part if oversize
+ selected = hw_embed[_ind, hw_lengths]
+ if self.target_buffer_length > 0:
+ _b = selected.shape[0]
+ if self.hotword_buffer is None:
+ self.hotword_buffer = selected
+ self.length_record.append(selected.shape[0])
+ self.current_buffer_length = _b
+ elif self.current_buffer_length + _b < self.target_buffer_length:
+ self.hotword_buffer = torch.cat([self.hotword_buffer.detach(), selected], dim=0)
+ self.current_buffer_length += _b
+ selected = self.hotword_buffer
+ else:
+ self.hotword_buffer = torch.cat([self.hotword_buffer.detach(), selected], dim=0)
+ random_throw = random.randint(self.target_buffer_length // 2, self.target_buffer_length) + 10
+ self.hotword_buffer = self.hotword_buffer[-1 * random_throw:]
+ selected = self.hotword_buffer
+ self.current_buffer_length = selected.shape[0]
+ return selected.squeeze(0).repeat(ys_pad.shape[0], 1, 1).to(ys_pad.device)
+
+ def sampler(self, encoder_out, encoder_out_lens, ys_pad, ys_pad_lens, pre_acoustic_embeds, contextual_info):
+
+ tgt_mask = (~make_pad_mask(ys_pad_lens, maxlen=ys_pad_lens.max())[:, :, None]).to(ys_pad.device)
+ ys_pad = ys_pad * tgt_mask[:, :, 0]
+ if self.share_embedding:
+ ys_pad_embed = self.decoder.output_layer.weight[ys_pad]
+ else:
+ ys_pad_embed = self.decoder.embed(ys_pad)
+ with torch.no_grad():
+ decoder_outs = self.decoder(
+ encoder_out, encoder_out_lens, pre_acoustic_embeds, ys_pad_lens, contextual_info=contextual_info
+ )
+ decoder_out, _ = decoder_outs[0], decoder_outs[1]
+ pred_tokens = decoder_out.argmax(-1)
+ nonpad_positions = ys_pad.ne(self.ignore_id)
+ seq_lens = (nonpad_positions).sum(1)
+ same_num = ((pred_tokens == ys_pad) & nonpad_positions).sum(1)
+ input_mask = torch.ones_like(nonpad_positions)
+ bsz, seq_len = ys_pad.size()
+ for li in range(bsz):
+ target_num = (((seq_lens[li] - same_num[li].sum()).float()) * self.sampling_ratio).long()
+ if target_num > 0:
+ input_mask[li].scatter_(dim=0, index=torch.randperm(seq_lens[li])[:target_num].cuda(), value=0)
+ input_mask = input_mask.eq(1)
+ input_mask = input_mask.masked_fill(~nonpad_positions, False)
+ input_mask_expand_dim = input_mask.unsqueeze(2).to(pre_acoustic_embeds.device)
+
+ sematic_embeds = pre_acoustic_embeds.masked_fill(~input_mask_expand_dim, 0) + ys_pad_embed.masked_fill(
+ input_mask_expand_dim, 0)
+ return sematic_embeds * tgt_mask, decoder_out * tgt_mask
+
+ def _calc_att_loss(
+ self,
+ encoder_out: torch.Tensor,
+ encoder_out_lens: torch.Tensor,
+ ys_pad: torch.Tensor,
+ ys_pad_lens: torch.Tensor,
+ ):
+ encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
+ encoder_out.device)
+ if self.predictor_bias == 1:
+ _, ys_pad = add_sos_eos(ys_pad, self.sos, self.eos, self.ignore_id)
+ ys_pad_lens = ys_pad_lens + self.predictor_bias
+ pre_acoustic_embeds, pre_token_length, _, pre_peak_index = self.predictor(encoder_out, ys_pad,
+ encoder_out_mask,
+ ignore_id=self.ignore_id)
+
+ # sample hot word
+ contextual_info = self._sample_hot_word(ys_pad, ys_pad_lens)
+
+ # 0. sampler
+ decoder_out_1st = None
+ if self.sampling_ratio > 0.0:
+ if self.step_cur < 2:
+ logging.info("enable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+ sematic_embeds, decoder_out_1st = self.sampler(encoder_out, encoder_out_lens, ys_pad, ys_pad_lens,
+ pre_acoustic_embeds, contextual_info)
+ else:
+ if self.step_cur < 2:
+ logging.info("disable sampler in paraformer, sampling_ratio: {}".format(self.sampling_ratio))
+ sematic_embeds = pre_acoustic_embeds
+
+ # 1. Forward decoder
+ decoder_outs = self.decoder(
+ encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
+ )
+ decoder_out, _ = decoder_outs[0], decoder_outs[1]
+
+ if decoder_out_1st is None:
+ decoder_out_1st = decoder_out
+ # 2. Compute attention loss
+ loss_att = self.criterion_att(decoder_out, ys_pad)
+ acc_att = th_accuracy(
+ decoder_out_1st.view(-1, self.vocab_size),
+ ys_pad,
+ ignore_label=self.ignore_id,
+ )
+ loss_pre = self.criterion_pre(ys_pad_lens.type_as(pre_token_length), pre_token_length)
+
+ # Compute cer/wer using attention-decoder
+ if self.training or self.error_calculator is None:
+ cer_att, wer_att = None, None
+ else:
+ ys_hat = decoder_out_1st.argmax(dim=-1)
+ cer_att, wer_att = self.error_calculator(ys_hat.cpu(), ys_pad.cpu())
+
+ return loss_att, acc_att, cer_att, wer_att, loss_pre
+
+ def cal_decoder_with_predictor(self, encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, hw_list=None):
+ if hw_list is None:
+ # default hotword list
+ hw_list = [torch.Tensor([self.sos]).long().to(encoder_out.device)] # empty hotword list
+ hw_list_pad = pad_list(hw_list, 0)
+ hw_embed = self.bias_embed(hw_list_pad)
+ _, (h_n, _) = self.bias_encoder(hw_embed)
+ contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
+ else:
+ hw_lengths = [len(i) for i in hw_list]
+ hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)
+ hw_embed = self.bias_embed(hw_list_pad)
+ hw_embed = torch.nn.utils.rnn.pack_padded_sequence(hw_embed, hw_lengths, batch_first=True,
+ enforce_sorted=False)
+ _, (h_n, _) = self.bias_encoder(hw_embed)
+ # hw_embed, _ = torch.nn.utils.rnn.pad_packed_sequence(hw_embed, batch_first=True)
+ contextual_info = h_n.squeeze(0).repeat(encoder_out.shape[0], 1, 1)
+ decoder_outs = self.decoder(
+ encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, contextual_info=contextual_info
+ )
+ decoder_out = decoder_outs[0]
+ decoder_out = torch.log_softmax(decoder_out, dim=-1)
+ return decoder_out, ys_pad_lens
+
+ def gen_clas_tf2torch_map_dict(self):
+ tensor_name_prefix_torch = "bias_encoder"
+ tensor_name_prefix_tf = "seq2seq/clas_charrnn"
+
+ tensor_name_prefix_torch_emb = "bias_embed"
+ tensor_name_prefix_tf_emb = "seq2seq"
+
+ map_dict_local = {
+ # in lstm
+ "{}.weight_ih_l0".format(tensor_name_prefix_torch):
+ {"name": "{}/rnn/lstm_cell/kernel".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": (1, 0),
+ "slice": (0, 512),
+ "unit_k": 512,
+ }, # (1024, 2048),(2048,512)
+ "{}.weight_hh_l0".format(tensor_name_prefix_torch):
+ {"name": "{}/rnn/lstm_cell/kernel".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": (1, 0),
+ "slice": (512, 1024),
+ "unit_k": 512,
+ }, # (1024, 2048),(2048,512)
+ "{}.bias_ih_l0".format(tensor_name_prefix_torch):
+ {"name": "{}/rnn/lstm_cell/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ "scale": 0.5,
+ "unit_b": 512,
+ }, # (2048,),(2048,)
+ "{}.bias_hh_l0".format(tensor_name_prefix_torch):
+ {"name": "{}/rnn/lstm_cell/bias".format(tensor_name_prefix_tf),
+ "squeeze": None,
+ "transpose": None,
+ "scale": 0.5,
+ "unit_b": 512,
+ }, # (2048,),(2048,)
+
+ # in embed
+ "{}.weight".format(tensor_name_prefix_torch_emb):
+ {"name": "{}/contextual_encoder/w_char_embs".format(tensor_name_prefix_tf_emb),
+ "squeeze": None,
+ "transpose": None,
+ }, # (4235,256),(4235,256)
+ }
+ return map_dict_local
+
+ def clas_convert_tf2torch(self,
+ var_dict_tf,
+ var_dict_torch):
+ map_dict = self.gen_clas_tf2torch_map_dict()
+ var_dict_torch_update = dict()
+ for name in sorted(var_dict_torch.keys(), reverse=False):
+ names = name.split('.')
+ if names[0] == "bias_encoder":
+ name_q = name
+ if name_q in map_dict.keys():
+ name_v = map_dict[name_q]["name"]
+ name_tf = name_v
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name_q].get("unit_k") is not None:
+ dim = map_dict[name_q]["unit_k"]
+ i = data_tf[:, 0:dim].copy()
+ f = data_tf[:, dim:2 * dim].copy()
+ o = data_tf[:, 2 * dim:3 * dim].copy()
+ g = data_tf[:, 3 * dim:4 * dim].copy()
+ data_tf = np.concatenate([i, o, f, g], axis=1)
+ if map_dict[name_q].get("unit_b") is not None:
+ dim = map_dict[name_q]["unit_b"]
+ i = data_tf[0:dim].copy()
+ f = data_tf[dim:2 * dim].copy()
+ o = data_tf[2 * dim:3 * dim].copy()
+ g = data_tf[3 * dim:4 * dim].copy()
+ data_tf = np.concatenate([i, o, f, g], axis=0)
+ if map_dict[name_q]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name_q]["squeeze"])
+ if map_dict[name_q].get("slice") is not None:
+ data_tf = data_tf[map_dict[name_q]["slice"][0]:map_dict[name_q]["slice"][1]]
+ if map_dict[name_q].get("scale") is not None:
+ data_tf = data_tf * map_dict[name_q]["scale"]
+ if map_dict[name_q]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name_q]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_v,
+ var_dict_tf[name_tf].shape))
+ elif names[0] == "bias_embed":
+ name_tf = map_dict[name]["name"]
+ data_tf = var_dict_tf[name_tf]
+ if map_dict[name]["squeeze"] is not None:
+ data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"])
+ if map_dict[name]["transpose"] is not None:
+ data_tf = np.transpose(data_tf, map_dict[name]["transpose"])
+ data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
+ assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
+ var_dict_torch[
+ name].size(),
+ data_tf.size())
+ var_dict_torch_update[name] = data_tf
+ logging.info(
+ "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
+ var_dict_tf[name_tf].shape))
+
+ return var_dict_torch_update
\ No newline at end of file
diff --git a/funasr/tasks/asr.py b/funasr/tasks/asr.py
index 1b7f152..e62a748 100644
--- a/funasr/tasks/asr.py
+++ b/funasr/tasks/asr.py
@@ -37,8 +37,9 @@
)
from funasr.models.decoder.transformer_decoder import ParaformerDecoderSAN
from funasr.models.decoder.transformer_decoder import TransformerDecoder
+from funasr.models.decoder.contextual_decoder import ContextualParaformerDecoder
from funasr.models.e2e_asr import ESPnetASRModel
-from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerBert, BiCifParaformer
+from funasr.models.e2e_asr_paraformer import Paraformer, ParaformerBert, BiCifParaformer, ContextualParaformer
from funasr.models.e2e_uni_asr import UniASR
from funasr.models.encoder.abs_encoder import AbsEncoder
from funasr.models.encoder.conformer_encoder import ConformerEncoder
@@ -117,6 +118,7 @@
paraformer=Paraformer,
paraformer_bert=ParaformerBert,
bicif_paraformer=BiCifParaformer,
+ contextual_paraformer=ContextualParaformer,
),
type_check=AbsESPnetModel,
default="asr",
@@ -177,6 +179,7 @@
fsmn_scama_opt=FsmnDecoderSCAMAOpt,
paraformer_decoder_sanm=ParaformerSANMDecoder,
paraformer_decoder_san=ParaformerDecoderSAN,
+ contextual_paraformer_decoder=ContextualParaformerDecoder,
),
type_check=AbsDecoder,
default="rnn",
@@ -1098,5 +1101,8 @@
# decoder
var_dict_torch_update_local = model.decoder.convert_tf2torch(var_dict_tf, var_dict_torch)
var_dict_torch_update.update(var_dict_torch_update_local)
+ # bias_encoder
+ var_dict_torch_update_local = model.clas_convert_tf2torch(var_dict_tf, var_dict_torch)
+ var_dict_torch_update.update(var_dict_torch_update_local)
return var_dict_torch_update
--
Gitblit v1.9.1