From 6dcd960fda8be389af355ede4ecc583b036029d4 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期二, 05 三月 2024 16:45:25 +0800
Subject: [PATCH] update cutsplit step
---
funasr/models/paraformer/cif_predictor.py | 241 +++++++++++++++++++-----------------------------
1 files changed, 96 insertions(+), 145 deletions(-)
diff --git a/funasr/models/paraformer/cif_predictor.py b/funasr/models/paraformer/cif_predictor.py
index 383d9ca..4d9f5d8 100644
--- a/funasr/models/paraformer/cif_predictor.py
+++ b/funasr/models/paraformer/cif_predictor.py
@@ -1,23 +1,25 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+# MIT License (https://opensource.org/licenses/MIT)
+
import torch
-from torch import nn
-from torch import Tensor
import logging
import numpy as np
-from funasr.train_utils.device_funcs import to_device
-from funasr.models.transformer.utils.nets_utils import make_pad_mask
-from funasr.models.scama.utils import sequence_mask
-from typing import Optional, Tuple
from funasr.register import tables
+from funasr.train_utils.device_funcs import to_device
+from funasr.models.transformer.utils.nets_utils import make_pad_mask
+from torch.cuda.amp import autocast
@tables.register("predictor_classes", "CifPredictor")
-class CifPredictor(nn.Module):
+class CifPredictor(torch.nn.Module):
def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1, smooth_factor=1.0, noise_threshold=0, tail_threshold=0.45):
super().__init__()
- self.pad = nn.ConstantPad1d((l_order, r_order), 0)
- self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1, groups=idim)
- self.cif_output = nn.Linear(idim, 1)
+ self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
+ self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1, groups=idim)
+ self.cif_output = torch.nn.Linear(idim, 1)
self.dropout = torch.nn.Dropout(p=dropout)
self.threshold = threshold
self.smooth_factor = smooth_factor
@@ -26,42 +28,44 @@
def forward(self, hidden, target_label=None, mask=None, ignore_id=-1, mask_chunk_predictor=None,
target_label_length=None):
- h = hidden
- context = h.transpose(1, 2)
- queries = self.pad(context)
- memory = self.cif_conv1d(queries)
- output = memory + context
- output = self.dropout(output)
- output = output.transpose(1, 2)
- output = torch.relu(output)
- output = self.cif_output(output)
- alphas = torch.sigmoid(output)
- alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
- if mask is not None:
- mask = mask.transpose(-1, -2).float()
- alphas = alphas * mask
- if mask_chunk_predictor is not None:
- alphas = alphas * mask_chunk_predictor
- alphas = alphas.squeeze(-1)
- mask = mask.squeeze(-1)
- if target_label_length is not None:
- target_length = target_label_length
- elif target_label is not None:
- target_length = (target_label != ignore_id).float().sum(-1)
- else:
- target_length = None
- token_num = alphas.sum(-1)
- if target_length is not None:
- alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
- elif self.tail_threshold > 0.0:
- hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+
+ with autocast(False):
+ h = hidden
+ context = h.transpose(1, 2)
+ queries = self.pad(context)
+ memory = self.cif_conv1d(queries)
+ output = memory + context
+ output = self.dropout(output)
+ output = output.transpose(1, 2)
+ output = torch.relu(output)
+ output = self.cif_output(output)
+ alphas = torch.sigmoid(output)
+ alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+ if mask is not None:
+ mask = mask.transpose(-1, -2).float()
+ alphas = alphas * mask
+ if mask_chunk_predictor is not None:
+ alphas = alphas * mask_chunk_predictor
+ alphas = alphas.squeeze(-1)
+ mask = mask.squeeze(-1)
+ if target_label_length is not None:
+ target_length = target_label_length
+ elif target_label is not None:
+ target_length = (target_label != ignore_id).float().sum(-1)
+ else:
+ target_length = None
+ token_num = alphas.sum(-1)
+ if target_length is not None:
+ alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
+ elif self.tail_threshold > 0.0:
+ hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+
+ acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
- acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
-
- if target_length is None and self.tail_threshold > 0.0:
- token_num_int = torch.max(token_num).type(torch.int32).item()
- acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
-
+ if target_length is None and self.tail_threshold > 0.0:
+ token_num_int = torch.max(token_num).type(torch.int32).item()
+ acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+
return acoustic_embeds, token_num, alphas, cif_peak
def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
@@ -137,7 +141,7 @@
return predictor_alignments.detach(), predictor_alignments_length.detach()
@tables.register("predictor_classes", "CifPredictorV2")
-class CifPredictorV2(nn.Module):
+class CifPredictorV2(torch.nn.Module):
def __init__(self,
idim,
l_order,
@@ -153,9 +157,9 @@
):
super(CifPredictorV2, self).__init__()
- self.pad = nn.ConstantPad1d((l_order, r_order), 0)
- self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1)
- self.cif_output = nn.Linear(idim, 1)
+ self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
+ self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1)
+ self.cif_output = torch.nn.Linear(idim, 1)
self.dropout = torch.nn.Dropout(p=dropout)
self.threshold = threshold
self.smooth_factor = smooth_factor
@@ -167,45 +171,48 @@
def forward(self, hidden, target_label=None, mask=None, ignore_id=-1, mask_chunk_predictor=None,
target_label_length=None):
- h = hidden
- context = h.transpose(1, 2)
- queries = self.pad(context)
- output = torch.relu(self.cif_conv1d(queries))
- output = output.transpose(1, 2)
-
- output = self.cif_output(output)
- alphas = torch.sigmoid(output)
- alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
- if mask is not None:
- mask = mask.transpose(-1, -2).float()
- alphas = alphas * mask
- if mask_chunk_predictor is not None:
- alphas = alphas * mask_chunk_predictor
- alphas = alphas.squeeze(-1)
- mask = mask.squeeze(-1)
- if target_label_length is not None:
- target_length = target_label_length
- elif target_label is not None:
- target_length = (target_label != ignore_id).float().sum(-1)
- else:
- target_length = None
- token_num = alphas.sum(-1)
- if target_length is not None:
- alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
- elif self.tail_threshold > 0.0:
- if self.tail_mask:
- hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+
+ with autocast(False):
+ h = hidden
+ context = h.transpose(1, 2)
+ queries = self.pad(context)
+ output = torch.relu(self.cif_conv1d(queries))
+ output = output.transpose(1, 2)
+
+ output = self.cif_output(output)
+ alphas = torch.sigmoid(output)
+ alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+ if mask is not None:
+ mask = mask.transpose(-1, -2).float()
+ alphas = alphas * mask
+ if mask_chunk_predictor is not None:
+ alphas = alphas * mask_chunk_predictor
+ alphas = alphas.squeeze(-1)
+ mask = mask.squeeze(-1)
+ if target_label_length is not None:
+ target_length = target_label_length.squeeze(-1)
+ elif target_label is not None:
+ target_length = (target_label != ignore_id).float().sum(-1)
else:
- hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=None)
-
- acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
- if target_length is None and self.tail_threshold > 0.0:
- token_num_int = torch.max(token_num).type(torch.int32).item()
- acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+ target_length = None
+ token_num = alphas.sum(-1)
+ if target_length is not None:
+ alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
+ elif self.tail_threshold > 0.0:
+ if self.tail_mask:
+ hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+ else:
+ hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=None)
+
+ acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+ if target_length is None and self.tail_threshold > 0.0:
+ token_num_int = torch.max(token_num).type(torch.int32).item()
+ acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
return acoustic_embeds, token_num, alphas, cif_peak
- def forward_chunk(self, hidden, cache=None):
+ def forward_chunk(self, hidden, cache=None, **kwargs):
+ is_final = kwargs.get("is_final", False)
batch_size, len_time, hidden_size = hidden.shape
h = hidden
context = h.transpose(1, 2)
@@ -226,14 +233,14 @@
if cache is not None and "chunk_size" in cache:
alphas[:, :cache["chunk_size"][0]] = 0.0
- if "is_final" in cache and not cache["is_final"]:
+ if not is_final:
alphas[:, sum(cache["chunk_size"][:2]):] = 0.0
if cache is not None and "cif_alphas" in cache and "cif_hidden" in cache:
cache["cif_hidden"] = to_device(cache["cif_hidden"], device=hidden.device)
cache["cif_alphas"] = to_device(cache["cif_alphas"], device=alphas.device)
hidden = torch.cat((cache["cif_hidden"], hidden), dim=1)
alphas = torch.cat((cache["cif_alphas"], alphas), dim=1)
- if cache is not None and "is_final" in cache and cache["is_final"]:
+ if cache is not None and is_final:
tail_hidden = torch.zeros((batch_size, 1, hidden_size), device=hidden.device)
tail_alphas = torch.tensor([[self.tail_threshold]], device=alphas.device)
tail_alphas = torch.tile(tail_alphas, (batch_size, 1))
@@ -277,7 +284,7 @@
max_token_len = max(token_length)
if max_token_len == 0:
- return hidden, torch.stack(token_length, 0)
+ return hidden, torch.stack(token_length, 0), None, None
list_ls = []
for b in range(batch_size):
pad_frames = torch.zeros((max_token_len - token_length[b], hidden_size), device=alphas.device)
@@ -291,7 +298,7 @@
cache["cif_alphas"] = torch.unsqueeze(cache["cif_alphas"], axis=0)
cache["cif_hidden"] = torch.stack(cache_hiddens, axis=0)
cache["cif_hidden"] = torch.unsqueeze(cache["cif_hidden"], axis=0)
- return torch.stack(list_ls, 0), torch.stack(token_length, 0)
+ return torch.stack(list_ls, 0), torch.stack(token_length, 0), None, None
def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
@@ -368,64 +375,8 @@
predictor_alignments_length = predictor_alignments.sum(-1).type(encoder_sequence_length.dtype)
return predictor_alignments.detach(), predictor_alignments_length.detach()
- def gen_tf2torch_map_dict(self):
-
- tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
- tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
- map_dict_local = {
- ## predictor
- "{}.cif_conv1d.weight".format(tensor_name_prefix_torch):
- {"name": "{}/conv1d/kernel".format(tensor_name_prefix_tf),
- "squeeze": None,
- "transpose": (2, 1, 0),
- }, # (256,256,3),(3,256,256)
- "{}.cif_conv1d.bias".format(tensor_name_prefix_torch):
- {"name": "{}/conv1d/bias".format(tensor_name_prefix_tf),
- "squeeze": None,
- "transpose": None,
- }, # (256,),(256,)
- "{}.cif_output.weight".format(tensor_name_prefix_torch):
- {"name": "{}/conv1d_1/kernel".format(tensor_name_prefix_tf),
- "squeeze": 0,
- "transpose": (1, 0),
- }, # (1,256),(1,256,1)
- "{}.cif_output.bias".format(tensor_name_prefix_torch):
- {"name": "{}/conv1d_1/bias".format(tensor_name_prefix_tf),
- "squeeze": None,
- "transpose": None,
- }, # (1,),(1,)
- }
- return map_dict_local
- def convert_tf2torch(self,
- var_dict_tf,
- var_dict_torch,
- ):
- map_dict = self.gen_tf2torch_map_dict()
- var_dict_torch_update = dict()
- for name in sorted(var_dict_torch.keys(), reverse=False):
- names = name.split('.')
- if names[0] == self.tf2torch_tensor_name_prefix_torch:
- name_tf = map_dict[name]["name"]
- data_tf = var_dict_tf[name_tf]
- if map_dict[name]["squeeze"] is not None:
- data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"])
- if map_dict[name]["transpose"] is not None:
- data_tf = np.transpose(data_tf, map_dict[name]["transpose"])
- data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
- assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
- var_dict_torch[
- name].size(),
- data_tf.size())
- var_dict_torch_update[name] = data_tf
- logging.info(
- "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
- var_dict_tf[name_tf].shape))
-
- return var_dict_torch_update
-
-
-class mae_loss(nn.Module):
+class mae_loss(torch.nn.Module):
def __init__(self, normalize_length=False):
super(mae_loss, self).__init__()
--
Gitblit v1.9.1