From 6dcd960fda8be389af355ede4ecc583b036029d4 Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期二, 05 三月 2024 16:45:25 +0800
Subject: [PATCH] update cutsplit step

---
 funasr/models/paraformer/cif_predictor.py |  241 +++++++++++++++++++-----------------------------
 1 files changed, 96 insertions(+), 145 deletions(-)

diff --git a/funasr/models/paraformer/cif_predictor.py b/funasr/models/paraformer/cif_predictor.py
index 383d9ca..4d9f5d8 100644
--- a/funasr/models/paraformer/cif_predictor.py
+++ b/funasr/models/paraformer/cif_predictor.py
@@ -1,23 +1,25 @@
+#!/usr/bin/env python3
+# -*- encoding: utf-8 -*-
+# Copyright FunASR (https://github.com/alibaba-damo-academy/FunASR). All Rights Reserved.
+#  MIT License  (https://opensource.org/licenses/MIT)
+
 import torch
-from torch import nn
-from torch import Tensor
 import logging
 import numpy as np
-from funasr.train_utils.device_funcs import to_device
-from funasr.models.transformer.utils.nets_utils import make_pad_mask
-from funasr.models.scama.utils import sequence_mask
-from typing import Optional, Tuple
 
 from funasr.register import tables
+from funasr.train_utils.device_funcs import to_device
+from funasr.models.transformer.utils.nets_utils import make_pad_mask
+from torch.cuda.amp import autocast
 
 @tables.register("predictor_classes", "CifPredictor")
-class CifPredictor(nn.Module):
+class CifPredictor(torch.nn.Module):
     def __init__(self, idim, l_order, r_order, threshold=1.0, dropout=0.1, smooth_factor=1.0, noise_threshold=0, tail_threshold=0.45):
         super().__init__()
 
-        self.pad = nn.ConstantPad1d((l_order, r_order), 0)
-        self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1, groups=idim)
-        self.cif_output = nn.Linear(idim, 1)
+        self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
+        self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1, groups=idim)
+        self.cif_output = torch.nn.Linear(idim, 1)
         self.dropout = torch.nn.Dropout(p=dropout)
         self.threshold = threshold
         self.smooth_factor = smooth_factor
@@ -26,42 +28,44 @@
 
     def forward(self, hidden, target_label=None, mask=None, ignore_id=-1, mask_chunk_predictor=None,
                 target_label_length=None):
-        h = hidden
-        context = h.transpose(1, 2)
-        queries = self.pad(context)
-        memory = self.cif_conv1d(queries)
-        output = memory + context
-        output = self.dropout(output)
-        output = output.transpose(1, 2)
-        output = torch.relu(output)
-        output = self.cif_output(output)
-        alphas = torch.sigmoid(output)
-        alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
-        if mask is not None:
-            mask = mask.transpose(-1, -2).float()
-            alphas = alphas * mask
-        if mask_chunk_predictor is not None:
-            alphas = alphas * mask_chunk_predictor
-        alphas = alphas.squeeze(-1)
-        mask = mask.squeeze(-1)
-        if target_label_length is not None:
-            target_length = target_label_length
-        elif target_label is not None:
-            target_length = (target_label != ignore_id).float().sum(-1)
-        else:
-            target_length = None
-        token_num = alphas.sum(-1)
-        if target_length is not None:
-            alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
-        elif self.tail_threshold > 0.0:
-            hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+    
+        with autocast(False):
+            h = hidden
+            context = h.transpose(1, 2)
+            queries = self.pad(context)
+            memory = self.cif_conv1d(queries)
+            output = memory + context
+            output = self.dropout(output)
+            output = output.transpose(1, 2)
+            output = torch.relu(output)
+            output = self.cif_output(output)
+            alphas = torch.sigmoid(output)
+            alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+            if mask is not None:
+                mask = mask.transpose(-1, -2).float()
+                alphas = alphas * mask
+            if mask_chunk_predictor is not None:
+                alphas = alphas * mask_chunk_predictor
+            alphas = alphas.squeeze(-1)
+            mask = mask.squeeze(-1)
+            if target_label_length is not None:
+                target_length = target_label_length
+            elif target_label is not None:
+                target_length = (target_label != ignore_id).float().sum(-1)
+            else:
+                target_length = None
+            token_num = alphas.sum(-1)
+            if target_length is not None:
+                alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
+            elif self.tail_threshold > 0.0:
+                hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+                
+            acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
             
-        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
-        
-        if target_length is None and self.tail_threshold > 0.0:
-            token_num_int = torch.max(token_num).type(torch.int32).item()
-            acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
-            
+            if target_length is None and self.tail_threshold > 0.0:
+                token_num_int = torch.max(token_num).type(torch.int32).item()
+                acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+                
         return acoustic_embeds, token_num, alphas, cif_peak
 
     def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
@@ -137,7 +141,7 @@
         return predictor_alignments.detach(), predictor_alignments_length.detach()
 
 @tables.register("predictor_classes", "CifPredictorV2")
-class CifPredictorV2(nn.Module):
+class CifPredictorV2(torch.nn.Module):
     def __init__(self,
                  idim,
                  l_order,
@@ -153,9 +157,9 @@
                  ):
         super(CifPredictorV2, self).__init__()
 
-        self.pad = nn.ConstantPad1d((l_order, r_order), 0)
-        self.cif_conv1d = nn.Conv1d(idim, idim, l_order + r_order + 1)
-        self.cif_output = nn.Linear(idim, 1)
+        self.pad = torch.nn.ConstantPad1d((l_order, r_order), 0)
+        self.cif_conv1d = torch.nn.Conv1d(idim, idim, l_order + r_order + 1)
+        self.cif_output = torch.nn.Linear(idim, 1)
         self.dropout = torch.nn.Dropout(p=dropout)
         self.threshold = threshold
         self.smooth_factor = smooth_factor
@@ -167,45 +171,48 @@
 
     def forward(self, hidden, target_label=None, mask=None, ignore_id=-1, mask_chunk_predictor=None,
                 target_label_length=None):
-        h = hidden
-        context = h.transpose(1, 2)
-        queries = self.pad(context)
-        output = torch.relu(self.cif_conv1d(queries))
-        output = output.transpose(1, 2)
-
-        output = self.cif_output(output)
-        alphas = torch.sigmoid(output)
-        alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
-        if mask is not None:
-            mask = mask.transpose(-1, -2).float()
-            alphas = alphas * mask
-        if mask_chunk_predictor is not None:
-            alphas = alphas * mask_chunk_predictor
-        alphas = alphas.squeeze(-1)
-        mask = mask.squeeze(-1)
-        if target_label_length is not None:
-            target_length = target_label_length
-        elif target_label is not None:
-            target_length = (target_label != ignore_id).float().sum(-1)
-        else:
-            target_length = None
-        token_num = alphas.sum(-1)
-        if target_length is not None:
-            alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
-        elif self.tail_threshold > 0.0:
-            if self.tail_mask:
-                hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+        
+        with autocast(False):
+            h = hidden
+            context = h.transpose(1, 2)
+            queries = self.pad(context)
+            output = torch.relu(self.cif_conv1d(queries))
+            output = output.transpose(1, 2)
+    
+            output = self.cif_output(output)
+            alphas = torch.sigmoid(output)
+            alphas = torch.nn.functional.relu(alphas * self.smooth_factor - self.noise_threshold)
+            if mask is not None:
+                mask = mask.transpose(-1, -2).float()
+                alphas = alphas * mask
+            if mask_chunk_predictor is not None:
+                alphas = alphas * mask_chunk_predictor
+            alphas = alphas.squeeze(-1)
+            mask = mask.squeeze(-1)
+            if target_label_length is not None:
+                target_length = target_label_length.squeeze(-1)
+            elif target_label is not None:
+                target_length = (target_label != ignore_id).float().sum(-1)
             else:
-                hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=None)
-
-        acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
-        if target_length is None and self.tail_threshold > 0.0:
-            token_num_int = torch.max(token_num).type(torch.int32).item()
-            acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
+                target_length = None
+            token_num = alphas.sum(-1)
+            if target_length is not None:
+                alphas *= (target_length / token_num)[:, None].repeat(1, alphas.size(1))
+            elif self.tail_threshold > 0.0:
+                if self.tail_mask:
+                    hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=mask)
+                else:
+                    hidden, alphas, token_num = self.tail_process_fn(hidden, alphas, token_num, mask=None)
+    
+            acoustic_embeds, cif_peak = cif(hidden, alphas, self.threshold)
+            if target_length is None and self.tail_threshold > 0.0:
+                token_num_int = torch.max(token_num).type(torch.int32).item()
+                acoustic_embeds = acoustic_embeds[:, :token_num_int, :]
 
         return acoustic_embeds, token_num, alphas, cif_peak
 
-    def forward_chunk(self, hidden, cache=None):
+    def forward_chunk(self, hidden, cache=None, **kwargs):
+        is_final = kwargs.get("is_final", False)
         batch_size, len_time, hidden_size = hidden.shape
         h = hidden
         context = h.transpose(1, 2)
@@ -226,14 +233,14 @@
 
         if cache is not None and "chunk_size" in cache:
             alphas[:, :cache["chunk_size"][0]] = 0.0
-            if "is_final" in cache and not cache["is_final"]:
+            if not is_final:
                 alphas[:, sum(cache["chunk_size"][:2]):] = 0.0
         if cache is not None and "cif_alphas" in cache and "cif_hidden" in cache:
             cache["cif_hidden"] = to_device(cache["cif_hidden"], device=hidden.device)
             cache["cif_alphas"] = to_device(cache["cif_alphas"], device=alphas.device)
             hidden = torch.cat((cache["cif_hidden"], hidden), dim=1)
             alphas = torch.cat((cache["cif_alphas"], alphas), dim=1)
-        if cache is not None and "is_final" in cache and cache["is_final"]:
+        if cache is not None and is_final:
             tail_hidden = torch.zeros((batch_size, 1, hidden_size), device=hidden.device)
             tail_alphas = torch.tensor([[self.tail_threshold]], device=alphas.device)
             tail_alphas = torch.tile(tail_alphas, (batch_size, 1))
@@ -277,7 +284,7 @@
 
         max_token_len = max(token_length)
         if max_token_len == 0:
-             return hidden, torch.stack(token_length, 0)
+             return hidden, torch.stack(token_length, 0), None, None
         list_ls = []
         for b in range(batch_size):
             pad_frames = torch.zeros((max_token_len - token_length[b], hidden_size), device=alphas.device)
@@ -291,7 +298,7 @@
         cache["cif_alphas"] = torch.unsqueeze(cache["cif_alphas"], axis=0)
         cache["cif_hidden"] = torch.stack(cache_hiddens, axis=0)
         cache["cif_hidden"] = torch.unsqueeze(cache["cif_hidden"], axis=0)
-        return torch.stack(list_ls, 0), torch.stack(token_length, 0)
+        return torch.stack(list_ls, 0), torch.stack(token_length, 0), None, None
 
 
     def tail_process_fn(self, hidden, alphas, token_num=None, mask=None):
@@ -368,64 +375,8 @@
         predictor_alignments_length = predictor_alignments.sum(-1).type(encoder_sequence_length.dtype)
         return predictor_alignments.detach(), predictor_alignments_length.detach()
 
-    def gen_tf2torch_map_dict(self):
-    
-        tensor_name_prefix_torch = self.tf2torch_tensor_name_prefix_torch
-        tensor_name_prefix_tf = self.tf2torch_tensor_name_prefix_tf
-        map_dict_local = {
-            ## predictor
-            "{}.cif_conv1d.weight".format(tensor_name_prefix_torch):
-                {"name": "{}/conv1d/kernel".format(tensor_name_prefix_tf),
-                 "squeeze": None,
-                 "transpose": (2, 1, 0),
-                 },  # (256,256,3),(3,256,256)
-            "{}.cif_conv1d.bias".format(tensor_name_prefix_torch):
-                {"name": "{}/conv1d/bias".format(tensor_name_prefix_tf),
-                 "squeeze": None,
-                 "transpose": None,
-                 },  # (256,),(256,)
-            "{}.cif_output.weight".format(tensor_name_prefix_torch):
-                {"name": "{}/conv1d_1/kernel".format(tensor_name_prefix_tf),
-                 "squeeze": 0,
-                 "transpose": (1, 0),
-                 },  # (1,256),(1,256,1)
-            "{}.cif_output.bias".format(tensor_name_prefix_torch):
-                {"name": "{}/conv1d_1/bias".format(tensor_name_prefix_tf),
-                 "squeeze": None,
-                 "transpose": None,
-                 },  # (1,),(1,)
-        }
-        return map_dict_local
 
-    def convert_tf2torch(self,
-                         var_dict_tf,
-                         var_dict_torch,
-                         ):
-        map_dict = self.gen_tf2torch_map_dict()
-        var_dict_torch_update = dict()
-        for name in sorted(var_dict_torch.keys(), reverse=False):
-            names = name.split('.')
-            if names[0] == self.tf2torch_tensor_name_prefix_torch:
-                name_tf = map_dict[name]["name"]
-                data_tf = var_dict_tf[name_tf]
-                if map_dict[name]["squeeze"] is not None:
-                    data_tf = np.squeeze(data_tf, axis=map_dict[name]["squeeze"])
-                if map_dict[name]["transpose"] is not None:
-                    data_tf = np.transpose(data_tf, map_dict[name]["transpose"])
-                data_tf = torch.from_numpy(data_tf).type(torch.float32).to("cpu")
-                assert var_dict_torch[name].size() == data_tf.size(), "{}, {}, {} != {}".format(name, name_tf,
-                                                                                                var_dict_torch[
-                                                                                                    name].size(),
-                                                                                                data_tf.size())
-                var_dict_torch_update[name] = data_tf
-                logging.info(
-                    "torch tensor: {}, {}, loading from tf tensor: {}, {}".format(name, data_tf.size(), name_tf,
-                                                                                  var_dict_tf[name_tf].shape))
-    
-        return var_dict_torch_update
-
-
-class mae_loss(nn.Module):
+class mae_loss(torch.nn.Module):
 
     def __init__(self, normalize_length=False):
         super(mae_loss, self).__init__()

--
Gitblit v1.9.1