From d2c1204d91d7c98be7998e3966bd82e22750293b Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 04 三月 2024 17:50:29 +0800
Subject: [PATCH] Revert "Dev yf" (#1418)
---
funasr/models/seaco_paraformer/model.py | 18 ++++++++----------
1 files changed, 8 insertions(+), 10 deletions(-)
diff --git a/funasr/models/seaco_paraformer/model.py b/funasr/models/seaco_paraformer/model.py
index a8b1f1f..20b0cc8 100644
--- a/funasr/models/seaco_paraformer/model.py
+++ b/funasr/models/seaco_paraformer/model.py
@@ -30,7 +30,7 @@
from funasr.models.transformer.utils.nets_utils import make_pad_mask, pad_list
from funasr.utils.load_utils import load_audio_text_image_video, extract_fbank
-import pdb
+
if LooseVersion(torch.__version__) >= LooseVersion("1.6.0"):
from torch.cuda.amp import autocast
else:
@@ -128,7 +128,7 @@
hotword_pad = kwargs.get("hotword_pad")
hotword_lengths = kwargs.get("hotword_lengths")
dha_pad = kwargs.get("dha_pad")
-
+
batch_size = speech.shape[0]
# for data-parallel
text = text[:, : text_lengths.max()]
@@ -209,20 +209,17 @@
nfilter=50,
seaco_weight=1.0):
# decoder forward
-
decoder_out, decoder_hidden, _ = self.decoder(encoder_out, encoder_out_lens, sematic_embeds, ys_pad_lens, return_hidden=True, return_both=True)
-
decoder_pred = torch.log_softmax(decoder_out, dim=-1)
if hw_list is not None:
hw_lengths = [len(i) for i in hw_list]
hw_list_ = [torch.Tensor(i).long() for i in hw_list]
hw_list_pad = pad_list(hw_list_, 0).to(encoder_out.device)
selected = self._hotword_representation(hw_list_pad, torch.Tensor(hw_lengths).int().to(encoder_out.device))
-
contextual_info = selected.squeeze(0).repeat(encoder_out.shape[0], 1, 1).to(encoder_out.device)
num_hot_word = contextual_info.shape[1]
_contextual_length = torch.Tensor([num_hot_word]).int().repeat(encoder_out.shape[0]).to(encoder_out.device)
-
+
# ASF Core
if nfilter > 0 and nfilter < num_hot_word:
hotword_scores = self.seaco_decoder.forward_asf6(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
@@ -242,7 +239,7 @@
cif_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, sematic_embeds, ys_pad_lens)
dec_attended, _ = self.seaco_decoder(contextual_info, _contextual_length, decoder_hidden, ys_pad_lens)
merged = self._merge(cif_attended, dec_attended)
-
+
dha_output = self.hotword_output_layer(merged) # remove the last token in loss calculation
dha_pred = torch.log_softmax(dha_output, dim=-1)
def _merge_res(dec_output, dha_output):
@@ -256,8 +253,8 @@
# logits = dec_output * dha_mask + dha_output[:,:,:-1] * (1-dha_mask)
logits = dec_output * dha_mask + dha_output[:,:,:] * (1-dha_mask)
return logits
-
merged_pred = _merge_res(decoder_pred, dha_pred)
+ # import pdb; pdb.set_trace()
return merged_pred
else:
return decoder_pred
@@ -307,6 +304,7 @@
logging.info("enable beam_search")
self.init_beam_search(**kwargs)
self.nbest = kwargs.get("nbest", 1)
+
meta_data = {}
# extract fbank feats
@@ -332,7 +330,6 @@
if isinstance(encoder_out, tuple):
encoder_out = encoder_out[0]
-
# predictor
predictor_outs = self.calc_predictor(encoder_out, encoder_out_lens)
pre_acoustic_embeds, pre_token_length, _, _ = predictor_outs[0], predictor_outs[1], \
@@ -341,14 +338,15 @@
if torch.max(pre_token_length) < 1:
return []
+
decoder_out = self._seaco_decode_with_ASF(encoder_out, encoder_out_lens,
pre_acoustic_embeds,
pre_token_length,
hw_list=self.hotword_list)
-
# decoder_out, _ = decoder_outs[0], decoder_outs[1]
_, _, us_alphas, us_peaks = self.calc_predictor_timestamp(encoder_out, encoder_out_lens,
pre_token_length)
+
results = []
b, n, d = decoder_out.size()
for i in range(b):
--
Gitblit v1.9.1