From 2acef4bdaea588adee3098a057a395937dff4e6a Mon Sep 17 00:00:00 2001
From: 雾聪 <wucong.lyb@alibaba-inc.com>
Date: 星期一, 08 一月 2024 16:51:42 +0800
Subject: [PATCH] json stamp_sents for websocket-server

---
 funasr/models/e2e_asr_contextual_paraformer.py |    9 ++++++---
 1 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/funasr/models/e2e_asr_contextual_paraformer.py b/funasr/models/e2e_asr_contextual_paraformer.py
index d27fd8d..b474dbc 100644
--- a/funasr/models/e2e_asr_contextual_paraformer.py
+++ b/funasr/models/e2e_asr_contextual_paraformer.py
@@ -125,6 +125,7 @@
         if self.crit_attn_weight > 0:
             self.attn_loss = torch.nn.L1Loss()
         self.crit_attn_smooth = crit_attn_smooth
+        self.length_normalized_loss = length_normalized_loss
 
     def forward(
             self,
@@ -134,7 +135,7 @@
             text_lengths: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
+            dha_pad: torch.Tensor,
     ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
         """Frontend + Encoder + Decoder + Calc loss
 
@@ -207,7 +208,7 @@
         # 2b. Attention decoder branch
         if self.ctc_weight != 1.0:
             loss_att, acc_att, cer_att, wer_att, loss_pre, loss_ideal = self._calc_att_clas_loss(
-                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths, ideal_attn
+                encoder_out, encoder_out_lens, text, text_lengths, hotword_pad, hotword_lengths
             )
 
         # 3. CTC-Att loss definition
@@ -231,6 +232,8 @@
 
         stats["loss"] = torch.clone(loss.detach())
         # force_gatherable: to-device and to-tensor if scalar for DataParallel
+        if self.length_normalized_loss:
+            batch_size = int((text_lengths + self.predictor_bias).sum())
         loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
         return loss, stats, weight
     
@@ -242,7 +245,6 @@
             ys_pad_lens: torch.Tensor,
             hotword_pad: torch.Tensor,
             hotword_lengths: torch.Tensor,
-            ideal_attn: torch.Tensor,
     ):
         encoder_out_mask = (~make_pad_mask(encoder_out_lens, maxlen=encoder_out.size(1))[:, None, :]).to(
             encoder_out.device)
@@ -350,6 +352,7 @@
             else:
                 hw_embed = self.bias_embed(hw_list_pad)
             hw_embed, (h_n, _) = self.bias_encoder(hw_embed)
+            hw_embed = h_n.repeat(encoder_out.shape[0], 1, 1)
         else:
             hw_lengths = [len(i) for i in hw_list]
             hw_list_pad = pad_list([torch.Tensor(i).long() for i in hw_list], 0).to(encoder_out.device)

--
Gitblit v1.9.1