From 23bc5dee4e88ef8b5d0c0d64d2e188c054422e8b Mon Sep 17 00:00:00 2001
From: speech_asr <wangjiaming.wjm@alibaba-inc.com>
Date: 星期二, 11 四月 2023 00:21:45 +0800
Subject: [PATCH] update

---
 funasr/models/e2e_uni_asr.py |   14 +++++++-------
 1 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/funasr/models/e2e_uni_asr.py b/funasr/models/e2e_uni_asr.py
index 03fbca9..0c26b8e 100644
--- a/funasr/models/e2e_uni_asr.py
+++ b/funasr/models/e2e_uni_asr.py
@@ -23,6 +23,7 @@
 from funasr.models.postencoder.abs_postencoder import AbsPostEncoder
 from funasr.models.preencoder.abs_preencoder import AbsPreEncoder
 from funasr.models.specaug.abs_specaug import AbsSpecAug
+from funasr.models.base_model import FunASRModel
 from funasr.layers.abs_normalize import AbsNormalize
 from funasr.torch_utils.device_funcs import force_gatherable
 from funasr.train.abs_espnet_model import AbsESPnetModel
@@ -38,7 +39,7 @@
         yield
 
 
-class UniASR(AbsESPnetModel):
+class UniASR(FunASRModel):
     """
     Author: Speech Lab, Alibaba Group, China
     """
@@ -198,16 +199,15 @@
 
         # for data-parallel
         text = text[:, : text_lengths.max()]
-        speech = speech[:, :speech_lengths.max(), :]
+        speech = speech[:, :speech_lengths.max()]
 
         ind = self.encoder.overlap_chunk_cls.random_choice(self.training, decoding_ind)
-        speech_raw = speech.clone().to(speech.device)
         # 1. Encoder
         if self.enable_maas_finetune:
             with torch.no_grad():
-                encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
+                speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
         else:
-            encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
+            speech_raw, encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, ind=ind)
 
         intermediate_outs = None
         if isinstance(encoder_out, tuple):
@@ -486,7 +486,7 @@
             # 3. Normalization for feature: e.g. Global-CMVN, Utterance-CMVN
             if self.normalize is not None:
                 feats, feats_lengths = self.normalize(feats, feats_lengths)
-
+        speech_raw = feats.clone().to(feats.device)
         # Pre-encoder, e.g. used for raw input data
         if self.preencoder is not None:
             feats, feats_lengths = self.preencoder(feats, feats_lengths)
@@ -523,7 +523,7 @@
         if intermediate_outs is not None:
             return (encoder_out, intermediate_outs), encoder_out_lens
 
-        return encoder_out, encoder_out_lens
+        return speech_raw, encoder_out, encoder_out_lens
 
     def encode2(
         self,

--
Gitblit v1.9.1