From 4a7a984a5f3e3f894f86ce82e76ddd13d8a42a20 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 11 三月 2024 17:56:30 +0800
Subject: [PATCH] Dev gzf (#1465)

---
 funasr/models/llm_asr_nar/model.py |   20 ++++++++++++++++----
 1 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/funasr/models/llm_asr_nar/model.py b/funasr/models/llm_asr_nar/model.py
index 6a4ecce..d83f571 100644
--- a/funasr/models/llm_asr_nar/model.py
+++ b/funasr/models/llm_asr_nar/model.py
@@ -214,10 +214,13 @@
     
     def encode(
         self, speech: torch.Tensor, speech_lengths: torch.Tensor, **kwargs,
-    ) -> Tuple[torch.Tensor, torch.Tensor]:
+    ):
     
         audio_mask = kwargs.get("audio_mask", None)
         audio_token_lengths = audio_mask.sum(-1) if audio_mask is not None else None
+        text_token_int = kwargs.get("text_token_int", None)
+        if audio_token_lengths is None:
+            audio_token_lengths = torch.tensor([len(text_token_int)], dtype=torch.int64)
 
         batch = {"speech": speech, "speech_lengths": speech_lengths}
         enc, enc_lens = self.audio_encoder.encode(**batch)
@@ -259,7 +262,13 @@
             time1 = time.perf_counter()
             audio_sample_list = load_audio_text_image_video(data_in, fs=frontend.fs, audio_fs=kwargs.get("fs", 16000),
                                                             data_type=kwargs.get("data_type", "sound"),
-                                                            tokenizer=tokenizer)
+                                                            tokenizer=None)
+            if len(kwargs.get("data_type")) > 1:
+                audio_sample_list, text_token_int_list = audio_sample_list
+                text_token_int = text_token_int_list[0].replace(" ", "")
+                text_token_int = tokenizer.encode(text_token_int)
+            else:
+                text_token_int = None
             time2 = time.perf_counter()
             meta_data["load_data"] = f"{time2 - time1:0.3f}"
             speech, speech_lengths = extract_fbank(audio_sample_list, data_type=kwargs.get("data_type", "sound"),
@@ -272,7 +281,7 @@
         speech_lengths = speech_lengths.to(device=kwargs["device"])
         
         # Encoder
-        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+        encoder_out, encoder_out_lens = self.encode(speech, speech_lengths, text_token_int=text_token_int)
 
         # adaptor
         encoder_out = self.adaptor(encoder_out)
@@ -315,7 +324,10 @@
         model_outputs = self.llm(inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=None)
         preds = torch.argmax(model_outputs.logits, -1)
         text = tokenizer.batch_decode(preds, add_special_tokens=False, skip_special_tokens=True)
-        text = text[0].split(': \n')[-1]
+
+        text = text[0].split(': ')[-1]
+        text = text.strip()
+        
         # preds = torch.argmax(model_outputs.logits, -1)
         
         ibest_writer = None

--
Gitblit v1.9.1