From cfe577f16fef9fb5b0a48f07d4f9e232799cc9d4 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期三, 08 五月 2024 00:03:52 +0800
Subject: [PATCH] decoding key

---
 funasr/models/sense_voice/model.py |   14 +++++++++-----
 1 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/funasr/models/sense_voice/model.py b/funasr/models/sense_voice/model.py
index 8198706..0230638 100644
--- a/funasr/models/sense_voice/model.py
+++ b/funasr/models/sense_voice/model.py
@@ -803,10 +803,12 @@
                 tokenizer=tokenizer,
             )
 
-            if len(kwargs.get("data_type", [])) > 1:
+            if (
+                isinstance(kwargs.get("data_type", None), (list, tuple))
+                and len(kwargs.get("data_type", [])) > 1
+            ):
                 audio_sample_list, text_token_int_list = audio_sample_list
                 text_token_int = text_token_int_list[0]
-                text_token_int = tokenizer.encode(text_token_int)
             else:
                 text_token_int = None
 
@@ -846,7 +848,7 @@
         )
 
         if text_token_int is not None:
-            i = 1
+            i = 0
             results = []
             ibest_writer = None
             if kwargs.get("output_dir") is not None:
@@ -855,8 +857,10 @@
                 ibest_writer = self.writer[f"1best_recog"]
 
             # 1. Forward decoder
-            ys_pad = torch.tensor(text_token_int, dtype=torch.int64).to(kwargs["device"])[None, :]
-            ys_pad_lens = torch.tensor([len(text_token_int)], dtype=torch.int64).to(
+            ys_pad = torch.tensor(sos_int + text_token_int, dtype=torch.int64).to(kwargs["device"])[
+                None, :
+            ]
+            ys_pad_lens = torch.tensor([len(sos_int + text_token_int)], dtype=torch.int64).to(
                 kwargs["device"]
             )[None, :]
             decoder_out = self.model.decoder(

--
Gitblit v1.9.1