From 6fe10a8dbfbab2bdcc28a411f9c5da85a4a8d002 Mon Sep 17 00:00:00 2001
From: hohaiuhsx <hohaiuhsx@gmail.com>
Date: 星期一, 10 三月 2025 23:16:22 +0800
Subject: [PATCH] 修复 当选用SenseVoice模型处理长音频(同时开启vad和output_timestamp)时的异常 (#2413)
---
funasr/models/llm_asr/model.py | 199 +++++++++++++++++++++++++++++++++++--------------
1 files changed, 143 insertions(+), 56 deletions(-)
diff --git a/funasr/models/llm_asr/model.py b/funasr/models/llm_asr/model.py
index 03a2c08..fa1d2c3 100644
--- a/funasr/models/llm_asr/model.py
+++ b/funasr/models/llm_asr/model.py
@@ -168,8 +168,6 @@
text: (Batch, Length)
text_lengths: (Batch,)
"""
- # import pdb;
- # pdb.set_trace()
if len(text_lengths.size()) > 1:
text_lengths = text_lengths[:, 0]
if len(speech_lengths.size()) > 1:
@@ -814,7 +812,7 @@
ibest_writer = self.writer[f"{0 + 1}best_recog"]
results = []
- response_clean = re.sub("[^\w\s\u3000\u4e00-\u9fff]+", "", response)
+ response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
result_i = {"key": key[0], "text": response, "text_tn": response_clean, "label": label}
if loss is not None:
result_i["loss"] = loss
@@ -990,9 +988,9 @@
text: (Batch, Length)
text_lengths: (Batch,)
"""
- import pdb
-
- pdb.set_trace()
+ # import pdb
+ #
+ # pdb.set_trace()
if len(speech_lengths.size()) > 1:
speech_lengths = speech_lengths[:, 0]
@@ -1013,12 +1011,13 @@
fake_token_len = kwargs.get("fake_token_len")
fake_token_len[fake_token_len < 0] = 0
fbank_beg[fbank_beg < 0] = 0
+
speech_idx = 0
for batch_idx in range(batch_size):
for turn_id in range(fbank_beg.shape[1]):
fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
- if fbank_beg[batch_idx, turn_id] > 0:
+ if fbank_beg_idx > 0:
speech_token_len = fake_token_len[batch_idx, turn_id]
speech_token = encoder_out[speech_idx, :speech_token_len, :]
@@ -1027,12 +1026,15 @@
batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
] = speech_token
except Exception as e:
+ #
logging.error(f"{str(e)}, {traceback.format_exc()}")
logging.info(
- f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens[speech_idx].item()}"
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
)
+ # import pdb;
+ # pdb.set_trace()
speech_token_len = encoder_out_lens[speech_idx].item()
- speech_token = encoder_out[speech_idx, turn_id, :speech_token_len, :]
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
inputs_embeds[
batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
] = speech_token
@@ -1066,6 +1068,12 @@
stats["batch_size_x_tokens"] = token_num * batch_size
stats["batch_size_real_tokens"] = attention_mask.sum().item()
stats["padding_tokens"] = stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
+
+ dialog_turns = (fbank_beg > 0).sum(-1)
+ dialog_turns_max = torch.max(dialog_turns).int().item()
+ dialog_turns_avg = dialog_turns.sum().item() / batch_size
+ stats["dialog_turns_max"] = dialog_turns_max
+ stats["dialog_turns_avg"] = dialog_turns_avg
# force_gatherable: to-device and to-tensor if scalar for DataParallel
if self.length_normalized_loss:
@@ -1107,8 +1115,8 @@
user = contents["user"]
assistant = contents["assistant"]
pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
- input_ids, labels, source_ids, target_ids, fbank, fbank_lens, fbank_mask, fbank_beg = (
- [],
+
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
[],
[],
[],
@@ -1117,30 +1125,43 @@
[],
[],
)
-
+ input_source_ids = []
for i, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
+ if i >= kwargs.get("multiturn_num_max", 5):
+ break
+ if len(input_ids) > kwargs.get("max_token_length", 1500):
- source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ break
+
+ if i == 0:
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ else:
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
splits = pattern.split(source_input)
- source_ids_i = []
+ source_ids = []
+ fbank_i = []
fbank_mask_i = []
- fbank_beg_i = []
+ fake_token_len_i = 0
+ fbank_beg_i = -1
fbank_lens_i = []
- # target_ids_i = []
+ speech, speech_lengths = [], []
for k, sub_str in enumerate(splits):
if not sub_str.startswith("<|startofspeech|>"):
sub_token = tokenizer.encode(sub_str)
- source_ids_i += sub_token
+ source_ids += sub_token
fbank_mask_i += [0] * len(sub_token)
else:
sub_str = sub_str.replace("<|startofspeech|>", "").replace(
"<|endofspeech|>", ""
)
if sub_str.startswith("!"):
+ sub_str = sub_str[1:]
+ if sub_str.startswith("!"): # !!bytes
+ sub_str = eval(sub_str[1:])
try:
time1 = time.perf_counter()
- data_src = load_audio_text_image_video(sub_str[1:], fs=frontend.fs)
+ data_src = load_audio_text_image_video(sub_str, fs=frontend.fs)
time2 = time.perf_counter()
meta_data["load_data"] = f"{time2 - time1:0.3f}"
except Exception as e:
@@ -1164,49 +1185,70 @@
if kwargs.get("permute", True):
speech = speech.permute(0, 2, 1)
+ if speech_lengths > kwargs.get("max_source_length", 5500):
+ # logging.info(
+ # f"speech_lengths > max_source_length: {speech_lengths}>{self.max_source_length}, {item}"
+ # )
+ badcase_flag = True
olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
olens = 1 + (olens - 3 + 2 * 1) // 2
- sub_token_len = (olens - 1) // 2 + 1
- sub_token = [0] * sub_token_len
- fbank_beg_i = [len(source_ids_i)]
- source_ids_i += sub_token
- fbank_mask_i += [1] * len(sub_token)
+ fake_token_len_i = (olens - 1) // 2 + 1
+ fake_token = [0] * fake_token_len_i
+ fbank_beg_i = len(source_ids)
+ source_ids += fake_token
+ fbank_mask_i += [1] * len(fake_token)
- source_mask = [-100] * len(source_ids_i)
+ fbank_beg += [fbank_beg_i + len(input_ids)]
+ fake_token_len += [fake_token_len_i]
+ source_mask = [-100] * len(source_ids)
target_out = f"{target_out}<|im_end|>"
target_ids = tokenizer.encode(target_out)
- input_ids += source_ids_i + target_ids
+ input_source_ids = input_ids + source_ids
+ input_ids += source_ids + target_ids
labels += source_mask + target_ids
fbank_mask += fbank_mask_i
- fbank_beg.append(fbank_beg_i)
+ if len(speech) > 0:
+ fbank.append(speech[0, :, :])
+ fbank_lens.append(speech_lengths)
input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
- source_ids = torch.tensor(source_ids_i, dtype=torch.int64)
- target_ids = torch.tensor(target_ids, dtype=torch.int64)
- fbank = speech[0, :, :]
- fbank_lens = speech_lengths
+ # fbank = speech[0, :, :]
+ # fbank_lens = torch.tensor(fbank_lens, dtype=torch.int32)
fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
+ source_ids = torch.tensor(input_source_ids, dtype=torch.int64)
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
+ if len(fbank) > 0:
+ speech = torch.nn.utils.rnn.pad_sequence(fbank, batch_first=True, padding_value=0.0)
+ speech_lengths = torch.nn.utils.rnn.pad_sequence(
+ fbank_lens, batch_first=True, padding_value=-1
+ )
+ else:
+ speech = []
+ speech_lengths = []
output = {
- "speech": fbank[None, :, :],
- "speech_lengths": fbank_lens[:, None],
+ "speech": speech,
+ "speech_lengths": speech_lengths,
"fbank_mask": fbank_mask[None, :],
"fbank_beg": fbank_beg[None,],
- "input_ids": input_ids[None, :],
- "attention_mask": attention_mask[None, :],
- "labels_ids": labels[None, :],
+ "fake_token_len": fake_token_len[None, :],
+ "input_ids": input_ids[None,],
+ "attention_mask": attention_mask[None,],
+ "labels_ids": labels,
"source_ids": source_ids[None, :],
"target_ids": target_ids[None, :],
}
return output
- def inference(
+
+ def inference_prepare(
self,
data_in,
data_lengths=None,
@@ -1228,34 +1270,79 @@
# audio encoder
speech = batch["speech"]
- speech_lengths = batch["speech_lengths"][:, 0]
- # fp16
- if kwargs.get("fp16", False):
- speech = speech.to(torch.float16)
- elif kwargs.get("bf16", False):
- speech = speech.to(torch.bfloat16)
- # audio encoder
- encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+ if len(speech) > 0:
+ speech_lengths = batch["speech_lengths"][:, 0]
+ # fp16
+ if kwargs.get("fp16", False):
+ speech = speech.to(torch.float16)
+ elif kwargs.get("bf16", False):
+ speech = speech.to(torch.bfloat16)
+ # audio encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
- # audio_adaptor
- encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
+ # audio_adaptor
+ encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
input_ids = batch["input_ids"]
source_ids = batch["source_ids"]
+ fbank_beg = batch["fbank_beg"]
+ fake_token_len = batch["fake_token_len"]
+
if not kwargs.get("tearchforing", False):
input_ids = source_ids
+
input_ids[input_ids < 0] = 0
inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
batch_size, token_num, dims = inputs_embeds.shape
- fbank_beg = batch["fbank_beg"]
+
+ fake_token_len[fake_token_len < 0] = 0
+ fbank_beg[fbank_beg < 0] = 0
+
+ speech_idx = 0
for batch_idx in range(batch_size):
- min_len = encoder_out_lens[batch_idx].item()
- fbank_beg_idx = fbank_beg[batch_idx]
- inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + min_len, :] = encoder_out[
- batch_idx, :min_len, :
- ]
+ for turn_id in range(fbank_beg.shape[1]):
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
+ if fbank_beg_idx > 0:
+ speech_token_len = fake_token_len[batch_idx, turn_id]
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
+
+ try:
+ inputs_embeds[
+ batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
+ ] = speech_token
+ except Exception as e:
+ #
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
+ logging.info(
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
+ )
+ # import pdb;
+ # pdb.set_trace()
+ speech_token_len = encoder_out_lens[speech_idx].item()
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
+ inputs_embeds[
+ batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
+ ] = speech_token
+
+ speech_idx += 1
+ return inputs_embeds, contents, batch, source_ids, meta_data
+
+
+ def inference(
+ self,
+ data_in,
+ data_lengths=None,
+ key: list = None,
+ tokenizer=None,
+ frontend=None,
+ **kwargs,
+ ):
+
+ inputs_embeds, contents, batch, source_ids, meta_data = self.inference_prepare(
+ data_in, data_lengths, key, tokenizer, frontend, **kwargs
+ )
llm_dtype = kwargs.get("llm_dtype", "fp32")
if llm_dtype == "fp32":
@@ -1265,7 +1352,7 @@
with torch.cuda.amp.autocast(
enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
):
- label = contents["assistant"][0]
+ label = contents["assistant"][-1]
self.llm = self.llm.to(dtype_map[llm_dtype])
inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
@@ -1308,15 +1395,15 @@
ibest_writer = self.writer[f"{0 + 1}best_recog"]
results = []
- response_clean = re.sub("[^\w\s\u3000\u4e00-\u9fff]+", "", response)
+ response_clean = re.sub(r"[^\w\s\u3000\u4e00-\u9fff]+", "", response)
result_i = {"key": key[0], "text": response, "text_tn": response_clean, "label": label}
if loss is not None:
result_i["loss"] = loss
results.append(result_i)
if ibest_writer is not None:
- ibest_writer["text"][key[0]] = response
- ibest_writer["label"][key[0]] = label
+ ibest_writer["text"][key[0]] = response.replace("\n", " ")
+ ibest_writer["label"][key[0]] = label.replace("\n", " ")
ibest_writer["text_tn"][key[0]] = response_clean
return results, meta_data
--
Gitblit v1.9.1