From e65b1f701abca03bf3a1b5fbb200392aabd38c22 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 20 六月 2024 17:09:33 +0800
Subject: [PATCH] Dev gzf deepspeed (#1833)
---
funasr/models/llm_asr/model.py | 140 +++++++++++++++++++++++++++++++++-------------
1 files changed, 101 insertions(+), 39 deletions(-)
diff --git a/funasr/models/llm_asr/model.py b/funasr/models/llm_asr/model.py
index 738ba92..43c044e 100644
--- a/funasr/models/llm_asr/model.py
+++ b/funasr/models/llm_asr/model.py
@@ -988,9 +988,9 @@
text: (Batch, Length)
text_lengths: (Batch,)
"""
- import pdb
-
- pdb.set_trace()
+ # import pdb
+ #
+ # pdb.set_trace()
if len(speech_lengths.size()) > 1:
speech_lengths = speech_lengths[:, 0]
@@ -1011,6 +1011,7 @@
fake_token_len = kwargs.get("fake_token_len")
fake_token_len[fake_token_len < 0] = 0
fbank_beg[fbank_beg < 0] = 0
+
speech_idx = 0
for batch_idx in range(batch_size):
@@ -1025,12 +1026,15 @@
batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
] = speech_token
except Exception as e:
+ #
logging.error(f"{str(e)}, {traceback.format_exc()}")
logging.info(
- f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens[speech_idx].item()}"
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
)
+ # import pdb;
+ # pdb.set_trace()
speech_token_len = encoder_out_lens[speech_idx].item()
- speech_token = encoder_out[speech_idx, turn_id, :speech_token_len, :]
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
inputs_embeds[
batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
] = speech_token
@@ -1064,6 +1068,12 @@
stats["batch_size_x_tokens"] = token_num * batch_size
stats["batch_size_real_tokens"] = attention_mask.sum().item()
stats["padding_tokens"] = stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
+
+ dialog_turns = (fbank_beg > 0).sum(-1)
+ dialog_turns_max = torch.max(dialog_turns).int().item()
+ dialog_turns_avg = dialog_turns.sum().item() / batch_size
+ stats["dialog_turns_max"] = dialog_turns_max
+ stats["dialog_turns_avg"] = dialog_turns_avg
# force_gatherable: to-device and to-tensor if scalar for DataParallel
if self.length_normalized_loss:
@@ -1105,8 +1115,8 @@
user = contents["user"]
assistant = contents["assistant"]
pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
- input_ids, labels, source_ids, target_ids, fbank, fbank_lens, fbank_mask, fbank_beg = (
- [],
+
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
[],
[],
[],
@@ -1115,21 +1125,30 @@
[],
[],
)
-
+ input_source_ids = []
for i, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
+ if i >= kwargs.get("multiturn_num_max", 5):
+ break
+ if len(input_ids) > kwargs.get("max_token_length", 1500):
- source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ break
+
+ if i == 0:
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ else:
+ source_input = f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
splits = pattern.split(source_input)
- source_ids_i = []
+ source_ids = []
+ fbank_i = []
fbank_mask_i = []
- fbank_beg_i = []
+ fake_token_len_i = 0
+ fbank_beg_i = -1
fbank_lens_i = []
- # target_ids_i = []
for k, sub_str in enumerate(splits):
if not sub_str.startswith("<|startofspeech|>"):
sub_token = tokenizer.encode(sub_str)
- source_ids_i += sub_token
+ source_ids += sub_token
fbank_mask_i += [0] * len(sub_token)
else:
sub_str = sub_str.replace("<|startofspeech|>", "").replace(
@@ -1162,42 +1181,57 @@
if kwargs.get("permute", True):
speech = speech.permute(0, 2, 1)
+ if speech_lengths > kwargs.get("max_source_length", 5500):
+ # logging.info(
+ # f"speech_lengths > max_source_length: {speech_lengths}>{self.max_source_length}, {item}"
+ # )
+ badcase_flag = True
olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
olens = 1 + (olens - 3 + 2 * 1) // 2
- sub_token_len = (olens - 1) // 2 + 1
- sub_token = [0] * sub_token_len
- fbank_beg_i = [len(source_ids_i)]
- source_ids_i += sub_token
- fbank_mask_i += [1] * len(sub_token)
+ fake_token_len_i = (olens - 1) // 2 + 1
+ fake_token = [0] * fake_token_len_i
+ fbank_beg_i = len(source_ids)
+ source_ids += fake_token
+ fbank_mask_i += [1] * len(fake_token)
- source_mask = [-100] * len(source_ids_i)
+ fbank_beg += [fbank_beg_i + len(input_ids)]
+ fake_token_len += [fake_token_len_i]
+ source_mask = [-100] * len(source_ids)
target_out = f"{target_out}<|im_end|>"
target_ids = tokenizer.encode(target_out)
- input_ids += source_ids_i + target_ids
+ input_source_ids = input_ids + source_ids
+ input_ids += source_ids + target_ids
labels += source_mask + target_ids
+ fbank.append(speech[0, :, :])
fbank_mask += fbank_mask_i
- fbank_beg.append(fbank_beg_i)
+ fbank_lens.append(speech_lengths)
input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
- source_ids = torch.tensor(source_ids_i, dtype=torch.int64)
- target_ids = torch.tensor(target_ids, dtype=torch.int64)
- fbank = speech[0, :, :]
- fbank_lens = speech_lengths
+ # fbank = speech[0, :, :]
+ # fbank_lens = torch.tensor(fbank_lens, dtype=torch.int32)
fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
+ source_ids = torch.tensor(input_source_ids, dtype=torch.int64)
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
+ speech = torch.nn.utils.rnn.pad_sequence(fbank, batch_first=True, padding_value=0.0)
+ speech_lengths = torch.nn.utils.rnn.pad_sequence(
+ fbank_lens, batch_first=True, padding_value=-1
+ )
output = {
- "speech": fbank[None, :, :],
- "speech_lengths": fbank_lens[:, None],
+ "speech": speech,
+ "speech_lengths": speech_lengths,
"fbank_mask": fbank_mask[None, :],
"fbank_beg": fbank_beg[None,],
- "input_ids": input_ids[None, :],
- "attention_mask": attention_mask[None, :],
- "labels_ids": labels[None, :],
+ "fake_token_len": fake_token_len[None, :],
+ "input_ids": input_ids[None,],
+ "attention_mask": attention_mask[None,],
+ "labels_ids": labels,
"source_ids": source_ids[None, :],
"target_ids": target_ids[None, :],
}
@@ -1240,20 +1274,48 @@
input_ids = batch["input_ids"]
source_ids = batch["source_ids"]
+ fbank_beg = batch["fbank_beg"]
+ fake_token_len = batch["fake_token_len"]
+
if not kwargs.get("tearchforing", False):
input_ids = source_ids
+
input_ids[input_ids < 0] = 0
inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
batch_size, token_num, dims = inputs_embeds.shape
- fbank_beg = batch["fbank_beg"]
+
+ fake_token_len[fake_token_len < 0] = 0
+ fbank_beg[fbank_beg < 0] = 0
+
+ speech_idx = 0
for batch_idx in range(batch_size):
- min_len = encoder_out_lens[batch_idx].item()
- fbank_beg_idx = fbank_beg[batch_idx]
- inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + min_len, :] = encoder_out[
- batch_idx, :min_len, :
- ]
+ for turn_id in range(fbank_beg.shape[1]):
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
+ if fbank_beg_idx > 0:
+ speech_token_len = fake_token_len[batch_idx, turn_id]
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
+
+ try:
+ inputs_embeds[
+ batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
+ ] = speech_token
+ except Exception as e:
+ #
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
+ logging.info(
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, speech_token_len: {speech_token_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens}, fake_token_len: {fake_token_len}, speech_lengths: {speech_lengths}"
+ )
+ # import pdb;
+ # pdb.set_trace()
+ speech_token_len = encoder_out_lens[speech_idx].item()
+ speech_token = encoder_out[speech_idx, :speech_token_len, :]
+ inputs_embeds[
+ batch_idx, fbank_beg_idx : fbank_beg_idx + speech_token_len, :
+ ] = speech_token
+
+ speech_idx += 1
llm_dtype = kwargs.get("llm_dtype", "fp32")
if llm_dtype == "fp32":
@@ -1263,7 +1325,7 @@
with torch.cuda.amp.autocast(
enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
):
- label = contents["assistant"][0]
+ label = contents["assistant"][-1]
self.llm = self.llm.to(dtype_map[llm_dtype])
inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
@@ -1313,8 +1375,8 @@
results.append(result_i)
if ibest_writer is not None:
- ibest_writer["text"][key[0]] = response
- ibest_writer["label"][key[0]] = label
+ ibest_writer["text"][key[0]] = response.replace("\n", " ")
+ ibest_writer["label"][key[0]] = label.replace("\n", " ")
ibest_writer["text_tn"][key[0]] = response_clean
return results, meta_data
--
Gitblit v1.9.1