From 0033151b6286e9d17a9b91567ba649ff14a89464 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 17 六月 2024 00:55:42 +0800
Subject: [PATCH] decoding
---
funasr/models/llm_asr/model.py | 498 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
funasr/datasets/openai_datasets/datasets.py | 36 ++-
2 files changed, 518 insertions(+), 16 deletions(-)
diff --git a/funasr/datasets/openai_datasets/datasets.py b/funasr/datasets/openai_datasets/datasets.py
index 3c2a957..ae9f289 100644
--- a/funasr/datasets/openai_datasets/datasets.py
+++ b/funasr/datasets/openai_datasets/datasets.py
@@ -300,8 +300,9 @@
return len(self.index_ds)
def __getitem__(self, index):
- # import pdb;
- # pdb.set_trace()
+ import pdb
+
+ pdb.set_trace()
output = None
@@ -318,7 +319,15 @@
user = item["user"]
assistant = item["assistant"]
- input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg = [], [], [], [], [], []
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg, fake_token_len = (
+ [],
+ [],
+ [],
+ [],
+ [],
+ [],
+ [],
+ )
for i, (system_prompt, user_prompt, target_out) in enumerate(
zip(system, user, assistant)
@@ -336,7 +345,8 @@
source_ids = []
fbank_i = []
fbank_mask_i = []
- fbank_beg_i = []
+ fake_token_len_i = 0
+ fbank_beg_i = -1
fbank_lens_i = []
for k, sub_str in enumerate(splits):
if not sub_str.startswith("<|startofspeech|>"):
@@ -369,14 +379,17 @@
olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
olens = 1 + (olens - 3 + 2 * 1) // 2
- sub_token_len = (olens - 1) // 2 + 1
- sub_token = [0] * sub_token_len
- fbank_beg_i = [len(source_ids)]
- source_ids += sub_token
- fbank_mask_i += [1] * len(sub_token)
+ fake_token_len_i = (olens - 1) // 2 + 1
+ fake_token = [0] * fake_token_len_i
+ fbank_beg_i = len(source_ids)
+ source_ids += fake_token
+ fbank_mask_i += [1] * len(fake_token)
if badcase_flag:
continue
+
+ fbank_beg += [fbank_beg_i + len(input_ids)]
+ fake_token_len += [fake_token_len_i]
source_mask = [-100] * len(source_ids)
target_out = f"{target_out}<|im_end|>"
target_ids = self.tokenizer.encode(target_out)
@@ -384,9 +397,6 @@
labels += source_mask + target_ids
fbank.append(speech[0, :, :])
fbank_mask += fbank_mask_i
- if len(fbank_beg_i) < 1:
- fbank_beg_i = [-1]
- fbank_beg += fbank_beg_i
if len(input_ids) > self.max_token_length:
logging.info(
@@ -403,12 +413,14 @@
fbank_lens = speech_lengths
fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
+ fake_token_len = torch.tensor(fake_token_len, dtype=torch.int32)
output = {
"speech": fbank,
"speech_lengths": fbank_lens,
"fbank_mask": fbank_mask,
"fbank_beg": fbank_beg,
+ "fake_token_len": fake_token_len,
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels_ids": labels,
diff --git a/funasr/models/llm_asr/model.py b/funasr/models/llm_asr/model.py
index 1151269..1e3515f 100644
--- a/funasr/models/llm_asr/model.py
+++ b/funasr/models/llm_asr/model.py
@@ -655,12 +655,25 @@
/ 1000
)
- if kwargs.get("permute", True):
+ if hasattr(frontend, "permute") and not frontend.permute:
+ # if kwargs.get("permute", True):
speech = speech.permute(0, 2, 1)
- olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
- olens = 1 + (olens - 3 + 2 * 1) // 2
- sub_token_len = (olens - 1) // 2 + 1
+ if (
+ kwargs.get("dataset_conf", {}).get("audio_encoder_downsample_rate", 1)
+ == 4
+ ):
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
+ olens = 1 + (olens - 3 + 2 * 1) // 2
+ elif (
+ kwargs.get("dataset_conf", {}).get("audio_encoder_downsample_rate", 1)
+ == 1
+ ):
+ olens = speech_lengths[0].item()
+
+ sub_token_len = (olens - 1) // kwargs.get("dataset_conf", {}).get(
+ "audio_adaptor_downsample_rate", 1
+ ) + 1
sub_token = [0] * sub_token_len
fbank_beg_i = [len(source_ids_i)]
source_ids_i += sub_token
@@ -831,3 +844,480 @@
# audio encoder
encoder_out, encoder_out_lens = self.audio_encoder(speech, speech_lengths)
return encoder_out, encoder_out_lens
+
+
+@tables.register("model_classes", "LLMASR4")
+class LLMASR4(nn.Module):
+ """ """
+
+ def __init__(
+ self,
+ specaug: str = None,
+ specaug_conf: dict = None,
+ normalize: str = None,
+ normalize_conf: dict = None,
+ audio_encoder: str = None,
+ audio_encoder_conf: dict = None,
+ audio_adaptor: str = None,
+ audio_adaptor_conf: dict = None,
+ decoder: str = None,
+ decoder_conf: dict = None,
+ ctc: str = None,
+ ctc_conf: dict = None,
+ ctc_weight: float = 0.5,
+ llm: str = None,
+ llm_conf: dict = None,
+ input_size: int = 80,
+ vocab_size: int = -1,
+ ignore_id: int = -1,
+ blank_id: int = 0,
+ sos: int = 1,
+ eos: int = 2,
+ lsm_weight: float = 0.0,
+ length_normalized_loss: bool = False,
+ report_cer: bool = True,
+ report_wer: bool = True,
+ sym_space: str = "<space>",
+ sym_blank: str = "<blank>",
+ # extract_feats_in_collect_stats: bool = True,
+ share_embedding: bool = False,
+ # preencoder: Optional[AbsPreEncoder] = None,
+ # postencoder: Optional[AbsPostEncoder] = None,
+ **kwargs,
+ ):
+
+ super().__init__()
+
+ # audio encoder
+ hub = audio_encoder_conf.get("hub", None)
+ if hub == "ms":
+ from funasr import AutoModel
+
+ model = AutoModel(model=audio_encoder, model_revision="master")
+ # frontend = model.kwargs.get("frontend")
+ audio_encoder_output_size = model.model.encoder_output_size
+
+ audio_encoder = (
+ model.model.model.encoder if hasattr(model.model, "model") else model.model.encoder
+ )
+
+ # self.frontend = frontend
+
+ elif hub == "hf":
+ pass
+ else:
+ encoder_class = tables.encoder_classes.get(audio_encoder)
+ audio_encoder = encoder_class(input_size=input_size, **audio_encoder_conf)
+ audio_encoder_output_size = audio_encoder.output_size()
+ freeze = audio_encoder_conf.get("freeze", True)
+ freeze_layer_num = int(audio_encoder_conf.get("freeze_layer_num", -1))
+ # if freeze_layer_num > 0:
+ # freeze_layer_num = range(freeze_layer_num)
+
+ if freeze:
+ for name, param in audio_encoder.named_parameters():
+ if freeze_layer_num > 0:
+ idx = re.search(r"\.\d+\.", name)
+ if idx is not None:
+ beg, end = idx.regs[0]
+ layer_id = int(name[beg + 1 : end - 1])
+ if layer_id < freeze_layer_num:
+ param.requires_grad = False
+ elif "ln_post." not in name:
+ param.requires_grad = False
+ else:
+ param.requires_grad = False
+
+ audio_encoder.eval()
+
+ self.audio_encoder = audio_encoder
+
+ # llm
+ self.llm = None
+
+ from transformers import AutoModelForCausalLM, AutoTokenizer, AutoConfig
+
+ init_param_path = llm_conf.get("init_param_path", "vicuna-7b-v1.5")
+
+ model = AutoModelForCausalLM.from_pretrained(
+ init_param_path,
+ load_in_8bit=None,
+ device_map=None,
+ use_cache=None,
+ )
+ freeze = llm_conf.get("freeze", True)
+ if freeze:
+ for name, param in model.named_parameters():
+ param.requires_grad = False
+ model.eval()
+ self.llm_dtype = llm_conf.get("llm_dtype", "fp32")
+ self.llm = model.to(dtype_map[self.llm_dtype])
+ llm_dim = model.get_input_embeddings().weight.shape[-1]
+
+ # adaptor
+ adaptor_class = tables.adaptor_classes.get(audio_adaptor)
+ audio_adaptor_conf["encoder_dim"] = audio_encoder_output_size
+ audio_adaptor_conf["llm_dim"] = llm_dim
+ audio_adaptor = adaptor_class(**audio_adaptor_conf)
+ init_param_path = audio_adaptor_conf.get("init_param_path", None)
+ if init_param_path is not None:
+ src_state = torch.load(init_param_path, map_location="cpu")
+ flag = audio_adaptor.load_state_dict(src_state, strict=False)
+ logging.info(f"Loading audio_adaptor ckpt: {init_param_path}, status: {flag}")
+
+ self.audio_adaptor = audio_adaptor
+
+ self.error_calculator = None
+
+ self.length_normalized_loss = length_normalized_loss
+ self.beam_search = None
+
+ def forward(
+ self,
+ speech: torch.Tensor,
+ speech_lengths: torch.Tensor,
+ input_ids: torch.Tensor,
+ attention_mask: torch.Tensor,
+ labels_ids: torch.Tensor,
+ fbank_beg: torch.Tensor,
+ fbank_mask: torch.Tensor,
+ **kwargs,
+ ) -> Tuple[torch.Tensor, Dict[str, torch.Tensor], torch.Tensor]:
+ """Encoder + Decoder + Calc loss
+ Args:
+ speech: (Batch, Length, ...)
+ speech_lengths: (Batch, )
+ text: (Batch, Length)
+ text_lengths: (Batch,)
+ """
+ # import pdb;
+ # pdb.set_trace()
+ if len(speech_lengths.size()) > 1:
+ speech_lengths = speech_lengths[:, 0]
+
+ batch_size, frames, _ = speech.shape
+
+ with torch.cuda.amp.autocast(enabled=False):
+ # audio encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+ # audio_adaptor
+ encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
+
+ input_ids[input_ids < 0] = 0
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
+
+ batch_size, token_num, dims = inputs_embeds.shape
+ fbank_mask[fbank_mask < 0] = 0
+ fbank_fake_lens = fbank_mask.sum(-1).to(torch.int32)
+ # _, l, _ = encoder_out.shape
+ fake_token_len = kwargs.get("fake_token_len")
+ fake_token_len[fake_token_len < 0] = 0
+ fbank_beg[fbank_beg < 0] = 0
+ for batch_idx in range(batch_size):
+
+ for turn_id in range(fbank_beg.shape[1]):
+ fbank_beg_idx = fbank_beg[batch_idx, turn_id].item()
+ if fbank_beg[turn_id] > 0:
+ speech_token_len = fake_token_len[batch_idx, turn_id]
+ speech_token = encoder_out[batch_idx + turn_id, turn_id, :speech_token_len, :]
+
+ fbank_fake_len = fbank_fake_lens[batch_idx].item()
+ fbank_beg_idx = fbank_beg[batch_idx, 0].item()
+ min_len = min(fbank_fake_len, inputs_embeds.shape[1] - fbank_beg_idx)
+
+ try:
+ inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + min_len, :] = encoder_out[
+ batch_idx, :min_len, :
+ ]
+ except Exception as e:
+ logging.error(f"{str(e)}, {traceback.format_exc()}")
+ logging.info(
+ f"batch_idx: {batch_idx}, inputs_embeds: {inputs_embeds.shape}, fbank_beg_idx: {fbank_beg_idx}, min_len: {min_len}, fbank_fake_len: {fbank_fake_len}, encoder_out: {encoder_out.shape}, encoder_out_lens: {encoder_out_lens[batch_idx].item()}"
+ )
+ fbank_fake_len = encoder_out_lens[batch_idx].item()
+ min_len = min(fbank_fake_len, min_len)
+ inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + min_len, :] = encoder_out[
+ batch_idx, :min_len, :
+ ]
+
+ with torch.cuda.amp.autocast(
+ enabled=True if self.llm_dtype != "fp32" else False, dtype=dtype_map[self.llm_dtype]
+ ):
+ labels_ids[labels_ids == -1] = -100
+ attention_mask[attention_mask < 0] = 0
+ model_outputs = self.llm(
+ inputs_embeds=inputs_embeds.to(dtype_map[self.llm_dtype]),
+ attention_mask=attention_mask,
+ labels=labels_ids,
+ )
+ loss = model_outputs.loss
+
+ stats = {}
+ with torch.no_grad():
+ preds = torch.argmax(model_outputs.logits, -1)
+ acc_att = compute_accuracy(preds[:, :-1], labels_ids[:, 1:], ignore_label=-100)
+ stats["acc"] = acc_att
+
+ stats["loss"] = torch.clone(loss.detach())
+ stats["batch_size"] = batch_size
+ stats["batch_size_x_frames"] = frames * batch_size
+ stats["batch_size_real_frames"] = speech_lengths.sum().item()
+ stats["padding_frames"] = stats["batch_size_x_frames"] - stats["batch_size_real_frames"]
+ stats["batch_size_x_tokens"] = token_num * batch_size
+ stats["batch_size_real_tokens"] = attention_mask.sum().item()
+ stats["padding_tokens"] = stats["batch_size_x_tokens"] - stats["batch_size_real_tokens"]
+
+ # force_gatherable: to-device and to-tensor if scalar for DataParallel
+ if self.length_normalized_loss:
+ batch_size = int((labels_ids > 0 + 1).sum())
+ loss, stats, weight = force_gatherable((loss, stats, batch_size), loss.device)
+ return loss, stats, weight
+
+ def encode(self, speech, speech_lengths):
+ # audio encoder
+ encoder_out, encoder_out_lens = self.audio_encoder(speech.permute(0, 2, 1), speech_lengths)
+
+ return encoder_out, encoder_out_lens
+
+ def data_template(self, data):
+ system, user, assistant = [], [], []
+ for i, item in enumerate(data):
+ role = item["role"]
+ content = item["content"]
+ if role == "system":
+ system.append(content)
+ elif role == "user":
+ user.append(content)
+ elif role == "assistant":
+ assistant.append(content)
+
+ system = system * len(user)
+
+ contents = {
+ "system": system,
+ "user": user,
+ "assistant": assistant,
+ }
+
+ return contents
+
+ def data_load_speech(self, contents: dict, tokenizer, frontend, meta_data={}, **kwargs):
+
+ system = contents["system"]
+ user = contents["user"]
+ assistant = contents["assistant"]
+ pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
+ input_ids, labels, source_ids, target_ids, fbank, fbank_lens, fbank_mask, fbank_beg = (
+ [],
+ [],
+ [],
+ [],
+ [],
+ [],
+ [],
+ [],
+ )
+
+ for i, (system_prompt, user_prompt, target_out) in enumerate(zip(system, user, assistant)):
+
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+
+ splits = pattern.split(source_input)
+ source_ids_i = []
+ fbank_mask_i = []
+ fbank_beg_i = []
+ fbank_lens_i = []
+ # target_ids_i = []
+ for k, sub_str in enumerate(splits):
+ if not sub_str.startswith("<|startofspeech|>"):
+ sub_token = tokenizer.encode(sub_str)
+ source_ids_i += sub_token
+ fbank_mask_i += [0] * len(sub_token)
+ else:
+ sub_str = sub_str.replace("<|startofspeech|>", "").replace(
+ "<|endofspeech|>", ""
+ )
+ if sub_str.startswith("!"):
+ try:
+ time1 = time.perf_counter()
+ data_src = load_audio_text_image_video(sub_str[1:], fs=frontend.fs)
+ time2 = time.perf_counter()
+ meta_data["load_data"] = f"{time2 - time1:0.3f}"
+ except Exception as e:
+ logging.error(f"Loading wav failed! {str(e)}, {traceback.format_exc()}")
+
+ speech, speech_lengths = extract_fbank(
+ data_src,
+ data_type=kwargs.get("data_type", "sound"),
+ frontend=frontend,
+ is_final=True,
+ ) # speech: [b, T, d]
+
+ time3 = time.perf_counter()
+ meta_data["extract_feat"] = f"{time3 - time2:0.3f}"
+ meta_data["batch_data_time"] = (
+ speech_lengths.sum().item()
+ * frontend.frame_shift
+ * frontend.lfr_n
+ / 1000
+ )
+
+ if kwargs.get("permute", True):
+ speech = speech.permute(0, 2, 1)
+
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
+ olens = 1 + (olens - 3 + 2 * 1) // 2
+ sub_token_len = (olens - 1) // 2 + 1
+ sub_token = [0] * sub_token_len
+ fbank_beg_i = [len(source_ids_i)]
+ source_ids_i += sub_token
+ fbank_mask_i += [1] * len(sub_token)
+
+ source_mask = [-100] * len(source_ids_i)
+ target_out = f"{target_out}<|im_end|>"
+ target_ids = tokenizer.encode(target_out)
+ input_ids += source_ids_i + target_ids
+ labels += source_mask + target_ids
+ fbank_mask += fbank_mask_i
+ fbank_beg.append(fbank_beg_i)
+
+ input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
+ attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
+ labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
+ source_ids = torch.tensor(source_ids_i, dtype=torch.int64)
+ target_ids = torch.tensor(target_ids, dtype=torch.int64)
+
+ fbank = speech[0, :, :]
+ fbank_lens = speech_lengths
+ fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
+ fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
+
+ output = {
+ "speech": fbank[None, :, :],
+ "speech_lengths": fbank_lens[:, None],
+ "fbank_mask": fbank_mask[None, :],
+ "fbank_beg": fbank_beg[None,],
+ "input_ids": input_ids[None, :],
+ "attention_mask": attention_mask[None, :],
+ "labels_ids": labels[None, :],
+ "source_ids": source_ids[None, :],
+ "target_ids": target_ids[None, :],
+ }
+
+ return output
+
+ def inference(
+ self,
+ data_in,
+ data_lengths=None,
+ key: list = None,
+ tokenizer=None,
+ frontend=None,
+ **kwargs,
+ ):
+
+ meta_data = {}
+ prompt = kwargs.get("prompt", None)
+
+ if kwargs.get("batch_size", 1) > 1:
+ raise NotImplementedError("batch decoding is not implemented")
+
+ contents = self.data_template(data_in[0])
+ output = self.data_load_speech(contents, tokenizer, frontend, meta_data=meta_data, **kwargs)
+ batch = to_device(output, kwargs["device"])
+
+ # audio encoder
+ speech = batch["speech"]
+ speech_lengths = batch["speech_lengths"][:, 0]
+ # fp16
+ if kwargs.get("fp16", False):
+ speech = speech.to(torch.float16)
+ elif kwargs.get("bf16", False):
+ speech = speech.to(torch.bfloat16)
+ # audio encoder
+ encoder_out, encoder_out_lens = self.encode(speech, speech_lengths)
+
+ # audio_adaptor
+ encoder_out, encoder_out_lens = self.audio_adaptor(encoder_out, encoder_out_lens)
+
+ input_ids = batch["input_ids"]
+ source_ids = batch["source_ids"]
+ if not kwargs.get("tearchforing", False):
+ input_ids = source_ids
+ input_ids[input_ids < 0] = 0
+ inputs_embeds = self.llm.model.get_input_embeddings()(input_ids)
+
+ batch_size, token_num, dims = inputs_embeds.shape
+ fbank_beg = batch["fbank_beg"]
+ for batch_idx in range(batch_size):
+
+ min_len = encoder_out_lens[batch_idx].item()
+ fbank_beg_idx = fbank_beg[batch_idx]
+ inputs_embeds[batch_idx, fbank_beg_idx : fbank_beg_idx + min_len, :] = encoder_out[
+ batch_idx, :min_len, :
+ ]
+
+ llm_dtype = kwargs.get("llm_dtype", "fp32")
+ if llm_dtype == "fp32":
+ llm_dtype = "fp16" if kwargs.get("fp16", False) else llm_dtype
+ llm_dtype = "bf16" if kwargs.get("bf16", False) else llm_dtype
+
+ with torch.cuda.amp.autocast(
+ enabled=True if llm_dtype != "fp32" else False, dtype=dtype_map[llm_dtype]
+ ):
+ label = contents["assistant"][0]
+ self.llm = self.llm.to(dtype_map[llm_dtype])
+ inputs_embeds = inputs_embeds.to(dtype_map[llm_dtype])
+
+ if not kwargs.get("tearchforing", False):
+
+ generated_ids = self.llm.generate(
+ inputs_embeds=inputs_embeds, max_new_tokens=kwargs.get("max_length", 512)
+ )
+ # generated_ids = [
+ # output_ids[len(input_id) :]
+ # for input_id, output_ids in zip(input_ids, generated_ids)
+ # ]
+ response = tokenizer.batch_decode(
+ generated_ids, skip_special_tokens=kwargs.get("skip_special_tokens", True)
+ )[0]
+
+ loss = None
+ else:
+
+ labels_ids = batch["labels_ids"]
+ labels_ids[labels_ids == -1] = -100
+ attention_mask = batch.get("attention_mask", None)
+ # attention_mask = attention_mask.to(dtype_map[llm_dtype])
+ model_outputs = self.llm(
+ inputs_embeds=inputs_embeds, attention_mask=attention_mask, labels=labels_ids
+ )
+
+ preds = torch.argmax(model_outputs.logits, -1)[:, source_ids.shape[1] :]
+ response = tokenizer.batch_decode(
+ preds,
+ add_special_tokens=False,
+ skip_special_tokens=kwargs.get("skip_special_tokens", True),
+ )[0]
+ loss = model_outputs.loss.item()
+
+ ibest_writer = None
+ if kwargs.get("output_dir") is not None:
+ if not hasattr(self, "writer"):
+ self.writer = DatadirWriter(kwargs.get("output_dir"))
+ ibest_writer = self.writer[f"{0 + 1}best_recog"]
+
+ results = []
+ response_clean = re.sub("[^\w\s\u3000\u4e00-\u9fff]+", "", response)
+ result_i = {"key": key[0], "text": response, "text_tn": response_clean, "label": label}
+ if loss is not None:
+ result_i["loss"] = loss
+ results.append(result_i)
+
+ if ibest_writer is not None:
+ ibest_writer["text"][key[0]] = response
+ ibest_writer["label"][key[0]] = label
+ ibest_writer["text_tn"][key[0]] = response_clean
+
+ return results, meta_data
--
Gitblit v1.9.1