From 05f05e7421da12e38109df8ba75be52e90f15092 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 11 六月 2024 19:00:55 +0800
Subject: [PATCH] decoding
---
examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh | 54 +++++-----
funasr/bin/train_ds.py | 4
funasr/datasets/openai_datasets/datasets.py | 224 ++++++++++++++++++++++++++++++++++++++++++++
3 files changed, 253 insertions(+), 29 deletions(-)
diff --git a/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh b/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
index d4c409b..8e22cb9 100644
--- a/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
+++ b/examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh
@@ -12,6 +12,7 @@
out_dir="${ckpt_dir}/inference-${ckpt_id}"
mkdir -p ${out_dir}
for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
+{
jsonl=${jsonl_dir}/${data_set}
output_dir=${out_dir}/${data_set}
mkdir -p ${output_dir}
@@ -22,10 +23,12 @@
python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false
+}&
done
+wait
-
-for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
+for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do
+{
jsonl=${jsonl_dir}/${data_set}
output_dir=${out_dir}/${data_set}
mkdir -p ${output_dir}
@@ -36,30 +39,27 @@
python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true
+}&
done
-for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
- jsonl=${jsonl_dir}/${data_set}
- output_dir=${out_dir}/${data_set}
- mkdir -p ${output_dir}
- pred_file=${output_dir}/1best_recog/text_tn
- ref_file=${output_dir}/1best_recog/label
-
- python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
-
- python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true
-
-done
-
-for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
- jsonl=${jsonl_dir}/${data_set}
- output_dir=${out_dir}/${data_set}
- mkdir -p ${output_dir}
- pred_file=${output_dir}/1best_recog/text_tn
- ref_file=${output_dir}/1best_recog/label
-
- python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
-
- python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false
-
-done
\ No newline at end of file
+#for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
+# jsonl=${jsonl_dir}/${data_set}
+# output_dir=${out_dir}/${data_set}
+# mkdir -p ${output_dir}
+# pred_file=${output_dir}/1best_recog/text_tn
+# ref_file=${output_dir}/1best_recog/label
+#
+# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
+#
+#done
+#
+#for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
+# jsonl=${jsonl_dir}/${data_set}
+# output_dir=${out_dir}/${data_set}
+# mkdir -p ${output_dir}
+# pred_file=${output_dir}/1best_recog/text_tn
+# ref_file=${output_dir}/1best_recog/label
+#
+# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
+#
+#done
\ No newline at end of file
diff --git a/funasr/bin/train_ds.py b/funasr/bin/train_ds.py
index 5b1d4fd..67ed2ba 100644
--- a/funasr/bin/train_ds.py
+++ b/funasr/bin/train_ds.py
@@ -182,7 +182,7 @@
time_escaped = (time.perf_counter() - time_slice_i) / 3600.0
logging.info(
- f"rank: {local_rank}, "
+ f"\n\nrank: {local_rank}, "
f"time_escaped_epoch: {time_escaped:.3f} hours, "
f"estimated to finish {dataloader.data_split_num} data_slices, remaining: {dataloader.data_split_num-data_split_i} slices, {(dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours, "
f"epoch: {trainer.max_epoch - epoch} epochs, {((trainer.max_epoch - epoch - 1)*dataloader.data_split_num + dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours\n"
@@ -199,7 +199,7 @@
time2 = time.perf_counter()
time_escaped = (time2 - time1) / 3600.0
logging.info(
- f"rank: {local_rank}, "
+ f"\n\nrank: {local_rank}, "
f"time_escaped_epoch: {time_escaped:.3f} hours, "
f"estimated to finish {trainer.max_epoch} "
f"epoch: {(trainer.max_epoch - epoch) * time_escaped:.3f} hours\n"
diff --git a/funasr/datasets/openai_datasets/datasets.py b/funasr/datasets/openai_datasets/datasets.py
index 8d243ac..6307930 100644
--- a/funasr/datasets/openai_datasets/datasets.py
+++ b/funasr/datasets/openai_datasets/datasets.py
@@ -222,3 +222,227 @@
break
return outputs
+
+
+@tables.register("dataset_classes", "OpenAIDatasetMultiTurn")
+class OpenAIDatasetMultiTurn(torch.utils.data.Dataset):
+ """
+ SenseVoiceDataset
+ """
+
+ def __init__(
+ self,
+ path,
+ index_ds: str = None,
+ frontend=None,
+ tokenizer=None,
+ int_pad_value: int = -1,
+ float_pad_value: float = 0.0,
+ **kwargs,
+ ):
+ super().__init__()
+ index_ds_class = tables.index_ds_classes.get(index_ds)
+ self.index_ds = index_ds_class(path, **kwargs)
+ preprocessor_speech = kwargs.get("preprocessor_speech", None)
+ if preprocessor_speech:
+ preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech)
+ preprocessor_speech = preprocessor_speech_class(
+ **kwargs.get("preprocessor_speech_conf")
+ )
+ self.preprocessor_speech = preprocessor_speech
+ preprocessor_text = kwargs.get("preprocessor_text", None)
+ if preprocessor_text:
+ preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
+ preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf"))
+ self.preprocessor_text = preprocessor_text
+
+ self.frontend = frontend
+ self.fs = 16000 if frontend is None else frontend.fs
+ self.data_type = "sound"
+ self.tokenizer = tokenizer
+
+ self.int_pad_value = int_pad_value
+ self.float_pad_value = float_pad_value
+ self.sos = kwargs.get("sos", "<|startoftranscript|>")
+ self.eos = kwargs.get("eos", "<|endoftext|>")
+ self.batch_size = kwargs.get("batch_size")
+ self.batch_type = kwargs.get("batch_type")
+ self.prompt_ids_len = 0
+ self.retry = kwargs.get("retry", 100)
+
+ self.permute = False
+ from funasr.frontends.whisper_frontend import WhisperFrontend
+
+ if isinstance(self.frontend, WhisperFrontend):
+ self.permute = True
+
+ self.pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
+ # self.kwargs = kwargs
+ self.max_token_length = kwargs.get("max_token_length", 1024)
+ self.batch_size_scale_ratio_max = kwargs.get("batch_size_scale_ratio_max", 1.5)
+ self.batch_size_token_max = kwargs.get("batch_size_token_max", 2500)
+ self.multiturn_num_max = kwargs.get("multiturn_num_max", 5)
+
+ def get_source_len(self, index):
+ item = self.index_ds[index]
+ return self.index_ds.get_source_len(item)
+
+ def get_target_len(self, index):
+ item = self.index_ds[index]
+ return self.index_ds.get_target_len(item)
+
+ def __len__(self):
+ return len(self.index_ds)
+
+ def __getitem__(self, index):
+ # import pdb;
+ # pdb.set_trace()
+
+ output = None
+
+ for idx in range(self.retry):
+ badcase_flag = False
+ if idx == 0:
+ index_cur = index
+ else:
+ index_cur = torch.randint(0, len(self.index_ds), ()).item()
+
+ item = self.index_ds[index_cur]
+
+ system = item["system"]
+ user = item["user"]
+ assistant = item["assistant"]
+
+ input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg = [], [], [], [], [], []
+
+ for i, (system_prompt, user_prompt, target_out) in enumerate(
+ zip(system, user, assistant)
+ ):
+ if i >= self.multiturn_num_max:
+ break
+ if i == 0:
+ source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ else:
+ source_input = (
+ f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
+ )
+
+ splits = self.pattern.split(source_input)
+ source_ids = []
+ fbank_mask_i = []
+ fbank_beg_i = []
+ fbank_lens_i = []
+ for k, sub_str in enumerate(splits):
+ if not sub_str.startswith("<|startofspeech|>"):
+ sub_token = self.tokenizer.encode(sub_str)
+ source_ids += sub_token
+ fbank_mask_i += [0] * len(sub_token)
+ else:
+ sub_str = sub_str.replace("<|startofspeech|>", "").replace(
+ "<|endofspeech|>", ""
+ )
+ if sub_str.startswith("!"):
+ try:
+ data_src = load_audio_text_image_video(sub_str[1:], fs=self.fs)
+ except Exception as e:
+ logging.error(
+ f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
+ )
+ badcase_flag = True
+ continue
+ speech, speech_lengths = extract_fbank(
+ data_src,
+ data_type=self.data_type,
+ frontend=self.frontend,
+ is_final=True,
+ ) # speech: [b, T, d]
+ if self.permute:
+ speech = speech.permute(0, 2, 1)
+ # if speech_lengths > self.batch_size:
+ # continue
+
+ olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
+ olens = 1 + (olens - 3 + 2 * 1) // 2
+ sub_token_len = (olens - 1) // 2 + 1
+ sub_token = [0] * sub_token_len
+ fbank_beg_i = [len(source_ids)]
+ source_ids += sub_token
+ fbank_mask_i += [1] * len(sub_token)
+
+ if badcase_flag:
+ continue
+ source_mask = [-100] * len(source_ids)
+ target_out = f"{target_out}<|im_end|>"
+ target_ids = self.tokenizer.encode(target_out)
+ input_ids += source_ids + target_ids
+ labels += source_mask + target_ids
+ fbank_mask += fbank_mask_i
+ fbank_beg.append(fbank_beg_i)
+
+ if len(input_ids) > self.max_token_length:
+ logging.info(
+ f"input_ids > max_token_length: {len(input_ids)}>{self.max_token_length}, {item}"
+ )
+ badcase_flag = True
+ if badcase_flag:
+ continue
+ input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
+ attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
+ labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]
+
+ fbank = speech[0, :, :]
+ fbank_lens = speech_lengths
+ fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
+ fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)
+
+ output = {
+ "speech": fbank,
+ "speech_lengths": fbank_lens,
+ "fbank_mask": fbank_mask,
+ "fbank_beg": fbank_beg,
+ "input_ids": input_ids,
+ "attention_mask": attention_mask,
+ "labels_ids": labels,
+ }
+ break
+
+ return output
+
+ def collator(self, samples: list = None):
+
+ for idx in range(self.retry):
+ badcase_flag = False
+
+ outputs = {}
+ for sample in samples:
+ if sample is None:
+ continue
+ for key in sample.keys():
+ if key not in outputs:
+ outputs[key] = []
+ outputs[key].append(sample[key])
+
+ for key, data_list in outputs.items():
+ if isinstance(data_list[0], torch.Tensor):
+ if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32:
+
+ pad_value = self.int_pad_value
+ else:
+ pad_value = self.float_pad_value
+
+ outputs[key] = torch.nn.utils.rnn.pad_sequence(
+ data_list, batch_first=True, padding_value=pad_value
+ )
+
+ if self.batch_type != "example":
+ b, t = outputs["input_ids"].shape
+ if b > 1 and b * t > self.batch_size_token_max:
+ logging.info(
+ f"Warning, {idx}th, b*t: {b}*{t}={b * t} > batch_size_sample_max: {self.batch_size_token_max}, drop last data"
+ )
+ samples = samples[:-1]
+ continue
+
+ break
+
+ return outputs
--
Gitblit v1.9.1