python/FunASR-XL.git

parent: f57b3788 | 补丁 | 提交 | ignore whitespace

游雁

2024-06-11 05f05e7421da12e38109df8ba75be52e90f15092

decoding

3个文件已修改

	examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh	54 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/bin/train_ds.py	4 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史
	funasr/datasets/openai_datasets/datasets.py	224 ●●●●● 补丁 \| 查看 \| 原始文档 \| blame \| 历史

 examples/industrial_data_pretraining/llm_asr/demo_speech2text.sh

@@ -12,6 +12,7 @@
out_dir="${ckpt_dir}/inference-${ckpt_id}"
mkdir -p ${out_dir}
for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
{
    jsonl=${jsonl_dir}/${data_set}
    output_dir=${out_dir}/${data_set}
    mkdir -p ${output_dir}
@@ -22,10 +23,12 @@

    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false

}&
done
wait


for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do
{
    jsonl=${jsonl_dir}/${data_set}
    output_dir=${out_dir}/${data_set}
    mkdir -p ${output_dir}
@@ -36,30 +39,27 @@

    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true

}&
done

for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
    jsonl=${jsonl_dir}/${data_set}
    output_dir=${out_dir}/${data_set}
    mkdir -p ${output_dir}
    pred_file=${output_dir}/1best_recog/text_tn
    ref_file=${output_dir}/1best_recog/label

    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}

    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true

done

for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
    jsonl=${jsonl_dir}/${data_set}
    output_dir=${out_dir}/${data_set}
    mkdir -p ${output_dir}
    pred_file=${output_dir}/1best_recog/text_tn
    ref_file=${output_dir}/1best_recog/label

    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}

    python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false

done
#for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
#    jsonl=${jsonl_dir}/${data_set}
#    output_dir=${out_dir}/${data_set}
#    mkdir -p ${output_dir}
#    pred_file=${output_dir}/1best_recog/text_tn
#    ref_file=${output_dir}/1best_recog/label
#
#    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
#
#done
#
#for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
#    jsonl=${jsonl_dir}/${data_set}
#    output_dir=${out_dir}/${data_set}
#    mkdir -p ${output_dir}
#    pred_file=${output_dir}/1best_recog/text_tn
#    ref_file=${output_dir}/1best_recog/label
#
#    python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
#
#done

 funasr/bin/train_ds.py

@@ -182,7 +182,7 @@

            time_escaped = (time.perf_counter() - time_slice_i) / 3600.0
            logging.info(
                f"rank: {local_rank}, "
                f"\n\nrank: {local_rank}, "
                f"time_escaped_epoch: {time_escaped:.3f} hours, "
                f"estimated to finish {dataloader.data_split_num} data_slices, remaining: {dataloader.data_split_num-data_split_i} slices, {(dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours, "
                f"epoch: {trainer.max_epoch - epoch} epochs, {((trainer.max_epoch - epoch - 1)*dataloader.data_split_num + dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours\n"
@@ -199,7 +199,7 @@
        time2 = time.perf_counter()
        time_escaped = (time2 - time1) / 3600.0
        logging.info(
            f"rank: {local_rank}, "
            f"\n\nrank: {local_rank}, "
            f"time_escaped_epoch: {time_escaped:.3f} hours, "
            f"estimated to finish {trainer.max_epoch} "
            f"epoch: {(trainer.max_epoch - epoch) * time_escaped:.3f} hours\n"

 funasr/datasets/openai_datasets/datasets.py

@@ -222,3 +222,227 @@
            break

        return outputs


@tables.register("dataset_classes", "OpenAIDatasetMultiTurn")
class OpenAIDatasetMultiTurn(torch.utils.data.Dataset):
    """
    SenseVoiceDataset
    """

    def __init__(
        self,
        path,
        index_ds: str = None,
        frontend=None,
        tokenizer=None,
        int_pad_value: int = -1,
        float_pad_value: float = 0.0,
        **kwargs,
    ):
        super().__init__()
        index_ds_class = tables.index_ds_classes.get(index_ds)
        self.index_ds = index_ds_class(path, **kwargs)
        preprocessor_speech = kwargs.get("preprocessor_speech", None)
        if preprocessor_speech:
            preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech)
            preprocessor_speech = preprocessor_speech_class(
                **kwargs.get("preprocessor_speech_conf")
            )
        self.preprocessor_speech = preprocessor_speech
        preprocessor_text = kwargs.get("preprocessor_text", None)
        if preprocessor_text:
            preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
            preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf"))
        self.preprocessor_text = preprocessor_text

        self.frontend = frontend
        self.fs = 16000 if frontend is None else frontend.fs
        self.data_type = "sound"
        self.tokenizer = tokenizer

        self.int_pad_value = int_pad_value
        self.float_pad_value = float_pad_value
        self.sos = kwargs.get("sos", "<|startoftranscript|>")
        self.eos = kwargs.get("eos", "<|endoftext|>")
        self.batch_size = kwargs.get("batch_size")
        self.batch_type = kwargs.get("batch_type")
        self.prompt_ids_len = 0
        self.retry = kwargs.get("retry", 100)

        self.permute = False
        from funasr.frontends.whisper_frontend import WhisperFrontend

        if isinstance(self.frontend, WhisperFrontend):
            self.permute = True

        self.pattern = re.compile(r"(<\|startofspeech\|>.*?<\|endofspeech\|>)")
        # self.kwargs = kwargs
        self.max_token_length = kwargs.get("max_token_length", 1024)
        self.batch_size_scale_ratio_max = kwargs.get("batch_size_scale_ratio_max", 1.5)
        self.batch_size_token_max = kwargs.get("batch_size_token_max", 2500)
        self.multiturn_num_max = kwargs.get("multiturn_num_max", 5)

    def get_source_len(self, index):
        item = self.index_ds[index]
        return self.index_ds.get_source_len(item)

    def get_target_len(self, index):
        item = self.index_ds[index]
        return self.index_ds.get_target_len(item)

    def __len__(self):
        return len(self.index_ds)

    def __getitem__(self, index):
        # import pdb;
        # pdb.set_trace()

        output = None

        for idx in range(self.retry):
            badcase_flag = False
            if idx == 0:
                index_cur = index
            else:
                index_cur = torch.randint(0, len(self.index_ds), ()).item()

            item = self.index_ds[index_cur]

            system = item["system"]
            user = item["user"]
            assistant = item["assistant"]

            input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg = [], [], [], [], [], []

            for i, (system_prompt, user_prompt, target_out) in enumerate(
                zip(system, user, assistant)
            ):
                if i >= self.multiturn_num_max:
                    break
                if i == 0:
                    source_input = f"<|im_start|>system\n{system_prompt}<|im_end|>\n<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
                else:
                    source_input = (
                        f"<|im_start|>user\n{user_prompt}<|im_end|>\n<|im_start|>assistant\n"
                    )

                splits = self.pattern.split(source_input)
                source_ids = []
                fbank_mask_i = []
                fbank_beg_i = []
                fbank_lens_i = []
                for k, sub_str in enumerate(splits):
                    if not sub_str.startswith("<|startofspeech|>"):
                        sub_token = self.tokenizer.encode(sub_str)
                        source_ids += sub_token
                        fbank_mask_i += [0] * len(sub_token)
                    else:
                        sub_str = sub_str.replace("<|startofspeech|>", "").replace(
                            "<|endofspeech|>", ""
                        )
                        if sub_str.startswith("!"):
                            try:
                                data_src = load_audio_text_image_video(sub_str[1:], fs=self.fs)
                            except Exception as e:
                                logging.error(
                                    f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
                                )
                                badcase_flag = True
                                continue
                            speech, speech_lengths = extract_fbank(
                                data_src,
                                data_type=self.data_type,
                                frontend=self.frontend,
                                is_final=True,
                            )  # speech: [b, T, d]
                            if self.permute:
                                speech = speech.permute(0, 2, 1)
                            # if speech_lengths > self.batch_size:
                            #     continue

                            olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
                            olens = 1 + (olens - 3 + 2 * 1) // 2
                            sub_token_len = (olens - 1) // 2 + 1
                            sub_token = [0] * sub_token_len
                            fbank_beg_i = [len(source_ids)]
                            source_ids += sub_token
                            fbank_mask_i += [1] * len(sub_token)

                if badcase_flag:
                    continue
                source_mask = [-100] * len(source_ids)
                target_out = f"{target_out}<|im_end|>"
                target_ids = self.tokenizer.encode(target_out)
                input_ids += source_ids + target_ids
                labels += source_mask + target_ids
                fbank_mask += fbank_mask_i
                fbank_beg.append(fbank_beg_i)

            if len(input_ids) > self.max_token_length:
                logging.info(
                    f"input_ids > max_token_length: {len(input_ids)}>{self.max_token_length}, {item}"
                )
                badcase_flag = True
            if badcase_flag:
                continue
            input_ids = torch.tensor(input_ids, dtype=torch.int64)  # [: self.max_token_length]
            attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
            labels = torch.tensor(labels, dtype=torch.int64)  # [: self.max_token_length]

            fbank = speech[0, :, :]
            fbank_lens = speech_lengths
            fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
            fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)

            output = {
                "speech": fbank,
                "speech_lengths": fbank_lens,
                "fbank_mask": fbank_mask,
                "fbank_beg": fbank_beg,
                "input_ids": input_ids,
                "attention_mask": attention_mask,
                "labels_ids": labels,
            }
            break

        return output

    def collator(self, samples: list = None):

        for idx in range(self.retry):
            badcase_flag = False

            outputs = {}
            for sample in samples:
                if sample is None:
                    continue
                for key in sample.keys():
                    if key not in outputs:
                        outputs[key] = []
                    outputs[key].append(sample[key])

            for key, data_list in outputs.items():
                if isinstance(data_list[0], torch.Tensor):
                    if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32:

                        pad_value = self.int_pad_value
                    else:
                        pad_value = self.float_pad_value

                    outputs[key] = torch.nn.utils.rnn.pad_sequence(
                        data_list, batch_first=True, padding_value=pad_value
                    )

            if self.batch_type != "example":
                b, t = outputs["input_ids"].shape
                if b > 1 and b * t > self.batch_size_token_max:
                    logging.info(
                        f"Warning, {idx}th, b*t: {b}*{t}={b * t} > batch_size_sample_max: {self.batch_size_token_max}, drop last data"
                    )
                    samples = samples[:-1]
                    continue

            break

        return outputs

			@@ -12,6 +12,7 @@
			out_dir="${ckpt_dir}/inference-${ckpt_id}"
			mkdir -p ${out_dir}
			for data_set in "librispeech_test_clean_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
			{
			jsonl=${jsonl_dir}/${data_set}
			output_dir=${out_dir}/${data_set}
			mkdir -p ${output_dir}
			@@ -22,10 +23,12 @@

			python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false

			}&
			done
			wait


			for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl" "librispeech_test_other_speech2text.jsonl"; do
			for data_set in "aishell1_test_speech2text.jsonl" "aishell2_ios_test_speech2text.jsonl"; do
			{
			jsonl=${jsonl_dir}/${data_set}
			output_dir=${out_dir}/${data_set}
			mkdir -p ${output_dir}
			@@ -36,30 +39,27 @@

			python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true

			}&
			done

			for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
			jsonl=${jsonl_dir}/${data_set}
			output_dir=${out_dir}/${data_set}
			mkdir -p ${output_dir}
			pred_file=${output_dir}/1best_recog/text_tn
			ref_file=${output_dir}/1best_recog/label

			python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}

			python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=true

			done

			for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
			jsonl=${jsonl_dir}/${data_set}
			output_dir=${out_dir}/${data_set}
			mkdir -p ${output_dir}
			pred_file=${output_dir}/1best_recog/text_tn
			ref_file=${output_dir}/1best_recog/label

			python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}

			python /mnt/workspace/zhifu.gzf/codebase/FunASR/funasr/metrics/wer.py ++ref_file=${ref_file} ++hyp_file=${pred_file} ++cer_file=${pred_file}.cer ++cn_postprocess=false

			done
			#for data_set in "s2tt_en2zh.v20240605.test.jsonl"; do
			# jsonl=${jsonl_dir}/${data_set}
			# output_dir=${out_dir}/${data_set}
			# mkdir -p ${output_dir}
			# pred_file=${output_dir}/1best_recog/text_tn
			# ref_file=${output_dir}/1best_recog/label
			#
			# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
			#
			#done
			#
			#for data_set in "s2tt_zh2en.v20240605.test.jsonl"; do
			# jsonl=${jsonl_dir}/${data_set}
			# output_dir=${out_dir}/${data_set}
			# mkdir -p ${output_dir}
			# pred_file=${output_dir}/1best_recog/text_tn
			# ref_file=${output_dir}/1best_recog/label
			#
			# python ./demo_speech2text.py ${ckpt_dir} ${ckpt_id} ${jsonl} ${output_dir} ${device}
			#
			#done

			@@ -182,7 +182,7 @@

			time_escaped = (time.perf_counter() - time_slice_i) / 3600.0
			logging.info(
			f"rank: {local_rank}, "
			f"\n\nrank: {local_rank}, "
			f"time_escaped_epoch: {time_escaped:.3f} hours, "
			f"estimated to finish {dataloader.data_split_num} data_slices, remaining: {dataloader.data_split_num-data_split_i} slices, {(dataloader.data_split_num-data_split_i)*time_escaped:.3f} hours, "
			f"epoch: {trainer.max_epoch - epoch} epochs, {((trainer.max_epoch - epoch - 1)dataloader.data_split_num + dataloader.data_split_num-data_split_i)time_escaped:.3f} hours\n"
			@@ -199,7 +199,7 @@
			time2 = time.perf_counter()
			time_escaped = (time2 - time1) / 3600.0
			logging.info(
			f"rank: {local_rank}, "
			f"\n\nrank: {local_rank}, "
			f"time_escaped_epoch: {time_escaped:.3f} hours, "
			f"estimated to finish {trainer.max_epoch} "
			f"epoch: {(trainer.max_epoch - epoch) * time_escaped:.3f} hours\n"

			@@ -222,3 +222,227 @@
			break

			return outputs


			@tables.register("dataset_classes", "OpenAIDatasetMultiTurn")
			class OpenAIDatasetMultiTurn(torch.utils.data.Dataset):
			"""
			SenseVoiceDataset
			"""

			def __init__(
			self,
			path,
			index_ds: str = None,
			frontend=None,
			tokenizer=None,
			int_pad_value: int = -1,
			float_pad_value: float = 0.0,
			**kwargs,
			):
			super().__init__()
			index_ds_class = tables.index_ds_classes.get(index_ds)
			self.index_ds = index_ds_class(path, **kwargs)
			preprocessor_speech = kwargs.get("preprocessor_speech", None)
			if preprocessor_speech:
			preprocessor_speech_class = tables.preprocessor_classes.get(preprocessor_speech)
			preprocessor_speech = preprocessor_speech_class(
			**kwargs.get("preprocessor_speech_conf")
			)
			self.preprocessor_speech = preprocessor_speech
			preprocessor_text = kwargs.get("preprocessor_text", None)
			if preprocessor_text:
			preprocessor_text_class = tables.preprocessor_classes.get(preprocessor_text)
			preprocessor_text = preprocessor_text_class(**kwargs.get("preprocessor_text_conf"))
			self.preprocessor_text = preprocessor_text

			self.frontend = frontend
			self.fs = 16000 if frontend is None else frontend.fs
			self.data_type = "sound"
			self.tokenizer = tokenizer

			self.int_pad_value = int_pad_value
			self.float_pad_value = float_pad_value
			self.sos = kwargs.get("sos", "<\|startoftranscript\|>")
			self.eos = kwargs.get("eos", "<\|endoftext\|>")
			self.batch_size = kwargs.get("batch_size")
			self.batch_type = kwargs.get("batch_type")
			self.prompt_ids_len = 0
			self.retry = kwargs.get("retry", 100)

			self.permute = False
			from funasr.frontends.whisper_frontend import WhisperFrontend

			if isinstance(self.frontend, WhisperFrontend):
			self.permute = True

			self.pattern = re.compile(r"(<\\|startofspeech\\|>.*?<\\|endofspeech\\|>)")
			# self.kwargs = kwargs
			self.max_token_length = kwargs.get("max_token_length", 1024)
			self.batch_size_scale_ratio_max = kwargs.get("batch_size_scale_ratio_max", 1.5)
			self.batch_size_token_max = kwargs.get("batch_size_token_max", 2500)
			self.multiturn_num_max = kwargs.get("multiturn_num_max", 5)

			def get_source_len(self, index):
			item = self.index_ds[index]
			return self.index_ds.get_source_len(item)

			def get_target_len(self, index):
			item = self.index_ds[index]
			return self.index_ds.get_target_len(item)

			def __len__(self):
			return len(self.index_ds)

			def __getitem__(self, index):
			# import pdb;
			# pdb.set_trace()

			output = None

			for idx in range(self.retry):
			badcase_flag = False
			if idx == 0:
			index_cur = index
			else:
			index_cur = torch.randint(0, len(self.index_ds), ()).item()

			item = self.index_ds[index_cur]

			system = item["system"]
			user = item["user"]
			assistant = item["assistant"]

			input_ids, labels, fbank, fbank_lens, fbank_mask, fbank_beg = [], [], [], [], [], []

			for i, (system_prompt, user_prompt, target_out) in enumerate(
			zip(system, user, assistant)
			):
			if i >= self.multiturn_num_max:
			break
			if i == 0:
			source_input = f"<\|im_start\|>system\n{system_prompt}<\|im_end\|>\n<\|im_start\|>user\n{user_prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"
			else:
			source_input = (
			f"<\|im_start\|>user\n{user_prompt}<\|im_end\|>\n<\|im_start\|>assistant\n"
			)

			splits = self.pattern.split(source_input)
			source_ids = []
			fbank_mask_i = []
			fbank_beg_i = []
			fbank_lens_i = []
			for k, sub_str in enumerate(splits):
			if not sub_str.startswith("<\|startofspeech\|>"):
			sub_token = self.tokenizer.encode(sub_str)
			source_ids += sub_token
			fbank_mask_i += [0] * len(sub_token)
			else:
			sub_str = sub_str.replace("<\|startofspeech\|>", "").replace(
			"<\|endofspeech\|>", ""
			)
			if sub_str.startswith("!"):
			try:
			data_src = load_audio_text_image_video(sub_str[1:], fs=self.fs)
			except Exception as e:
			logging.error(
			f"Loading wav failed! {str(e)}, {traceback.format_exc()}"
			)
			badcase_flag = True
			continue
			speech, speech_lengths = extract_fbank(
			data_src,
			data_type=self.data_type,
			frontend=self.frontend,
			is_final=True,
			) # speech: [b, T, d]
			if self.permute:
			speech = speech.permute(0, 2, 1)
			# if speech_lengths > self.batch_size:
			# continue

			olens = 1 + (speech_lengths[0].item() - 3 + 2 * 1) // 2
			olens = 1 + (olens - 3 + 2 * 1) // 2
			sub_token_len = (olens - 1) // 2 + 1
			sub_token = [0] * sub_token_len
			fbank_beg_i = [len(source_ids)]
			source_ids += sub_token
			fbank_mask_i += [1] * len(sub_token)

			if badcase_flag:
			continue
			source_mask = [-100] * len(source_ids)
			target_out = f"{target_out}<\|im_end\|>"
			target_ids = self.tokenizer.encode(target_out)
			input_ids += source_ids + target_ids
			labels += source_mask + target_ids
			fbank_mask += fbank_mask_i
			fbank_beg.append(fbank_beg_i)

			if len(input_ids) > self.max_token_length:
			logging.info(
			f"input_ids > max_token_length: {len(input_ids)}>{self.max_token_length}, {item}"
			)
			badcase_flag = True
			if badcase_flag:
			continue
			input_ids = torch.tensor(input_ids, dtype=torch.int64) # [: self.max_token_length]
			attention_mask = torch.tensor([1] * len(input_ids), dtype=torch.int32)
			labels = torch.tensor(labels, dtype=torch.int64) # [: self.max_token_length]

			fbank = speech[0, :, :]
			fbank_lens = speech_lengths
			fbank_mask = torch.tensor(fbank_mask, dtype=torch.float32)
			fbank_beg = torch.tensor(fbank_beg, dtype=torch.int32)

			output = {
			"speech": fbank,
			"speech_lengths": fbank_lens,
			"fbank_mask": fbank_mask,
			"fbank_beg": fbank_beg,
			"input_ids": input_ids,
			"attention_mask": attention_mask,
			"labels_ids": labels,
			}
			break

			return output

			def collator(self, samples: list = None):

			for idx in range(self.retry):
			badcase_flag = False

			outputs = {}
			for sample in samples:
			if sample is None:
			continue
			for key in sample.keys():
			if key not in outputs:
			outputs[key] = []
			outputs[key].append(sample[key])

			for key, data_list in outputs.items():
			if isinstance(data_list[0], torch.Tensor):
			if data_list[0].dtype == torch.int64 or data_list[0].dtype == torch.int32:

			pad_value = self.int_pad_value
			else:
			pad_value = self.float_pad_value

			outputs[key] = torch.nn.utils.rnn.pad_sequence(
			data_list, batch_first=True, padding_value=pad_value
			)

			if self.batch_type != "example":
			b, t = outputs["input_ids"].shape
			if b > 1 and b * t > self.batch_size_token_max:
			logging.info(
			f"Warning, {idx}th, bt: {b}{t}={b * t} > batch_size_sample_max: {self.batch_size_token_max}, drop last data"
			)
			samples = samples[:-1]
			continue

			break

			return outputs