| | |
| | | with open(path, encoding="utf-8") as fin: |
| | | file_list_all = fin.readlines() |
| | | |
| | | num_per_slice = (len(file_list_all) - 1) // data_split_num + 1 |
| | | num_per_slice = (len(file_list_all) - 1) // data_split_num + 1 # 16 |
| | | file_list = file_list_all[ |
| | | data_split_i * num_per_slice : (data_split_i + 1) * num_per_slice |
| | | ] |
| | |
| | | or target_len > self.max_target_length |
| | | ): |
| | | continue |
| | | |
| | | |
| | | if (source_len + target_len) > self.max_token_length: |
| | | continue |
| | | |
| | | |
| | | contents_i = { |
| | | "source": source, |
| | | "prompt": prompt, |
| | |
| | | text_language = data.get("text_language", None) |
| | | if text_language is not None: |
| | | contents_i["text_language"] = text_language |
| | | if "emo_target" in data: |
| | | contents_i["emo_target"] = data["emo_target"] |
| | | if "event_target" in data: |
| | | contents_i["event_target"] = data["event_target"] |
| | | if "with_or_wo_itn" in data: |
| | | contents_i["with_or_wo_itn"] = data["with_or_wo_itn"] |
| | | # audio_language = data.get("audio_language", None) |
| | | # if audio_language is not None: |
| | | # contents_i["audio_language"] = audio_language |