游雁
2024-06-09 3eee773814c392e497557bbad501e0add4c8eca9
funasr/datasets/audio_datasets/index_ds.py
@@ -21,6 +21,7 @@
        self.min_source_length = kwargs.get("min_source_length", 0)
        self.max_target_length = kwargs.get("max_target_length", 2048)
        self.min_target_length = kwargs.get("min_target_length", 0)
        self.max_token_length = kwargs.get("max_token_length", 2200)
        is_training = kwargs.get("is_training", True)
        if not (path.endswith(".jsonl") or path.endswith(".json")):
@@ -34,7 +35,7 @@
            with open(path, encoding="utf-8") as fin:
                file_list_all = fin.readlines()
                num_per_slice = (len(file_list_all) - 1) // data_split_num + 1
                num_per_slice = (len(file_list_all) - 1) // data_split_num + 1  # 16
                file_list = file_list_all[
                    data_split_i * num_per_slice : (data_split_i + 1) * num_per_slice
                ]
@@ -103,6 +104,10 @@
                            or target_len > self.max_target_length
                        ):
                            continue
                        if (source_len + target_len) > self.max_token_length:
                            continue
                        contents_i = {
                            "source": source,
                            "prompt": prompt,