liugz18
2024-07-18 d80ac2fd2df4e7fb8a28acfa512bb11472b5cc99
funasr/datasets/openai_datasets/index_ds.py
@@ -15,7 +15,8 @@
    def __init__(self, path: str, **kwargs):
        super().__init__()
        self.max_source_length = kwargs.get("max_source_length", 2048)
        self.max_source_length = kwargs.get("max_source_length", 3000)
        self.min_source_length = kwargs.get("min_source_length", 0)
        self.max_target_length = kwargs.get("max_target_length", 2048)
        self.min_target_length = kwargs.get("min_target_length", 0)
@@ -48,7 +49,19 @@
        for file_json in file_list:
            with open(file_json.strip(), encoding="utf-8") as fin:
                for line in fin:
                    data = json.loads(line.strip())["messages"]
                    data_dict = json.loads(line.strip())
                    data = data_dict["messages"]
                    speech_length = data_dict.get("speech_length", -1) // 8
                    text_length = data_dict.get("text_length", 0)
                    if speech_length > self.max_source_length:
                        logging.info(
                            "speech_length: {speech_length} > {self.max_source_length}, drop it"
                        )
                        continue
                    if text_length > self.max_target_length:
                        continue
                    self.max_target_length = kwargs.get("max_target_length", 2048)
                    system, user, assistant = [], [], []
                    for i, item in enumerate(data):
@@ -63,7 +76,12 @@
                    system = system * len(user)
                    contents_i = {"system": system, "user": user, "assistant": assistant}
                    contents_i = {
                        "system": system,
                        "user": user,
                        "assistant": assistant,
                        "source_len": speech_length + text_length,
                    }
                    contents.append(contents_i)
        self.contents = contents
@@ -80,11 +98,14 @@
        return data
    def get_source_len(self, data_dict):
        return len(data_dict["system"]) + len(data_dict["user"])
        source_len = data_dict.get("source_len", -1)
        if source_len < 0:
            source_len = len(data_dict["system"]) + len(data_dict["user"])
        return source_len
    def get_target_len(self, data_dict):
        return len(data_dict["assistant"])
        return 0
if __name__ == "__main__":