zhifu gao
2024-06-20 e65b1f701abca03bf3a1b5fbb200392aabd38c22
funasr/datasets/openai_datasets/index_ds.py
@@ -16,6 +16,12 @@
    def __init__(self, path: str, **kwargs):
        super().__init__()
        self.max_source_length = kwargs.get("max_source_length", 3000)
        self.min_source_length = kwargs.get("min_source_length", 0)
        self.max_target_length = kwargs.get("max_target_length", 2048)
        self.min_target_length = kwargs.get("min_target_length", 0)
        self.max_token_length = kwargs.get("max_token_length", 2200)
        is_training = kwargs.get("is_training", True)
        if not (path.endswith(".jsonl") or path.endswith(".json")):
            # jsonl list file
@@ -47,6 +53,15 @@
                    data = data_dict["messages"]
                    speech_length = data_dict.get("speech_length", -1) // 8
                    text_length = data_dict.get("text_length", 0)
                    if speech_length > self.max_source_length:
                        logging.info(
                            "speech_length: {speech_length} > {self.max_source_length}, drop it"
                        )
                        continue
                    if text_length > self.max_target_length:
                        continue
                    self.max_target_length = kwargs.get("max_target_length", 2048)
                    system, user, assistant = [], [], []
                    for i, item in enumerate(data):