| | |
| | | def __init__(self, path: str, **kwargs): |
| | | super().__init__() |
| | | |
| | | self.max_source_length = kwargs.get("max_source_length", 3000) |
| | | self.min_source_length = kwargs.get("min_source_length", 0) |
| | | self.max_target_length = kwargs.get("max_target_length", 2048) |
| | | self.min_target_length = kwargs.get("min_target_length", 0) |
| | | self.max_token_length = kwargs.get("max_token_length", 2200) |
| | | |
| | | is_training = kwargs.get("is_training", True) |
| | | if not (path.endswith(".jsonl") or path.endswith(".json")): |
| | | # jsonl list file |
| | |
| | | data = data_dict["messages"] |
| | | speech_length = data_dict.get("speech_length", -1) // 8 |
| | | text_length = data_dict.get("text_length", 0) |
| | | if speech_length > self.max_source_length: |
| | | logging.info( |
| | | "speech_length: {speech_length} > {self.max_source_length}, drop it" |
| | | ) |
| | | continue |
| | | if text_length > self.max_target_length: |
| | | continue |
| | | |
| | | self.max_target_length = kwargs.get("max_target_length", 2048) |
| | | |
| | | system, user, assistant = [], [], [] |
| | | for i, item in enumerate(data): |