| | |
| | | batch_type="token", |
| | | num_replicas=None, |
| | | rank=None, |
| | | rank_split=False, |
| | | shuffle=True, |
| | | drop_last=False, |
| | | is_training: bool = True, |
| | |
| | | except: |
| | | rank = 0 |
| | | num_replicas = 1 |
| | | |
| | | if rank_split: |
| | | logging.info(f"Warning, rank_split: {rank_split}, batch and shuffle data in local rank") |
| | | rank = 0 |
| | | num_replicas = 1 |
| | | |
| | | self.rank = rank |
| | | self.num_replicas = num_replicas |
| | | self.dataset = dataset |
| | |
| | | self.shuffle = shuffle and is_training |
| | | self.drop_last = drop_last |
| | | |
| | | # self.total_size = len(self.dataset) |
| | | self.total_size = len(self.dataset) |
| | | self.num_samples = int(math.ceil(self.total_size / self.num_replicas)) |
| | | self.epoch = 0 |
| | | self.sort_size = sort_size * num_replicas |
| | |
| | | max_len_in_batch = 0 |
| | | for idx in buffer: |
| | | original_sample_length = self.dataset.get_source_len(idx) |
| | | if original_sample_length > self.max_sample_length: |
| | | if original_sample_length > self.max_token_length: |
| | | continue |
| | | sample_length = 1 if self.batch_type == "example" else original_sample_length |
| | | potential_batch_length = max(max_len_in_batch, sample_length) * (len(batch) + 1) |