| | |
| | | self.max_token_length = kwargs.get("max_token_length", 2048) |
| | | self.min_token_length = kwargs.get("min_token_length", 0) |
| | | self.length_scale_source = kwargs.get("length_scale_source", 1.0) |
| | | self.start_step = 0 |
| | | self.start_step = start_step |
| | | if self.start_step > 0: |
| | | logging.info(f"Warning, start_step > 0, dataloader start from step: {self.start_step}") |
| | | # super().__init__(dataset, num_replicas=num_replicas, rank=rank, |
| | |
| | | start_idx = self.rank * batches_per_rank |
| | | end_idx = start_idx + batches_per_rank |
| | | rank_batches = buffer_batches[start_idx + self.start_step : end_idx] |
| | | |
| | | if self.start_step > 0: |
| | | logging.info( |
| | | f"Warning, rank: {self.rank}, dataloader start from step: {self.start_step}, batch_num_before: {end_idx-start_idx}, now: {len(rank_batches)}" |
| | | ) |
| | | # Return an iterator over the batches for the current rank |
| | | return iter(rank_batches) |
| | | |
| | |
| | | with open(path, encoding="utf-8") as fin: |
| | | file_list_all = fin.readlines() |
| | | |
| | | num_per_slice = (len(file_list_all) - 1) // data_split_num + 1 |
| | | num_per_slice = (len(file_list_all) - 1) // data_split_num + 1 # 16 |
| | | file_list = file_list_all[ |
| | | data_split_i * num_per_slice : (data_split_i + 1) * num_per_slice |
| | | ] |
| | |
| | | sample_num = len(waveform) |
| | | source_len = int(sample_num / 16000 * 1000 / 10) |
| | | source_len_old = data["source_len"] |
| | | if source_len_old != source_len: |
| | | print(f"wav: {wav_path}, old: {source_len_old}, new: {source_len}") |
| | | if (source_len_old - source_len) > 100 or (source_len - source_len_old) > 100: |
| | | print(f"old: {source_len_old}, new: {source_len}, wav: {wav_path}") |
| | | data["source_len"] = source_len |
| | | jsonl_line = json.dumps(data, ensure_ascii=False) |
| | | lines[i] = jsonl_line |