| | |
| | | output_dir (str): The directory where model checkpoints will be saved. Default is './'. |
| | | resume (str, optional): The file path to a checkpoint to resume training from. |
| | | """ |
| | | self.rank = kwargs.get("rank", 0) |
| | | self.rank = rank |
| | | self.local_rank = local_rank |
| | | self.world_size = world_size |
| | | self.use_ddp = use_ddp |
| | |
| | | elif self.use_fsdp: |
| | | pass |
| | | elif self.rank == 0: |
| | | logging.info(f"Save checkpoint: {epoch}, rank: {self.local_rank}\n") |
| | | logging.info( |
| | | f"Save checkpoint: {epoch}, rank: {self.rank}, local_rank: {self.local_rank}\n" |
| | | ) |
| | | # self.step_or_epoch += 1 |
| | | state = { |
| | | "epoch": epoch, |