| | |
| | | output_dir (str): The directory where model checkpoints will be saved. Default is './'. |
| | | resume (str, optional): The file path to a checkpoint to resume training from. |
| | | """ |
| | | self.rank = kwargs.get("rank", 0) |
| | | self.rank = rank |
| | | self.local_rank = local_rank |
| | | self.world_size = world_size |
| | | self.use_ddp = use_ddp |
| | |
| | | elif self.use_fsdp: |
| | | pass |
| | | elif self.rank == 0: |
| | | logging.info(f"Save checkpoint: {epoch}, rank: {self.local_rank}\n") |
| | | logging.info( |
| | | f"Save checkpoint: {epoch}, rank: {self.rank}, local_rank: {self.local_rank}\n" |
| | | ) |
| | | # self.step_or_epoch += 1 |
| | | state = { |
| | | "epoch": epoch, |
| | |
| | | for k_ex in self.effective_save_name_excludes: |
| | | k_tmp = k.replace("module.", "") |
| | | if k.startswith(k_ex): |
| | | logging.info(f"key: {{k}} matching: {k_ex}, not save it") |
| | | logging.info(f"key: {k} matching: {k_ex}, not save it") |
| | | break |
| | | else: |
| | | dst_state_dict[k] = state_dict[k] |