| | |
| | | self._resume_checkpoint(self.output_dir) |
| | | |
| | | for epoch in range(self.start_epoch, self.max_epoch + 1): |
| | | |
| | | time1 = time.perf_counter() |
| | | self._train_epoch(epoch) |
| | | |
| | | |
| | |
| | | |
| | | self.scheduler.step() |
| | | |
| | | time2 = time.perf_counter() |
| | | time_escaped = (time2 - time1)/3600.0 |
| | | print(f"time_escaped_epoch: {time_escaped:.3f} hours, estimated to finish: {(self.max_epoch-epoch)*time_escaped:.3f}") |
| | | |
| | | if self.rank == 0: |
| | | average_checkpoints(self.output_dir, self.avg_nbest_model) |
| | |
| | | ) |
| | | lr = self.scheduler.get_last_lr()[0] |
| | | time_now = datetime.now() |
| | | time_now = now.strftime("%Y-%m-%d %H:%M:%S") |
| | | time_now = time_now.strftime("%Y-%m-%d %H:%M:%S") |
| | | description = ( |
| | | f"{time_now}, " |
| | | f"rank: {self.local_rank}, " |