Fix a few issues found during fine-tuning (#2582)
* Fix wandb log
* fix validation loss is not logged
batch_idx got reset for each epoch.
use the global step counter instead
* LR should only be updated per step, not per step+ per epoch
* add early stopping
* Fix bf16 handling
scaler is only needed for fp16
* more logs
---------
Co-authored-by: Tony Mak <tony@Tonys-MacBook-Air-1800.local>
| | |
| | | dataloader = dataloader_class(**kwargs) |
| | | # dataloader_tr, dataloader_val = dataloader_class(**kwargs) |
| | | |
| | | scaler = GradScaler(enabled=True) if trainer.use_fp16 or trainer.use_bf16 else None |
| | | scaler = GradScaler(enabled=True) if trainer.use_fp16 else None |
| | | scaler = ShardedGradScaler(enabled=trainer.use_fp16) if trainer.use_fsdp else scaler |
| | | |
| | | trainer.resume_checkpoint( |
| | |
| | | scheduler=scheduler, |
| | | scaler=scaler, |
| | | ) |
| | | |
| | | early_stopping_patience = kwargs.get("train_conf", {}).get("early_stopping_patience", 0) |
| | | best_val_loss = float("inf") |
| | | epochs_no_improve = 0 |
| | | |
| | | dataloader_tr, dataloader_val = None, None |
| | | for epoch in range(trainer.start_epoch, trainer.max_epoch): |
| | |
| | | |
| | | trainer.start_data_split_i = 0 |
| | | trainer.validate_epoch(model=model, dataloader_val=dataloader_val, epoch=epoch + 1) |
| | | scheduler.step() |
| | | current_val = trainer.val_loss_avg |
| | | |
| | | if current_val < best_val_loss: |
| | | logging.info(f"current_val: {current_val}, best_val_loss: {best_val_loss}") |
| | | best_val_loss = current_val |
| | | epochs_no_improve = 0 |
| | | else: |
| | | epochs_no_improve += 1 |
| | | logging.info(f"No val_loss improvement for {epochs_no_improve}/{early_stopping_patience} epochs") |
| | | if early_stopping_patience > 0 and epochs_no_improve >= early_stopping_patience: |
| | | logging.info(f"Early stopping triggered at epoch {epoch+1}") |
| | | break |
| | | |
| | | trainer.step_in_epoch = 0 |
| | | trainer.save_checkpoint( |
| | | epoch + 1, model=model, optim=optim, scheduler=scheduler, scaler=scaler |
| | |
| | | if self.use_wandb and wandb is not None: |
| | | wandb.log( |
| | | description_dict, |
| | | setp=self.batch_total, |
| | | step=self.batch_total, |
| | | ) |
| | | |
| | | def close(self, writer=None): |
| | |
| | | yield |
| | | else: |
| | | if dtype == torch.float16 or dtype == torch.bfloat16: |
| | | yield |
| | | # with autocast(enabled=True, dtype=dtype): |
| | | # yield |
| | | with autocast(enabled=True, dtype=dtype): |
| | | yield |
| | | else: |
| | | yield |
| | | |
| | |
| | | scaled_loss = model.backward(loss) |
| | | else: |
| | | loss = loss / self.accum_grad |
| | | if self.use_fp16 or self.use_bf16: |
| | | if scaler: |
| | | scaler.scale(loss).backward() |
| | | else: |
| | | loss.backward() |
| | |
| | | # Execute an optimization step (update model parameters) |
| | | if self.use_ddp or self.use_fsdp: |
| | | dist.barrier() |
| | | if self.use_fp16 or self.use_bf16: |
| | | if scaler: |
| | | scaler.step(optim) |
| | | scaler.update() |
| | | else: |
| | |
| | | Args: |
| | | epoch (int): The current epoch number. |
| | | """ |
| | | self.val_loss_avg = 0.0 |
| | | self.val_acc_avg = 0.0 |
| | | |
| | | if self.use_ddp or self.use_fsdp or self.use_deepspeed: |
| | | dist.barrier() |
| | | logging.info(f"Validate epoch: {epoch}, rank: {self.rank}\n") |
| | |
| | | "data_split_i": kwargs.get("data_split_i", 0), |
| | | "data_split_num": kwargs.get("data_split_num", 1), |
| | | "log_step": batch_idx + kwargs.get("start_step", 0), |
| | | "batch_total": batch_idx + 1, |
| | | "batch_total": self.batch_total, |
| | | "step_in_epoch": batch_idx + 1, |
| | | "lr": 0.0, |
| | | } |
| | |
| | | if self.use_wandb and wandb is not None: |
| | | wandb.log( |
| | | description_dict, |
| | | setp=batch_total, |
| | | step=batch_total, |
| | | ) |
| | | |
| | | def close(self, writer=None): |