zhifu gao
2024-04-24 861147c7308b91068ffa02724fdf74ee623a909e
funasr/train_utils/trainer.py
@@ -15,6 +15,7 @@
from funasr.train_utils.average_nbest_models import average_checkpoints
from torch.distributed.fsdp.sharded_grad_scaler import ShardedGradScaler
@contextmanager
def maybe_autocast(enabled):
    if enabled:
@@ -22,6 +23,7 @@
            yield
    else:
        yield
class Trainer:
    """
@@ -39,13 +41,15 @@
        resume (str, optional): Path to a checkpoint to resume training from.
    """
    
    def __init__(self,
    def __init__(
        self,
                 local_rank,
                 use_ddp: bool = False,
                 use_fsdp: bool = False,
                 use_fp16: bool = False,
                 output_dir: str="./",
                 **kwargs):
        **kwargs,
    ):
        """
        Initializes the Trainer class with the model, optimizer, scheduler, dataloaders, and other settings.
@@ -64,13 +68,13 @@
        self.output_dir = output_dir
        if not os.path.exists(self.output_dir):
            os.makedirs(self.output_dir, exist_ok=True)
        self.resume = kwargs.get('resume', True)
        self.resume = kwargs.get("resume", True)
        self.start_epoch = 0
        self.max_epoch = kwargs.get('max_epoch', 100)
        self.max_epoch = kwargs.get("max_epoch", 100)
        self.local_rank = local_rank
        self.use_ddp = use_ddp
        self.use_fsdp = use_fsdp
        self.device = kwargs.get('device', "cuda")
        self.device = kwargs.get("device", "cuda")
        # self.kwargs = kwargs
        self.log_interval = kwargs.get("log_interval", 50)
        self.batch_total = 0
@@ -83,8 +87,6 @@
        self.accum_grad = kwargs.get("accum_grad", 1)
        self.grad_clip = kwargs.get("grad_clip", 10.0)
        self.grad_clip_type = kwargs.get("grad_clip_type", 2.0)
    
        try:
            rank = dist.get_rank()
@@ -106,7 +108,9 @@
        self.val_acc_step_or_eoch = {}
        self.val_loss_step_or_eoch = {}
       
    def save_checkpoint(self, epoch,
    def save_checkpoint(
        self,
        epoch,
                        step=None,
                        model=None,
                        optim=None,
@@ -126,10 +130,10 @@
            logging.info(f"Save checkpoint: {epoch}, rank: {self.local_rank}\n")
            # self.step_or_epoch += 1
            state = {
                'epoch': epoch,
                'state_dict': model.state_dict(),
                'optimizer': optim.state_dict(),
                'scheduler': scheduler.state_dict(),
                "epoch": epoch,
                "state_dict": model.state_dict(),
                "optimizer": optim.state_dict(),
                "scheduler": scheduler.state_dict(),
                "saved_ckpts": self.saved_ckpts,
                "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
                "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
@@ -144,37 +148,53 @@
            # Create output directory if it does not exist
            os.makedirs(self.output_dir, exist_ok=True)
            if step is None:
                ckpt_name = f'model.pt.ep{epoch}'
                ckpt_name = f"model.pt.ep{epoch}"
            else:
                ckpt_name = f'model.pt.ep{epoch}.{step}'
                ckpt_name = f"model.pt.ep{epoch}.{step}"
            filename = os.path.join(self.output_dir, ckpt_name)
            torch.save(state, filename)
            
            logging.info(f'\nCheckpoint saved to {filename}\n')
            latest = Path(os.path.join(self.output_dir, f'model.pt'))
            logging.info(f"\nCheckpoint saved to {filename}\n")
            latest = Path(os.path.join(self.output_dir, f"model.pt"))
            torch.save(state, latest)
            if self.best_step_or_epoch == "":
                self.best_step_or_epoch = ckpt_name
             
            if self.avg_keep_nbest_models_type == "acc":
                if self.val_acc_step_or_eoch[ckpt_name] >= self.val_acc_step_or_eoch[self.best_step_or_epoch]:
                if (
                    self.val_acc_step_or_eoch[ckpt_name]
                    >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
                    best_ckpt = Path(os.path.join(self.output_dir, f'model.pt.best'))
                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                    torch.save(state, best_ckpt)
                    logging.info(f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}")
                    logging.info(
                        f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                    )
                else:
                    logging.info(f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}")
                    logging.info(
                        f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}"
                    )
            elif self.avg_keep_nbest_models_type == "loss":
                if self.val_loss_step_or_eoch[ckpt_name] <= self.val_loss_step_or_eoch[self.best_step_or_epoch]:
                if (
                    self.val_loss_step_or_eoch[ckpt_name]
                    <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
                    best_ckpt = Path(os.path.join(self.output_dir, f'model.pt.best'))
                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                    torch.save(state, best_ckpt)
                    logging.info(f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}")
                    logging.info(
                        f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                    )
                else:
                    logging.info(f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}")
                    logging.info(
                        f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}"
                    )
            else:
                print("Undo")
            self.saved_ckpts[ckpt_name] = getattr(self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch")[ckpt_name]
            self.saved_ckpts[ckpt_name] = getattr(
                self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
            )[ckpt_name]
            if self.keep_nbest_models > 0:
                if len(self.saved_ckpts) > self.keep_nbest_models:
                    if self.avg_keep_nbest_models_type == "acc":
@@ -191,7 +211,8 @@
        if self.use_ddp or self.use_fsdp:
            dist.barrier()
    
    def resume_checkpoint(self,
    def resume_checkpoint(
        self,
                          model=None,
                          optim=None,
                          scheduler=None,
@@ -208,9 +229,9 @@
            ckpt = os.path.join(self.output_dir, "model.pt")
            if os.path.isfile(ckpt):
                checkpoint = torch.load(ckpt, map_location="cpu")
                self.start_epoch = checkpoint['epoch'] + 1
                self.start_epoch = checkpoint["epoch"] + 1
                # self.model.load_state_dict(checkpoint['state_dict'])
                src_state = checkpoint['state_dict']
                src_state = checkpoint["state_dict"]
                dst_state = model.state_dict()
                for k in dst_state.keys():
                    if not k.startswith("module.") and "module."+k in src_state.keys():
@@ -225,15 +246,25 @@
                        print(f"Miss key in ckpt: model: {k}, ckpt: {k_ddp}")
    
                model.load_state_dict(dst_state)
                optim.load_state_dict(checkpoint['optimizer'])
                scheduler.load_state_dict(checkpoint['scheduler'])
                if scaler is not None and 'scaler_state' in checkpoint:
                    scaler.load_state_dict(checkpoint['scaler_state'])
                optim.load_state_dict(checkpoint["optimizer"])
                scheduler.load_state_dict(checkpoint["scheduler"])
                if scaler is not None and "scaler_state" in checkpoint:
                    scaler.load_state_dict(checkpoint["scaler_state"])
                
                self.saved_ckpts = checkpoint["saved_ckpts"]
                self.val_acc_step_or_eoch = checkpoint["val_acc_step_or_eoch"] if "val_acc_step_or_eoch" in checkpoint else {}
                self.val_loss_step_or_eoch = checkpoint["val_loss_step_or_eoch"] if "val_loss_step_or_eoch" in checkpoint else {}
                self.best_step_or_epoch = checkpoint["best_step_or_epoch"] if "best_step_or_epoch" in checkpoint else ""
                self.val_acc_step_or_eoch = (
                    checkpoint["val_acc_step_or_eoch"]
                    if "val_acc_step_or_eoch" in checkpoint
                    else {}
                )
                self.val_loss_step_or_eoch = (
                    checkpoint["val_loss_step_or_eoch"]
                    if "val_loss_step_or_eoch" in checkpoint
                    else {}
                )
                self.best_step_or_epoch = (
                    checkpoint["best_step_or_epoch"] if "best_step_or_epoch" in checkpoint else ""
                )
                model.to(self.device)
                print(f"Checkpoint loaded successfully from '{ckpt}'")
            else:
@@ -242,8 +273,8 @@
        if self.use_ddp or self.use_fsdp:
            dist.barrier()
        
    def train_epoch(self,
    def train_epoch(
        self,
                model=None,
                optim=None,
                scheduler=None,
@@ -319,17 +350,24 @@
                time4 = time.perf_counter()
                speed_stats["backward_time"] = f"{time4 - time3:0.3f}"
                
                self.train_loss_avg = (self.train_loss_avg*batch_idx + loss.detach().cpu().item())/(batch_idx+1)
                self.train_loss_avg = (
                    self.train_loss_avg * batch_idx + loss.detach().cpu().item()
                ) / (batch_idx + 1)
                if "acc" in stats:
                    self.train_acc_avg = (self.train_acc_avg * batch_idx + stats["acc"].detach().cpu().item()) / (batch_idx + 1)
                    self.train_acc_avg = (
                        self.train_acc_avg * batch_idx + stats["acc"].detach().cpu().item()
                    ) / (batch_idx + 1)
                if self.use_ddp or self.use_fsdp:
                    train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(self.device)
                    train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(self.device)
                    train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(
                        self.device
                    )
                    train_acc_avg = torch.tensor(self.train_acc_avg, dtype=torch.float32).to(
                        self.device
                    )
                    dist.all_reduce(train_loss_avg, op=dist.ReduceOp.SUM)
                    dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
                    self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
                    self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size
            
            # Perform an optimizer step only after accumulating enough gradients
            if (batch_idx + 1) % accum_grad == 0:
@@ -367,7 +405,9 @@
                batch_num_epoch = 1
                if hasattr(dataloader_train, "__len__"):
                    batch_num_epoch = len(dataloader_train)
                self.log(epoch, batch_idx,
                self.log(
                    epoch,
                    batch_idx,
                         batch_num_epoch=batch_num_epoch,
                         lr=lr,
                         loss=loss.detach().cpu().item(),
@@ -389,7 +429,14 @@
                )
            if (batch_idx+1) % self.save_checkpoint_interval == 0:
                self.save_checkpoint(epoch, model=model, optim=optim, scheduler=scheduler, scaler=scaler, step=batch_idx+1)
                self.save_checkpoint(
                    epoch,
                    model=model,
                    optim=optim,
                    scheduler=scheduler,
                    scaler=scaler,
                    step=batch_idx + 1,
                )
            time_beg = time.perf_counter()
        else:
@@ -401,9 +448,8 @@
            dist.barrier()
            iterator_stop = torch.tensor(0).to(self.device)
        
    def validate_epoch(self,
    def validate_epoch(
        self,
                       model=None,
                       dataloader_val=None,
                       epoch=None,
@@ -458,12 +504,20 @@
                loss = loss
                time4 = time.perf_counter()
                self.val_loss_avg = (self.val_loss_avg*batch_idx + loss.detach().cpu().item())/(batch_idx+1)
                self.val_loss_avg = (self.val_loss_avg * batch_idx + loss.detach().cpu().item()) / (
                    batch_idx + 1
                )
                if "acc" in stats:
                    self.val_acc_avg = (self.val_acc_avg * batch_idx + stats["acc"].detach().cpu().item()) / (batch_idx + 1)
                    self.val_acc_avg = (
                        self.val_acc_avg * batch_idx + stats["acc"].detach().cpu().item()
                    ) / (batch_idx + 1)
                if self.use_ddp or self.use_fsdp:
                    val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(self.device)
                    val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(self.device)
                    val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(
                        self.device
                    )
                    val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(
                        self.device
                    )
                    dist.all_reduce(val_loss_avg, op=dist.ReduceOp.SUM)
                    dist.all_reduce(val_acc_avg, op=dist.ReduceOp.SUM)
                    self.val_loss_avg = val_loss_avg.detach().cpu().item() / self.world_size
@@ -472,7 +526,9 @@
                batch_num_epoch = 1
                if hasattr(dataloader_val, "__len__"):
                    batch_num_epoch = len(dataloader_val)
                self.log(epoch, batch_idx,
                self.log(
                    epoch,
                    batch_idx,
                         batch_num_epoch=batch_num_epoch,
                         lr=0.0,
                         loss=loss.detach().cpu().item(),
@@ -488,7 +544,7 @@
                    dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
        if kwargs.get("step", None) is None:
            ckpt_name = f'model.pt.ep{epoch}'
            ckpt_name = f"model.pt.ep{epoch}"
        else:
            ckpt_name = f'model.pt.ep{epoch}.{kwargs.get("step")}'
        self.val_acc_step_or_eoch[ckpt_name] = self.val_acc_avg
@@ -499,8 +555,8 @@
            dist.barrier()
            iterator_stop = torch.tensor(0).to(self.device)
        
    def log(self,
    def log(
        self,
            epoch=0,
            batch_idx=0,
            batch_num_epoch=-1,
@@ -517,13 +573,16 @@
        
        if (batch_idx + 1) % self.log_interval == 0:
            
            gpu_info = "GPU, memory: usage: {:.3f} GB, " \
                       "peak: {:.3f} GB, " \
                       "cache: {:.3f} GB, " \
                       "cache_peak: {:.3f} GB".format(torch.cuda.memory_allocated() / 1024 / 1024 / 1024,
            gpu_info = (
                "GPU, memory: usage: {:.3f} GB, "
                "peak: {:.3f} GB, "
                "cache: {:.3f} GB, "
                "cache_peak: {:.3f} GB".format(
                    torch.cuda.memory_allocated() / 1024 / 1024 / 1024,
                                          torch.cuda.max_memory_allocated() / 1024 / 1024 / 1024,
                                          torch.cuda.memory_reserved() / 1024 / 1024 / 1024,
                                          torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024,
                )
                                          )
            
            loss_avg_epoch = getattr(self, f"{tag}_loss_avg")
@@ -546,13 +605,17 @@
            logging.info(description)
            
            if writer is not None:
                writer.add_scalar(f'rank{self.local_rank}_loss/{tag}', loss, self.batch_total)
                writer.add_scalar(f'rank{self.local_rank}_lr/{tag}', lr, self.batch_total)
                writer.add_scalar(f'rank{self.local_rank}_lr/{tag}', lr, self.batch_total)
                writer.add_scalar(f"rank{self.local_rank}_loss/{tag}", loss, self.batch_total)
                writer.add_scalar(f"rank{self.local_rank}_lr/{tag}", lr, self.batch_total)
                writer.add_scalar(f"rank{self.local_rank}_lr/{tag}", lr, self.batch_total)
                for key, var in stats.items():
                    writer.add_scalar(f'stats_rank{self.local_rank}_{key}/{tag}', var.item(), self.batch_total)
                    writer.add_scalar(
                        f"stats_rank{self.local_rank}_{key}/{tag}", var.item(), self.batch_total
                    )
                for key, var in speed_stats.items():
                    writer.add_scalar(f'stats_rank{self.local_rank}_{key}/{tag}', eval(var), self.batch_total)
                    writer.add_scalar(
                        f"stats_rank{self.local_rank}_{key}/{tag}", eval(var), self.batch_total
                    )
        
    def close(self, writer=None):