游雁
2025-01-10 d4f13c2e444f972b272273bce76b05f52f5164aa
funasr/train_utils/trainer.py
@@ -85,7 +85,12 @@
        self.batch_total = 0
        self.use_fp16 = use_fp16
        self.save_checkpoint_interval = kwargs.get("save_checkpoint_interval", 5000)
        self.validate_interval = kwargs.get("validate_interval", 5000)
        self.validate_interval = kwargs.get("validate_interval", -1)
        if self.validate_interval < 0:
            self.validate_interval = self.save_checkpoint_interval
        assert (
            self.save_checkpoint_interval == self.validate_interval
        ), f"save_checkpoint_interval must equal to validate_interval"
        self.keep_nbest_models = kwargs.get("keep_nbest_models", 500)
        self.avg_keep_nbest_models_type = kwargs.get("avg_keep_nbest_models_type", "acc")
        self.avg_nbest_model = kwargs.get("avg_nbest_model", 10)
@@ -110,8 +115,8 @@
        self.saved_ckpts = {}
        self.step_or_epoch = -1
        self.best_step_or_epoch = ""
        self.val_acc_step_or_eoch = {}
        self.val_loss_step_or_eoch = {}
        self.val_acc_step_or_epoch = {}
        self.val_loss_step_or_epoch = {}
        self.reset_gpu_cache = kwargs.get("reset_gpu_cache", False)
        self.start_data_split_i = 0
@@ -156,15 +161,16 @@
            # self.step_or_epoch += 1
            state = {
                "epoch": epoch,
                "step": step,
                "total_step": self.batch_total,
                "state_dict": model.state_dict(),
                "optimizer": optim.state_dict(),
                "scheduler": scheduler.state_dict(),
                "saved_ckpts": self.saved_ckpts,
                "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
                "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
                "val_acc_step_or_epoch": self.val_acc_step_or_epoch,
                "val_loss_step_or_epoch": self.val_loss_step_or_epoch,
                "best_step_or_epoch": self.best_step_or_epoch,
                "avg_keep_nbest_models_type": self.avg_keep_nbest_models_type,
                "step": step,
                "step_in_epoch": step_in_epoch,
                "data_split_i": kwargs.get("data_split_i", 0),
                "data_split_num": kwargs.get("data_split_num", 1),
@@ -178,6 +184,7 @@
            if scaler:
                state["scaler_state"] = scaler.state_dict()
            # Create output directory if it does not exist
            os.makedirs(self.output_dir, exist_ok=True)
            if step is None:
@@ -186,47 +193,48 @@
                ckpt_name = f"model.pt.ep{epoch}.{step}"
            filename = os.path.join(self.output_dir, ckpt_name)
            torch.save(state, filename)
            logging.info(f"Checkpoint saved to {filename}")
            logging.info(f"\nCheckpoint saved to {filename}\n")
            latest = Path(os.path.join(self.output_dir, f"model.pt"))
            torch.save(state, latest)
            if self.best_step_or_epoch == "":
                self.best_step_or_epoch = ckpt_name
            if self.avg_keep_nbest_models_type == "acc":
                if (
                    self.val_acc_step_or_eoch[ckpt_name]
                    >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
                    self.val_acc_step_or_epoch[ckpt_name]
                    >= self.val_acc_step_or_epoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                    torch.save(state, best_ckpt)
                    logging.info(
                        f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                        f"Update best acc: {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                    )
                else:
                    logging.info(
                        f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                        f"No improvement in acc: {self.val_acc_step_or_epoch[ckpt_name]:.4f} < {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                    )
            elif self.avg_keep_nbest_models_type == "loss":
                if (
                    self.val_loss_step_or_eoch[ckpt_name]
                    <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
                    self.val_loss_step_or_epoch[ckpt_name]
                    <= self.val_loss_step_or_epoch[self.best_step_or_epoch]
                ):
                    self.best_step_or_epoch = ckpt_name
                    best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                    torch.save(state, best_ckpt)
                    logging.info(
                        f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                        f"Update best loss: {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                    )
                else:
                    logging.info(
                        f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                        f"No improvement in loss: {self.val_loss_step_or_epoch[ckpt_name]:.4f} > {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                    )
            else:
                print("Undo")
            self.saved_ckpts[ckpt_name] = getattr(
                self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
                self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
            )[ckpt_name]
            if self.keep_nbest_models > 0:
                if len(self.saved_ckpts) > self.keep_nbest_models:
@@ -273,6 +281,7 @@
                        k_ddp = k.replace("module.", "", 1)
                    else:
                        k_ddp = k
                    if k_ddp in src_state.keys():
                        dst_state[k] = src_state[k_ddp]
                    else:
@@ -285,14 +294,14 @@
                    scaler.load_state_dict(checkpoint["scaler_state"])
                self.saved_ckpts = checkpoint["saved_ckpts"]
                self.val_acc_step_or_eoch = (
                    checkpoint["val_acc_step_or_eoch"]
                    if "val_acc_step_or_eoch" in checkpoint
                self.val_acc_step_or_epoch = (
                    checkpoint["val_acc_step_or_epoch"]
                    if "val_acc_step_or_epoch" in checkpoint
                    else {}
                )
                self.val_loss_step_or_eoch = (
                    checkpoint["val_loss_step_or_eoch"]
                    if "val_loss_step_or_eoch" in checkpoint
                self.val_loss_step_or_epoch = (
                    checkpoint["val_loss_step_or_epoch"]
                    if "val_loss_step_or_epoch" in checkpoint
                    else {}
                )
                self.best_step_or_epoch = (
@@ -357,10 +366,10 @@
        time_beg = time.perf_counter()
        time5 = time_beg
        for batch_idx, batch in enumerate(dataloader_train):
            if self.use_ddp or self.use_fsdp:
                dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
                if iterator_stop > 0:
                    break
            # if self.use_ddp or self.use_fsdp:
            #     dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
            #     if iterator_stop > 0:
            #         break
            self.batch_total += 1
            self.step_in_epoch += 1
            time1 = time.perf_counter()
@@ -376,27 +385,27 @@
                with maybe_autocast(self.use_fp16):
                    retval = model(**batch)
                    if (
                        self.reset_gpu_cache
                        and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
                    ):
                        torch.cuda.empty_cache()
                    # if (
                    #     self.reset_gpu_cache
                    #     and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
                    # ):
                    #     torch.cuda.empty_cache()
                loss, stats, weight = retval
                stats = {k: v for k, v in stats.items() if v is not None}
                # if self.use_ddp or self.use_fsdp:
                #     # Apply weighted averaging for loss and stats
                #     loss = (loss * weight.type(loss.dtype)).sum()
                #     # if distributed, this method can also apply all_reduce()
                #     # stats, weight = recursive_average(stats, weight, distributed=True)
                #     if self.use_ddp or self.use_fsdp:
                #         dist.all_reduce(weight, op=dist.ReduceOp.SUM)
                #     # Now weight is summation over all workers
                #     loss /= weight.sum()  # shape:[1] -> shape:[]
                #     # Multiply world_size because DistributedDataParallel
                #     # automatically normalizes the gradient by world_size.
                #     loss *= self.world_size
                loss *= self.world_size
                if self.use_ddp or self.use_fsdp:
                    # Apply weighted averaging for loss and stats
                    loss = (loss * weight.type(loss.dtype)).sum()
                    # if distributed, this method can also apply all_reduce()
                    # stats, weight = recursive_average(stats, weight, distributed=True)
                    if self.use_ddp or self.use_fsdp:
                        dist.all_reduce(weight, op=dist.ReduceOp.SUM)
                    # Now weight is summation over all workers
                    loss /= weight.sum()  # shape:[1] -> shape:[]
                    # Multiply world_size because DistributedDataParallel
                    # automatically normalizes the gradient by world_size.
                    loss *= self.world_size
                # loss *= self.world_size
                # Scale the loss since we're not updating for every mini-batch
                loss = loss / accum_grad
@@ -410,13 +419,14 @@
                speed_stats["backward_and_AllReaduce_time"] = f"{time4 - time3:0.3f}"
                self.train_loss_avg = (
                    self.train_loss_avg * (self.step_in_epoch - 1) + loss.detach().cpu().item()
                ) / self.step_in_epoch
                    self.train_loss_avg * (batch_idx + kwargs.get("start_step", 0))
                    + loss.detach().cpu().item()
                ) / (batch_idx + kwargs.get("start_step", 0) + 1)
                if "acc" in stats:
                    self.train_acc_avg = (
                        self.train_acc_avg * (self.step_in_epoch - 1)
                        self.train_acc_avg * (batch_idx + kwargs.get("start_step", 0))
                        + stats["acc"].detach().cpu().item()
                    ) / self.step_in_epoch
                    ) / (batch_idx + kwargs.get("start_step", 0) + 1)
            # Perform an optimizer step only after accumulating enough gradients
            if (batch_idx + 1) % accum_grad == 0:
@@ -445,8 +455,6 @@
                scheduler.step()
                # Clear gradients for the next accumulation stage
                optim.zero_grad(set_to_none=True)
                total_time = f"{(time.perf_counter() - time5)/accum_grad:0.3f}"
                time5 = time.perf_counter()
                if self.use_ddp or self.use_fsdp:
                    train_loss_avg = torch.tensor(self.train_loss_avg, dtype=torch.float32).to(
@@ -459,6 +467,9 @@
                    dist.all_reduce(train_acc_avg, op=dist.ReduceOp.SUM)
                    self.train_loss_avg = train_loss_avg.detach().cpu().item() / self.world_size
                    self.train_acc_avg = train_acc_avg.detach().cpu().item() / self.world_size
                total_time = f"{(time.perf_counter() - time5)/accum_grad:0.3f}"
                time5 = time.perf_counter()
                speed_stats["optim_time"] = f"{time5 - time4:0.3f}"
@@ -474,7 +485,7 @@
                    step_in_epoch=self.step_in_epoch,
                    batch_num_epoch=batch_num_epoch,
                    lr=lr,
                    loss=loss.detach().cpu().item(),
                    loss=accum_grad * loss.detach().cpu().item(),
                    speed_stats=speed_stats,
                    stats=stats,
                    writer=writer,
@@ -509,14 +520,14 @@
                )
            time_beg = time.perf_counter()
        else:
            if self.use_ddp or self.use_fsdp:
                iterator_stop.fill_(1)
                dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
        # else:
        #     if self.use_ddp or self.use_fsdp:
        #         iterator_stop.fill_(1)
        #         dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)
        if self.use_ddp or self.use_fsdp:
            dist.barrier()
            iterator_stop = torch.tensor(0).to(self.device)
            # iterator_stop = torch.tensor(0).to(self.device)
    def validate_epoch(
        self,
@@ -552,12 +563,14 @@
                time1 = time.perf_counter()
                speed_stats["data_load"] = f"{time1 - time5:0.3f}"
                batch = to_device(batch, self.device)
                time2 = time.perf_counter()
                retval = model(**batch)
                time3 = time.perf_counter()
                speed_stats["forward_time"] = f"{time3 - time2:0.3f}"
                loss, stats, weight = retval
                stats = {k: v for k, v in stats.items() if v is not None}
                if self.use_ddp or self.use_fsdp:
                    # Apply weighted averaging for loss and stats
                    loss = (loss * weight.type(loss.dtype)).sum()
@@ -570,28 +583,33 @@
                    # Multiply world_size because DistributedDataParallel
                    # automatically normalizes the gradient by world_size.
                    loss *= self.world_size
                # Scale the loss since we're not updating for every mini-batch
                loss = loss
                time4 = time.perf_counter()
                self.val_loss_avg = (self.val_loss_avg * batch_idx + loss.detach().cpu().item()) / (
                    batch_idx + 1
                )
                if "acc" in stats:
                    self.val_acc_avg = (
                        self.val_acc_avg * batch_idx + stats["acc"].detach().cpu().item()
                if torch.isfinite(loss):
                    self.val_loss_avg = (
                        self.val_loss_avg * batch_idx + loss.detach().cpu().item()
                    ) / (batch_idx + 1)
                if self.use_ddp or self.use_fsdp:
                    val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(
                        self.device
                    )
                    val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(
                        self.device
                    )
                    dist.all_reduce(val_loss_avg, op=dist.ReduceOp.SUM)
                    dist.all_reduce(val_acc_avg, op=dist.ReduceOp.SUM)
                    self.val_loss_avg = val_loss_avg.detach().cpu().item() / self.world_size
                    self.val_acc_avg = val_acc_avg.detach().cpu().item() / self.world_size
                    if "acc" in stats:
                        self.val_acc_avg = (
                            self.val_acc_avg * batch_idx + stats["acc"].detach().cpu().item()
                        ) / (batch_idx + 1)
                    if self.use_ddp or self.use_fsdp:
                        val_loss_avg = torch.tensor(self.val_loss_avg, dtype=torch.float32).to(
                            self.device
                        )
                        val_acc_avg = torch.tensor(self.val_acc_avg, dtype=torch.float32).to(
                            self.device
                        )
                        dist.all_reduce(val_loss_avg, op=dist.ReduceOp.SUM)
                        dist.all_reduce(val_acc_avg, op=dist.ReduceOp.SUM)
                        self.val_loss_avg = val_loss_avg.detach().cpu().item() / self.world_size
                        self.val_acc_avg = val_acc_avg.detach().cpu().item() / self.world_size
                time5 = time.perf_counter()
                batch_num_epoch = 1
                if hasattr(dataloader_val, "__len__"):
@@ -617,8 +635,8 @@
            ckpt_name = f"model.pt.ep{epoch}"
        else:
            ckpt_name = f'model.pt.ep{epoch}.{kwargs.get("step_in_epoch")}'
        self.val_acc_step_or_eoch[ckpt_name] = self.val_acc_avg
        self.val_loss_step_or_eoch[ckpt_name] = self.val_loss_avg
        self.val_acc_step_or_epoch[ckpt_name] = self.val_acc_avg
        self.val_loss_step_or_epoch[ckpt_name] = self.val_loss_avg
        model.train()
        if self.use_ddp or self.use_fsdp:
@@ -666,9 +684,9 @@
                f"data_slice: {data_split_i}/{data_split_num}, "
                f"step_in_slice: {batch_idx + 1}/{batch_num_epoch}, step_in_epoch: {step_in_epoch}, total step: {self.batch_total}, "
                f"(loss_avg_rank: {loss:.3f}), "
                f"(loss_avg_epoch: {loss_avg_epoch:.3f}), "
                f"(ppl_avg_epoch: {math.exp(loss_avg_epoch):.3e}), "
                f"(acc_avg_epoch: {acc_avg_epoch:.3f}), "
                f"(loss_avg_slice: {loss_avg_epoch:.3f}), "
                f"(ppl_avg_slice: {math.exp(loss_avg_epoch):.3e}), "
                f"(acc_avg_slice: {acc_avg_epoch:.3f}), "
                f"(lr: {lr:.3e}), "
                f"{[(k, round(v.detach().cpu().item(), 3)) for k, v in stats.items()]}, "
                f"{speed_stats}, "