From e6fe602db3eb1209543e55f1aafa2932dfda3310 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 10 一月 2025 10:14:30 +0800
Subject: [PATCH] step_or_epoch bugfix

---
 funasr/train_utils/trainer_ds.py |   99 ++++++++++++++++++++++++++-----------------------
 1 files changed, 53 insertions(+), 46 deletions(-)

diff --git a/funasr/train_utils/trainer_ds.py b/funasr/train_utils/trainer_ds.py
index ba8dd16..0b104da 100644
--- a/funasr/train_utils/trainer_ds.py
+++ b/funasr/train_utils/trainer_ds.py
@@ -29,9 +29,10 @@
         with torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False):
             yield
     else:
-        if dtype == torch.float16:
-            with autocast(enabled=True):
-                yield
+        if dtype == torch.float16 or dtype == torch.bfloat16:
+            yield
+            # with autocast(enabled=True, dtype=dtype):
+            #     yield
         else:
             yield
 
@@ -60,6 +61,7 @@
         use_ddp: bool = False,
         use_fsdp: bool = False,
         use_fp16: bool = False,
+        use_bf16: bool = False,
         use_deepspeed: bool = False,
         output_dir: str = "./",
         **kwargs,
@@ -78,7 +80,7 @@
                       output_dir (str): The directory where model checkpoints will be saved. Default is './'.
                       resume (str, optional): The file path to a checkpoint to resume training from.
         """
-        self.rank = kwargs.get("rank", 0)
+        self.rank = rank
         self.local_rank = local_rank
         self.world_size = world_size
         self.use_ddp = use_ddp
@@ -98,8 +100,11 @@
         self.batch_total = 0
         self.dtype = torch.float32
         self.use_fp16 = use_fp16
+        self.use_bf16 = use_bf16
         if self.use_fp16:
             self.dtype = torch.float16
+        if self.use_bf16:
+            self.dtype = torch.bfloat16
         self.save_checkpoint_interval = kwargs.get("save_checkpoint_interval", 5000)
         self.validate_interval = kwargs.get("validate_interval", 5000)
         self.keep_nbest_models = kwargs.get("keep_nbest_models", 500)
@@ -117,8 +122,8 @@
         self.saved_ckpts = {}
         self.step_or_epoch = -1
         self.best_step_or_epoch = ""
-        self.val_acc_step_or_eoch = {}
-        self.val_loss_step_or_eoch = {}
+        self.val_acc_step_or_epoch = {}
+        self.val_loss_step_or_epoch = {}
 
         self.reset_gpu_cache = kwargs.get("reset_gpu_cache", False)
         self.start_data_split_i = 0
@@ -190,8 +195,8 @@
                 # "optimizer": optim.state_dict(),
                 # "scheduler": scheduler.state_dict(),
                 "saved_ckpts": self.saved_ckpts,
-                "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
-                "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
+                "val_acc_step_or_epoch": self.val_acc_step_or_epoch,
+                "val_loss_step_or_epoch": self.val_loss_step_or_epoch,
                 "best_step_or_epoch": self.best_step_or_epoch,
                 "avg_keep_nbest_models_type": self.avg_keep_nbest_models_type,
                 "step": step,
@@ -229,8 +234,8 @@
 
             if self.avg_keep_nbest_models_type == "acc":
                 if (
-                    self.val_acc_step_or_eoch[ckpt_name]
-                    >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
+                    self.val_acc_step_or_epoch[ckpt_name]
+                    >= self.val_acc_step_or_epoch[self.best_step_or_epoch]
                 ):
                     self.best_step_or_epoch = ckpt_name
                     best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
@@ -240,16 +245,16 @@
                             save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
                         )
                     logging.info(
-                        f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+                        f"Update best acc: {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                     )
                 else:
                     logging.info(
-                        f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+                        f"No improvement in acc: {self.val_acc_step_or_epoch[ckpt_name]:.4f} < {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                     )
             elif self.avg_keep_nbest_models_type == "loss":
                 if (
-                    self.val_loss_step_or_eoch[ckpt_name]
-                    <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
+                    self.val_loss_step_or_epoch[ckpt_name]
+                    <= self.val_loss_step_or_epoch[self.best_step_or_epoch]
                 ):
                     self.best_step_or_epoch = ckpt_name
                     best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
@@ -259,16 +264,16 @@
                             save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
                         )
                     logging.info(
-                        f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+                        f"Update best loss: {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                     )
                 else:
                     logging.info(
-                        f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+                        f"No improvement in loss: {self.val_loss_step_or_epoch[ckpt_name]:.4f} > {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                     )
             else:
                 print("Undo")
             self.saved_ckpts[ckpt_name] = getattr(
-                self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
+                self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
             )[ckpt_name]
             if self.keep_nbest_models > 0:
                 if len(self.saved_ckpts) > self.keep_nbest_models:
@@ -287,15 +292,17 @@
         elif self.use_fsdp:
             pass
         elif self.rank == 0:
-            logging.info(f"Save checkpoint: {epoch}, rank: {self.local_rank}\n")
+            logging.info(
+                f"Save checkpoint: {epoch}, rank: {self.rank}, local_rank: {self.local_rank}\n"
+            )
             # self.step_or_epoch += 1
             state = {
                 "epoch": epoch,
                 "optimizer": optim.state_dict(),
                 "scheduler": scheduler.state_dict(),
                 "saved_ckpts": self.saved_ckpts,
-                "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
-                "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
+                "val_acc_step_or_epoch": self.val_acc_step_or_epoch,
+                "val_loss_step_or_epoch": self.val_loss_step_or_epoch,
                 "best_step_or_epoch": self.best_step_or_epoch,
                 "avg_keep_nbest_models_type": self.avg_keep_nbest_models_type,
                 "step": step,
@@ -346,38 +353,38 @@
 
             if self.avg_keep_nbest_models_type == "acc":
                 if (
-                    self.val_acc_step_or_eoch[ckpt_name]
-                    >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
+                    self.val_acc_step_or_epoch[ckpt_name]
+                    >= self.val_acc_step_or_epoch[self.best_step_or_epoch]
                 ):
                     self.best_step_or_epoch = ckpt_name
                     best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                     torch.save(state, best_ckpt)
                     logging.info(
-                        f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+                        f"Update best acc: {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                     )
                 else:
                     logging.info(
-                        f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+                        f"No improvement in acc: {self.val_acc_step_or_epoch[ckpt_name]:.4f} < {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                     )
             elif self.avg_keep_nbest_models_type == "loss":
                 if (
-                    self.val_loss_step_or_eoch[ckpt_name]
-                    <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
+                    self.val_loss_step_or_epoch[ckpt_name]
+                    <= self.val_loss_step_or_epoch[self.best_step_or_epoch]
                 ):
                     self.best_step_or_epoch = ckpt_name
                     best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
                     torch.save(state, best_ckpt)
                     logging.info(
-                        f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+                        f"Update best loss: {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
                     )
                 else:
                     logging.info(
-                        f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+                        f"No improvement in loss: {self.val_loss_step_or_epoch[ckpt_name]:.4f} > {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
                     )
             else:
                 print("Undo")
             self.saved_ckpts[ckpt_name] = getattr(
-                self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
+                self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
             )[ckpt_name]
             if self.keep_nbest_models > 0:
                 if len(self.saved_ckpts) > self.keep_nbest_models:
@@ -418,14 +425,14 @@
                     _, checkpoint = model.load_checkpoint(self.output_dir, "model.pt")
                     self.start_epoch = checkpoint["epoch"]
                     self.saved_ckpts = checkpoint["saved_ckpts"]
-                    self.val_acc_step_or_eoch = (
-                        checkpoint["val_acc_step_or_eoch"]
-                        if "val_acc_step_or_eoch" in checkpoint
+                    self.val_acc_step_or_epoch = (
+                        checkpoint["val_acc_step_or_epoch"]
+                        if "val_acc_step_or_epoch" in checkpoint
                         else {}
                     )
-                    self.val_loss_step_or_eoch = (
-                        checkpoint["val_loss_step_or_eoch"]
-                        if "val_loss_step_or_eoch" in checkpoint
+                    self.val_loss_step_or_epoch = (
+                        checkpoint["val_loss_step_or_epoch"]
+                        if "val_loss_step_or_epoch" in checkpoint
                         else {}
                     )
                     self.best_step_or_epoch = (
@@ -471,7 +478,7 @@
                             for k_ex in self.excludes:
                                 k_tmp = k.replace("module.", "")
                                 if k_tmp.startswith(k_ex):
-                                    logging.info(f"key: {{k}} matching: {k_ex}, excluded")
+                                    logging.info(f"key: {k} matching: {k_ex}, excluded")
                                     excludes_flag = True
                                     break
                         if excludes_flag:
@@ -494,14 +501,14 @@
                         scaler.load_state_dict(checkpoint["scaler_state"])
 
                     self.saved_ckpts = checkpoint["saved_ckpts"]
-                    self.val_acc_step_or_eoch = (
-                        checkpoint["val_acc_step_or_eoch"]
-                        if "val_acc_step_or_eoch" in checkpoint
+                    self.val_acc_step_or_epoch = (
+                        checkpoint["val_acc_step_or_epoch"]
+                        if "val_acc_step_or_epoch" in checkpoint
                         else {}
                     )
-                    self.val_loss_step_or_eoch = (
-                        checkpoint["val_loss_step_or_eoch"]
-                        if "val_loss_step_or_eoch" in checkpoint
+                    self.val_loss_step_or_epoch = (
+                        checkpoint["val_loss_step_or_epoch"]
+                        if "val_loss_step_or_epoch" in checkpoint
                         else {}
                     )
                     self.best_step_or_epoch = (
@@ -676,7 +683,7 @@
             scaled_loss = model.backward(loss)
         else:
             loss = loss / self.accum_grad
-            if self.use_fp16:
+            if self.use_fp16 or self.use_bf16:
                 scaler.scale(loss).backward()
             else:
                 loss.backward()
@@ -704,7 +711,7 @@
                 # Execute an optimization step (update model parameters)
                 if self.use_ddp or self.use_fsdp:
                     dist.barrier()
-                if self.use_fp16:
+                if self.use_fp16 or self.use_bf16:
                     scaler.step(optim)
                     scaler.update()
                 else:
@@ -796,8 +803,8 @@
             ckpt_name = f"model.pt.ep{epoch}"
         else:
             ckpt_name = f'model.pt.ep{epoch}.{kwargs.get("step_in_epoch")}'
-        self.val_acc_step_or_eoch[ckpt_name] = self.val_acc_avg
-        self.val_loss_step_or_eoch[ckpt_name] = self.val_loss_avg
+        self.val_acc_step_or_epoch[ckpt_name] = self.val_acc_avg
+        self.val_loss_step_or_epoch[ckpt_name] = self.val_loss_avg
 
         if self.use_ddp or self.use_fsdp or self.use_deepspeed:
             dist.barrier()

--
Gitblit v1.9.1