From 28ccfbfc51068a663a80764e14074df5edf2b5ba Mon Sep 17 00:00:00 2001
From: kongdeqiang <kongdeqiang960204@163.com>
Date: 星期五, 13 三月 2026 17:41:41 +0800
Subject: [PATCH] 提交
---
funasr/train_utils/trainer_ds.py | 178 +++++++++++++++++++++++++++++++++++-----------------------
1 files changed, 107 insertions(+), 71 deletions(-)
diff --git a/funasr/train_utils/trainer_ds.py b/funasr/train_utils/trainer_ds.py
index c973728..ce8809c 100644
--- a/funasr/train_utils/trainer_ds.py
+++ b/funasr/train_utils/trainer_ds.py
@@ -29,8 +29,8 @@
with torch.cuda.amp.autocast(enabled=True, dtype=dtype, cache_enabled=False):
yield
else:
- if dtype == torch.float16:
- with autocast(enabled=True):
+ if dtype == torch.float16 or dtype == torch.bfloat16:
+ with autocast(enabled=True, dtype=dtype):
yield
else:
yield
@@ -60,6 +60,7 @@
use_ddp: bool = False,
use_fsdp: bool = False,
use_fp16: bool = False,
+ use_bf16: bool = False,
use_deepspeed: bool = False,
output_dir: str = "./",
**kwargs,
@@ -78,7 +79,7 @@
output_dir (str): The directory where model checkpoints will be saved. Default is './'.
resume (str, optional): The file path to a checkpoint to resume training from.
"""
- self.rank = kwargs.get("rank", 0)
+ self.rank = rank
self.local_rank = local_rank
self.world_size = world_size
self.use_ddp = use_ddp
@@ -98,8 +99,11 @@
self.batch_total = 0
self.dtype = torch.float32
self.use_fp16 = use_fp16
+ self.use_bf16 = use_bf16
if self.use_fp16:
self.dtype = torch.float16
+ if self.use_bf16:
+ self.dtype = torch.bfloat16
self.save_checkpoint_interval = kwargs.get("save_checkpoint_interval", 5000)
self.validate_interval = kwargs.get("validate_interval", 5000)
self.keep_nbest_models = kwargs.get("keep_nbest_models", 500)
@@ -117,8 +121,8 @@
self.saved_ckpts = {}
self.step_or_epoch = -1
self.best_step_or_epoch = ""
- self.val_acc_step_or_eoch = {}
- self.val_loss_step_or_eoch = {}
+ self.val_acc_step_or_epoch = {}
+ self.val_loss_step_or_epoch = {}
self.reset_gpu_cache = kwargs.get("reset_gpu_cache", False)
self.start_data_split_i = 0
@@ -147,10 +151,16 @@
self.use_deepspeed = use_deepspeed
self.deepspeed_config = kwargs.get("deepspeed_config", "")
- self.excludes = kwargs.get("excludes", None)
- if self.excludes is not None:
- if isinstance(self.excludes, str):
- self.excludes = self.excludes.split(",")
+ excludes = kwargs.get("excludes", None)
+ if excludes is not None:
+ if isinstance(excludes, str):
+ excludes = excludes.split(",")
+ self.excludes = excludes
+ effective_save_name_excludes = kwargs.get("effective_save_name_excludes", None)
+ if effective_save_name_excludes is not None:
+ if isinstance(effective_save_name_excludes, str):
+ effective_save_name_excludes = effective_save_name_excludes.split(",")
+ self.effective_save_name_excludes = effective_save_name_excludes
def save_checkpoint(
self,
@@ -184,8 +194,8 @@
# "optimizer": optim.state_dict(),
# "scheduler": scheduler.state_dict(),
"saved_ckpts": self.saved_ckpts,
- "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
- "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
+ "val_acc_step_or_epoch": self.val_acc_step_or_epoch,
+ "val_loss_step_or_epoch": self.val_loss_step_or_epoch,
"best_step_or_epoch": self.best_step_or_epoch,
"avg_keep_nbest_models_type": self.avg_keep_nbest_models_type,
"step": step,
@@ -223,8 +233,8 @@
if self.avg_keep_nbest_models_type == "acc":
if (
- self.val_acc_step_or_eoch[ckpt_name]
- >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
+ self.val_acc_step_or_epoch[ckpt_name]
+ >= self.val_acc_step_or_epoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
@@ -234,16 +244,16 @@
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
)
logging.info(
- f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+ f"Update best acc: {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
)
else:
logging.info(
- f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+ f"No improvement in acc: {self.val_acc_step_or_epoch[ckpt_name]:.4f} < {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
)
elif self.avg_keep_nbest_models_type == "loss":
if (
- self.val_loss_step_or_eoch[ckpt_name]
- <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
+ self.val_loss_step_or_epoch[ckpt_name]
+ <= self.val_loss_step_or_epoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
@@ -253,44 +263,46 @@
save_dir=self.output_dir, tag=f"model.pt.best", client_state=state
)
logging.info(
- f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+ f"Update best loss: {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
)
else:
logging.info(
- f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+ f"No improvement in loss: {self.val_loss_step_or_epoch[ckpt_name]:.4f} > {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
)
else:
print("Undo")
- self.saved_ckpts[ckpt_name] = getattr(
- self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
- )[ckpt_name]
- if self.keep_nbest_models > 0:
- if len(self.saved_ckpts) > self.keep_nbest_models:
- if self.avg_keep_nbest_models_type == "acc":
- key = min(self.saved_ckpts, key=self.saved_ckpts.get)
- else:
- key = max(self.saved_ckpts, key=self.saved_ckpts.get)
- if key in self.saved_ckpts:
- del self.saved_ckpts[key]
- filename = os.path.join(self.output_dir, key)
- logging.info(f"Delete: {filename}")
- if os.path.exists(filename):
- # os.remove(filename)
- misc_utils.smart_remove(filename)
+ if self.rank == 0:
+ self.saved_ckpts[ckpt_name] = getattr(
+ self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
+ )[ckpt_name]
+ if self.keep_nbest_models > 0:
+ if len(self.saved_ckpts) > self.keep_nbest_models:
+ if self.avg_keep_nbest_models_type == "acc":
+ key = min(self.saved_ckpts, key=self.saved_ckpts.get)
+ else:
+ key = max(self.saved_ckpts, key=self.saved_ckpts.get)
+ if key in self.saved_ckpts:
+ del self.saved_ckpts[key]
+ filename = os.path.join(self.output_dir, key)
+ logging.info(f"Delete: {filename}")
+ if os.path.exists(filename):
+ # os.remove(filename)
+ misc_utils.smart_remove(filename)
elif self.use_fsdp:
pass
elif self.rank == 0:
- logging.info(f"Save checkpoint: {epoch}, rank: {self.local_rank}\n")
+ logging.info(
+ f"Save checkpoint: {epoch}, rank: {self.rank}, local_rank: {self.local_rank}\n"
+ )
# self.step_or_epoch += 1
state = {
"epoch": epoch,
- "state_dict": model.state_dict(),
"optimizer": optim.state_dict(),
"scheduler": scheduler.state_dict(),
"saved_ckpts": self.saved_ckpts,
- "val_acc_step_or_eoch": self.val_acc_step_or_eoch,
- "val_loss_step_or_eoch": self.val_loss_step_or_eoch,
+ "val_acc_step_or_epoch": self.val_acc_step_or_epoch,
+ "val_loss_step_or_epoch": self.val_loss_step_or_epoch,
"best_step_or_epoch": self.best_step_or_epoch,
"avg_keep_nbest_models_type": self.avg_keep_nbest_models_type,
"step": step,
@@ -303,7 +315,24 @@
}
step = step_in_epoch
if hasattr(model, "module"):
- state["state_dict"] = model.module.state_dict()
+ state_dict = model.module.state_dict()
+ else:
+ state_dict = model.state_dict()
+
+ if self.effective_save_name_excludes is not None:
+ logging.info(f"effective_save_name_excludes: {self.effective_save_name_excludes}")
+ dst_state_dict = {}
+ for k in state_dict.keys():
+ for k_ex in self.effective_save_name_excludes:
+ k_tmp = k.replace("module.", "")
+ if k.startswith(k_ex):
+ logging.info(f"key: {k} matching: {k_ex}, not save it")
+ break
+ else:
+ dst_state_dict[k] = state_dict[k]
+ state["state_dict"] = dst_state_dict
+ else:
+ state["state_dict"] = state_dict
if scaler:
state["scaler_state"] = scaler.state_dict()
@@ -324,38 +353,38 @@
if self.avg_keep_nbest_models_type == "acc":
if (
- self.val_acc_step_or_eoch[ckpt_name]
- >= self.val_acc_step_or_eoch[self.best_step_or_epoch]
+ self.val_acc_step_or_epoch[ckpt_name]
+ >= self.val_acc_step_or_epoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
torch.save(state, best_ckpt)
logging.info(
- f"Update best acc: {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+ f"Update best acc: {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
)
else:
logging.info(
- f"No improvement in acc: {self.val_acc_step_or_eoch[ckpt_name]:.4f} < {self.val_acc_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+ f"No improvement in acc: {self.val_acc_step_or_epoch[ckpt_name]:.4f} < {self.val_acc_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
)
elif self.avg_keep_nbest_models_type == "loss":
if (
- self.val_loss_step_or_eoch[ckpt_name]
- <= self.val_loss_step_or_eoch[self.best_step_or_epoch]
+ self.val_loss_step_or_epoch[ckpt_name]
+ <= self.val_loss_step_or_epoch[self.best_step_or_epoch]
):
self.best_step_or_epoch = ckpt_name
best_ckpt = Path(os.path.join(self.output_dir, f"model.pt.best"))
torch.save(state, best_ckpt)
logging.info(
- f"Update best loss: {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
+ f"Update best loss: {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {best_ckpt}"
)
else:
logging.info(
- f"No improvement in loss: {self.val_loss_step_or_eoch[ckpt_name]:.4f} > {self.val_loss_step_or_eoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
+ f"No improvement in loss: {self.val_loss_step_or_epoch[ckpt_name]:.4f} > {self.val_loss_step_or_epoch[self.best_step_or_epoch]:.4f}, {os.path.join(self.output_dir, self.best_step_or_epoch)}"
)
else:
print("Undo")
self.saved_ckpts[ckpt_name] = getattr(
- self, f"val_{self.avg_keep_nbest_models_type}_step_or_eoch"
+ self, f"val_{self.avg_keep_nbest_models_type}_step_or_epoch"
)[ckpt_name]
if self.keep_nbest_models > 0:
if len(self.saved_ckpts) > self.keep_nbest_models:
@@ -396,14 +425,14 @@
_, checkpoint = model.load_checkpoint(self.output_dir, "model.pt")
self.start_epoch = checkpoint["epoch"]
self.saved_ckpts = checkpoint["saved_ckpts"]
- self.val_acc_step_or_eoch = (
- checkpoint["val_acc_step_or_eoch"]
- if "val_acc_step_or_eoch" in checkpoint
+ self.val_acc_step_or_epoch = (
+ checkpoint["val_acc_step_or_epoch"]
+ if "val_acc_step_or_epoch" in checkpoint
else {}
)
- self.val_loss_step_or_eoch = (
- checkpoint["val_loss_step_or_eoch"]
- if "val_loss_step_or_eoch" in checkpoint
+ self.val_loss_step_or_epoch = (
+ checkpoint["val_loss_step_or_epoch"]
+ if "val_loss_step_or_epoch" in checkpoint
else {}
)
self.best_step_or_epoch = (
@@ -444,12 +473,16 @@
src_state = checkpoint["state_dict"]
dst_state = model.state_dict()
for k in dst_state.keys():
- if excludes is not None:
- for k_ex in excludes:
+ excludes_flag = False
+ if self.excludes is not None:
+ for k_ex in self.excludes:
k_tmp = k.replace("module.", "")
if k_tmp.startswith(k_ex):
- logging.info(f"key: {{k}} matching: {k_ex}, excluded")
- continue
+ logging.info(f"key: {k} matching: {k_ex}, excluded")
+ excludes_flag = True
+ break
+ if excludes_flag:
+ continue
if not k.startswith("module.") and "module." + k in src_state.keys():
k_ddp = "module." + k
elif k.startswith("module.") and "module." + k not in src_state.keys():
@@ -468,14 +501,14 @@
scaler.load_state_dict(checkpoint["scaler_state"])
self.saved_ckpts = checkpoint["saved_ckpts"]
- self.val_acc_step_or_eoch = (
- checkpoint["val_acc_step_or_eoch"]
- if "val_acc_step_or_eoch" in checkpoint
+ self.val_acc_step_or_epoch = (
+ checkpoint["val_acc_step_or_epoch"]
+ if "val_acc_step_or_epoch" in checkpoint
else {}
)
- self.val_loss_step_or_eoch = (
- checkpoint["val_loss_step_or_eoch"]
- if "val_loss_step_or_eoch" in checkpoint
+ self.val_loss_step_or_epoch = (
+ checkpoint["val_loss_step_or_epoch"]
+ if "val_loss_step_or_epoch" in checkpoint
else {}
)
self.best_step_or_epoch = (
@@ -650,7 +683,7 @@
scaled_loss = model.backward(loss)
else:
loss = loss / self.accum_grad
- if self.use_fp16:
+ if scaler:
scaler.scale(loss).backward()
else:
loss.backward()
@@ -678,7 +711,7 @@
# Execute an optimization step (update model parameters)
if self.use_ddp or self.use_fsdp:
dist.barrier()
- if self.use_fp16:
+ if scaler:
scaler.step(optim)
scaler.update()
else:
@@ -702,6 +735,9 @@
Args:
epoch (int): The current epoch number.
"""
+ self.val_loss_avg = 0.0
+ self.val_acc_avg = 0.0
+
if self.use_ddp or self.use_fsdp or self.use_deepspeed:
dist.barrier()
logging.info(f"Validate epoch: {epoch}, rank: {self.rank}\n")
@@ -723,7 +759,7 @@
"data_split_i": kwargs.get("data_split_i", 0),
"data_split_num": kwargs.get("data_split_num", 1),
"log_step": batch_idx + kwargs.get("start_step", 0),
- "batch_total": batch_idx + 1,
+ "batch_total": self.batch_total,
"step_in_epoch": batch_idx + 1,
"lr": 0.0,
}
@@ -770,8 +806,8 @@
ckpt_name = f"model.pt.ep{epoch}"
else:
ckpt_name = f'model.pt.ep{epoch}.{kwargs.get("step_in_epoch")}'
- self.val_acc_step_or_eoch[ckpt_name] = self.val_acc_avg
- self.val_loss_step_or_eoch[ckpt_name] = self.val_loss_avg
+ self.val_acc_step_or_epoch[ckpt_name] = self.val_acc_avg
+ self.val_loss_step_or_epoch[ckpt_name] = self.val_loss_avg
if self.use_ddp or self.use_fsdp or self.use_deepspeed:
dist.barrier()
@@ -849,7 +885,7 @@
if self.use_wandb and wandb is not None:
wandb.log(
description_dict,
- setp=batch_total,
+ step=batch_total,
)
def close(self, writer=None):
--
Gitblit v1.9.1