From b76af7be8cd7428f19ec0ba9a7fd811148fbc358 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期日, 28 四月 2024 21:18:45 +0800
Subject: [PATCH] Merge branch 'dev_gzf_exp' of github.com:alibaba-damo-academy/FunASR into dev_gzf_exp merge

---
 funasr/bin/train.py |   15 +++++++++------
 1 files changed, 9 insertions(+), 6 deletions(-)

diff --git a/funasr/bin/train.py b/funasr/bin/train.py
index 448e464..97516eb 100644
--- a/funasr/bin/train.py
+++ b/funasr/bin/train.py
@@ -13,7 +13,7 @@
 
 from contextlib import nullcontext
 import torch.distributed as dist
-from collections.abc import Sequence
+
 from omegaconf import DictConfig, OmegaConf
 from torch.cuda.amp import autocast, GradScaler
 from torch.nn.parallel import DistributedDataParallel as DDP
@@ -99,7 +99,7 @@
     if freeze_param is not None:
         if "," in freeze_param:
             freeze_param = eval(freeze_param)
-        if not isinstance(freeze_param, Sequence):
+        if not isinstance(freeze_param, (list, tuple)):
             freeze_param = (freeze_param,)
         logging.info("freeze_param is not None: %s", freeze_param)
         for t in freeze_param:
@@ -193,7 +193,7 @@
     try:
         from tensorboardX import SummaryWriter
 
-        writer = SummaryWriter(tensorboard_dir) if trainer.rank == 0 else None
+        writer = SummaryWriter(tensorboard_dir)  # if trainer.rank == 0 else None
     except:
         writer = None
 
@@ -206,6 +206,7 @@
                 epoch, data_split_i=data_split_i, start_step=trainer.start_step
             )
             trainer.start_step = 0
+
             trainer.train_epoch(
                 model=model,
                 optim=optim,
@@ -222,11 +223,13 @@
             torch.cuda.empty_cache()
 
         trainer.validate_epoch(
-            model=model, dataloader_val=dataloader_val, epoch=epoch, writer=writer
+            model=model, dataloader_val=dataloader_val, epoch=epoch + 1, writer=writer
         )
         scheduler.step()
-
-        trainer.save_checkpoint(epoch, model=model, optim=optim, scheduler=scheduler, scaler=scaler)
+        trainer.step_in_epoch = 0
+        trainer.save_checkpoint(
+            epoch + 1, model=model, optim=optim, scheduler=scheduler, scaler=scaler
+        )
 
         time2 = time.perf_counter()
         time_escaped = (time2 - time1) / 3600.0

--
Gitblit v1.9.1