From 2175736ab0e2752837db102ffc27277339f19b5b Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期二, 11 六月 2024 14:02:18 +0800
Subject: [PATCH] Merge branch 'dev_gzf_deepspeed' into main

---
 funasr/train_utils/trainer.py |    9 +++++++--
 1 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py
index 50f99f0..afc632d 100644
--- a/funasr/train_utils/trainer.py
+++ b/funasr/train_utils/trainer.py
@@ -85,7 +85,12 @@
         self.batch_total = 0
         self.use_fp16 = use_fp16
         self.save_checkpoint_interval = kwargs.get("save_checkpoint_interval", 5000)
-        self.validate_interval = kwargs.get("validate_interval", 5000)
+        self.validate_interval = kwargs.get("validate_interval", -1)
+        if self.validate_interval < 0:
+            self.validate_interval = self.save_checkpoint_interval
+        assert (
+            self.save_checkpoint_interval == self.validate_interval
+        ), f"save_checkpoint_interval must equal to validate_interval"
         self.keep_nbest_models = kwargs.get("keep_nbest_models", 500)
         self.avg_keep_nbest_models_type = kwargs.get("avg_keep_nbest_models_type", "acc")
         self.avg_nbest_model = kwargs.get("avg_nbest_model", 10)
@@ -476,7 +481,7 @@
                     step_in_epoch=self.step_in_epoch,
                     batch_num_epoch=batch_num_epoch,
                     lr=lr,
-                    loss=loss.detach().cpu().item(),
+                    loss=accum_grad * loss.detach().cpu().item(),
                     speed_stats=speed_stats,
                     stats=stats,
                     writer=writer,

--
Gitblit v1.9.1