From a70f5b3edf22ac889724aa9a06cefbb316374b28 Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期日, 24 三月 2024 01:44:18 +0800
Subject: [PATCH] finetune

---
 funasr/train_utils/trainer.py |    2 +-
 funasr/bin/train.py           |   21 +++++++++++----------
 2 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/funasr/bin/train.py b/funasr/bin/train.py
index 5cf54da..e446e54 100644
--- a/funasr/bin/train.py
+++ b/funasr/bin/train.py
@@ -173,10 +173,10 @@
     except:
         writer = None
 
-    # if use_ddp or use_fsdp:
-    #     context = Join([model])
-    # else:
-    context = nullcontext()
+    if use_ddp or use_fsdp:
+        context = Join([model])
+    else:
+        context = nullcontext()
 
     for epoch in range(trainer.start_epoch, trainer.max_epoch + 1):
         time1 = time.perf_counter()
@@ -192,13 +192,14 @@
                                 epoch=epoch,
                                 writer=writer
                                 )
+        with context:
+            trainer.validate_epoch(
+                model=model,
+                dataloader_val=dataloader_val,
+                epoch=epoch,
+                writer=writer
+            )
         scheduler.step()
-        trainer.validate_epoch(
-            model=model,
-            dataloader_val=dataloader_val,
-            epoch=epoch,
-            writer=writer
-        )
 
         
         trainer.save_checkpoint(epoch, model=model, optim=optim, scheduler=scheduler, scaler=scaler)
diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py
index cf23483..e554aca 100644
--- a/funasr/train_utils/trainer.py
+++ b/funasr/train_utils/trainer.py
@@ -398,7 +398,7 @@
             speed_stats = {}
             time5 = time.perf_counter()
             # iterator_stop = torch.tensor(0).to(self.device)
-
+            dataloader_val.batch_sampler.set_epoch(epoch)
             for batch_idx, batch in enumerate(dataloader_val):
                 # if self.use_ddp or self.use_fsdp:
                 #     dist.all_reduce(iterator_stop, dist.ReduceOp.SUM)

--
Gitblit v1.9.1