From 80bd14e6bbb7bb282ff3832194648dc4a16157ca Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 25 四月 2024 10:41:14 +0800
Subject: [PATCH] Dev gzf exp (#1657)

---
 funasr/train_utils/trainer.py |    9 +++++++++
 1 files changed, 9 insertions(+), 0 deletions(-)

diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py
index 713e171..8f20ba4 100644
--- a/funasr/train_utils/trainer.py
+++ b/funasr/train_utils/trainer.py
@@ -107,6 +107,9 @@
         self.best_step_or_epoch = ""
         self.val_acc_step_or_eoch = {}
         self.val_loss_step_or_eoch = {}
+        
+        self.reset_gpu_cache = kwargs.get("reset_gpu_cache", False)
+
 
     def save_checkpoint(
         self,
@@ -324,6 +327,12 @@
                 time2 = time.perf_counter()
                 with maybe_autocast(self.use_fp16):
                     retval = model(**batch)
+                    
+                    if (
+                        self.reset_gpu_cache
+                        and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
+                    ):
+                        torch.cuda.empty_cache()
 
                 time3 = time.perf_counter()
                 speed_stats["forward_time"] = f"{time3 - time2:0.3f}"

--
Gitblit v1.9.1