From 80bd14e6bbb7bb282ff3832194648dc4a16157ca Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期四, 25 四月 2024 10:41:14 +0800
Subject: [PATCH] Dev gzf exp (#1657)
---
funasr/train_utils/trainer.py | 9 +++++++++
1 files changed, 9 insertions(+), 0 deletions(-)
diff --git a/funasr/train_utils/trainer.py b/funasr/train_utils/trainer.py
index 713e171..8f20ba4 100644
--- a/funasr/train_utils/trainer.py
+++ b/funasr/train_utils/trainer.py
@@ -107,6 +107,9 @@
self.best_step_or_epoch = ""
self.val_acc_step_or_eoch = {}
self.val_loss_step_or_eoch = {}
+
+ self.reset_gpu_cache = kwargs.get("reset_gpu_cache", False)
+
def save_checkpoint(
self,
@@ -324,6 +327,12 @@
time2 = time.perf_counter()
with maybe_autocast(self.use_fp16):
retval = model(**batch)
+
+ if (
+ self.reset_gpu_cache
+ and (torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024) > 70
+ ):
+ torch.cuda.empty_cache()
time3 = time.perf_counter()
speed_stats["forward_time"] = f"{time3 - time2:0.3f}"
--
Gitblit v1.9.1