From 963ba1a7717c785d6e20ccb0d3cee9b59d5365e3 Mon Sep 17 00:00:00 2001
From: zhifu gao <zhifu.gzf@alibaba-inc.com>
Date: 星期一, 20 五月 2024 17:11:41 +0800
Subject: [PATCH] Dev gzf deepspeed (#1737)

---
 funasr/train_utils/average_nbest_models.py |    9 ++++++++-
 1 files changed, 8 insertions(+), 1 deletions(-)

diff --git a/funasr/train_utils/average_nbest_models.py b/funasr/train_utils/average_nbest_models.py
index 20da130..67f1e55 100644
--- a/funasr/train_utils/average_nbest_models.py
+++ b/funasr/train_utils/average_nbest_models.py
@@ -22,7 +22,13 @@
     in the output directory.
     """
     try:
-        checkpoint = torch.load(os.path.join(output_dir, "model.pt"), map_location="cpu")
+        if not use_deepspeed:
+            checkpoint = torch.load(os.path.join(output_dir, "model.pt"), map_location="cpu")
+        else:
+            checkpoint = torch.load(
+                os.path.join(output_dir, "model.pt", "mp_rank_00_model_states.pt"),
+                map_location="cpu",
+            )
         avg_keep_nbest_models_type = checkpoint["avg_keep_nbest_models_type"]
         val_step_or_eoch = checkpoint[f"val_{avg_keep_nbest_models_type}_step_or_eoch"]
         sorted_items = sorted(val_step_or_eoch.items(), key=lambda x: x[1], reverse=True)
@@ -35,6 +41,7 @@
                 ckpt = os.path.join(output_dir, key)
             else:
                 ckpt = os.path.join(output_dir, key, "mp_rank_00_model_states.pt")
+            checkpoint_paths.append(ckpt)
 
     except:
         print(f"{checkpoint} does not exist, avg the lastet checkpoint.")

--
Gitblit v1.9.1