From f68ae892be3e59c20a033f18d5f61db7f633801f Mon Sep 17 00:00:00 2001
From: 游雁 <zhifu.gzf@alibaba-inc.com>
Date: 星期五, 17 五月 2024 10:32:34 +0800
Subject: [PATCH] deepspeed

---
 funasr/bin/train_ds.py |   38 ++------------------------------------
 1 files changed, 2 insertions(+), 36 deletions(-)

diff --git a/funasr/bin/train_ds.py b/funasr/bin/train_ds.py
index e4db533..032a0cf 100644
--- a/funasr/bin/train_ds.py
+++ b/funasr/bin/train_ds.py
@@ -133,29 +133,7 @@
     kwargs["device"] = next(model.parameters()).device
     trainer.device = kwargs["device"]
 
-    # optim
-    logging.info("Build optim")
-    optim = kwargs.get("optim", "adam")
-    assert optim in optim_classes
-    optim_class = optim_classes.get(optim)
-    optim = optim_class(model.parameters(), **kwargs.get("optim_conf"))
-
-    # scheduler
-    logging.info("Build scheduler")
-    scheduler = kwargs.get("scheduler", "warmuplr")
-    assert scheduler in scheduler_classes
-    scheduler_class = scheduler_classes.get(scheduler)
-    scheduler = scheduler_class(optim, **kwargs.get("scheduler_conf"))
-
-    if use_deepspeed:
-        args = OmegaConf.create({"deepspeed_config": kwargs.get("deepspeed_config", "")})
-        model, optimizer, _, scheduler = deepspeed.initialize(
-            args=args,
-            model=model,
-            optimizer=optim,
-            lr_scheduler=scheduler,
-            model_parameters=model.parameters(),
-        )
+    model, optim, scheduler = trainer.warp_optim_scheduler(model, kwargs)
 
     # dataset
     logging.info("Build dataloader")
@@ -175,15 +153,6 @@
         scaler=scaler,
     )
 
-    tensorboard_dir = os.path.join(kwargs.get("output_dir"), "tensorboard")
-    os.makedirs(tensorboard_dir, exist_ok=True)
-    try:
-        from tensorboardX import SummaryWriter
-
-        writer = SummaryWriter(tensorboard_dir)  # if trainer.rank == 0 else None
-    except:
-        writer = None
-
     dataloader_tr, dataloader_val = None, None
     for epoch in range(trainer.start_epoch, trainer.max_epoch):
         time1 = time.perf_counter()
@@ -201,7 +170,6 @@
                 dataloader_train=dataloader_tr,
                 dataloader_val=dataloader_val,
                 epoch=epoch,
-                writer=writer,
                 data_split_i=data_split_i,
                 data_split_num=dataloader.data_split_num,
                 start_step=trainer.start_step,
@@ -211,9 +179,7 @@
             torch.cuda.empty_cache()
 
         trainer.start_data_split_i = 0
-        trainer.validate_epoch(
-            model=model, dataloader_val=dataloader_val, epoch=epoch + 1, writer=writer
-        )
+        trainer.validate_epoch(model=model, dataloader_val=dataloader_val, epoch=epoch + 1)
         scheduler.step()
         trainer.step_in_epoch = 0
         trainer.save_checkpoint(

--
Gitblit v1.9.1