kongdeqiang
5 天以前 28ccfbfc51068a663a80764e14074df5edf2b5ba
funasr/bin/train_ds.py
@@ -81,7 +81,10 @@
        deepspeed.init_distributed(dist_backend=kwargs.get("backend", "nccl"))
    elif use_ddp or use_fsdp:
        logging.info(f"use_ddp: {use_ddp}, use_fsdp: {use_fsdp}")
        dist.init_process_group(backend=kwargs.get("backend", "nccl"), init_method="env://")
        dist.init_process_group(
            backend=kwargs.get("backend", "nccl"),
            init_method="env://",
        )
        torch.cuda.set_device(local_rank)
    # rank = dist.get_rank()
@@ -131,7 +134,7 @@
        **kwargs.get("train_conf"),
    )
    model = trainer.warp_model(model)
    model = trainer.warp_model(model, **kwargs)
    kwargs["device"] = int(os.environ.get("LOCAL_RANK", 0))
    trainer.device = int(os.environ.get("LOCAL_RANK", 0))
@@ -146,7 +149,7 @@
    dataloader = dataloader_class(**kwargs)
    # dataloader_tr, dataloader_val = dataloader_class(**kwargs)
    scaler = GradScaler(enabled=True) if trainer.use_fp16 or trainer.use_bf16 else None
    scaler = GradScaler(enabled=True) if trainer.use_fp16 else None
    scaler = ShardedGradScaler(enabled=trainer.use_fp16) if trainer.use_fsdp else scaler
    trainer.resume_checkpoint(
@@ -155,6 +158,10 @@
        scheduler=scheduler,
        scaler=scaler,
    )
    early_stopping_patience = kwargs.get("train_conf", {}).get("early_stopping_patience", 0)
    best_val_loss = float("inf")
    epochs_no_improve = 0
    dataloader_tr, dataloader_val = None, None
    for epoch in range(trainer.start_epoch, trainer.max_epoch):
@@ -181,7 +188,10 @@
            )
            trainer.start_step = 0
            torch.cuda.empty_cache()
            device = next(model.parameters()).device
            if device.type == "cuda":
                with torch.cuda.device(device):
                    torch.cuda.empty_cache()
            time_escaped = (time.perf_counter() - time_slice_i) / 3600.0
            logging.info(
@@ -193,7 +203,19 @@
        trainer.start_data_split_i = 0
        trainer.validate_epoch(model=model, dataloader_val=dataloader_val, epoch=epoch + 1)
        scheduler.step()
        current_val = trainer.val_loss_avg
        if current_val < best_val_loss:
            logging.info(f"current_val: {current_val}, best_val_loss: {best_val_loss}")
            best_val_loss = current_val
            epochs_no_improve = 0
        else:
            epochs_no_improve += 1
            logging.info(f"No val_loss improvement for {epochs_no_improve}/{early_stopping_patience} epochs")
        if early_stopping_patience > 0 and epochs_no_improve >= early_stopping_patience:
            logging.info(f"Early stopping triggered at epoch {epoch+1}")
            break
        trainer.step_in_epoch = 0
        trainer.save_checkpoint(
            epoch + 1, model=model, optim=optim, scheduler=scheduler, scaler=scaler