haoneng.lhn
2023-12-08 acb9a0fec8d8a4dabeedcbb8e08c26f66d7083f0
funasr/train/trainer.py
@@ -26,7 +26,6 @@
import torch
import torch.nn
import torch.optim
from typeguard import check_argument_types
from funasr.iterators.abs_iter_factory import AbsIterFactory
from funasr.main_funcs.average_nbest_models import average_nbest_models
@@ -44,6 +43,7 @@
from funasr.train.reporter import Reporter
from funasr.train.reporter import SubReporter
from funasr.utils.build_dataclass import build_dataclass
from funasr.utils.kwargs2args import kwargs2args
if torch.distributed.is_available():
    from torch.distributed import ReduceOp
@@ -126,7 +126,6 @@
    @classmethod
    def build_options(cls, args: argparse.Namespace) -> TrainerOptions:
        """Build options consumed by train(), eval()"""
        assert check_argument_types()
        return build_dataclass(TrainerOptions, args)
    @classmethod
@@ -187,7 +186,6 @@
        distributed_option: DistributedOption,
    ) -> None:
        """Perform training. This method performs the main process of training."""
        assert check_argument_types()
        # NOTE(kamo): Don't check the type more strictly as far trainer_options
        assert is_dataclass(trainer_options), type(trainer_options)
        assert len(optimizers) == len(schedulers), (len(optimizers), len(schedulers))
@@ -280,14 +278,11 @@
        for iepoch in range(start_epoch, trainer_options.max_epoch + 1):
            if iepoch != start_epoch:
                logging.info(
                    "{}/{}epoch started. Estimated time to finish: {}".format(
                    "{}/{}epoch started. Estimated time to finish: {} hours".format(
                        iepoch,
                        trainer_options.max_epoch,
                        humanfriendly.format_timespan(
                            (time.perf_counter() - start_time)
                            / (iepoch - start_epoch)
                            * (trainer_options.max_epoch - iepoch + 1)
                        ),
                        (time.perf_counter() - start_time) / 3600.0 / (iepoch - start_epoch) * (
                                trainer_options.max_epoch - iepoch + 1),
                    )
                )
            else:
@@ -371,7 +366,7 @@
                            ],
                            "scaler": scaler.state_dict() if scaler is not None else None,
                            "ema_model": model.encoder.ema.model.state_dict()
                            if hasattr(model.encoder, "ema") and model.encoder.ema is not None else None,
                            if hasattr(model, "encoder") and hasattr(model.encoder, "ema") and model.encoder.ema is not None else None,
                        },
                        buffer,
                    )
@@ -550,7 +545,6 @@
        options: TrainerOptions,
        distributed_option: DistributedOption,
    ) -> Tuple[bool, bool]:
        assert check_argument_types()
        grad_noise = options.grad_noise
        accum_grad = options.accum_grad
@@ -619,6 +613,24 @@
            if no_forward_run:
                all_steps_are_invalid = False
                continue
            if iiter == 1 and summary_writer is not None:
                try:
                    args = kwargs2args(model.forward, batch)
                except (ValueError, TypeError):
                    logging.warning(
                        "inpect.signature() is failed for the model. "
                        "The graph can't be added for tensorboard."
                    )
                else:
                    try:
                        summary_writer.add_graph(model, args, use_strict_trace=False)
                    except Exception:
                        logging.warning(
                            "summary_writer.add_graph() is failed for the model. "
                            "The graph can't be added for tensorboard."
                        )
                    del args
            with autocast(scaler is not None):
                with reporter.measure_time("forward_time"):
@@ -826,7 +838,6 @@
        options: TrainerOptions,
        distributed_option: DistributedOption,
    ) -> None:
        assert check_argument_types()
        ngpu = options.ngpu
        no_forward_run = options.no_forward_run
        distributed = distributed_option.distributed