zhifu gao
2024-05-20 961ec280afb02f2464ce4f7b2fd7c821dd24044b
funasr/train_utils/average_nbest_models.py
@@ -16,51 +16,60 @@
from functools import cmp_to_key
def _get_checkpoint_paths(output_dir: str, last_n: int=5):
def _get_checkpoint_paths(output_dir: str, last_n: int = 5, use_deepspeed=False, **kwargs):
    """
    Get the paths of the last 'last_n' checkpoints by parsing filenames
    in the output directory.
    """
    try:
        checkpoint = torch.load(os.path.exists(os.path.join(output_dir, "model.pt")), map_location="cpu")
        checkpoint = torch.load(os.path.join(output_dir, "model.pt"), map_location="cpu")
        avg_keep_nbest_models_type = checkpoint["avg_keep_nbest_models_type"]
        val_step_or_eoch = checkpoint[f"val_{avg_keep_nbest_models_type}_step_or_eoch"]
        sorted_items = sorted(saved_ckpts.items(), key=lambda x: x[1], reverse=True)
        sorted_items = sorted_items[:last_n] if avg_keep_nbest_models_type == "acc" else sorted_items[-last_n:]
        checkpoint_paths = [os.path.join(output_dir, key) for key, value in sorted_items[:last_n]]
        sorted_items = sorted(val_step_or_eoch.items(), key=lambda x: x[1], reverse=True)
        sorted_items = (
            sorted_items[:last_n] if avg_keep_nbest_models_type == "acc" else sorted_items[-last_n:]
        )
        checkpoint_paths = []
        for key, value in sorted_items[:last_n]:
            if not use_deepspeed:
                ckpt = os.path.join(output_dir, key)
            else:
                ckpt = os.path.join(output_dir, key, "mp_rank_00_model_states.pt")
    except:
        print(f"{checkpoint} does not exist, avg the lastet checkpoint.")
        # List all files in the output directory
        files = os.listdir(output_dir)
        # Filter out checkpoint files and extract epoch numbers
        checkpoint_files = [f for f in files if f.startswith("model.pt.e")]
        # Sort files by epoch number in descending order
        checkpoint_files.sort(key=lambda x: int(re.search(r'(\d+)', x).group()), reverse=True)
        checkpoint_files.sort(key=lambda x: int(re.search(r"(\d+)", x).group()), reverse=True)
        # Get the last 'last_n' checkpoint paths
        checkpoint_paths = [os.path.join(output_dir, f) for f in checkpoint_files[:last_n]]
    return checkpoint_paths
@torch.no_grad()
def average_checkpoints(output_dir: str, last_n: int=5, **kwargs):
def average_checkpoints(output_dir: str, last_n: int = 5, **kwargs):
    """
    Average the last 'last_n' checkpoints' model state_dicts.
    If a tensor is of type torch.int, perform sum instead of average.
    """
    checkpoint_paths = _get_checkpoint_paths(output_dir, last_n)
    checkpoint_paths = _get_checkpoint_paths(output_dir, last_n, **kwargs)
    print(f"average_checkpoints: {checkpoint_paths}")
    state_dicts = []
    # Load state_dicts from checkpoints
    for path in checkpoint_paths:
        if os.path.isfile(path):
            state_dicts.append(torch.load(path, map_location='cpu')['state_dict'])
            state_dicts.append(torch.load(path, map_location="cpu")["state_dict"])
        else:
            print(f"Checkpoint file {path} not found.")
            continue
    # Check if we have any state_dicts to average
    if not state_dicts:
        raise RuntimeError("No checkpoints found for averaging.")
    if len(state_dicts) < 1:
        print("No checkpoints found for averaging.")
        return
    # Average or sum weights
    avg_state_dict = OrderedDict()
@@ -75,6 +84,6 @@
            # Perform average for other types of tensors
            stacked_tensors = torch.stack(tensors)
            avg_state_dict[key] = torch.mean(stacked_tensors, dim=0)
    torch.save({'state_dict': avg_state_dict}, os.path.join(output_dir, f"model.pt.avg{last_n}"))
    return avg_state_dict
    checkpoint_outpath = os.path.join(output_dir, f"model.pt.avg{last_n}")
    torch.save({"state_dict": avg_state_dict}, checkpoint_outpath)
    return checkpoint_outpath