| | |
| | | from functools import cmp_to_key |
| | | |
| | | |
| | | def _get_checkpoint_paths(output_dir: str, last_n: int = 5): |
| | | def _get_checkpoint_paths(output_dir: str, last_n: int = 5, use_deepspeed=False, **kwargs): |
| | | """ |
| | | Get the paths of the last 'last_n' checkpoints by parsing filenames |
| | | in the output directory. |
| | | """ |
| | | try: |
| | | checkpoint = torch.load(os.path.join(output_dir, "model.pt"), map_location="cpu") |
| | | if not use_deepspeed: |
| | | checkpoint = torch.load(os.path.join(output_dir, "model.pt"), map_location="cpu") |
| | | else: |
| | | checkpoint = torch.load( |
| | | os.path.join(output_dir, "model.pt", "mp_rank_00_model_states.pt"), |
| | | map_location="cpu", |
| | | ) |
| | | avg_keep_nbest_models_type = checkpoint["avg_keep_nbest_models_type"] |
| | | val_step_or_eoch = checkpoint[f"val_{avg_keep_nbest_models_type}_step_or_eoch"] |
| | | sorted_items = sorted(val_step_or_eoch.items(), key=lambda x: x[1], reverse=True) |
| | | val_step_or_epoch = checkpoint[f"val_{avg_keep_nbest_models_type}_step_or_epoch"] |
| | | sorted_items = sorted(val_step_or_epoch.items(), key=lambda x: x[1], reverse=True) |
| | | sorted_items = ( |
| | | sorted_items[:last_n] if avg_keep_nbest_models_type == "acc" else sorted_items[-last_n:] |
| | | ) |
| | | checkpoint_paths = [os.path.join(output_dir, key) for key, value in sorted_items[:last_n]] |
| | | checkpoint_paths = [] |
| | | for key, value in sorted_items[:last_n]: |
| | | if not use_deepspeed: |
| | | ckpt = os.path.join(output_dir, key) |
| | | else: |
| | | ckpt = os.path.join(output_dir, key, "mp_rank_00_model_states.pt") |
| | | checkpoint_paths.append(ckpt) |
| | | |
| | | except: |
| | | print(f"{checkpoint} does not exist, avg the lastet checkpoint.") |
| | | # List all files in the output directory |
| | |
| | | checkpoint_files.sort(key=lambda x: int(re.search(r"(\d+)", x).group()), reverse=True) |
| | | # Get the last 'last_n' checkpoint paths |
| | | checkpoint_paths = [os.path.join(output_dir, f) for f in checkpoint_files[:last_n]] |
| | | |
| | | return checkpoint_paths |
| | | |
| | | |
| | |
| | | Average the last 'last_n' checkpoints' model state_dicts. |
| | | If a tensor is of type torch.int, perform sum instead of average. |
| | | """ |
| | | checkpoint_paths = _get_checkpoint_paths(output_dir, last_n) |
| | | checkpoint_paths = _get_checkpoint_paths(output_dir, last_n, **kwargs) |
| | | print(f"average_checkpoints: {checkpoint_paths}") |
| | | state_dicts = [] |
| | | |
| | |
| | | |
| | | # Check if we have any state_dicts to average |
| | | if len(state_dicts) < 1: |
| | | raise RuntimeError("No checkpoints found for averaging.") |
| | | print("No checkpoints found for averaging.") |
| | | return |
| | | |
| | | # Average or sum weights |
| | | avg_state_dict = OrderedDict() |