Skip to content

Commit

Permalink
change default ckpt name (#11277)
Browse files Browse the repository at this point in the history
Signed-off-by: Maanu Grover <[email protected]>
  • Loading branch information
maanug-nv authored Nov 14, 2024
1 parent af91d28 commit 8b0c311
Show file tree
Hide file tree
Showing 4 changed files with 4 additions and 4 deletions.
2 changes: 1 addition & 1 deletion nemo/lightning/nemo_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -220,7 +220,7 @@ def _setup_trainer_model_checkpoint(self, trainer, log_dir, ckpt=None):
if callback.dirpath is None:
callback.dirpath = Path(log_dir / "checkpoints")
if callback.filename is None:
callback.filename = f"{self.name}--{{{callback.monitor}:.4f}}-{{epoch}}"
callback.filename = f"{self.name}--{{{callback.monitor}:.4f}}-{{epoch}}-{{consumed_samples}}"
ModelCheckpoint.CHECKPOINT_NAME_LAST = callback.filename + "-last"

def _handle_task_config(self, task_config, log_dir):
Expand Down
2 changes: 1 addition & 1 deletion tests/collections/llm/bitexact/mixtral/run.sh
Original file line number Diff line number Diff line change
Expand Up @@ -43,4 +43,4 @@ python3 /workspace/tests/collections/llm/bitexact/mixtral/pretrain_mini_mixtral.

# Compare outputs
python3 /workspace/tests/collections/llm/bitexact/mixtral/compare_ckpts.py \
"$NEMO_OUTPUT_PATH/checkpoints/--None=0.0000-epoch=0/weights" "$MCORE_OUTPUT_PATH/iter_0000010/"
"$NEMO_OUTPUT_PATH/checkpoints/--None=0.0000-epoch=0-consumed_samples=20.0/weights" "$MCORE_OUTPUT_PATH/iter_0000010/"
2 changes: 1 addition & 1 deletion tests/collections/llm/megatron_mixtral_pretraining.py
Original file line number Diff line number Diff line change
Expand Up @@ -158,7 +158,7 @@ def main(args):
)

# Confirm checkpoint directory structure
output_path = Path(args.experiment_dir) / "checkpoints/--None=0.0000-epoch=0/weights"
output_path = Path(args.experiment_dir) / "checkpoints/--None=0.0000-epoch=0-consumed_samples=8.0/weights"
assert output_path.exists(), f"Expected {output_path} to exist"
assert output_path.is_dir(), f"Expected {output_path} to be a directory"
output_files = ['__0_0.distcp', '__0_1.distcp', 'common.pt', 'metadata.json', '.metadata']
Expand Down
2 changes: 1 addition & 1 deletion tests/lightning/test_state_restoration.py
Original file line number Diff line number Diff line change
Expand Up @@ -239,7 +239,7 @@ def run_resume_train(mbs, gbs, num_dev):
resume=AutoResume(
resume_if_exists=True,
resume_ignore_no_checkpoint=False,
resume_from_path=f'{EXP_DIR}default/v1/checkpoints/default--None=0.0000-epoch=0/',
resume_from_path=f'{EXP_DIR}default/v1/checkpoints/default--None=0.0000-epoch=0-consumed_samples=20.0/',
),
)
trainer._teardown()
Expand Down

0 comments on commit 8b0c311

Please sign in to comment.