From d809a60aae985597432d0ea309c1126d44033589 Mon Sep 17 00:00:00 2001 From: Pablo Garay Date: Tue, 9 Jul 2024 08:24:01 -0700 Subject: [PATCH 1/8] add some more NeMoLogger tests and fix explicit_log_dir in 2.0 Signed-off-by: ashors1 --- nemo/lightning/nemo_logger.py | 59 ++++++++++------ tests/lightning/test_nemo_logger.py | 104 +++++++++++++++++++++++++++- 2 files changed, 139 insertions(+), 24 deletions(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 5ed783fdbefe..9a5c37c00e01 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -73,30 +73,45 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = logging.rank = self.global_rank if self.explicit_log_dir and isinstance(trainer, pl.Trainer): # If explicit log_dir was passed, short circuit - return check_explicit_log_dir(trainer, self.explicit_log_dir, self.dir, self.name, self.version) - - # Default dir to ./nemo_experiments if None was passed - _dir = self.dir - if self.dir is None: - _dir = str(Path.cwd() / 'nemo_experiments') - - if not self.name: - self.name = "default" - - version = self.version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None) - if is_global_rank_zero(): - if self.use_datetime_version: - version = time.strftime('%Y-%m-%d_%H-%M-%S') - if resume_if_exists: - logging.warning( - "No version folders would be created under the log folder as 'resume_if_exists' is enabled." - ) - version = None - if version: + if trainer.logger is not None and not self.update_logger_directory: + logging.warning( + f"nemo logger received explicit_log_dir: {self.explicit_log_dir} and the pytorch lightning trainer " + f"that was passed to nemo_logger container a logger, but update_logger_directory is False. This means " + f"that the trainer's logger directory may not match with the explicit_log_dir." + ) + if self.dir or self.version: + logging.error( + f"nemo logger received explicit_log_dir: {self.explicit_log_dir} and at least one of dir: {self.dir}, " + f"or version: {self.version}. Please note that dir, name, and version will be ignored." + ) + if is_global_rank_zero() and Path(self.explicit_log_dir).exists(): + logging.warning(f"Exp_manager is logging to {self.explicit_log_dir}, but it already exists.") + log_dir, _dir, self.name, version = Path(self.explicit_log_dir), str(self.explicit_log_dir), "", "" + + else: + # Default dir to ./nemo_experiments if None was passed + _dir = self.dir + if self.dir is None: + _dir = str(Path.cwd() / 'nemo_experiments') + + if not self.name: + self.name = "default" + + version = self.version or os.environ.get(NEMO_ENV_VARNAME_VERSION, None) if is_global_rank_zero(): - os.environ[NEMO_ENV_VARNAME_VERSION] = version + if self.use_datetime_version: + version = time.strftime('%Y-%m-%d_%H-%M-%S') + if resume_if_exists: + logging.warning( + "No version folders would be created under the log folder as 'resume_if_exists' is enabled." + ) + version = None + if version: + if is_global_rank_zero(): + os.environ[NEMO_ENV_VARNAME_VERSION] = version + + log_dir = Path(_dir) / Path(str(self.name)) / Path("" if version is None else str(version)) - log_dir = Path(_dir) / Path(str(self.name)) / Path("" if version is None else str(version)) # update app_state with log_dir, exp_dir, etc app_state = AppState() app_state.log_dir = log_dir diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 0dd49838d9e4..029b34b1c63c 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -1,11 +1,13 @@ +from pathlib import Path from unittest.mock import patch - +import os import pytest from pytorch_lightning.callbacks import ModelCheckpoint as PTLModelCheckpoint from pytorch_lightning.loggers import WandbLogger from nemo import lightning as nl - +from nemo.constants import NEMO_ENV_VARNAME_VERSION +from nemo.utils.exp_manager import NotFoundError class TestNeMoLogger: @pytest.fixture @@ -35,6 +37,14 @@ def test_explicit_log_dir(self, trainer): logger.setup(trainer) mock_check.assert_called_once_with(trainer, explicit_dir, None, "test", None) + def test_default_log_dir(self, trainer): + + if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None: + del os.environ[NEMO_ENV_VARNAME_VERSION] + logger = nl.NeMoLogger(use_datetime_version=False) + app_state = logger.setup(trainer) + assert app_state.log_dir == Path(Path.cwd() / "nemo_experiments" / "default") + def test_custom_version(self, trainer): custom_version = "v1.0" logger = nl.NeMoLogger(name="test", version=custom_version, use_datetime_version=False) @@ -58,3 +68,93 @@ def test_model_checkpoint_setup(self, trainer): ptl_ckpt = next(cb for cb in trainer.callbacks if isinstance(cb, PTLModelCheckpoint)) assert str(ptl_ckpt.dirpath).endswith("test_ckpt") assert ptl_ckpt.filename == "test-{epoch:02d}-{val_loss:.2f}" + + def test_resume(self, trainer, tmp_path): + """Tests the resume capabilities of exp_manager""" + + if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None: + del os.environ[NEMO_ENV_VARNAME_VERSION] + + # Error because explicit_log_dir does not exist + ### TODO: make "model" arg optional + with pytest.raises(NotFoundError): + nl.AutoResume( + dirpath=str(tmp_path / "test_resume"), + resume_if_exists=True, + ).setup(None, trainer) + + # Error because checkpoints folder does not exist + with pytest.raises(NotFoundError): + nl.AutoResume( + dirpath=str(tmp_path / "test_resume" / "does_not_exist"), + path="does_not_exist", + resume_if_exists=True, + ).setup(None, trainer) + + # No error because we tell autoresume to ignore notfounderror + nl.AutoResume( + dirpath=str(tmp_path / "test_resume" / "does_not_exist"), + resume_if_exists=True, + resume_ignore_no_checkpoint=True, + ).setup(None, trainer) + + + path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True) + # Error because checkpoints do not exist in folder + with pytest.raises(NotFoundError): + nl.AutoResume( + dirpath=path, + resume_if_exists=True, + ).setup(None, trainer) + + + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").mkdir() + # Error because *end.ckpt is in folder indicating that training has already finished + with pytest.raises(ValueError): + nl.AutoResume( + dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_if_exists=True, + ).setup(None, trainer) + + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir() + # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore + with pytest.raises(ValueError): + nl.AutoResume( + dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_if_exists=True, + ).setup(None, trainer) + + # Finally succeed + logger = nl.NeMoLogger( + name="default", + dir=str(tmp_path) + "/test_resume", + version="version_0", + use_datetime_version=False, + ) + logger.setup(trainer) + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").rmdir() + nl.AutoResume( + resume_if_exists=True, + ).setup(None, trainer) + checkpoint = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last") + assert Path(trainer.ckpt_path).resolve() == checkpoint.resolve() + + trainer = nl.Trainer(accelerator="cpu", logger=False) + # Check that model loads from `dirpath` and not /checkpoints + dirpath_log_dir = Path(tmp_path / "test_resume" / "dirpath_test" / "logs") + dirpath_log_dir.mkdir(parents=True) + dirpath_checkpoint_dir = Path(tmp_path / "test_resume" / "dirpath_test" / "ckpts") + dirpath_checkpoint = Path(dirpath_checkpoint_dir / "mymodel--last") + dirpath_checkpoint.mkdir(parents=True) + logger = nl.NeMoLogger( + name="default", + explicit_log_dir=dirpath_log_dir, + ) + logger.setup(trainer) + nl.AutoResume( + resume_if_exists=True, + dirpath=str(dirpath_checkpoint_dir), + ).setup(None, trainer) + assert Path(trainer.ckpt_path).resolve() == dirpath_checkpoint.resolve() From a0885730552c6fa885a360276f01f17eee16eff0 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 18 Jul 2024 17:58:42 +0000 Subject: [PATCH 2/8] Apply isort and black reformatting Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 029b34b1c63c..9551748d5711 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -1,6 +1,7 @@ +import os from pathlib import Path from unittest.mock import patch -import os + import pytest from pytorch_lightning.callbacks import ModelCheckpoint as PTLModelCheckpoint from pytorch_lightning.loggers import WandbLogger @@ -9,6 +10,7 @@ from nemo.constants import NEMO_ENV_VARNAME_VERSION from nemo.utils.exp_manager import NotFoundError + class TestNeMoLogger: @pytest.fixture def trainer(self): @@ -38,11 +40,11 @@ def test_explicit_log_dir(self, trainer): mock_check.assert_called_once_with(trainer, explicit_dir, None, "test", None) def test_default_log_dir(self, trainer): - + if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None: del os.environ[NEMO_ENV_VARNAME_VERSION] logger = nl.NeMoLogger(use_datetime_version=False) - app_state = logger.setup(trainer) + app_state = logger.setup(trainer) assert app_state.log_dir == Path(Path.cwd() / "nemo_experiments" / "default") def test_custom_version(self, trainer): @@ -98,7 +100,6 @@ def test_resume(self, trainer, tmp_path): resume_ignore_no_checkpoint=True, ).setup(None, trainer) - path = Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints").mkdir(parents=True) # Error because checkpoints do not exist in folder with pytest.raises(NotFoundError): @@ -107,7 +108,6 @@ def test_resume(self, trainer, tmp_path): resume_if_exists=True, ).setup(None, trainer) - Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").mkdir() # Error because *end.ckpt is in folder indicating that training has already finished with pytest.raises(ValueError): From c1b6f2f8f0653ace1b0b0a7a38343a8e6cb31df4 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 18 Jul 2024 11:16:28 -0700 Subject: [PATCH 3/8] update explicit_log_dir test Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 9551748d5711..34bdc89c5e58 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -35,9 +35,10 @@ def test_explicit_log_dir(self, trainer): explicit_dir = "explicit_test_dir" logger = nl.NeMoLogger(name="test", explicit_log_dir=explicit_dir) - with patch("nemo.utils.exp_manager.check_explicit_log_dir") as mock_check: - logger.setup(trainer) - mock_check.assert_called_once_with(trainer, explicit_dir, None, "test", None) + app_state = logger.setup(trainer) + assert str(app_state.log_dir) == "explicit_test_dir" + assert app_state.name == "" ## name should be ignored when explicit_log_dir is passed in + assert app_state.version == "" def test_default_log_dir(self, trainer): From b2ee4e9cfdcf07ec30e53bc585efe64ab831dfa9 Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 18 Jul 2024 18:17:24 +0000 Subject: [PATCH 4/8] Apply isort and black reformatting Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 34bdc89c5e58..15ab08b95e55 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -37,7 +37,7 @@ def test_explicit_log_dir(self, trainer): app_state = logger.setup(trainer) assert str(app_state.log_dir) == "explicit_test_dir" - assert app_state.name == "" ## name should be ignored when explicit_log_dir is passed in + assert app_state.name == "" ## name should be ignored when explicit_log_dir is passed in assert app_state.version == "" def test_default_log_dir(self, trainer): From 26324cd24495d343881a59b5188ce4fa1081f49e Mon Sep 17 00:00:00 2001 From: ashors1 Date: Thu, 25 Jul 2024 20:52:14 -0700 Subject: [PATCH 5/8] remove reference to exp_manager Signed-off-by: ashors1 --- nemo/lightning/nemo_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/nemo/lightning/nemo_logger.py b/nemo/lightning/nemo_logger.py index 9a5c37c00e01..8c0508676f14 100644 --- a/nemo/lightning/nemo_logger.py +++ b/nemo/lightning/nemo_logger.py @@ -85,7 +85,7 @@ def setup(self, trainer: Union[pl.Trainer, fl.Fabric], resume_if_exists: bool = f"or version: {self.version}. Please note that dir, name, and version will be ignored." ) if is_global_rank_zero() and Path(self.explicit_log_dir).exists(): - logging.warning(f"Exp_manager is logging to {self.explicit_log_dir}, but it already exists.") + logging.warning(f"NeMoLogger is logging to {self.explicit_log_dir}, but it already exists.") log_dir, _dir, self.name, version = Path(self.explicit_log_dir), str(self.explicit_log_dir), "", "" else: From 3062e8643018613c2355d3eff7c81b5e24d1f7ca Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 26 Jul 2024 07:31:03 -0700 Subject: [PATCH 6/8] update checkpoint restore test Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 15ab08b95e55..0d7699f15357 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -1,6 +1,7 @@ import os from pathlib import Path from unittest.mock import patch +import time import pytest from pytorch_lightning.callbacks import ModelCheckpoint as PTLModelCheckpoint @@ -79,12 +80,11 @@ def test_resume(self, trainer, tmp_path): del os.environ[NEMO_ENV_VARNAME_VERSION] # Error because explicit_log_dir does not exist - ### TODO: make "model" arg optional with pytest.raises(NotFoundError): nl.AutoResume( dirpath=str(tmp_path / "test_resume"), resume_if_exists=True, - ).setup(None, trainer) + ).setup(model=None, trainer=trainer) # Error because checkpoints folder does not exist with pytest.raises(NotFoundError): @@ -117,15 +117,16 @@ def test_resume(self, trainer, tmp_path): resume_if_exists=True, ).setup(None, trainer) + ## if there are multiple "-last" checkpoints, choose the most recent one Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() + time.sleep(1) ## sleep for a second so the checkpoints are created at different times Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir() - # Error because multiple *last.ckpt is in folder. If more than one, don't know which to restore - with pytest.raises(ValueError): - nl.AutoResume( - dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), - resume_if_exists=True, - ).setup(None, trainer) + nl.AutoResume( + dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), + resume_if_exists=True, + ).setup(None, trainer) + assert str(trainer.ckpt_path) == str(Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last")) # Finally succeed logger = nl.NeMoLogger( From 263a9572dea05ea4129d8772030458b0d900fada Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 26 Jul 2024 14:32:01 +0000 Subject: [PATCH 7/8] Apply isort and black reformatting Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 0d7699f15357..65871e040ce1 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -1,7 +1,7 @@ import os +import time from pathlib import Path from unittest.mock import patch -import time import pytest from pytorch_lightning.callbacks import ModelCheckpoint as PTLModelCheckpoint @@ -120,13 +120,15 @@ def test_resume(self, trainer, tmp_path): ## if there are multiple "-last" checkpoints, choose the most recent one Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--end").rmdir() Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel--last").mkdir() - time.sleep(1) ## sleep for a second so the checkpoints are created at different times + time.sleep(1) ## sleep for a second so the checkpoints are created at different times Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last").mkdir() nl.AutoResume( dirpath=Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints"), resume_if_exists=True, ).setup(None, trainer) - assert str(trainer.ckpt_path) == str(Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last")) + assert str(trainer.ckpt_path) == str( + Path(tmp_path / "test_resume" / "default" / "version_0" / "checkpoints" / "mymodel2--last") + ) # Finally succeed logger = nl.NeMoLogger( From 694c3ea04b2a95df384a39c8eb2c9882d895d60f Mon Sep 17 00:00:00 2001 From: ashors1 Date: Fri, 26 Jul 2024 07:34:15 -0700 Subject: [PATCH 8/8] update comment Signed-off-by: ashors1 --- tests/lightning/test_nemo_logger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/lightning/test_nemo_logger.py b/tests/lightning/test_nemo_logger.py index 0d7699f15357..7fdbafcf1684 100644 --- a/tests/lightning/test_nemo_logger.py +++ b/tests/lightning/test_nemo_logger.py @@ -74,7 +74,7 @@ def test_model_checkpoint_setup(self, trainer): assert ptl_ckpt.filename == "test-{epoch:02d}-{val_loss:.2f}" def test_resume(self, trainer, tmp_path): - """Tests the resume capabilities of exp_manager""" + """Tests the resume capabilities of NeMoLogger + AutoResume""" if os.environ.get(NEMO_ENV_VARNAME_VERSION, None) is not None: del os.environ[NEMO_ENV_VARNAME_VERSION]