updates to latest main

microsoft · ant0nsc · Apr 19, 2021 · Feb 8, 2021 · Feb 8, 2021 · Feb 9, 2021
commit 5b1081f2cb764b7eb00349a4c8d2b1ece3dcd2de
diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
@@ -34,7 +34,7 @@ class TrainingAndValidationDataLightning(LightningDataModule):
     A class that wraps training and validation data from an InnerEye model configuration to a Lightning data module.
     """
 
-    def _init__(self, config: ModelConfigBase) -> None:
+    def __init__(self, config: ModelConfigBase) -> None:
         super().__init__()
         self.config = config
         self.data_loaders: Dict[ModelExecutionMode, DataLoader] = {}
@@ -68,7 +68,7 @@ def setup(self) -> None:
         # loaded (typically only during tests)
         if self.config.dataset_data_frame is None:
             assert self.config.local_dataset is not None
-            validate_dataset_paths(self.config.local_dataset)
+            validate_dataset_paths(self.config.local_dataset, self.config.dataset_csv)
         self.config.read_dataset_if_needed()
 
     def get_training_data_module(self, crossval_index: int, crossval_count: int) -> LightningDataModule:

diff --git a/InnerEye/ML/lightning_container.py b/InnerEye/ML/lightning_container.py
@@ -158,6 +158,9 @@ def val_diagnostics(self) -> Any:
         """
         return None
 
+    def trainer_hook(self, trainer) -> None:
+        pass
+
 
 class LightningContainer:
 

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -6,7 +6,7 @@
 import os
 import sys
 from pathlib import Path
-from typing import Optional, Tuple, TypeVar
+from typing import Any, Dict, Optional, Tuple, TypeVar
 
 import torch
 from pytorch_lightning import Trainer, seed_everything
@@ -65,14 +65,17 @@ def upload_output_file_as_temp(file_path: Path, outputs_folder: Path) -> None:
 
 def create_lightning_trainer(config: DeepLearningConfig,
                              resume_from_checkpoint: Optional[Path] = None,
-                             num_nodes: int = 1) -> Tuple[Trainer, StoringLogger]:
+                             num_nodes: int = 1,
+                             **kwargs: Dict[str, Any]) -> \
+        Tuple[Trainer, StoringLogger]:
     """
     Creates a Pytorch Lightning Trainer object for the given model configuration. It creates checkpoint handlers
     and loggers. That includes a diagnostic logger for use in unit tests, that is also returned as the second
     return value.
     :param config: The model configuration.
     :param resume_from_checkpoint: If provided, training resumes from this checkpoint point.
     :param num_nodes: The number of nodes to use in distributed training.
+    :param kwargs: Any additional keyowrd arguments will be passed to the constructor of Trainer.
     :return: A tuple [Trainer object, diagnostic logger]
     """
     # For now, stick with the legacy behaviour of always saving only the last epoch checkpoint. For large segmentation
@@ -106,14 +109,6 @@ def create_lightning_trainer(config: DeepLearningConfig,
     storing_logger = StoringLogger()
     tensorboard_logger = TensorBoardLogger(save_dir=str(config.logs_folder), name="Lightning", version="")
     loggers = [storing_logger, tensorboard_logger, AzureMLLogger()]
-    # This leads to problems with run termination.
-    # if not is_offline_run_context(RUN_CONTEXT):
-    #     mlflow_logger = MLFlowLogger(experiment_name=RUN_CONTEXT.experiment.name,
-    #                                  tracking_uri=RUN_CONTEXT.experiment.workspace.get_mlflow_tracking_uri())
-    #     # The MLFlow logger needs to get its ID from the AzureML run context, otherwise there will be two sets of
-    #     # results for each run, one from native AzureML and one from the MLFlow logger.
-    #     mlflow_logger._run_id = RUN_CONTEXT.id
-    #     loggers.append(mlflow_logger)
     # Use 32bit precision when running on CPU. Otherwise, make it depend on use_mixed_precision flag.
     precision = 32 if num_gpus == 0 else 16 if config.use_mixed_precision else 32
     # The next two flags control the settings in torch.backends.cudnn.deterministic and torch.backends.cudnn.benchmark
@@ -142,8 +137,8 @@ def create_lightning_trainer(config: DeepLearningConfig,
                       precision=precision,
                       sync_batchnorm=True,
                       terminate_on_nan=config.detect_anomaly,
-                      resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None
-                      )
+                      resume_from_checkpoint=str(resume_from_checkpoint) if resume_from_checkpoint else None,
+                      **kwargs)
     return trainer, storing_logger
 
 
@@ -183,11 +178,11 @@ def model_train(config: DeepLearningConfig,
         assert isinstance(config, ModelConfigBase), "When using a built-in InnerEye model, the configuration should " \
                                                     "be an instance of ModelConfigBase"
         lightning_container = InnerEyeContainer(config)
-        # When trying to store the config object in the constructor, it does not appear to get stored at all, later
-        # reference of the object simply fail. Hence, have to set explicitly here.
         lightning_container.setup()
         if is_rank_zero():
             # Save the dataset files for later use in cross validation analysis
+            # TODO antonsc: Should we move that into TrainAndValidationDataLightning? The .prepare method
+            # of a data module is called only on rank zero
             config.write_dataset_files()
         lightning_model = create_lightning_model(config)
     else:
@@ -200,7 +195,10 @@ def model_train(config: DeepLearningConfig,
     # training in the unit tests.d
     old_environ = dict(os.environ)
     seed_everything(config.get_effective_random_seed())
-    trainer, storing_logger = create_lightning_trainer(config, checkpoint_path, num_nodes=num_nodes)
+    trainer, storing_logger = create_lightning_trainer(config,
+                                                       checkpoint_path,
+                                                       num_nodes=num_nodes,
+                                                       **lightning_container.get_trainer_arguments())
 
     logging.info(f"GLOBAL_RANK: {os.getenv('GLOBAL_RANK')}, LOCAL_RANK {os.getenv('LOCAL_RANK')}. "
                  f"trainer.global_rank: {trainer.global_rank}")
@@ -240,7 +238,6 @@ def model_train(config: DeepLearningConfig,
     trainer.logger.close()  # type: ignore
     lightning_model.close_all_loggers()
     world_size = getattr(trainer, "world_size", 0)
-    # TODO antonsc
     is_azureml_run = not config.is_offline_run
     # Per-subject model outputs for regression models are written per rank, and need to be aggregated here.
     # Each thread per rank will come here, and upload its files to the run outputs. Rank 0 will later download them.

diff --git a/Tests/ML/configs/lightning_test_containers.py b/Tests/ML/configs/lightning_test_containers.py
@@ -14,39 +14,35 @@
 from InnerEye.ML.lightning_container import LightningContainer, LightningWithInference
 
 
-class DummyContainerWithAzureDataset(LightningContainer):
-    def __init__(self):
+class DummyContainerWithDatasets(LightningContainer):
+    def __init__(self, has_local_dataset: bool = False, has_azure_dataset: bool = False):
         super().__init__()
+        self.has_local_dataset = has_local_dataset
+        self.has_azure_dataset = has_azure_dataset
 
     def create_lightning_module(self) -> LightningWithInference:
-        local_dataset = full_ml_test_data_path("lightning_module_data")
-        return LightningWithInference(azure_dataset_id="azure_dataset", local_dataset=local_dataset)
+        local_dataset = full_ml_test_data_path("lightning_module_data") if self.has_local_dataset else None
+        azure_dataset = "azure_dataset" if self.has_local_dataset else ""
+        return LightningWithInference(azure_dataset_id=azure_dataset, local_dataset=local_dataset)
 
 
-class DummyContainerWithoutDataset(LightningContainer):
+class DummyContainerWithAzureDataset(DummyContainerWithDatasets):
     def __init__(self):
-        super().__init__()
+        super().__init__(has_azure_dataset=True)
 
-    def create_lightning_module(self) -> LightningWithInference:
-        return LightningWithInference()
 
+class DummyContainerWithoutDataset(DummyContainerWithDatasets):
+    pass
 
-class DummyContainerWithLocalDataset(LightningContainer):
-    def __init__(self):
-        super().__init__()
 
-    def create_lightning_module(self) -> LightningWithInference:
-        local_dataset = full_ml_test_data_path("lightning_module_data")
-        return LightningWithInference(local_dataset=local_dataset)
+class DummyContainerWithLocalDataset(DummyContainerWithDatasets):
+    def __init__(self):
+        super().__init__(has_local_dataset=True)
 
 
-class DummyContainerWithAzureAndLocalDataset(LightningContainer):
+class DummyContainerWithAzureAndLocalDataset(DummyContainerWithDatasets):
     def __init__(self):
-        super().__init__()
-
-    def create_lightning_module(self) -> LightningWithInference:
-        local_dataset = full_ml_test_data_path("lightning_module_data")
-        return LightningWithInference(azure_dataset_id="azure_dataset", local_dataset=local_dataset)
+        super().__init__(has_local_dataset=True, has_azure_dataset=True)
 
 
 class DummyRegression(LightningWithInference):
@@ -102,11 +98,13 @@ def test_dataloader(self, *args, **kwargs) -> DataLoader:
 
 
 class DummyContainerWithModel(LightningContainer):
-    def __init__(self):
-        super().__init__()
-
     def create_lightning_module(self) -> LightningWithInference:
         return DummyRegression()
 
     def get_training_data_module(self, crossval_index: int, crossval_count: int) -> LightningDataModule:
         return FixedRegressionData()
+
+
+class DummyContainerWithInvalidTrainerArguments(DummyContainerWithModel):
+    def get_trainer_arguments(self):
+        return {"no_such_argument": 1}
diff --git a/Tests/ML/models/test_instantiate_models.py b/Tests/ML/models/test_instantiate_models.py
@@ -11,16 +11,18 @@
 
 from InnerEye.Common import fixed_paths
 from InnerEye.Common.common_util import logging_to_stdout, namespace_to_path
+from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.deep_learning_config import DeepLearningConfig
 from InnerEye.ML.lightning_container import LightningContainer
-from InnerEye.ML.model_training import generate_and_print_model_summary
+from InnerEye.ML.model_training import generate_and_print_model_summary, model_train
 from InnerEye.ML.runner import Runner
 from InnerEye.ML.utils.config_loader import ModelConfigLoader
 from InnerEye.ML.utils.model_util import create_model_with_temperature_scaling
 from Tests.ML.configs.DummyModel import DummyModel
-from Tests.ML.configs.lightning_test_containers import DummyContainerWithModel
-from Tests.ML.util import get_model_loader
+from Tests.ML.configs.lightning_test_containers import DummyContainerWithInvalidTrainerArguments, \
+    DummyContainerWithModel
+from Tests.ML.util import get_default_checkpoint_handler, get_model_loader
 
 
 def find_models() -> List[str]:
@@ -159,3 +161,14 @@ def test_run_container_in_situ() -> None:
         loaded_config, actual_run = runner.run()
     assert actual_run is None
     assert isinstance(runner.lightning_container, DummyContainerWithModel)
+
+
+def test_run_model_with_invalid_trainer_arguments(test_output_dirs: OutputFolderForTests) -> None:
+    container = DummyContainerWithInvalidTrainerArguments()
+    config = container.create_lightning_module()
+    container.lightning_module = config
+    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
+                                                        project_root=test_output_dirs.root_dir)
+    with pytest.raises(Exception) as ex:
+        model_train(container.lightning_module, checkpoint_handler=checkpoint_handler, lightning_container=container)
+    assert "no_such_argument" in str(ex)