microsoft · ant0nsc · Sep 30, 2020 · Sep 30, 2020 · Sep 30, 2020 · Oct 4, 2020
diff --git a/InnerEye/Azure/azure_config.py b/InnerEye/Azure/azure_config.py
@@ -84,6 +84,7 @@ class AzureConfig(GenericConfig):
     is_train: bool = param.Boolean(True,
                                    doc="If True, train a new model. If False, run inference on an existing model.")
     model: str = param.String(doc="The name of the model to train/test.")
+    use_distributed_data_parallel: bool = param.Boolean(default=False)
     register_model_only_for_epoch: Optional[int] = param.Integer(None,
                                                                  doc="If set, and run_recovery_id is also set, "
                                                                      "register the model for this epoch and do no "

diff --git a/InnerEye/Azure/azure_runner.py b/InnerEye/Azure/azure_runner.py
@@ -12,12 +12,13 @@
 from pathlib import Path
 from typing import Any, Dict, List, Optional
 
-from azureml.core import Dataset, Experiment, Run, Workspace
+from azureml.core import Dataset, Experiment, Run, Workspace, ComputeTarget
 from azureml.core.conda_dependencies import CondaDependencies
 from azureml.core.datastore import Datastore
 from azureml.core.workspace import WORKSPACE_DEFAULT_BLOB_STORE_NAME
 from azureml.data.dataset_consumption_config import DatasetConsumptionConfig
 from azureml.exceptions import WorkspaceException
+from azureml.train._distributed_training import Mpi
 from azureml.train.dnn import PyTorch
 
 from InnerEye.Azure import azure_util
@@ -286,19 +287,30 @@ def create_estimator_from_configs(workspace: Workspace, azure_config: AzureConfi
     # create Estimator environment
     framework_version = pytorch_version_from_conda_dependencies(conda_dependencies)
     logging.info(f"PyTorch framework version: {framework_version}")
+
+    if azure_config.use_distributed_data_parallel:
+        distributed_training_backend = Mpi(azure_config.workers_per_node)
+    else:
+        distributed_training_backend = None
+
+    compute_target = ComputeTarget(workspace, azure_config.gpu_cluster_name)
+
     estimator = PyTorch(
         source_directory=source_config.root_folder,
         entry_script=entry_script_relative_path,
         script_params=source_config.script_params,
-        compute_target=azure_config.gpu_cluster_name,
+        compute_target=compute_target,
         # Use blob storage for storing the source, rather than the FileShares section of the storage account.
         source_directory_data_store=workspace.datastores.get(WORKSPACE_DEFAULT_BLOB_STORE_NAME),
         inputs=estimator_inputs,
         environment_variables=environment_variables,
         shm_size=azure_config.docker_shm_size,
         use_docker=True,
         use_gpu=True,
-        framework_version=framework_version
+        framework_version=framework_version,
+        node_count=azure_config.node_count,
+        distributed_training=distributed_training_backend,
+        pip_packages=['azureml-dataprep[pandas,fuse]']
     )
     estimator.run_config.environment.python.conda_dependencies = conda_dependencies
     # We'd like to log the estimator config, but conversion to string fails when the Estimator has some inputs.

diff --git a/InnerEye/Common/common_util.py b/InnerEye/Common/common_util.py
@@ -61,7 +61,7 @@ def flush(self, log_info: bool = False) -> None:
         """
         import pandas as pd
         if not self.csv_path.parent.is_dir():
-            self.csv_path.parent.mkdir(parents=True)
+            self.csv_path.parent.mkdir(parents=True, exist_ok=True)
         # Specifying columns such that the order in which columns appear matches the order in which
         # columns were added in the code.
         columns = self.records[0].keys() if len(self.records) > 0 else None

diff --git a/InnerEye/Common/generic_parsing.py b/InnerEye/Common/generic_parsing.py
@@ -11,6 +11,7 @@
 import param
 from azureml.core import Run
 from azureml.core.run import _OfflineRun
+from torch import device as torch_device
 
 from InnerEye.Common.common_util import is_gpu_tensor, is_private_field_name
 from InnerEye.Common.type_annotations import T
@@ -75,15 +76,17 @@ def get_cuda_devices(self) -> Optional[List[Any]]:
         else:
             return None
 
-    def get_gpu_tensor_if_possible(self, data: T) -> Any:
+    def get_gpu_tensor_if_possible(self, data: T, device: Optional[torch_device] = None) -> Any:
         """"
         Get a cuda tensor if this transform was cuda enabled and a GPU is available, otherwise
         return the input.
         """
         import torch
         if isinstance(data, torch.Tensor):
             if self.use_gpu and not is_gpu_tensor(data):
-                return data.cuda()
+                # use default CUDA device if not specified
+                device = torch.device('cuda') if device is None else device
+                return data.to(device)
             else:
                 return data
         else:

diff --git a/InnerEye/ML/configs/classification/DummyClassification.py b/InnerEye/ML/configs/classification/DummyClassification.py
@@ -15,7 +15,7 @@ class DummyClassification(ScalarModelBase):
     "A config file for dummy image classification model for debugging purposes" 
 
     def __init__(self) -> None:
-        num_epochs = 4
+        num_epochs = 10
         super().__init__(
             local_dataset=full_ml_test_data_path("classification_data"),
             image_channels=["image"],
@@ -28,9 +28,10 @@ def __init__(self) -> None:
             num_epochs=num_epochs,
             num_dataload_workers=0,
             test_start_epoch=num_epochs,
-            use_mixed_precision=True,
+            use_mixed_precision=False,
             subject_column="subjectID",
-            conv_in_3d=True
+            conv_in_3d=True,
+            use_distributed_data_parallel=True
         )
         self.expected_image_size_zyx = (4, 5, 7)
 

diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
@@ -11,7 +11,8 @@
 import pandas as pd
 import torch.utils.data
 from torch._six import container_abcs
-from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler
+from torch.utils.data import BatchSampler, DataLoader, Dataset, RandomSampler, Sampler, SequentialSampler, \
+    DistributedSampler
 from torch.utils.data.dataloader import default_collate  # type: ignore
 
 from InnerEye.Common.type_annotations import IntOrString, TupleFloat3
@@ -177,7 +178,9 @@ def as_data_loader(self,
                        num_dataload_workers: Optional[int] = None,
                        use_imbalanced_sampler: bool = False,
                        drop_last_batch: bool = False,
-                       max_repeats: Optional[int] = None) -> DataLoader:
+                       max_repeats: Optional[int] = None,
+                       distribute=False
+                       ) -> DataLoader:
         num_dataload_workers = num_dataload_workers or self.args.num_dataload_workers
         batch_size = batch_size or self.args.train_batch_size
         if self.args.avoid_process_spawn_in_data_loaders:
@@ -195,6 +198,15 @@ def as_data_loader(self,
                 use_imbalanced_sampler=use_imbalanced_sampler,
                 drop_last=drop_last_batch
             )
+        elif distribute:
+            # distributed data loader
+            sampler: DistributedSampler = DistributedSampler(self)
+            return DataLoader(self,
+                              batch_size=batch_size,
+                              shuffle=False,
+                              num_workers=0,
+                              collate_fn=collate_with_metadata,
+                              sampler=sampler)
         else:
             if use_imbalanced_sampler:
                 sampler: Optional[Sampler] = ImbalancedSampler(self)

diff --git a/InnerEye/ML/deep_learning_config.py b/InnerEye/ML/deep_learning_config.py
@@ -181,7 +181,9 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
                                                          doc="The high-level model category described by this config.")
     _model_name: str = param.String(None, doc="The human readable name of the model (for example, Liver). This is "
                                               "usually set from the class name.")
-
+    use_distributed_data_parallel: bool = param.Boolean(False,
+                                                        doc="If True, will attempt to train with "
+                                                            "DistributedDataParallel")
     random_seed: int = param.Integer(42, doc="The seed to use for all random number generators.")
     azure_dataset_id: Optional[str] = param.String(None, allow_None=True,
                                                    doc="The ID of the dataset to use. This dataset must exist as a "
@@ -356,6 +358,9 @@ class DeepLearningConfig(GenericConfig, CudaAwareConfig):
                                                  "weight = alpha * (mean_teacher_weight) "
                                                  " + (1-alpha) * (current_student_weights). ")
 
+    dist_backend: str = param.String(default='nccl', doc="Communication package to use for distributed training")
+    init_method: str = param.String(default='env://', doc="URL specifying where to find peer processes")
+
     def __init__(self, **params: Any) -> None:
         self._model_name = type(self).__name__
         # This should be annotated as torch.utils.data.Dataset, but we don't want to import torch here.
@@ -629,7 +634,17 @@ def use_data_parallel(self) -> bool:
         :return:
         """
         _devices = self.get_cuda_devices()
-        return _devices is not None and len(_devices) > 1
+        return _devices is not None and len(_devices) > 1 and not self.use_distributed_data_parallel
+
+    @property
+    def use_ddp(self) -> bool:
+        """
+        Data parallel may used if GPUs are usable and the number of CUDA devices are greater than 1
+        and the OS is not windows
+        :return:
+        """
+        _devices = self.get_cuda_devices()
+        return (_devices is not None) & (len(_devices) > 1) & (not is_windows()) & self.use_distributed_data_parallel
 
     def write_args_file(self, root: Optional[Path] = None) -> None:
         """

diff --git a/InnerEye/ML/model_config_base.py b/InnerEye/ML/model_config_base.py
@@ -11,6 +11,7 @@
 from azureml.train.estimator import Estimator
 from azureml.train.hyperdrive import GridParameterSampling, HyperDriveConfig, PrimaryMetricGoal, choice
 from pandas import DataFrame
+from torch import device as torch_device
 
 from InnerEye.Azure.azure_util import CROSS_VALIDATION_SPLIT_INDEX_TAG_KEY
 from InnerEye.ML.common import DATASET_CSV_FILE_NAME, ModelExecutionMode, STORED_CSV_FILE_NAMES, TrackedMetrics
@@ -76,7 +77,7 @@ def create_torch_datasets(self, dataset_splits: DatasetSplits) -> Dict[ModelExec
 
     def create_and_set_torch_datasets(self, for_training: bool = True, for_inference: bool = True) -> None:
         """
-        Creats and sets torch datasets for training and validation, and stores them in the self._datasets_for_training
+        Creates and sets torch datasets for training and validation, and stores them in the self._datasets_for_training
         field. Similarly, create torch datasets in the form required for model inference, for all of the
         3 splits of the full data, and stored them in the self._datasets_for_training and/or
         self._datasets_for_inference fields.
@@ -128,12 +129,14 @@ def create_data_loaders(self, max_repeats: Optional[int] = None) -> Dict[ModelEx
             .as_data_loader(shuffle=self.shuffle,
                             use_imbalanced_sampler=self.use_imbalanced_sampler_for_training,
                             drop_last_batch=self.drop_last_batch_in_training,
-                            max_repeats=self.get_total_number_of_training_epochs())
+                            max_repeats=self.get_total_number_of_training_epochs(),
+                            distribute=self.use_ddp)
         logging.info("Creating the data loader for the validation set.")
 
         val_loader = self._datasets_for_training[ModelExecutionMode.VAL].as_data_loader(
             shuffle=False,
-            max_repeats=self.get_total_number_of_validation_epochs()
+            max_repeats=self.get_total_number_of_validation_epochs(),
+            distribute=self.use_ddp
         )
         logging.info("Finished creating the data loaders.")
         return {

diff --git a/InnerEye/ML/model_training.py b/InnerEye/ML/model_training.py
@@ -8,6 +8,8 @@
 from time import time
 from typing import Optional, Tuple, TypeVar
 
+import torch
+
 from InnerEye.Azure.azure_util import RUN_CONTEXT
 from InnerEye.Common.common_util import empty_string_to_none
 from InnerEye.Common.metrics_dict import MetricsDict
@@ -23,6 +25,7 @@
 from InnerEye.ML.scalar_config import ScalarModelBase
 from InnerEye.ML.sequence_config import SequenceModelBase
 from InnerEye.ML.utils import ml_util, model_util
+from InnerEye.ML.utils.aml_distributed_utils import get_local_rank, get_global_rank, get_global_size
 from InnerEye.ML.utils.config_util import ModelConfigLoader
 from InnerEye.ML.utils.lr_scheduler import LRScheduler
 from InnerEye.ML.utils.metrics_util import create_summary_writers
@@ -50,7 +53,8 @@ def load_checkpoint_from_model_and_info(run_recovery: Optional[RunRecovery], con
     return result
 
 
-def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = None) -> ModelTrainingResults:
+def model_train(config: ModelConfigBase,
+                run_recovery: Optional[RunRecovery] = None) -> ModelTrainingResults:
     """
     The main training loop. It creates the model, dataset, optimizer_type, and criterion, then proceeds
     to train the model. If a checkpoint was specified, then it loads the checkpoint before resuming training.
@@ -68,11 +72,55 @@ def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = N
 
     logging.debug("Creating the pytorch model.")
 
+    # create model
+    model = create_model_with_temperature_scaling(config)
+
+    if config.use_ddp:
+
+        world_size = get_global_size(config.is_offline_run)
+
+        if config.is_offline_run:
+            # set the environment variable for master node address
+            os.environ['MASTER_ADDR'] = 'localhost'
+            os.environ['MASTER_PORT'] = '12355'
+            # spawn processes
+            torch.multiprocessing.spawn(train,
+                                        args=(model, config),
+                                        nprocs=world_size)
+
+        else:
+            # AzureML MPI configuration handles spawn
+            rank = get_global_rank()
+            train(None, model, config)
+    else:
+        single_process_rank = 0
+        train(single_process_rank, model, config)
+
+
+def train(rank: Optional[int], model, config,  run_recovery: Optional[RunRecovery] = None):
+    """
+
+    :param rank: The global rank of the current process (for DistributedDataParallel). For single process, rank=0
+    :param model:
+    :param config:
+    :param run_recovery:
+    :return:
+    """
+    rank = get_global_rank() if rank is None else rank  # get rank for AML run
+    device = torch.device('cuda', rank) if torch.cuda.is_available() else torch.device('cpu')
+
+    if config.use_ddp:
+        print(f"Running distributed training on device with global rank {rank}")
+        torch.distributed.init_process_group(  # type: ignore
+            backend=config.dist_backend,
+            init_method=config.init_method,
+            world_size=get_global_size(config.is_offline_run),
+            rank=rank)
+
     # Create the train loader and validation loader to load images from the dataset
     data_loaders = config.create_data_loaders()
 
     # Create models, optimizers, and whether is_mean_teacher
-    model = create_model_with_temperature_scaling(config)
     models_and_optimizers = [ModelAndInfo(model, model_util.create_optimizer(config, model),
                                           model_execution_mode=ModelExecutionMode.TRAIN)]
     if config.compute_mean_teacher_model:
@@ -88,21 +136,22 @@ def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = N
     else:
         logging.info("Models are saved at {}".format(config.checkpoint_folder))
         if not os.path.isdir(config.checkpoint_folder):
-            os.makedirs(config.checkpoint_folder)
+            os.makedirs(config.checkpoint_folder, exist_ok=True)
 
     # Print out a detailed breakdown of layers, memory consumption and time.
     generate_and_print_model_summary(config, model)
 
     # Enable mixed precision training and data parallelization (no-op if already done).
     # This relies on the information generated in the model summary.
-
     # We only want to do this if we didn't call load_checkpoint above, because attempting updating twice
     # causes an error.
-    models_and_optimizers = [model_util.update_model_for_mixed_precision_and_parallel(model_and_info, config)
+    models_and_optimizers = [model_util.update_model_for_mixed_precision_and_parallel(model_and_info, config,
+                                                                                      rank=rank)
                              for model_and_info in models_and_optimizers]
 
     # Create the SummaryWriters for Tensorboard
-    writers = create_summary_writers(config)
+    writers = create_summary_writers(config, rank=rank)
+
     config.create_dataframe_loggers()
 
     model = models_and_optimizers[0].model
@@ -144,7 +193,7 @@ def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = N
                                     dataframe_loggers=config.metrics_data_frame_loggers,
                                     in_training_mode=True)
         training_steps = create_model_training_steps(config, train_val_params)
-        train_epoch_results = train_or_validate_epoch(training_steps)
+        train_epoch_results = train_or_validate_epoch(training_steps, device)
         train_results_per_epoch.append(train_epoch_results)
 
         metrics.validate_and_store_model_parameters(writers.train, epoch, model)
@@ -157,7 +206,7 @@ def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = N
             train_val_params.save_metrics = not (save_epoch and config.temperature_scaling_config)
 
         training_steps = create_model_training_steps(config, train_val_params)
-        val_epoch_results = train_or_validate_epoch(training_steps)
+        val_epoch_results = train_or_validate_epoch(training_steps, device)
         if train_val_params.save_metrics:
             val_results_per_epoch.append(val_epoch_results)
 
@@ -166,7 +215,7 @@ def model_train(config: ModelConfigBase, run_recovery: Optional[RunRecovery] = N
                                                        train_epoch_results.metrics,
                                                        val_epoch_results.metrics)
 
-        if save_epoch:
+        if save_epoch and rank==0:
             # perform temperature scaling if required
             if isinstance(config, SequenceModelBase) and config.temperature_scaling_config:
                 optimal_temperature, scaled_val_results = \
@@ -238,7 +287,7 @@ def temperature_scaling_steps(config: SequenceModelBase,
     return temperature_value, val_epoch_results
 
 
-def train_or_validate_epoch(training_steps: ModelTrainingStepsBase) -> ModelOutputsAndMetricsForEpoch:
+def train_or_validate_epoch(training_steps: ModelTrainingStepsBase, device) -> ModelOutputsAndMetricsForEpoch:
     """
     Trains or validates the model for one epoch.
     :param training_steps: Training pipeline to use.
@@ -280,7 +329,7 @@ def train_or_validate_epoch(training_steps: ModelTrainingStepsBase) -> ModelOutp
                                 f"{MAX_LOAD_TIME_WARNINGS} times.")
                 num_load_time_warnings += 1
         model_outputs_minibatch = training_steps.forward_and_backward_minibatch(
-            sample, batch_index, train_val_params.epoch)
+            sample, batch_index, train_val_params.epoch, device)
         model_outputs_epoch.append(model_outputs_minibatch)
         train_finish_time = time()
         logging.debug(f"Epoch {train_val_params.epoch} {status_string} batch {batch_index}: "
@@ -337,7 +386,11 @@ def main() -> None:
     parser.add_argument("--model", help="The name of the model to train", type=empty_string_to_none,
                         required=True)
     args = parser.parse_args()
-    model_train(ModelConfigLoader().create_model_config_from_name(args.model))
+
+    model_config = ModelConfigLoader().create_model_config_from_name(args.model)
+
+    not_distributed_rank = 0
+    model_train(not_distributed_rank, model_config)
 
 
 if __name__ == '__main__':