From 9c08fc6f5e5a05ae8c04aa0c76befbe88571270b Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Fri, 21 May 2021 13:22:13 -0700
Subject: [PATCH 01/45] Added initial unit test to evaluate model predictions.

Creates an 'InferencePipeline.Result' object using pre-defined volumes, stores results and evaluates metrics.
---
 Tests/ML/pipelines/test_inference.py | 74 +++++++++++++++++++++++++++-
 1 file changed, 73 insertions(+), 1 deletion(-)

diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 566597d8a..02507fac6 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -5,6 +5,7 @@
 from typing import Any, List
 
 import numpy as np
+import pandas as pd
 import pytest
 import torch
 from torch.nn import Parameter
@@ -15,9 +16,14 @@
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
-from InnerEye.ML.pipelines.inference import InferencePipeline
+from InnerEye.ML.pipelines.inference import InferencePipeline, FullImageInferencePipelineBase
 from InnerEye.ML.utils import image_util
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
+from Tests.ML.configs.DummyModel import DummyModel
+from InnerEye.ML.utils.split_dataset import DatasetSplits
+from InnerEye.ML.dataset.sample import Sample
+from InnerEye.ML.common import ModelExecutionMode
+from InnerEye.ML.model_testing import store_inference_results, evaluate_model_predictions
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@@ -198,3 +204,69 @@ def shrink_dim(i: int) -> int:
 
     def get_all_child_layers(self) -> List[torch.nn.Module]:
         return list()
+
+
+def test_evaluate_model_predictions() -> None:
+    """
+    Creates an 'InferencePipeline.Result' object using pre-defined volumes, stores results and evaluates metrics.
+    """
+
+    # Full dataset -- no missing channels
+    input_list = [
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"],
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2", "1"],
+        ["1", "train_and_test_data/id1_mask.nii.gz", "mask", "1"],
+        ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"],
+        ["2", "train_and_test_data/id2_mask.nii.gz", "mask", "2"],
+        ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"],
+        ["3", "train_and_test_data/id2_mask.nii.gz", "mask", "3"],
+        ["3", "train_and_test_data/id2_region.nii.gz", "region", "3"]]
+
+    # Overwrite get_model_train_test_dataset_splits method for subjects 1,2,3
+    class MyDummyModel(DummyModel):
+        def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
+            return DatasetSplits(train=dataset_df[dataset_df.subject.isin(['1'])],
+                                 test=dataset_df[dataset_df.subject.isin(['3'])],
+                                 val=dataset_df[dataset_df.subject.isin(['2'])])
+
+    config = MyDummyModel()
+    df = pd.DataFrame(input_list, columns=['subject', 'filePath', 'channel', 'institutionId'])
+    config._dataset_data_frame = df
+    ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
+
+    results_folder = config.outputs_folder
+    if not results_folder.is_dir():
+        results_folder.mkdir()
+
+    for sample_index, sample in enumerate(ds, 1):
+        sample = Sample.from_dict(sample=sample)
+        posteriors = np.zeros((3,) + sample.mask.shape, 'float32')
+        posteriors[0][:] = 0.2
+        posteriors[1][:] = 0.6
+        posteriors[2][:] = 0.2
+
+        inference_result = InferencePipeline.Result(
+            patient_id=sample.patient_id,
+            posteriors=posteriors,
+            segmentation=sample.mask,
+            voxel_spacing_mm=config.dataset_expected_spacing_xyz
+        )
+        store_inference_results(inference_result=inference_result,
+                                config=config,
+                                results_folder=results_folder,
+                                image_header=sample.metadata.image_header)
+
+        metadata, metrics_per_class = evaluate_model_predictions(
+            sample_index-1,
+            config=config,
+            dataset=ds,
+            results_folder=results_folder)
+
+        metrics_str_output = metrics_per_class.to_string()
+        assert 'Dice' in metrics_str_output
+        assert 'HausdorffDistance_millimeters' in metrics_str_output
+        assert 'MeanSurfaceDistance_millimeters' in metrics_str_output

From f867c9fcc8448d162a6e9effb414bb5604567ee3 Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Mon, 24 May 2021 16:58:26 -0700
Subject: [PATCH 02/45] Updated log PR #465.

---
 CHANGELOG.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 36f8c94e8..adf38140d 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -12,7 +12,8 @@ created.
 ## Upcoming
 
 ### Added
-
+- ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Added ability to run segmentation inference
+  module in the test data without or partial ground truth files. 
 - ([#454](https://github.com/microsoft/InnerEye-DeepLearning/pull/454)) Checking that labels are mutually exclusive.
 - ([#447](https://github.com/microsoft/InnerEye-DeepLearning/pull/447/)) Added a sanity check to ensure there are no
   missing channels, nor missing files. If missing channels in the csv file or filenames associated with channels are

From 8507ae70b1ab8cdbdc21feaef9230b903baf368b Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Mon, 24 May 2021 17:56:55 -0700
Subject: [PATCH 03/45] Added unit test for segmentation inference in test data
 and no ground truth file.

---
 InnerEye/ML/config.py                     |  3 +-
 InnerEye/ML/dataset/full_image_dataset.py | 34 ++++++---
 InnerEye/ML/dataset/sample.py             |  7 +-
 InnerEye/ML/model_testing.py              |  5 ++
 InnerEye/ML/utils/io_util.py              |  5 ++
 Tests/ML/pipelines/test_inference.py      | 93 ++++++++++++++++++-----
 6 files changed, 113 insertions(+), 34 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index 79a8a4cb0..0edec67ab 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -768,7 +768,8 @@ def create_and_set_torch_datasets(self, for_training: bool = True, for_inference
                 mode: FullImageDataset(
                     self,
                     dataset_splits[mode],
-                    full_image_sample_transforms=full_image_transforms.test)  # type: ignore
+                    full_image_sample_transforms=full_image_transforms.test,  # type: ignore
+                    for_inference=for_inference)
                 for mode in ModelExecutionMode if len(dataset_splits[mode]) > 0
             }
 
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index aa3bf2dff..c8baddc53 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -213,10 +213,11 @@ class FullImageDataset(GeneralDataset):
     """
 
     def __init__(self, args: SegmentationModelBase, data_frame: pd.DataFrame,
-                 full_image_sample_transforms: Optional[Compose3D[Sample]] = None):
+                 full_image_sample_transforms: Optional[Compose3D[Sample]] = None,
+                 for_inference: bool = False):
         super().__init__(args, data_frame)
         self.full_image_sample_transforms = full_image_sample_transforms
-
+        self.for_inference = for_inference
         # Check base_path
         assert self.args.local_dataset is not None
         if not self.args.local_dataset.is_dir():
@@ -250,7 +251,8 @@ def _extension_from_df_file_paths(file_paths: List[str]) -> str:
     def get_samples_at_index(self, index: int) -> List[Sample]:
         # load the channels into memory
         ds = self.dataset_sources[self.dataset_indices[index]]
-        samples = [io_util.load_images_from_dataset_source(dataset_source=ds, check_exclusive=self.args.check_exclusive)]  # type: ignore
+        samples = [io_util.load_images_from_dataset_source(dataset_source=ds,
+                                                           check_exclusive=self.args.check_exclusive)]  # type: ignore
         return [Compose3D.apply(self.full_image_sample_transforms, x) for x in samples]
 
     def _load_dataset_sources(self) -> Dict[str, PatientDatasetSource]:
@@ -259,18 +261,21 @@ def _load_dataset_sources(self) -> Dict[str, PatientDatasetSource]:
                                     local_dataset_root_folder=self.args.local_dataset,
                                     image_channels=self.args.image_channels,
                                     ground_truth_channels=self.args.ground_truth_ids,
-                                    mask_channel=self.args.mask_id
+                                    mask_channel=self.args.mask_id,
+                                    for_inference=self.for_inference
                                     )
 
 
 def convert_channels_to_file_paths(channels: List[str],
                                    rows: pd.DataFrame,
                                    local_dataset_root_folder: Path,
-                                   patient_id: str) -> Tuple[List[Path], str]:
+                                   patient_id: str,
+                                   for_inference: bool = False) -> Tuple[List[Path], str]:
     """
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
 
+    :param for_inference:
     :param channels: channel type defined in the configuration file
     :param rows: Input Pandas dataframe object containing subjectIds, path of local dataset, channel information
     :param local_dataset_root_folder: Root directory which points to the local dataset
@@ -281,12 +286,12 @@ def convert_channels_to_file_paths(channels: List[str],
 
     for channel_id in channels:
         row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
-        if len(row) == 0:
+        if len(row) == 0 and not for_inference:
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
                                    os.linesep
-        else:
+        elif len(row) == 1:
             image_path = local_dataset_root_folder / row[CSV_PATH_HEADER].values[0]
             if not image_path.is_file():
                 failed_channel_info += f"Patient {patient_id}, file {image_path} does not exist" + os.linesep
@@ -300,7 +305,8 @@ def load_dataset_sources(dataframe: pd.DataFrame,
                          local_dataset_root_folder: Path,
                          image_channels: List[str],
                          ground_truth_channels: List[str],
-                         mask_channel: Optional[str]) -> Dict[str, PatientDatasetSource]:
+                         mask_channel: Optional[str],
+                         for_inference: bool = False) -> Dict[str, PatientDatasetSource]:
     """
     Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
     The dataframe contains per-patient per-channel image information, relative to a root directory.
@@ -311,6 +317,7 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     :param image_channels: The names of the image channels that should be used in the result.
     :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
     :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
+    :param for_inference: Boolean variable to indicate if executing for inference.
     :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
     """
     expected_headers = {CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER}
@@ -328,8 +335,11 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     def get_mask_channel_or_default() -> Optional[Path]:
         if mask_channel is None:
             return None
+        paths = get_paths_for_channel_ids(channels=[mask_channel])
+        if len(paths) == 0:
+            return None
         else:
-            return get_paths_for_channel_ids(channels=[mask_channel])[0]
+            return paths[0]
 
     def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
         if len(set(channels)) < len(channels):
@@ -337,7 +347,7 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
         # converts channels to paths and makes second sanity check for channel data
         paths, failed_channel_info = convert_channels_to_file_paths(channels, rows, local_dataset_root_folder,
-                                                                    patient_id)
+                                                                    patient_id, for_inference)
 
         if failed_channel_info:
             raise ValueError(failed_channel_info)
@@ -351,7 +361,7 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
             metadata=metadata,
             image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
             mask_channel=get_mask_channel_or_default(),
-            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels)  # type: ignore
-        )
+            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels), # type: ignore
+            for_inference=for_inference)
 
     return dataset_sources
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index b556c215e..24a42c1cc 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -132,6 +132,7 @@ class PatientDatasetSource(SampleBase):
     ground_truth_channels: List[PathOrString]
     mask_channel: Optional[PathOrString]
     metadata: PatientMetadata
+    for_inference: bool
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -139,7 +140,7 @@ def __post_init__(self) -> None:
 
         if not self.image_channels:
             raise ValueError("image_channels cannot be empty")
-        if not self.ground_truth_channels:
+        if not self.ground_truth_channels and not self.for_inference:
             raise ValueError("ground_truth_channels cannot be empty")
 
 
@@ -164,8 +165,8 @@ def __post_init__(self) -> None:
         ml_util.check_size_matches(arg1=self.image, arg2=self.mask,
                                    matching_dimensions=self._get_matching_dimensions())
 
-        ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
-                                   matching_dimensions=self._get_matching_dimensions())
+        # ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
+        #                            matching_dimensions=self._get_matching_dimensions())
 
     @property
     def patient_id(self) -> int:
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index f9f709321..7360d1db8 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -231,6 +231,11 @@ def evaluate_model_predictions(process_id: int,
     """
     sample = dataset.get_samples_at_index(index=process_id)[0]
     logging.info(f"Evaluating predictions for patient {sample.patient_id}")
+
+    if sample.labels is None:
+        logging.info(f"No ground truth provided for patient  {sample.patient_id}, skipping metrics evaluation.")
+        return sample.metadata, MetricsDict(hues=config.ground_truth_ids)
+
     patient_results_folder = get_patient_results_folder(results_folder, sample.patient_id)
     segmentation = load_nifti_image(patient_results_folder / DEFAULT_RESULT_IMAGE_NAME).image
     metrics_per_class = metrics.calculate_metrics_per_class(segmentation,
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index 8229aa823..c3207e986 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -421,6 +421,10 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
     :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
     :return: A label sample object containing ground-truth information.
     """
+
+    if not dataset_source.ground_truth_channels:
+        return None
+
     labels = np.stack(
         [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
 
@@ -502,6 +506,7 @@ def load_images_from_dataset_source(dataset_source: PatientDatasetSource, check_
     metadata = copy(dataset_source.metadata)
     metadata.image_header = images[0].header
     labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive)
+
     return Sample(image=image,
                   labels=labels,
                   mask=mask,
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 02507fac6..ca3f89ad7 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -10,6 +10,7 @@
 import torch
 from torch.nn import Parameter
 
+from InnerEye.Common.common_util import METRICS_AGGREGATES_FILE, SUBJECT_METRICS_FILE_NAME
 from InnerEye.Common import common_util
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.Common.type_annotations import TupleInt3
@@ -18,12 +19,14 @@
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
 from InnerEye.ML.pipelines.inference import InferencePipeline, FullImageInferencePipelineBase
 from InnerEye.ML.utils import image_util
+from InnerEye.ML.utils.metrics_util import MetricsPerPatientWriter
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 from Tests.ML.configs.DummyModel import DummyModel
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 from InnerEye.ML.dataset.sample import Sample
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.model_testing import store_inference_results, evaluate_model_predictions
+from InnerEye.Common.metrics_constants import MetricType
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@@ -206,11 +209,28 @@ def get_all_child_layers(self) -> List[torch.nn.Module]:
         return list()
 
 
+def create_config_from_dataset(input_list: List, train: List, val: List, test: List) -> DummyModel:
+    """
+    Creates an "DummyModel(SegmentationModelBase)" object given patient list
+    and training, validation and test subjects id.
+    """
+
+    class MyDummyModel(DummyModel):
+        def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
+            return DatasetSplits(train=dataset_df[dataset_df.subject.isin(train)],
+                                 test=dataset_df[dataset_df.subject.isin(test)],
+                                 val=dataset_df[dataset_df.subject.isin(val)])
+
+    config = MyDummyModel()
+    df = pd.DataFrame(input_list, columns=['subject', 'filePath', 'channel', 'institutionId'])
+    config._dataset_data_frame = df
+    return config
+
+
 def test_evaluate_model_predictions() -> None:
     """
     Creates an 'InferencePipeline.Result' object using pre-defined volumes, stores results and evaluates metrics.
     """
-
     # Full dataset -- no missing channels
     input_list = [
         ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"],
@@ -226,18 +246,8 @@ def test_evaluate_model_predictions() -> None:
         ["3", "train_and_test_data/id2_mask.nii.gz", "mask", "3"],
         ["3", "train_and_test_data/id2_region.nii.gz", "region", "3"]]
 
-    # Overwrite get_model_train_test_dataset_splits method for subjects 1,2,3
-    class MyDummyModel(DummyModel):
-        def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
-            return DatasetSplits(train=dataset_df[dataset_df.subject.isin(['1'])],
-                                 test=dataset_df[dataset_df.subject.isin(['3'])],
-                                 val=dataset_df[dataset_df.subject.isin(['2'])])
-
-    config = MyDummyModel()
-    df = pd.DataFrame(input_list, columns=['subject', 'filePath', 'channel', 'institutionId'])
-    config._dataset_data_frame = df
+    config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3'])
     ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
-
     results_folder = config.outputs_folder
     if not results_folder.is_dir():
         results_folder.mkdir()
@@ -252,7 +262,7 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas
         inference_result = InferencePipeline.Result(
             patient_id=sample.patient_id,
             posteriors=posteriors,
-            segmentation=sample.mask,
+            segmentation=np.argmax(posteriors, 0),
             voxel_spacing_mm=config.dataset_expected_spacing_xyz
         )
         store_inference_results(inference_result=inference_result,
@@ -261,12 +271,59 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas
                                 image_header=sample.metadata.image_header)
 
         metadata, metrics_per_class = evaluate_model_predictions(
-            sample_index-1,
+            sample_index - 1,
             config=config,
             dataset=ds,
             results_folder=results_folder)
 
-        metrics_str_output = metrics_per_class.to_string()
-        assert 'Dice' in metrics_str_output
-        assert 'HausdorffDistance_millimeters' in metrics_str_output
-        assert 'MeanSurfaceDistance_millimeters' in metrics_str_output
+        hue_name = metrics_per_class.get_hue_names()[0]
+        assert 'Dice' in metrics_per_class.values(hue_name).keys()
+        assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
+        assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
+
+    # Dataset -- subject 3 missing ground truth and mask
+    input_list = [
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"],
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2", "1"],
+        ["1", "train_and_test_data/id1_mask.nii.gz", "mask", "1"],
+        ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"],
+        ["2", "train_and_test_data/id2_mask.nii.gz", "mask", "2"],
+        ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"]]
+
+    config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3'])
+    ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
+    results_folder = config.outputs_folder
+    if not results_folder.is_dir():
+        results_folder.mkdir()
+
+    average_dice = list()
+    metrics_writer = MetricsPerPatientWriter()
+
+    for sample_index, sample in enumerate(ds, 1):
+        sample = Sample.from_dict(sample=sample)
+        posteriors = np.zeros((3,) + sample.mask.shape, 'float32')
+        posteriors[0][:] = 0.2
+        posteriors[1][:] = 0.6
+        posteriors[2][:] = 0.2
+
+        assert config.dataset_expected_spacing_xyz is not None
+        inference_result = InferencePipeline.Result(
+            patient_id=sample.patient_id,
+            posteriors=posteriors,
+            segmentation=np.argmax(posteriors, 0),
+            voxel_spacing_mm=config.dataset_expected_spacing_xyz
+        )
+        store_inference_results(inference_result=inference_result,
+                                config=config,
+                                results_folder=results_folder,
+                                image_header=sample.metadata.image_header)
+
+        metadata, metrics_per_class = evaluate_model_predictions(
+            sample_index - 1,
+            config=config,
+            dataset=ds,
+            results_folder=results_folder)

From 6137efb4e4443bed5d382c98bd3f69011a85a8af Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Tue, 25 May 2021 15:22:58 -0700
Subject: [PATCH 04/45] Renamed variable from: "for_inference"  to
 "allow_incomplete_labels".

---
 InnerEye/ML/config.py                     |  6 +++---
 InnerEye/ML/dataset/full_image_dataset.py | 20 ++++++++++----------
 InnerEye/ML/dataset/sample.py             |  4 ++--
 3 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index 0edec67ab..57f894879 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -741,7 +741,7 @@ def class_and_index_with_background(self) -> Dict[str, int]:
         classes.update({x: i + 1 for i, x in enumerate(self.ground_truth_ids)})
         return classes
 
-    def create_and_set_torch_datasets(self, for_training: bool = True, for_inference: bool = True) -> None:
+    def create_and_set_torch_datasets(self, for_training: bool = True, allow_incomplete_labels: bool = True) -> None:
         """
         Creates torch datasets for all model execution modes, and stores them in the object.
         """
@@ -763,13 +763,13 @@ def create_and_set_torch_datasets(self, for_training: bool = True, for_inference
                     cropped_sample_transforms=crop_transforms.val,  # type: ignore
                     full_image_sample_transforms=full_image_transforms.val),  # type: ignore
             }
-        if for_inference:
+        if allow_incomplete_labels:
             self._datasets_for_inference = {
                 mode: FullImageDataset(
                     self,
                     dataset_splits[mode],
                     full_image_sample_transforms=full_image_transforms.test,  # type: ignore
-                    for_inference=for_inference)
+                    allow_incomplete_labels=allow_incomplete_labels)
                 for mode in ModelExecutionMode if len(dataset_splits[mode]) > 0
             }
 
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index c8baddc53..e4e0451a6 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -214,10 +214,10 @@ class FullImageDataset(GeneralDataset):
 
     def __init__(self, args: SegmentationModelBase, data_frame: pd.DataFrame,
                  full_image_sample_transforms: Optional[Compose3D[Sample]] = None,
-                 for_inference: bool = False):
+                 allow_incomplete_labels: bool = False):
         super().__init__(args, data_frame)
         self.full_image_sample_transforms = full_image_sample_transforms
-        self.for_inference = for_inference
+        self.allow_incomplete_labels = allow_incomplete_labels
         # Check base_path
         assert self.args.local_dataset is not None
         if not self.args.local_dataset.is_dir():
@@ -262,7 +262,7 @@ def _load_dataset_sources(self) -> Dict[str, PatientDatasetSource]:
                                     image_channels=self.args.image_channels,
                                     ground_truth_channels=self.args.ground_truth_ids,
                                     mask_channel=self.args.mask_id,
-                                    for_inference=self.for_inference
+                                    allow_incomplete_labels=self.allow_incomplete_labels
                                     )
 
 
@@ -270,12 +270,12 @@ def convert_channels_to_file_paths(channels: List[str],
                                    rows: pd.DataFrame,
                                    local_dataset_root_folder: Path,
                                    patient_id: str,
-                                   for_inference: bool = False) -> Tuple[List[Path], str]:
+                                   allow_incomplete_labels: bool = False) -> Tuple[List[Path], str]:
     """
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
 
-    :param for_inference:
+    :param allow_incomplete_labels:
     :param channels: channel type defined in the configuration file
     :param rows: Input Pandas dataframe object containing subjectIds, path of local dataset, channel information
     :param local_dataset_root_folder: Root directory which points to the local dataset
@@ -286,7 +286,7 @@ def convert_channels_to_file_paths(channels: List[str],
 
     for channel_id in channels:
         row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
-        if len(row) == 0 and not for_inference:
+        if len(row) == 0 and not allow_incomplete_labels:
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
@@ -306,7 +306,7 @@ def load_dataset_sources(dataframe: pd.DataFrame,
                          image_channels: List[str],
                          ground_truth_channels: List[str],
                          mask_channel: Optional[str],
-                         for_inference: bool = False) -> Dict[str, PatientDatasetSource]:
+                         allow_incomplete_labels: bool = False) -> Dict[str, PatientDatasetSource]:
     """
     Prepares a patient-to-images mapping from a dataframe read directly from a dataset CSV file.
     The dataframe contains per-patient per-channel image information, relative to a root directory.
@@ -317,7 +317,7 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     :param image_channels: The names of the image channels that should be used in the result.
     :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
     :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
-    :param for_inference: Boolean variable to indicate if executing for inference.
+    :param allow_incomplete_labels: Boolean variable to indicate if executing for inference.
     :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
     """
     expected_headers = {CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER}
@@ -347,7 +347,7 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
         # converts channels to paths and makes second sanity check for channel data
         paths, failed_channel_info = convert_channels_to_file_paths(channels, rows, local_dataset_root_folder,
-                                                                    patient_id, for_inference)
+                                                                    patient_id, allow_incomplete_labels)
 
         if failed_channel_info:
             raise ValueError(failed_channel_info)
@@ -362,6 +362,6 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
             image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
             mask_channel=get_mask_channel_or_default(),
             ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels), # type: ignore
-            for_inference=for_inference)
+            allow_incomplete_labels=allow_incomplete_labels)
 
     return dataset_sources
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index 24a42c1cc..4606007fd 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -132,7 +132,7 @@ class PatientDatasetSource(SampleBase):
     ground_truth_channels: List[PathOrString]
     mask_channel: Optional[PathOrString]
     metadata: PatientMetadata
-    for_inference: bool
+    allow_incomplete_labels: bool
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -140,7 +140,7 @@ def __post_init__(self) -> None:
 
         if not self.image_channels:
             raise ValueError("image_channels cannot be empty")
-        if not self.ground_truth_channels and not self.for_inference:
+        if not self.ground_truth_channels and not self.allow_incomplete_labels:
             raise ValueError("ground_truth_channels cannot be empty")
 
 

From 2e8f173e16bdfd4f6c01baf592537b22efd45104 Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Tue, 25 May 2021 16:05:00 -0700
Subject: [PATCH 05/45] Completed type annotation.

---
 Tests/ML/pipelines/test_inference.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index ca3f89ad7..768609def 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -209,7 +209,8 @@ def get_all_child_layers(self) -> List[torch.nn.Module]:
         return list()
 
 
-def create_config_from_dataset(input_list: List, train: List, val: List, test: List) -> DummyModel:
+def create_config_from_dataset(input_list: List[List[str]], train: List[str], val: List[str], test: List[str]) \
+        -> DummyModel:
     """
     Creates an "DummyModel(SegmentationModelBase)" object given patient list
     and training, validation and test subjects id.

From a50066388a3341f7a796c00a935dae0037ed55fd Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang <albertosa@microsoft.com>
Date: Wed, 26 May 2021 09:55:34 -0700
Subject: [PATCH 06/45] Extended unit test "test_evaluate_model_predictions" to
 account for multiple ground truth images missing

For inference, given in a test dataset with all, partial, and no segmentation channels, inference will be executed and metrics will be estimated in when segmentation files exist. In overall, the changes involve

1) Setting to true the Boolean flag: "allow_incomplete_labels".
2) Keeping track of missing channels in "convert_channels_to_file_paths".
    * We need to resolve which ground truth file is missing to be able to map back it correctly to the segmentation result from the inference.
3) A label volume is initialize to NaN if the corresponding ground truth file is missing
    * The order is preserved so can be mapped back to corresponding segmentation in the inference.
4) Only label images that are different that NaN are evaluated in the expected order as implemented in the method "metrics.calculate_metrics_per_class".
---
 InnerEye/ML/config.py                     |   6 +-
 InnerEye/ML/dataset/full_image_dataset.py |   9 +-
 InnerEye/ML/dataset/sample.py             |   4 +-
 InnerEye/ML/metrics.py                    |  15 ++-
 InnerEye/ML/model_testing.py              |   4 -
 InnerEye/ML/utils/io_util.py              |  37 ++++--
 Tests/ML/pipelines/test_inference.py      | 139 +++++++++++-----------
 Tests/ML/util.py                          |   3 +-
 Tests/ML/utils/test_io_util.py            |  12 +-
 9 files changed, 128 insertions(+), 101 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index 57f894879..053883a84 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -741,7 +741,7 @@ def class_and_index_with_background(self) -> Dict[str, int]:
         classes.update({x: i + 1 for i, x in enumerate(self.ground_truth_ids)})
         return classes
 
-    def create_and_set_torch_datasets(self, for_training: bool = True, allow_incomplete_labels: bool = True) -> None:
+    def create_and_set_torch_datasets(self, for_training: bool = True, for_inference: bool = True) -> None:
         """
         Creates torch datasets for all model execution modes, and stores them in the object.
         """
@@ -763,13 +763,13 @@ def create_and_set_torch_datasets(self, for_training: bool = True, allow_incompl
                     cropped_sample_transforms=crop_transforms.val,  # type: ignore
                     full_image_sample_transforms=full_image_transforms.val),  # type: ignore
             }
-        if allow_incomplete_labels:
+        if for_inference:
             self._datasets_for_inference = {
                 mode: FullImageDataset(
                     self,
                     dataset_splits[mode],
                     full_image_sample_transforms=full_image_transforms.test,  # type: ignore
-                    allow_incomplete_labels=allow_incomplete_labels)
+                    allow_incomplete_labels=True)
                 for mode in ModelExecutionMode if len(dataset_splits[mode]) > 0
             }
 
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index e4e0451a6..6679e9dd2 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -275,7 +275,7 @@ def convert_channels_to_file_paths(channels: List[str],
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
 
-    :param allow_incomplete_labels:
+    :param allow_incomplete_labels: flag to enforce all ground truth labels
     :param channels: channel type defined in the configuration file
     :param rows: Input Pandas dataframe object containing subjectIds, path of local dataset, channel information
     :param local_dataset_root_folder: Root directory which points to the local dataset
@@ -288,6 +288,9 @@ def convert_channels_to_file_paths(channels: List[str],
         row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
         if len(row) == 0 and not allow_incomplete_labels:
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
+        elif len(row) == 0 and allow_incomplete_labels:
+            # Keeps track of missing channels order
+            paths.append(Path(''))
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
                                    os.linesep
@@ -317,7 +320,7 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     :param image_channels: The names of the image channels that should be used in the result.
     :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
     :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
-    :param allow_incomplete_labels: Boolean variable to indicate if executing for inference.
+    :param allow_incomplete_labels: Boolean variable to allow missing ground truth files.
     :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
     """
     expected_headers = {CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER}
@@ -361,7 +364,7 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
             metadata=metadata,
             image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
             mask_channel=get_mask_channel_or_default(),
-            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels), # type: ignore
+            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels),  # type: ignore
             allow_incomplete_labels=allow_incomplete_labels)
 
     return dataset_sources
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index 4606007fd..d01246f55 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -165,8 +165,8 @@ def __post_init__(self) -> None:
         ml_util.check_size_matches(arg1=self.image, arg2=self.mask,
                                    matching_dimensions=self._get_matching_dimensions())
 
-        # ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
-        #                            matching_dimensions=self._get_matching_dimensions())
+        ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
+                                   matching_dimensions=self._get_matching_dimensions())
 
     @property
     def patient_id(self) -> int:
diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index 3c7471274..f220aa338 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -242,14 +242,23 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
                          f"the label tensor indicates that there are {number_of_classes - 1} classes.")
     binaries = binaries_from_multi_label_array(segmentation, number_of_classes)
 
-    all_classes_are_binary = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
-    if not np.all(all_classes_are_binary):
+    binary_classes = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
+
+    # If ground truth image is nan, then will not be used for metrics computation.
+    nan_images = [np.isnan(np.sum(ground_truth[label_id])) for label_id in range(ground_truth.shape[0])]
+
+    # Validates if not binary then nan
+    assert np.all(np.array(binary_classes) == ~np.array(nan_images))
+
+    #  Validates that all binary images should be 0 or 1
+    if not np.all(np.array(binary_classes)[~np.array(nan_images)]):
         raise ValueError("Ground truth values should be 0 or 1")
     overlap_measures_filter = sitk.LabelOverlapMeasuresImageFilter()
     hausdorff_distance_filter = sitk.HausdorffDistanceImageFilter()
     metrics = MetricsDict(hues=ground_truth_ids)
     for i, prediction in enumerate(binaries):
-        if i == 0:
+        # Skips if background image or nan_image
+        if i == 0 or nan_images[i]:
             continue
         check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
         if not is_binary_array(prediction):
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 7360d1db8..d91253c17 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -232,10 +232,6 @@ def evaluate_model_predictions(process_id: int,
     sample = dataset.get_samples_at_index(index=process_id)[0]
     logging.info(f"Evaluating predictions for patient {sample.patient_id}")
 
-    if sample.labels is None:
-        logging.info(f"No ground truth provided for patient  {sample.patient_id}, skipping metrics evaluation.")
-        return sample.metadata, MetricsDict(hues=config.ground_truth_ids)
-
     patient_results_folder = get_patient_results_folder(results_folder, sample.patient_id)
     segmentation = load_nifti_image(patient_results_folder / DEFAULT_RESULT_IMAGE_NAME).image
     metrics_per_class = metrics.calculate_metrics_per_class(segmentation,
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index c3207e986..4b2c3b859 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -412,28 +412,43 @@ def load_image_in_known_formats(file: Path,
         raise ValueError(f"Unsupported image file type for path {file}")
 
 
-def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True) -> np.ndarray:
+def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True,
+                                    mask_size: Optional[Tuple[int]] = None) -> np.ndarray:
     """
     Load labels containing segmentation binary labels in one-hot-encoding.
     In the future, this function will be used to load global class and non-imaging information as well.
 
+    :type mask_size: Image size, tuple if integers.
     :param dataset_source: The dataset source for which channels are to be loaded into memory.
     :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
     :return: A label sample object containing ground-truth information.
     """
 
-    if not dataset_source.ground_truth_channels:
-        return None
-
-    labels = np.stack(
-        [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
-
-    if check_exclusive and (sum(labels) > 1.).any():  # type: ignore
+    if not dataset_source.allow_incomplete_labels:
+        labels = np.stack(
+            [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
+    else:
+        assert mask_size is not None
+        label_list = []
+        for gt in dataset_source.ground_truth_channels:
+            if str(gt) == '.':
+                label_list.append(np.full(mask_size, np.NAN, ImageDataType))
+            else:
+                label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
+        labels = np.stack(label_list)
+
+    # If ground truth image is nan, then will not be used to check check_exclusive.
+    not_nan_label_images = [labels[label_id] for label_id in range(labels.shape[0])
+                            if not np.isnan(np.sum(labels[label_id]))]
+
+    if check_exclusive and (sum(np.array(not_nan_label_images)) > 1.).any():  # type: ignore
         raise ValueError(f'The labels for patient {dataset_source.metadata.patient_id} are not mutually exclusive. '
-                         'Some loss functions (e.g. SoftDice) may produce results on overlapping labels, while others (e.g. FocalLoss) will fail. '
+                         'Some loss functions (e.g. SoftDice) may produce results on overlapping labels, while others '
+                         '(e.g. FocalLoss) will fail. '
                          'If you are sure that you want to use mutually exclusive labels, '
                          'then re-run with the check_exclusive flag set to false in the settings file. '
-                         'Note that this is the first error encountered, other samples/patients may also have overlapping labels.')
+                         'Note that this is the first error encountered, other samples/patients may also have '
+                         'overlapping labels.')
 
     # Add the background binary map
     background = np.ones_like(labels[0])
@@ -505,7 +520,7 @@ def load_images_from_dataset_source(dataset_source: PatientDatasetSource, check_
     # create raw sample to return
     metadata = copy(dataset_source.metadata)
     metadata.image_header = images[0].header
-    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive)
+    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive, mask_size=mask.shape)
 
     return Sample(image=image,
                   labels=labels,
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 768609def..14282d1b4 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -10,23 +10,20 @@
 import torch
 from torch.nn import Parameter
 
-from InnerEye.Common.common_util import METRICS_AGGREGATES_FILE, SUBJECT_METRICS_FILE_NAME
 from InnerEye.Common import common_util
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.Common.type_annotations import TupleInt3
 from InnerEye.ML.config import SegmentationModelBase
 from InnerEye.ML.models.architectures.base_model import BaseSegmentationModel
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
-from InnerEye.ML.pipelines.inference import InferencePipeline, FullImageInferencePipelineBase
+from InnerEye.ML.pipelines.inference import InferencePipeline
 from InnerEye.ML.utils import image_util
-from InnerEye.ML.utils.metrics_util import MetricsPerPatientWriter
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 from Tests.ML.configs.DummyModel import DummyModel
 from InnerEye.ML.utils.split_dataset import DatasetSplits
 from InnerEye.ML.dataset.sample import Sample
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.model_testing import store_inference_results, evaluate_model_predictions
-from InnerEye.Common.metrics_constants import MetricType
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@@ -223,7 +220,15 @@ def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> Datas
                                  val=dataset_df[dataset_df.subject.isin(val)])
 
     config = MyDummyModel()
-    df = pd.DataFrame(input_list, columns=['subject', 'filePath', 'channel', 'institutionId'])
+    # Sets two regions for ground truth
+    config.fg_ids = ["region", "region_1"]
+    config.ground_truth_ids = config.fg_ids
+    config.ground_truth_ids_display_names = config.fg_ids
+    config.colours = [(255, 255, 255)] * len(config.fg_ids)
+    config.fill_holes = [False] * len(config.fg_ids)
+    config.roi_interpreted_types = ["Organ"] * len(config.fg_ids)
+    config.check_exclusive = False
+    df = pd.DataFrame(input_list, columns=['subject', 'filePath', 'channel'])
     config._dataset_data_frame = df
     return config
 
@@ -232,22 +237,38 @@ def test_evaluate_model_predictions() -> None:
     """
     Creates an 'InferencePipeline.Result' object using pre-defined volumes, stores results and evaluates metrics.
     """
-    # Full dataset -- no missing channels
+    # Patient 3,4,5 are in test dataset such that:
+    # Patient 3 has one missing ground truth channel: "region"
+    # Patient 4 has all missing ground truth channels: "region", "region_1"
+    # Patient 5 has no missing ground truth channels.
     input_list = [
-        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"],
-        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2", "1"],
-        ["1", "train_and_test_data/id1_mask.nii.gz", "mask", "1"],
-        ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"],
-        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"],
-        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"],
-        ["2", "train_and_test_data/id2_mask.nii.gz", "mask", "2"],
-        ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"],
-        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"],
-        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"],
-        ["3", "train_and_test_data/id2_mask.nii.gz", "mask", "3"],
-        ["3", "train_and_test_data/id2_region.nii.gz", "region", "3"]]
-
-    config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3'])
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1"],
+        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2"],
+        ["1", "train_and_test_data/id1_mask.nii.gz", "mask"],
+        ["1", "train_and_test_data/id1_region.nii.gz", "region"],
+        ["1", "train_and_test_data/id1_region.nii.gz", "region_1"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1"],
+        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2"],
+        ["2", "train_and_test_data/id2_mask.nii.gz", "mask"],
+        ["2", "train_and_test_data/id2_region.nii.gz", "region"],
+        ["2", "train_and_test_data/id2_region.nii.gz", "region_1"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1"],
+        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2"],
+        ["3", "train_and_test_data/id2_mask.nii.gz", "mask"],
+        # ["3", "train_and_test_data/id2_region.nii.gz", "region"], # commented on purpose
+        ["3", "train_and_test_data/id2_region.nii.gz", "region_1"],
+        ["4", "train_and_test_data/id2_channel1.nii.gz", "channel1"],
+        ["4", "train_and_test_data/id2_channel1.nii.gz", "channel2"],
+        ["4", "train_and_test_data/id2_mask.nii.gz", "mask"],
+        # ["4", "train_and_test_data/id2_region.nii.gz", "region"], # commented on purpose
+        # ["4", "train_and_test_data/id2_region.nii.gz", "region_1"], # commented on purpose
+        ["5", "train_and_test_data/id2_channel1.nii.gz", "channel1"],
+        ["5", "train_and_test_data/id2_channel1.nii.gz", "channel2"],
+        ["5", "train_and_test_data/id2_mask.nii.gz", "mask"],
+        ["5", "train_and_test_data/id2_region.nii.gz", "region"],
+        ["5", "train_and_test_data/id2_region.nii.gz", "region_1"]]
+
+    config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3', '4', '5'])
     ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
     results_folder = config.outputs_folder
     if not results_folder.is_dir():
@@ -260,6 +281,8 @@ def test_evaluate_model_predictions() -> None:
         posteriors[1][:] = 0.6
         posteriors[2][:] = 0.2
 
+        assert config.dataset_expected_spacing_xyz is not None
+
         inference_result = InferencePipeline.Result(
             patient_id=sample.patient_id,
             posteriors=posteriors,
@@ -277,54 +300,30 @@ def test_evaluate_model_predictions() -> None:
             dataset=ds,
             results_folder=results_folder)
 
-        hue_name = metrics_per_class.get_hue_names()[0]
-        assert 'Dice' in metrics_per_class.values(hue_name).keys()
-        assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
-        assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
-
-    # Dataset -- subject 3 missing ground truth and mask
-    input_list = [
-        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel1", "1"],
-        ["1", "train_and_test_data/id1_channel1.nii.gz", "channel2", "1"],
-        ["1", "train_and_test_data/id1_mask.nii.gz", "mask", "1"],
-        ["1", "train_and_test_data/id1_region.nii.gz", "region", "1"],
-        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel1", "2"],
-        ["2", "train_and_test_data/id2_channel1.nii.gz", "channel2", "2"],
-        ["2", "train_and_test_data/id2_mask.nii.gz", "mask", "2"],
-        ["2", "train_and_test_data/id2_region.nii.gz", "region", "2"],
-        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel1", "3"],
-        ["3", "train_and_test_data/id2_channel1.nii.gz", "channel2", "3"]]
-
-    config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3'])
-    ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
-    results_folder = config.outputs_folder
-    if not results_folder.is_dir():
-        results_folder.mkdir()
-
-    average_dice = list()
-    metrics_writer = MetricsPerPatientWriter()
+        # Patient 3 has one missing ground truth channel: "region"
+        if sample.metadata.patient_id == '3':
+            assert 'Dice' in metrics_per_class.values('region_1').keys()
+            assert 'HausdorffDistance_millimeters' in metrics_per_class.values('region_1').keys()
+            assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values('region_1').keys()
+            for hue_name in ['region', 'Default']:
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+
+        # Patient 4 has all missing ground truth channels: "region", "region_1"
+        if sample.metadata.patient_id == '4':
+            for hue_name in ['region_1', 'region', 'Default']:
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+
+        # Patient 5 has no missing ground truth channels
+        if sample.metadata.patient_id == '5':
+            assert len(metrics_per_class.values('Default').keys()) == 0
+            assert len(metrics_per_class.values('Default').keys()) == 0
+            assert len(metrics_per_class.values('Default').keys()) == 0
+            for hue_name in ['region_1', 'region']:
+                assert 'Dice' in metrics_per_class.values('region_1').keys()
+                assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
+                assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
-    for sample_index, sample in enumerate(ds, 1):
-        sample = Sample.from_dict(sample=sample)
-        posteriors = np.zeros((3,) + sample.mask.shape, 'float32')
-        posteriors[0][:] = 0.2
-        posteriors[1][:] = 0.6
-        posteriors[2][:] = 0.2
-
-        assert config.dataset_expected_spacing_xyz is not None
-        inference_result = InferencePipeline.Result(
-            patient_id=sample.patient_id,
-            posteriors=posteriors,
-            segmentation=np.argmax(posteriors, 0),
-            voxel_spacing_mm=config.dataset_expected_spacing_xyz
-        )
-        store_inference_results(inference_result=inference_result,
-                                config=config,
-                                results_folder=results_folder,
-                                image_header=sample.metadata.image_header)
-
-        metadata, metrics_per_class = evaluate_model_predictions(
-            sample_index - 1,
-            config=config,
-            dataset=ds,
-            results_folder=results_folder)
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index 34b21d8b2..fb49c116d 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -73,7 +73,8 @@ def load_train_and_test_data_channels(patient_ids: List[int],
         metadata=PatientMetadata(patient_id=z),
         image_channels=[file_name(z, c) for c in TEST_CHANNEL_IDS],
         mask_channel=file_name(z, TEST_MASK_ID),
-        ground_truth_channels=[file_name(z, TEST_GT_ID)]
+        ground_truth_channels=[file_name(z, TEST_GT_ID)],
+        allow_incomplete_labels=False
     ))
 
     samples = []
diff --git a/Tests/ML/utils/test_io_util.py b/Tests/ML/utils/test_io_util.py
index 141c997b8..4a57972b4 100644
--- a/Tests/ML/utils/test_io_util.py
+++ b/Tests/ML/utils/test_io_util.py
@@ -85,14 +85,17 @@ def test_load_images_from_dataset_source(
     # metadata, image and GT channels must be present. Mask is optional
     if None in [metadata, image_channel, ground_truth_channel]:
         with pytest.raises(Exception):
-            _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel, check_exclusive)
+            _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel,
+                                            check_exclusive)
     else:
         if check_exclusive:
             with pytest.raises(ValueError) as mutually_exclusive_labels_error:
-                _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel, check_exclusive)
+                _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel,
+                                                check_exclusive)
             assert 'not mutually exclusive' in str(mutually_exclusive_labels_error.value)
         else:
-            _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel, check_exclusive)
+            _test_load_images_from_channels(metadata, image_channel, ground_truth_channel, mask_channel,
+                                            check_exclusive)
 
 
 def _test_load_images_from_channels(
@@ -109,7 +112,8 @@ def _test_load_images_from_channels(
             metadata=metadata,
             image_channels=[image_channel] * 2,
             ground_truth_channels=[ground_truth_channel] * 4,
-            mask_channel=mask_channel
+            mask_channel=mask_channel,
+            allow_incomplete_labels=False
         ),
         check_exclusive=check_exclusive
     )

From e9723e571d6d9d72be7b42617d3e6d835857a817 Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang
 <82240512+asantamariapang@users.noreply.github.com>
Date: Sun, 30 May 2021 06:34:51 -0700
Subject: [PATCH 07/45] Improved inference support for missing labels.

Summary:
 * For inference, the following cases are implemented
	* No channels associated with mask or ground_truth_ids
	* Some channels associated with mask or ground_truth_ids
	* All channels associated with mask or ground_truth_ids

 * Changes above, do not apply for tor training and validation.

 Description: Major changes involve

 1) Adding an  optional boolean 'allow_incomplete_labels' flag in class 'FullImageDataset'
	1.1) For training and validation, default value of 'allow_incomplete_labels' is False
		1.1.1) If ground truth channels are missing, pipelines exits and reports errors
	1.2) For Inference 'allow_incomplete_labels' is set to true
		1.1.2) If ground truth channels are missing,
			Method 'get_paths_for_channel_ids' returns a list containing corresponding paths (if provided) and None if no ground truth class files files were provided

  2) Class 'Sample', main changes:
	2.1) 'labels' object is optional, to support full/partial number of provided ground class files
		    2.1.1) 	# (Batches if from data loader) x Classes x Z X Y x X, where the first class is background
					labels: Optional[Union[np.ndarray, torch.Tensor]]
	2.2) Added 'missing_labels' list of booleans to indicate if any channel is missing
			2.2.1) missing_labels: List[bool]
			2.3.2) 'missing_labels' keeps the order of corresponding provided or not provided ground truth classes so that they can be correctly mapped back when for report generation
			2.2.2) The length of 'missing_labels' list must be the same as the number of groundtruth classes (list 'ground_truth_ids')

  3) 'load_labels_from_dataset_source', changes:
	3.1)  A label sample object containing ground-truth information if channel is provided
            If no ground-truth channels provided, label is None
            If some ground-truth channels provided are provided, then they are loaded
            Background is loaded if at least one ground-truth channel is provided and is the first element of the tensor
    3.2 ) A list of booleans indicating if ground-truth channel is missing

  4) 'evaluate_model_predictions'
	4.1) If no ground-truth channels provided, then returns: sample.metadata, MetricsDict(hues=config.ground_truth_ids)
	4.2) If some or all ground-truth channel provided, then returns correponsing metrics
	4.3) If all ground-truth classes are provided, then runs 'plot_contours_for_all_classes'

  5) 'calculate_metrics_per_class'
	5.1) From boolean list 'missing_labels' resolves which ground-truth channels corresponds to the segmentation provided by the inference and estimates corresponding metrics

  6) Integrated/fix testing in the following files:
	6.1) test_augmentation.py
	6.2) test_plotting.py
	6.3) test_metrics.py
    6.4) test_inference.py
	6.5) test_dataset.py

  7) Integrated changes in the following dependencies:
	7.1) util.py
	7.2) io_util.py
	7.3) dataset_util.py
	7.4) augmentation.py
	7.5) plotting.py

  8) Added files to account the number of gt classes for training set consists of patients 1, 2, 3, and files id3* are required in unit test test_csv_dataset_as_data_loader
	8.1) id3_channel2.nii.gz"
	8.2) id3_mask.nii.gz"
    8.3) id3_region.nii.gz"
    8.4) id3_channel1.nii.gz"

  9) Test unit that still will be fix:
	9.1) 'test_load_images_from_dataset_source' as currently enforces ground-truth channels
---
 InnerEye/ML/dataset/cropping_dataset.py       |  2 +
 InnerEye/ML/dataset/full_image_dataset.py     | 11 ++--
 InnerEye/ML/dataset/sample.py                 | 17 +++--
 InnerEye/ML/lightning_models.py               |  1 +
 InnerEye/ML/metrics.py                        | 57 ++++++++++-------
 InnerEye/ML/model_testing.py                  | 19 ++++--
 InnerEye/ML/plotting.py                       |  2 +
 InnerEye/ML/utils/augmentation.py             | 11 +++-
 InnerEye/ML/utils/dataset_util.py             |  9 ++-
 InnerEye/ML/utils/io_util.py                  | 59 ++++++++++--------
 Tests/ML/datasets/test_dataset.py             | 62 ++++++++++++++-----
 Tests/ML/pipelines/test_inference.py          |  8 +--
 .../train_and_test_data/id3_channel1.nii.gz   |  3 +
 .../train_and_test_data/id3_channel2.nii.gz   |  3 +
 .../train_and_test_data/id3_mask.nii.gz       |  3 +
 .../train_and_test_data/id3_region.nii.gz     |  3 +
 Tests/ML/test_metrics.py                      | 45 +++++++++-----
 Tests/ML/test_plotting.py                     | 16 +++--
 Tests/ML/util.py                              |  6 +-
 Tests/ML/utils/test_augmentation.py           | 35 ++++++-----
 Tests/ML/utils/test_io_util.py                |  4 +-
 .../ML/visualizers/test_visualize_patches.py  | 10 ++-
 22 files changed, 247 insertions(+), 139 deletions(-)
 create mode 100644 Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
 create mode 100644 Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
 create mode 100644 Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
 create mode 100644 Tests/ML/test_data/train_and_test_data/id3_region.nii.gz

diff --git a/InnerEye/ML/dataset/cropping_dataset.py b/InnerEye/ML/dataset/cropping_dataset.py
index aa2f27c40..5e2265b7d 100644
--- a/InnerEye/ML/dataset/cropping_dataset.py
+++ b/InnerEye/ML/dataset/cropping_dataset.py
@@ -110,6 +110,7 @@ def create_random_cropped_sample(sample: Sample,
             mask_center_crop = image_util.get_center_crop(image=sample.mask, crop_shape=center_size)
             labels_center_crop = np.zeros(shape=[len(sample.labels)] + list(center_size),  # type: ignore
                                           dtype=ImageDataType.SEGMENTATION.value)
+            assert sample.labels is not None
             for c in range(len(sample.labels)):  # type: ignore
                 labels_center_crop[c] = image_util.get_center_crop(
                     image=sample.labels[c],
@@ -120,6 +121,7 @@ def create_random_cropped_sample(sample: Sample,
             image=sample.image,
             mask=sample.mask,
             labels=sample.labels,
+            missing_labels=sample.missing_labels,
             mask_center_crop=mask_center_crop,
             labels_center_crop=labels_center_crop,
             center_indices=center_point,
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 6679e9dd2..2efd0a9e3 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -270,7 +270,7 @@ def convert_channels_to_file_paths(channels: List[str],
                                    rows: pd.DataFrame,
                                    local_dataset_root_folder: Path,
                                    patient_id: str,
-                                   allow_incomplete_labels: bool = False) -> Tuple[List[Path], str]:
+                                   allow_incomplete_labels: bool = False) -> Tuple[List[Optional[Path]], str]:
     """
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
@@ -281,7 +281,7 @@ def convert_channels_to_file_paths(channels: List[str],
     :param local_dataset_root_folder: Root directory which points to the local dataset
     :param patient_id: string which contains subject identifier
     """
-    paths: List[Path] = []
+    paths: List[Optional[Path]] = []
     failed_channel_info: str = ''
 
     for channel_id in channels:
@@ -290,7 +290,7 @@ def convert_channels_to_file_paths(channels: List[str],
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
         elif len(row) == 0 and allow_incomplete_labels:
             # Keeps track of missing channels order
-            paths.append(Path(''))
+            paths.append(None)
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
                                    os.linesep
@@ -344,7 +344,7 @@ def get_mask_channel_or_default() -> Optional[Path]:
         else:
             return paths[0]
 
-    def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
+    def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
         if len(set(channels)) < len(channels):
             raise ValueError(f"ids have duplicated entries: {channels}")
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
@@ -364,7 +364,6 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
             metadata=metadata,
             image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
             mask_channel=get_mask_channel_or_default(),
-            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels),  # type: ignore
-            allow_incomplete_labels=allow_incomplete_labels)
+            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels))  # type: ignore
 
     return dataset_sources
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index d01246f55..26717c97b 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -127,12 +127,12 @@ def get_dict(self) -> Dict[str, Any]:
 class PatientDatasetSource(SampleBase):
     """
     Dataset source locations for channels associated with a given patient in a particular dataset.
+    Please note that "ground_truth_channels" is optional.
     """
     image_channels: List[PathOrString]
-    ground_truth_channels: List[PathOrString]
+    ground_truth_channels: List[Optional[PathOrString]]
     mask_channel: Optional[PathOrString]
     metadata: PatientMetadata
-    allow_incomplete_labels: bool
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -140,8 +140,6 @@ def __post_init__(self) -> None:
 
         if not self.image_channels:
             raise ValueError("image_channels cannot be empty")
-        if not self.ground_truth_channels and not self.allow_incomplete_labels:
-            raise ValueError("ground_truth_channels cannot be empty")
 
 
 @dataclass(frozen=True)
@@ -154,9 +152,10 @@ class Sample(SampleBase):
     image: Union[np.ndarray, torch.Tensor]
     # (Batches if from data loader) x Z x Y x X
     mask: Union[np.ndarray, torch.Tensor]
-    # (Batches if from data loader) x Classes x Z X Y x X
-    labels: Union[np.ndarray, torch.Tensor]
+    # (Batches if from data loader) x Classes x Z X Y x X, where the first class is background
+    labels: Optional[Union[np.ndarray, torch.Tensor]]
     metadata: PatientMetadata
+    missing_labels: List[bool]
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -164,9 +163,9 @@ def __post_init__(self) -> None:
 
         ml_util.check_size_matches(arg1=self.image, arg2=self.mask,
                                    matching_dimensions=self._get_matching_dimensions())
-
-        ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
-                                   matching_dimensions=self._get_matching_dimensions())
+        if self.labels is not None:
+            ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
+                                       matching_dimensions=self._get_matching_dimensions())
 
     @property
     def patient_id(self) -> int:
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 490e693ee..9f6b79c57 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -116,6 +116,7 @@ def compute_metrics(self, cropped_sample: CroppedSample, segmentation: torch.Ten
             ground_truth=cropped_sample.labels_center_crop,
             allow_multiple_classes_for_each_pixel=True)[:, 1:]
         # Number of foreground voxels per class, across all crops
+        assert cropped_sample.labels is not None
         foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:]
         # Store Dice and voxel count per sample in the minibatch. We need a custom aggregation logic for Dice
         # because it can be NaN. Also use custom logging for voxel count because Lightning's batch-size weighted
diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index f220aa338..c4af0fc9a 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -223,55 +223,63 @@ def _add_zero_distances(num_segmented_surface_pixels: int, seg2ref_distance_map_
 
 def calculate_metrics_per_class(segmentation: np.ndarray,
                                 ground_truth: np.ndarray,
+                                missing_labels: List[bool],
                                 ground_truth_ids: List[str],
                                 voxel_spacing: TupleFloat3,
                                 patient_id: Optional[int] = None) -> MetricsDict:
     """
-    Calculate the dice for all foreground structures (the background class is completely ignored).
-    Returns a MetricsDict with metrics for each of the foreground
+    Calculate the dice for provided foreground structures (the background class is completely ignored).
+    Returns a MetricsDict with metrics values for provided foreground class
     structures. Metrics are NaN if both ground truth and prediction are all zero for a class.
-    :param ground_truth_ids: The names of all foreground classes.
     :param segmentation: predictions multi-value array with dimensions: [Z x Y x X]
-    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]
+    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]. Note that the value of the 'C'
+                        dimension is function on the provided ground truth channels. The minimal value for
+                        C is 2: one background channel and one ground truth channel provided
+    :param missing_labels: list of booleans, if boolean variable is True, indicates that given channel was not provided
+                         and length of list is number of all foreground classes
+    :param ground_truth_ids: The names of all foreground classes
     :param voxel_spacing: voxel_spacing in 3D Z x Y x X
     :param patient_id: for logging
     """
-    number_of_classes = ground_truth.shape[0]
-    if len(ground_truth_ids) != (number_of_classes - 1):
-        raise ValueError(f"Received {len(ground_truth_ids)} foreground class names, but "
-                         f"the label tensor indicates that there are {number_of_classes - 1} classes.")
-    binaries = binaries_from_multi_label_array(segmentation, number_of_classes)
-
+    # For 'ground_truth', the expected C dimension is (Background Channel) + (Provided Ground Truth Channels)
+    # We can resolve the number of provided channels by subtracting the number of ground truth channels that were
+    # not provided from the number of classes
+    assert ground_truth is not None
+    assert ground_truth.shape[0] >= 2
+    num_classes_including_background = len(ground_truth_ids) + 1
+    if len(ground_truth_ids) - missing_labels.count(True) != (ground_truth.shape[0] - 1):
+        raise ValueError(f"Received {len(ground_truth_ids) - missing_labels.count(True)} foreground class names, but "
+                         f"the label tensor indicates that there are {num_classes_including_background - 1} classes.")
+    binaries = binaries_from_multi_label_array(segmentation, num_classes_including_background)
+
+    # Note that: i) binary_classes >= 2 since we count background class and at least one ground truth image class,
+    # ii) binary_classes <= num_classes_including_background-1
     binary_classes = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
 
-    # If ground truth image is nan, then will not be used for metrics computation.
-    nan_images = [np.isnan(np.sum(ground_truth[label_id])) for label_id in range(ground_truth.shape[0])]
-
-    # Validates if not binary then nan
-    assert np.all(np.array(binary_classes) == ~np.array(nan_images))
-
     #  Validates that all binary images should be 0 or 1
-    if not np.all(np.array(binary_classes)[~np.array(nan_images)]):
+    if not np.all(binary_classes):
         raise ValueError("Ground truth values should be 0 or 1")
     overlap_measures_filter = sitk.LabelOverlapMeasuresImageFilter()
     hausdorff_distance_filter = sitk.HausdorffDistanceImageFilter()
     metrics = MetricsDict(hues=ground_truth_ids)
+
+    ground_truth_index_counter = 1
     for i, prediction in enumerate(binaries):
         # Skips if background image or nan_image
-        if i == 0 or nan_images[i]:
+        if i == 0:
+            continue
+        # Skips if ground truth channel was not provided
+        if missing_labels[i-1]:
             continue
-        check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
-        if not is_binary_array(prediction):
-            raise ValueError("Predictions values should be 0 or 1")
-        # simpleitk returns a Dice score of 0 if both ground truth and prediction are all zeros.
         # We want to be able to fish out those cases, and treat them specially later.
         prediction_zero = np.all(prediction == 0)
-        gt_zero = np.all(ground_truth[i] == 0)
+        gt_zero = np.all(ground_truth[ground_truth_index_counter] == 0)
         dice = mean_surface_distance = hausdorff_distance = math.nan
         if not (prediction_zero and gt_zero):
             prediction_image = sitk.GetImageFromArray(prediction.astype(np.uint8))
             prediction_image.SetSpacing(sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
-            ground_truth_image = sitk.GetImageFromArray(ground_truth[i].astype(np.uint8))
+            # Use 'ground_truth_index_counter' to index the 'C' dimension
+            ground_truth_image = sitk.GetImageFromArray(ground_truth[ground_truth_index_counter].astype(np.uint8))
             ground_truth_image.SetSpacing(sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
             overlap_measures_filter.Execute(prediction_image, ground_truth_image)
             dice = overlap_measures_filter.GetDiceCoefficient()
@@ -296,6 +304,7 @@ def add_metric(metric_type: MetricType, value: float) -> None:
         add_metric(MetricType.DICE, dice)
         add_metric(MetricType.HAUSDORFF_mm, hausdorff_distance)
         add_metric(MetricType.MEAN_SURFACE_DIST_mm, mean_surface_distance)
+        ground_truth_index_counter += 1
     return metrics
 
 
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index d91253c17..6c1079d3b 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -229,23 +229,32 @@ def evaluate_model_predictions(process_id: int,
     :param results_folder: Path to results folder
     :returns [PatientMetadata, list[list]]: Patient metadata and list of computed metrics for each image.
     """
+
     sample = dataset.get_samples_at_index(index=process_id)[0]
+    assert sample.missing_labels is not None
+    if sample.labels is None:
+        logging.info(f"Ground truth label were not provided for patient {sample.patient_id}, skipping evaluation from "
+                     f"predictions")
+        return sample.metadata, MetricsDict(hues=config.ground_truth_ids)
+
     logging.info(f"Evaluating predictions for patient {sample.patient_id}")
 
     patient_results_folder = get_patient_results_folder(results_folder, sample.patient_id)
     segmentation = load_nifti_image(patient_results_folder / DEFAULT_RESULT_IMAGE_NAME).image
     metrics_per_class = metrics.calculate_metrics_per_class(segmentation,
                                                             sample.labels,
+                                                            sample.missing_labels,
                                                             ground_truth_ids=config.ground_truth_ids,
                                                             voxel_spacing=sample.image_spacing,
                                                             patient_id=sample.patient_id)
     thumbnails_folder = results_folder / THUMBNAILS_FOLDER
     thumbnails_folder.mkdir(exist_ok=True)
-    plotting.plot_contours_for_all_classes(sample,
-                                           segmentation=segmentation,
-                                           foreground_class_names=config.ground_truth_ids,
-                                           result_folder=thumbnails_folder,
-                                           image_range=config.output_range)
+    if sample.missing_labels.count(True) == 0:
+        plotting.plot_contours_for_all_classes(sample,
+                                               segmentation=segmentation,
+                                               foreground_class_names=config.ground_truth_ids,
+                                               result_folder=thumbnails_folder,
+                                               image_range=config.output_range)
     return sample.metadata, metrics_per_class
 
 
diff --git a/InnerEye/ML/plotting.py b/InnerEye/ML/plotting.py
index d0412849f..b94dbdea2 100644
--- a/InnerEye/ML/plotting.py
+++ b/InnerEye/ML/plotting.py
@@ -247,6 +247,7 @@ def plot_normalization_result(loaded_images: Sample,
     """
     # Labels are encoded with background and a single foreground class. We need the
     # slice with largest number of foreground voxels
+    assert loaded_images.labels is not None
     ground_truth = loaded_images.labels[class_index, ...]
     largest_gt_slice = get_largest_z_slice(ground_truth)
     first_channel = loaded_images.image[channel_index, ...]
@@ -289,6 +290,7 @@ def plot_contours_for_all_classes(sample: Sample,
     :param channel_index: The index of the image channel that should be plotted.
     :return: The paths to all generated PNG files.
     """
+    assert sample.labels is not None
     check_size_matches(sample.labels[0], segmentation)
     num_classes = sample.labels.shape[0]
     if len(foreground_class_names) != num_classes - 1:
diff --git a/InnerEye/ML/utils/augmentation.py b/InnerEye/ML/utils/augmentation.py
index b89b44666..60dd34506 100644
--- a/InnerEye/ML/utils/augmentation.py
+++ b/InnerEye/ML/utils/augmentation.py
@@ -28,8 +28,8 @@ class among the available classes then samples a center point among the pixels o
                           voxel belongs to (must sum to 1), uniform distribution assumed if none provided.
     :return numpy int array (3x1) containing patch center spatial coordinates
     """
+    assert sample.labels is not None
     num_classes = sample.labels.shape[0]
-
     if class_weights is not None:
         if len(class_weights) != num_classes:
             raise Exception("A weight must be provided for each class, found weights:{}, expected:{}"
@@ -111,7 +111,8 @@ def random_crop(sample: Sample,
     """
     Randomly crops images, mask, and labels arrays according to the crop_size argument.
     The selection of the center is dependant on background probability.
-    By default it does not center on background.
+    By default it does not center on background
+    All class labels must be provided.
 
     :param sample: A set of Image channels, ground truth labels and mask to randomly crop.
     :param crop_size: The size of the crop expressed as a list of 3 ints, one per spatial dimension.
@@ -122,12 +123,16 @@ def random_crop(sample: Sample,
     crop.
     :raises ValueError: If there are shape mismatches among the arguments or if the crop size is larger than the image.
     """
+    assert sample.labels is not None
+    # Ensures no missing class labels
+    assert sample.missing_labels.count(True) == 0
     slicers, center = slicers_for_random_crop(sample, crop_size, class_weights)
     sample = Sample(
         image=sample.image[:, slicers[0], slicers[1], slicers[2]],
         labels=sample.labels[:, slicers[0], slicers[1], slicers[2]],
         mask=sample.mask[slicers[0], slicers[1], slicers[2]],
-        metadata=sample.metadata
+        metadata=sample.metadata,
+        missing_labels=sample.missing_labels
     )
     return sample, center
 
diff --git a/InnerEye/ML/utils/dataset_util.py b/InnerEye/ML/utils/dataset_util.py
index b2275f4a0..14500bb23 100644
--- a/InnerEye/ML/utils/dataset_util.py
+++ b/InnerEye/ML/utils/dataset_util.py
@@ -197,11 +197,14 @@ def add_label_stats_to_dataframe(input_dataframe: pd.DataFrame,
 
     # Iterate over subjects and check overlapping labels
     for subject_id in [*dataset_sources.keys()]:
-        labels = io_util.load_labels_from_dataset_source(dataset_sources[subject_id])
+        labels, __ = io_util.load_labels_from_dataset_source(dataset_sources[subject_id])
+        assert labels is not None
         overlap_stats = metrics_util.get_label_overlap_stats(labels=labels[1:, ...],
                                                              label_names=target_label_names)
-
-        header = io_util.load_nifti_image(dataset_sources[subject_id].ground_truth_channels[0]).header
+        # Creates "ground_truth_channel" variable and asserts is not None to comply with mypy
+        ground_truth_channel = dataset_sources[subject_id].ground_truth_channels[0]
+        assert ground_truth_channel is not None
+        header = io_util.load_nifti_image(ground_truth_channel).header
         volume_stats = metrics_util.get_label_volume(labels=labels[1:, ...],
                                                      label_names=target_label_names,
                                                      label_spacing=header.spacing)
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index 4b2c3b859..db64c530a 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -412,36 +412,32 @@ def load_image_in_known_formats(file: Path,
         raise ValueError(f"Unsupported image file type for path {file}")
 
 
-def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True,
-                                    mask_size: Optional[Tuple[int]] = None) -> np.ndarray:
+def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True) -> \
+        Tuple[Optional[np.ndarray], List[bool]]:
     """
     Load labels containing segmentation binary labels in one-hot-encoding.
     In the future, this function will be used to load global class and non-imaging information as well.
 
-    :type mask_size: Image size, tuple if integers.
     :param dataset_source: The dataset source for which channels are to be loaded into memory.
     :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
-    :return: A label sample object containing ground-truth information.
-    """
-
-    if not dataset_source.allow_incomplete_labels:
-        labels = np.stack(
-            [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
-    else:
-        assert mask_size is not None
-        label_list = []
-        for gt in dataset_source.ground_truth_channels:
-            if str(gt) == '.':
-                label_list.append(np.full(mask_size, np.NAN, ImageDataType))
-            else:
-                label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
-        labels = np.stack(label_list)
-
-    # If ground truth image is nan, then will not be used to check check_exclusive.
-    not_nan_label_images = [labels[label_id] for label_id in range(labels.shape[0])
-                            if not np.isnan(np.sum(labels[label_id]))]
-
-    if check_exclusive and (sum(np.array(not_nan_label_images)) > 1.).any():  # type: ignore
+    :return A label sample object containing ground-truth information if channel is provided
+            If no ground-truth channels provided, label is None
+            If some ground-truth channels provided are provided, then they are loaded
+            Background is loaded if at least one ground-truth channel is provided and is the first element of the tensor
+    :return A list of booleans indicating if ground-truth channel is missing
+    """
+
+    label_list = []
+    missing_labels = [True] * len(dataset_source.ground_truth_channels)
+    for i, gt in enumerate(dataset_source.ground_truth_channels):
+        if gt is not None:
+            label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
+            missing_labels[i] = False
+
+    if len(label_list) == 0:
+        return None, missing_labels
+    labels = np.stack(label_list)
+    if check_exclusive and (sum(np.array(label_list)) > 1.).any():  # type: ignore
         raise ValueError(f'The labels for patient {dataset_source.metadata.patient_id} are not mutually exclusive. '
                          'Some loss functions (e.g. SoftDice) may produce results on overlapping labels, while others '
                          '(e.g. FocalLoss) will fail. '
@@ -455,7 +451,7 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
     for c in range(len(labels)):
         background[labels[c] == 1] = 0
     background = background[np.newaxis, ...]
-    return np.vstack((background, labels))
+    return np.vstack((background, labels)), missing_labels
 
 
 def load_image(path: PathOrString, image_type: Optional[Type] = float) -> ImageWithHeader:
@@ -520,12 +516,21 @@ def load_images_from_dataset_source(dataset_source: PatientDatasetSource, check_
     # create raw sample to return
     metadata = copy(dataset_source.metadata)
     metadata.image_header = images[0].header
-    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive, mask_size=mask.shape)
+    labels, missing_labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive)
+
+    # All class labels are counted, including missing or no missing ground truth class labels
+    assert len(missing_labels) == len(dataset_source.ground_truth_channels)
+    # Number of missing labels corresponds to the number of None objects in 'dataset_source.ground_truth_channels'
+    assert missing_labels.count(True) == dataset_source.ground_truth_channels.count(None)
+    # Number of not missing labels corresponds the number of provided ground truth (file name) channels.
+    assert missing_labels.count(False) == len(dataset_source.ground_truth_channels) - \
+           dataset_source.ground_truth_channels.count(None)
 
     return Sample(image=image,
                   labels=labels,
                   mask=mask,
-                  metadata=metadata)
+                  metadata=metadata,
+                  missing_labels=missing_labels)
 
 
 def store_image_as_short_nifti(image: np.ndarray,
diff --git a/Tests/ML/datasets/test_dataset.py b/Tests/ML/datasets/test_dataset.py
index 6fc446f2d..7609d9121 100644
--- a/Tests/ML/datasets/test_dataset.py
+++ b/Tests/ML/datasets/test_dataset.py
@@ -28,6 +28,7 @@
 
 crop_size = [55, 55, 55]
 
+
 @pytest.fixture
 def num_dataload_workers() -> int:
     """PyTorch support for multiple dataloader workers is flaky on Windows (so return 0)"""
@@ -142,21 +143,26 @@ def normalize_fn(default_config: SegmentationModelBase) -> PhotometricNormalizat
 
 def test_dataset_content(default_config: ModelConfigBase, gt_image: np.ndarray,
                          cropping_dataset: CroppingDataset, full_image_dataset: FullImageDataset) -> None:
-    # check number of patients
-    assert len(full_image_dataset) == len(cropping_dataset) == 2
+    # Content is compared with the split training set, since it was use as argument
+    # for 'full_image_dataset' and  'cropping_dataset'
+    assert len(full_image_dataset) == len(cropping_dataset) == \
+           len(set(default_config.get_dataset_splits().train.subject))
     assert len(np.unique(gt_image)) == default_config.number_of_classes
 
 
-def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any, random_patient_id: Any) -> None:
+def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any, random_patient_id: Any,
+                default_config: ModelConfigBase) -> None:
     """
     Tests that after creating and extracting a sample we obtain the same result
     :return:
     """
+    missing_labels_list = [False] * default_config.number_of_classes
     metadata = PatientMetadata(patient_id='42', institution="foo")
     sample = Sample(image=random_image_crop,
                     mask=random_mask_crop,
                     labels=random_label_crop,
-                    metadata=metadata)
+                    metadata=metadata,
+                    missing_labels=missing_labels_list)
 
     patched_sample = CroppedSample(image=random_image_crop,
                                    mask=random_mask_crop,
@@ -164,6 +170,7 @@ def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop
                                    mask_center_crop=random_mask_crop,
                                    labels_center_crop=random_label_crop,
                                    metadata=metadata,
+                                   missing_labels=missing_labels_list,
                                    center_indices=np.zeros((1, 3)))
 
     extracted_sample = sample.get_dict()
@@ -183,8 +190,10 @@ def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop
     assert extracted_sample["metadata"] == extracted_patched_sample["metadata"] == metadata
 
 
-def test_cropping_dataset_as_data_loader(cropping_dataset: CroppingDataset, num_dataload_workers: int) -> None:
-    batch_size = 2
+def test_cropping_dataset_as_data_loader(cropping_dataset: CroppingDataset, num_dataload_workers: int,
+                                         default_config: ModelConfigBase) -> None:
+    # Set batch size number of training classes 'default_config'
+    batch_size = len(set(default_config.get_dataset_splits().train.subject))
     loader = cropping_dataset.as_data_loader(shuffle=True, batch_size=batch_size,
                                              num_dataload_workers=num_dataload_workers)
     for i, item in enumerate(loader):
@@ -262,11 +271,17 @@ def test_cropping_dataset_has_reproducible_randomness(cropping_dataset: Cropping
 
 
 def test_csv_dataset_as_data_loader(normalize_fn: Any,
-                                    full_image_dataset: FullImageDataset, num_dataload_workers: int) -> None:
-    batch_size = 2
+                                    full_image_dataset: FullImageDataset, num_dataload_workers: int,
+                                    default_config: ModelConfigBase) -> None:
+
+    # Set batch size number of training classes 'default_config'
+    batch_size = len(set(default_config.get_dataset_splits().train.subject))
     # load the original images separately for comparison
+    # expected number of patients is 3
     expected_samples = load_train_and_test_data_channels(patient_ids=list(range(1, batch_size + 1)),
                                                          normalization_fn=normalize_fn)
+    # expected number of patients is 3, since we use the training set derived 'default_config' which is derived from
+    # class 'DummyModel'
     csv_dataset_loader = full_image_dataset.as_data_loader(batch_size=batch_size, shuffle=True,
                                                            num_dataload_workers=num_dataload_workers)
     for i, batch in enumerate(csv_dataset_loader):
@@ -290,24 +305,30 @@ def test_full_image_dataset_no_mask(full_image_dataset_no_mask: FullImageDataset
 
 @pytest.mark.parametrize("crop_size", [(4, 4, 4), (8, 6, 4)])
 def test_create_possibly_padded_sample_for_cropping(crop_size: Any) -> None:
+    number_gt_classes = 2
+    missing_label_list = [False] * number_gt_classes
     image_size = [4] * 3
     image = np.random.uniform(size=[1] + image_size)
-    labels = np.zeros(shape=[2] + image_size)
+    labels = np.zeros(shape=[number_gt_classes] + image_size)
     mask = np.zeros(shape=image_size, dtype=ImageDataType.MASK.value)
 
     cropped_sample = CroppingDataset.create_possibly_padded_sample_for_cropping(
-        sample=Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata),
+        sample=Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata,
+                      missing_labels=missing_label_list),
         crop_size=crop_size,
         padding_mode=PaddingMode.Zero
     )
 
     assert cropped_sample.image.shape[-3:] == crop_size
+    assert cropped_sample.labels is not None
     assert cropped_sample.labels.shape[-3:] == crop_size
     assert cropped_sample.mask.shape[-3:] == crop_size
 
 
 @pytest.mark.parametrize("use_mask", [False, True])
 def test_cropped_sample(use_mask: bool) -> None:
+    number_of_gt_classes = 2
+    missing_label_list = [False] * number_of_gt_classes
     ml_util.set_random_seed(1)
     image_size = [4] * 3
     crop_size = (2, 2, 2)
@@ -315,7 +336,7 @@ def test_cropped_sample(use_mask: bool) -> None:
 
     # create small image sample for random cropping
     image = np.random.uniform(size=[1] + image_size)
-    labels = np.zeros(shape=[2] + image_size)
+    labels = np.zeros(shape=[number_of_gt_classes] + image_size)
     # Two foreground points in the corners at (0, 0, 0) and (3, 3, 3)
     labels[0] = 1
     labels[0, 0, 0, 0] = 0
@@ -340,7 +361,8 @@ def test_cropped_sample(use_mask: bool) -> None:
         image=image,
         labels=labels,
         mask=mask,
-        metadata=DummyPatientMetadata
+        metadata=DummyPatientMetadata,
+        missing_labels=missing_label_list
     )
 
     for _ in range(0, 100):
@@ -354,6 +376,7 @@ def test_cropped_sample(use_mask: bool) -> None:
         if expected_center is not None:
             assert list(cropped_sample.center_indices) == expected_center  # type: ignore
             assert np.array_equal(cropped_sample.image, sample.image[:, crop_slicer, crop_slicer, crop_slicer])
+            assert sample.labels is not None
             assert np.array_equal(cropped_sample.labels, sample.labels[:, crop_slicer, crop_slicer, crop_slicer])
             assert np.array_equal(cropped_sample.mask, sample.mask[crop_slicer, crop_slicer, crop_slicer])
         else:
@@ -489,6 +512,8 @@ def test_sample_metadata_field() -> None:
     Test that the string constant we use to identify the metadata field is really matching the
     field name in SampleWithMetadata
     """
+    number_of_classes = 2
+    missing_labels_list = [False] * number_of_classes
     batch_size = 5
     xyz = (6, 7, 8)
     shape = (batch_size,) + xyz
@@ -496,9 +521,11 @@ def test_sample_metadata_field() -> None:
     s = Sample(metadata=DummyPatientMetadata,
                image=zero,
                mask=zero,
-               labels=torch.zeros((batch_size,) + (2,) + xyz))
+               labels=torch.zeros((batch_size,) + (number_of_classes,) + xyz),
+               missing_labels=missing_labels_list)
     fields = vars(s)
-    assert len(fields) == 4
+    # Assert fields for: 1) metadata, 2) image, 3) mask, 4) labels, 5) missing_labels
+    assert len(fields) == 5
     assert SAMPLE_METADATA_FIELD in fields
     # Lightning attempts to determine the batch size by trying to find a tensor field in the sample.
     # This only works if any field other than Metadata is first.
@@ -522,12 +549,15 @@ def test_custom_collate() -> None:
     assert result[foo].tolist() == [1, 2]
 
 
-def test_sample_construct_copy(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any) -> None:
+def test_sample_construct_copy(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any,
+                               default_config: SegmentationModelBase) -> None:
+    missing_labels_list = [False] * default_config.number_of_classes
     sample = Sample(
         image=random_image_crop,
         mask=random_mask_crop,
         labels=random_label_crop,
-        metadata=PatientMetadata(patient_id='1')
+        metadata=PatientMetadata(patient_id='1'),
+        missing_labels=missing_labels_list
     )
 
     sample_clone = sample.clone_with_overrides()
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 14282d1b4..e1003cf7e 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -307,23 +307,17 @@ def test_evaluate_model_predictions() -> None:
             assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values('region_1').keys()
             for hue_name in ['region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 4 has all missing ground truth channels: "region", "region_1"
         if sample.metadata.patient_id == '4':
             for hue_name in ['region_1', 'region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 5 has no missing ground truth channels
         if sample.metadata.patient_id == '5':
-            assert len(metrics_per_class.values('Default').keys()) == 0
-            assert len(metrics_per_class.values('Default').keys()) == 0
             assert len(metrics_per_class.values('Default').keys()) == 0
             for hue_name in ['region_1', 'region']:
-                assert 'Dice' in metrics_per_class.values('region_1').keys()
+                assert 'Dice' in metrics_per_class.values(hue_name).keys()
                 assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
                 assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
diff --git a/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
new file mode 100644
index 000000000..004e23b48
--- /dev/null
+++ b/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d09abc0d1a4c5d18de6da26fec2b813c5e5e545577cf9a2e7e4b27dcf2d052e5
+size 683626
diff --git a/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
new file mode 100644
index 000000000..004e23b48
--- /dev/null
+++ b/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d09abc0d1a4c5d18de6da26fec2b813c5e5e545577cf9a2e7e4b27dcf2d052e5
+size 683626
diff --git a/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
new file mode 100644
index 000000000..62d10d47c
--- /dev/null
+++ b/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b0c2c7f0fe10f3df48fb86edce291c340895488645a7cba030d1d3d67a1a0584
+size 913
diff --git a/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz
new file mode 100644
index 000000000..be4022fb1
--- /dev/null
+++ b/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:4c077c518c0aca796332ae13b26d7f2eac5de7f93ba44eb497ff777e6c24abf8
+size 83093
diff --git a/Tests/ML/test_metrics.py b/Tests/ML/test_metrics.py
index b30a754fb..49bc7eea5 100644
--- a/Tests/ML/test_metrics.py
+++ b/Tests/ML/test_metrics.py
@@ -33,21 +33,23 @@ def test_calculate_dice1() -> None:
     g1 = "g1"
     zero = np.zeros((3, 3, 3))
     one = np.ones((3, 3, 3))
+    missing_labels = [False]
 
     # ground truth is expected in one-hot encoding, but the segmentation is a map with class indices in each voxel
-    def assert_metrics(segmentation: np.ndarray, ground_truth: np.ndarray, expected_dice: float) -> None:
-        a = metrics.calculate_metrics_per_class(segmentation, ground_truth,
+    def assert_metrics(segmentation: np.ndarray, ground_truth: np.ndarray, missing_label: List[bool],
+                       expected_dice: float) -> None:
+        a = metrics.calculate_metrics_per_class(segmentation, ground_truth, missing_label,
                                                 voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
         assert a.get_hue_names(include_default=False) == [g1]
         assert equal_respecting_nan(a.get_single_metric(MetricType.DICE, hue=g1), expected_dice)
 
     # Case 1: Ground truth says everything is class 1, and segmentation says the same
-    assert_metrics(one, np.stack([zero, one]), expected_dice=1.0)
+    assert_metrics(one, np.stack([zero, one]), missing_labels, expected_dice=1.0)
     # Case 2: Ground truth says everything is class 0, but segmentation says it's class 1
-    assert_metrics(one, np.stack([one, zero]), expected_dice=0.0)
+    assert_metrics(one, np.stack([one, zero]), missing_labels, expected_dice=0.0)
     # Case 3: Ground truth says everything is class 0, and segmentation says the same: This means that class 1
     # is correctly predicted, but empty ground truth and empty prediction are indicated by Dice NaN
-    assert_metrics(zero, np.stack([one, zero]), expected_dice=math.nan)
+    assert_metrics(zero, np.stack([one, zero]), missing_labels, expected_dice=math.nan)
 
 
 def equal_respecting_nan(v1: float, v2: float) -> bool:
@@ -72,7 +74,10 @@ def expand(a: List[float]) -> np.ndarray:
     ground_truth_values = expand([0, 0, 1])
     ground_truth = np.stack([1 - ground_truth_values, ground_truth_values])
     prediction = expand(prediction_list)
-    m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
+    # Since there is only label, 'missing_labels' list to have only element
+    missing_labels_list = [False]
+    m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels_list,
+                                            voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
     assert m.get_single_metric(MetricType.DICE, hue=g1) == expected_dice
 
 
@@ -83,10 +88,15 @@ def test_calculate_hd() -> None:
     prediction1 = np.ones_like(prediction0)
     gt_all_zero = np.stack([prediction1, prediction0])
     gt_all_one = np.stack([prediction0, prediction1])
+    # Since there is only label, 'missing_labels' list to have only element
+    missing_labels_list = [False]
 
-    def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray, expected: Optional[float],
+    def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray,
+                       missing_labels: List[bool],
+                       expected: Optional[float],
                        voxel_spacing: TupleFloat3 = (1, 1, 1)) -> float:
-        m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=voxel_spacing,
+        m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels,
+                                                voxel_spacing=voxel_spacing,
                                                 ground_truth_ids=[g1])
         actual = m.get_single_metric(MetricType.HAUSDORFF_mm, hue=g1)
         if expected is not None:
@@ -94,8 +104,8 @@ def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray, expected: O
         return actual
 
     # check an infinity value if either the prediction or gt have no foreground
-    assert_metrics(prediction0, gt_all_one, math.inf)
-    assert_metrics(prediction1, gt_all_zero, math.inf)
+    assert_metrics(prediction0, gt_all_one, missing_labels_list, math.inf)
+    assert_metrics(prediction1, gt_all_zero, missing_labels_list, math.inf)
 
     def generate_random_prediction() -> np.ndarray:
         result = np.round(np.random.uniform(size=prediction0.shape))
@@ -106,12 +116,16 @@ def generate_random_prediction() -> np.ndarray:
 
     random_prediction = generate_random_prediction()
     matching_gt = np.stack([1 - random_prediction, random_prediction])
-    assert_metrics(random_prediction, matching_gt, 0.0)
+    # Since there is only label, 'missing_labels' list to have only element
+    missing_labels_list = [False]
+    assert_metrics(random_prediction, matching_gt, missing_labels_list, 0.0)
     # check voxel spacing is being used as expected
     random_prediction2 = generate_random_prediction()
     non_matching_gt = np.stack([1 - random_prediction2, random_prediction2])
-    without_spacing = assert_metrics(random_prediction, non_matching_gt, voxel_spacing=(1, 1, 1), expected=None)
-    with_spacing = assert_metrics(random_prediction, non_matching_gt, voxel_spacing=(2.0, 2.0, 2.0), expected=None)
+    without_spacing = assert_metrics(random_prediction, non_matching_gt, missing_labels_list,
+                                     voxel_spacing=(1, 1, 1), expected=None)
+    with_spacing = assert_metrics(random_prediction, non_matching_gt, missing_labels_list,
+                                  voxel_spacing=(2.0, 2.0, 2.0), expected=None)
     assert without_spacing != with_spacing
 
 
@@ -125,7 +139,10 @@ def test_calculate_hd_exact() -> None:
 
     ground_truth = np.stack(np.stack([1 - ground_truth, ground_truth]))
     g1 = "g1"
-    m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=(1, 2, 3), ground_truth_ids=[g1])
+    # Since there is only label, 'missing_labels' list to have only element
+    missing_labels_list = [False]
+    m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels_list,
+                                            voxel_spacing=(1, 2, 3), ground_truth_ids=[g1])
     assert m.get_single_metric(MetricType.HAUSDORFF_mm, hue=g1) == 6
     assert m.get_single_metric(MetricType.MEAN_SURFACE_DIST_mm, hue=g1) == 6
 
diff --git a/Tests/ML/test_plotting.py b/Tests/ML/test_plotting.py
index 5070f51e0..4d618270e 100644
--- a/Tests/ML/test_plotting.py
+++ b/Tests/ML/test_plotting.py
@@ -147,17 +147,21 @@ def test_plot_normalization_result(test_output_dirs: OutputFolderForTests) -> No
     Tests plotting of before/after histograms in photometric normalization.
     :return:
     """
+    number_of_gt_classes = 2
     size = (3, 3, 3)
     image = np.zeros((1,) + size)
     for i, (z, y, x) in enumerate(itertools.product(range(size[0]), range(size[1]), range(size[2]))):
         image[0, z, y, x] = i
-    labels = np.zeros((2,) + size)
+    labels = np.zeros((number_of_gt_classes,) + size)
+    # Initializes 'missing_labels_list' to 'False' for the given number of gt classes
+    missing_labels_list = [False] * number_of_gt_classes
     labels[1, 1, 1, 1] = 1
     sample = Sample(
         image=image,
         labels=labels,
         mask=np.ones(size),
-        metadata=DummyPatientMetadata
+        metadata=DummyPatientMetadata,
+        missing_labels=missing_labels_list
     )
     config = SegmentationModelBase(norm_method=PhotometricNormalizationMethod.CtWindow, window=4, level=13,
                                    should_validate=False)
@@ -169,13 +173,16 @@ def test_plot_normalization_result(test_output_dirs: OutputFolderForTests) -> No
 
 
 def test_plot_contours_for_all_classes(test_output_dirs: OutputFolderForTests) -> None:
+    number_of_gt_classes = 3
     size = (3, 3, 3)
     image = np.zeros((1,) + size)
     for i, (z, y, x) in enumerate(itertools.product(range(size[0]), range(size[1]), range(size[2]))):
         image[0, z, y, x] = i
     # Create a fake label array: For each class, there is exactly 1 pixel foreground, at the z slice that is
     # equal to the class index
-    labels = np.zeros((3,) + size)
+    labels = np.zeros((number_of_gt_classes,) + size)
+    # Initializes 'missing_labels_list' to 'False' for the given number of gt classes
+    missing_labels_list = [False] * number_of_gt_classes
     labels[0, 0, 1, 1] = 1
     labels[1, 1, 1, 1] = 1
     labels[2, 2, 1, 1] = 1
@@ -190,7 +197,8 @@ def test_plot_contours_for_all_classes(test_output_dirs: OutputFolderForTests) -
         image=image,
         labels=labels,
         mask=np.ones(size),
-        metadata=DummyPatientMetadata
+        metadata=DummyPatientMetadata,
+        missing_labels=missing_labels_list
     )
     plots = plotting.plot_contours_for_all_classes(sample,
                                                    segmentation,
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index fb49c116d..1fbbb8dea 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -73,8 +73,7 @@ def load_train_and_test_data_channels(patient_ids: List[int],
         metadata=PatientMetadata(patient_id=z),
         image_channels=[file_name(z, c) for c in TEST_CHANNEL_IDS],
         mask_channel=file_name(z, TEST_MASK_ID),
-        ground_truth_channels=[file_name(z, TEST_GT_ID)],
-        allow_incomplete_labels=False
+        ground_truth_channels=[file_name(z, TEST_GT_ID)]
     ))
 
     samples = []
@@ -83,7 +82,8 @@ def load_train_and_test_data_channels(patient_ids: List[int],
         sample = Sample(image=normalization_fn.transform(sample.image, sample.mask),
                         mask=sample.mask,
                         labels=sample.labels,
-                        metadata=sample.metadata)
+                        metadata=sample.metadata,
+                        missing_labels=sample.missing_labels)
         samples.append(sample)
 
     return samples
diff --git a/Tests/ML/utils/test_augmentation.py b/Tests/ML/utils/test_augmentation.py
index 43bbcf4f9..d087437d6 100644
--- a/Tests/ML/utils/test_augmentation.py
+++ b/Tests/ML/utils/test_augmentation.py
@@ -25,6 +25,9 @@
 valid_labels = np.zeros((number_of_classes,) + image_size)
 for c in range(number_of_classes):
     valid_labels[c, class_assignments == c] = 1
+# Since we have 5 classes and all ground truth class labels are provided, initialize 'missing_labels_list' to
+# to 'False' with  length 5
+missing_labels_list = [False] * number_of_classes
 valid_crop_size = (2, 2, 2)
 valid_full_crop_size = image_size
 valid_class_weights = [0.5] + [0.5 / (number_of_classes - 1)] * (number_of_classes - 1)
@@ -37,7 +40,8 @@ def test_valid_full_crop() -> None:
     sample, _ = augmentation.random_crop(sample=Sample(image=valid_image_4d,
                                                        labels=valid_labels,
                                                        mask=valid_mask,
-                                                       metadata=metadata),
+                                                       metadata=metadata,
+                                                       missing_labels=missing_labels_list),
                                          crop_size=valid_full_crop_size,
                                          class_weights=valid_class_weights)
 
@@ -60,36 +64,37 @@ def test_invalid_arrays(image: Any, labels: Any, mask: Any, class_weights: Any)
     if not (np.array_equal(image, valid_image_4d) and np.array_equal(labels, valid_labels)
             and np.array_equal(mask, valid_mask) and class_weights == valid_class_weights):
         with pytest.raises(Exception):
-            augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=image, labels=labels, mask=mask),
-                                     valid_crop_size, class_weights)
+            augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=image, labels=labels, mask=mask,
+                                            missing_labels=missing_labels_list), valid_crop_size, class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [None, ["a"], 5])
 def test_invalid_crop_arg(crop_size: Any) -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(
-            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask),
-            crop_size, valid_class_weights)
+            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask,
+                   missing_labels=missing_labels_list), crop_size, valid_class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [[2, 2], [2, 2, 2, 2], [10, 10, 10]])
 def test_invalid_crop_size(crop_size: Any) -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(
-            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask),
-            crop_size, valid_class_weights)
+            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask,
+                   missing_labels=missing_labels_list), crop_size, valid_class_weights)
 
 
 def test_random_crop_no_fg() -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels,
-                                        mask=np.zeros_like(valid_mask)),
-                                 valid_crop_size, valid_class_weights)
+                                        mask=np.zeros_like(valid_mask), missing_labels=missing_labels_list),
+                                        valid_crop_size, valid_class_weights)
 
     with pytest.raises(Exception):
         augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=valid_image_4d,
-                                        labels=np.zeros_like(valid_labels), mask=valid_mask),
-                                 valid_crop_size, valid_class_weights)
+                                        labels=np.zeros_like(valid_labels), mask=valid_mask,
+                                        missing_labels=missing_labels_list),
+                                        valid_crop_size, valid_class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [valid_crop_size])
@@ -103,13 +108,15 @@ def test_random_crop(crop_size: Any) -> None:
         image=valid_image_4d,
         labels=valid_labels,
         mask=valid_mask,
-        metadata=DummyPatientMetadata
-    ), crop_size, valid_class_weights)
+        metadata=DummyPatientMetadata,
+        missing_labels=missing_labels_list),
+        crop_size, valid_class_weights)
 
     expected_img_crop_size = (valid_image_4d.shape[0], *crop_size)
     expected_labels_crop_size = (valid_labels.shape[0], *crop_size)
 
     assert sample.image.shape == expected_img_crop_size
+    assert sample.labels is not None
     assert sample.labels.shape == expected_labels_crop_size
     assert sample.mask.shape == tuple(crop_size)
 
@@ -133,7 +140,7 @@ def test_valid_class_weights(class_weights: List[float]) -> None:
     labels[class2][3, 2, 3] = 1
 
     mask = np.ones_like(valid_mask)
-    sample = Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata)
+    sample = Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata, missing_labels=missing_labels_list)
 
     crop_size = (1, 1, 1)
     total_crops = 200
diff --git a/Tests/ML/utils/test_io_util.py b/Tests/ML/utils/test_io_util.py
index 4a57972b4..2982ff4e8 100644
--- a/Tests/ML/utils/test_io_util.py
+++ b/Tests/ML/utils/test_io_util.py
@@ -112,8 +112,7 @@ def _test_load_images_from_channels(
             metadata=metadata,
             image_channels=[image_channel] * 2,
             ground_truth_channels=[ground_truth_channel] * 4,
-            mask_channel=mask_channel,
-            allow_incomplete_labels=False
+            mask_channel=mask_channel
         ),
         check_exclusive=check_exclusive
     )
@@ -124,6 +123,7 @@ def _test_load_images_from_channels(
         if mask_channel:
             assert np.array_equal(sample.mask, image_with_header.image)
         if ground_truth_channel:
+            assert sample.labels is not None
             assert list(sample.labels.shape) == [5] + list(image_with_header.image.shape)
             assert np.all(sample.labels[0] == 0) and np.all(sample.labels[1:] == 1)
 
diff --git a/Tests/ML/visualizers/test_visualize_patches.py b/Tests/ML/visualizers/test_visualize_patches.py
index ef27fdcdf..556b93109 100644
--- a/Tests/ML/visualizers/test_visualize_patches.py
+++ b/Tests/ML/visualizers/test_visualize_patches.py
@@ -37,6 +37,8 @@ def test_visualize_patch_sampling(test_output_dirs: OutputFolderForTests,
     shape = (10, 30, 30)
     foreground_classes = ["fg"]
     class_weights = equally_weighted_classes(foreground_classes)
+    # Initializes 'missing_labels_list' to 'False'
+    missing_labels_list = [False] * len(foreground_classes)
     config = SegmentationModelBase(should_validate=False,
                                    crop_size=(2, 10, 10),
                                    class_weights=class_weights)
@@ -56,7 +58,8 @@ def test_visualize_patch_sampling(test_output_dirs: OutputFolderForTests,
                     mask=mask,
                     labels=labels,
                     metadata=PatientMetadata(patient_id='123',
-                                             image_header=image_header))
+                                             image_header=image_header),
+                    missing_labels=missing_labels_list)
     expected_folder = full_ml_test_data_path("patch_sampling")
     heatmap = visualize_random_crops(sample, config, output_folder=output_folder)
     expected_heatmap = expected_folder / ("sampled_to_boundary.npy" if labels_to_boundary else "sampled_center.npy")
@@ -101,6 +104,8 @@ def test_visualize_patch_sampling_2d(test_output_dirs: OutputFolderForTests) ->
     set_random_seed(0)
     shape = (1, 20, 30)
     foreground_classes = ["fg"]
+    # Initializes 'missing_labels_list' to 'False'
+    missing_labels_list = [False] * len(foreground_classes)
     class_weights = equally_weighted_classes(foreground_classes)
     config = SegmentationModelBase(should_validate=False,
                                    crop_size=(1, 5, 10),
@@ -116,7 +121,8 @@ def test_visualize_patch_sampling_2d(test_output_dirs: OutputFolderForTests) ->
                     mask=mask,
                     labels=labels,
                     metadata=PatientMetadata(patient_id='123',
-                                             image_header=image_header))
+                                             image_header=image_header),
+                    missing_labels=missing_labels_list)
     heatmap = visualize_random_crops(sample, config, output_folder=output_folder)
     expected_folder = full_ml_test_data_path("patch_sampling")
     expected_heatmap = expected_folder / "sampling_2d.npy"

From 73955f67d97050f04348ddabbfc19f2f83da17a9 Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang
 <82240512+asantamariapang@users.noreply.github.com>
Date: Tue, 1 Jun 2021 10:00:13 -0700
Subject: [PATCH 08/45] Revert "Improved inference support for missing labels."

This reverts commit e9723e571d6d9d72be7b42617d3e6d835857a817.
---
 InnerEye/ML/dataset/cropping_dataset.py       |  2 -
 InnerEye/ML/dataset/full_image_dataset.py     | 11 ++--
 InnerEye/ML/dataset/sample.py                 | 17 ++---
 InnerEye/ML/lightning_models.py               |  1 -
 InnerEye/ML/metrics.py                        | 57 +++++++----------
 InnerEye/ML/model_testing.py                  | 19 ++----
 InnerEye/ML/plotting.py                       |  2 -
 InnerEye/ML/utils/augmentation.py             | 11 +---
 InnerEye/ML/utils/dataset_util.py             |  9 +--
 InnerEye/ML/utils/io_util.py                  | 59 ++++++++----------
 Tests/ML/datasets/test_dataset.py             | 62 +++++--------------
 Tests/ML/pipelines/test_inference.py          |  8 ++-
 .../train_and_test_data/id3_channel1.nii.gz   |  3 -
 .../train_and_test_data/id3_channel2.nii.gz   |  3 -
 .../train_and_test_data/id3_mask.nii.gz       |  3 -
 .../train_and_test_data/id3_region.nii.gz     |  3 -
 Tests/ML/test_metrics.py                      | 45 +++++---------
 Tests/ML/test_plotting.py                     | 16 ++---
 Tests/ML/util.py                              |  6 +-
 Tests/ML/utils/test_augmentation.py           | 35 +++++------
 Tests/ML/utils/test_io_util.py                |  4 +-
 .../ML/visualizers/test_visualize_patches.py  | 10 +--
 22 files changed, 139 insertions(+), 247 deletions(-)
 delete mode 100644 Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
 delete mode 100644 Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
 delete mode 100644 Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
 delete mode 100644 Tests/ML/test_data/train_and_test_data/id3_region.nii.gz

diff --git a/InnerEye/ML/dataset/cropping_dataset.py b/InnerEye/ML/dataset/cropping_dataset.py
index 5e2265b7d..aa2f27c40 100644
--- a/InnerEye/ML/dataset/cropping_dataset.py
+++ b/InnerEye/ML/dataset/cropping_dataset.py
@@ -110,7 +110,6 @@ def create_random_cropped_sample(sample: Sample,
             mask_center_crop = image_util.get_center_crop(image=sample.mask, crop_shape=center_size)
             labels_center_crop = np.zeros(shape=[len(sample.labels)] + list(center_size),  # type: ignore
                                           dtype=ImageDataType.SEGMENTATION.value)
-            assert sample.labels is not None
             for c in range(len(sample.labels)):  # type: ignore
                 labels_center_crop[c] = image_util.get_center_crop(
                     image=sample.labels[c],
@@ -121,7 +120,6 @@ def create_random_cropped_sample(sample: Sample,
             image=sample.image,
             mask=sample.mask,
             labels=sample.labels,
-            missing_labels=sample.missing_labels,
             mask_center_crop=mask_center_crop,
             labels_center_crop=labels_center_crop,
             center_indices=center_point,
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 2efd0a9e3..6679e9dd2 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -270,7 +270,7 @@ def convert_channels_to_file_paths(channels: List[str],
                                    rows: pd.DataFrame,
                                    local_dataset_root_folder: Path,
                                    patient_id: str,
-                                   allow_incomplete_labels: bool = False) -> Tuple[List[Optional[Path]], str]:
+                                   allow_incomplete_labels: bool = False) -> Tuple[List[Path], str]:
     """
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
@@ -281,7 +281,7 @@ def convert_channels_to_file_paths(channels: List[str],
     :param local_dataset_root_folder: Root directory which points to the local dataset
     :param patient_id: string which contains subject identifier
     """
-    paths: List[Optional[Path]] = []
+    paths: List[Path] = []
     failed_channel_info: str = ''
 
     for channel_id in channels:
@@ -290,7 +290,7 @@ def convert_channels_to_file_paths(channels: List[str],
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
         elif len(row) == 0 and allow_incomplete_labels:
             # Keeps track of missing channels order
-            paths.append(None)
+            paths.append(Path(''))
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
                                    os.linesep
@@ -344,7 +344,7 @@ def get_mask_channel_or_default() -> Optional[Path]:
         else:
             return paths[0]
 
-    def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
+    def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
         if len(set(channels)) < len(channels):
             raise ValueError(f"ids have duplicated entries: {channels}")
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
@@ -364,6 +364,7 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
             metadata=metadata,
             image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
             mask_channel=get_mask_channel_or_default(),
-            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels))  # type: ignore
+            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels),  # type: ignore
+            allow_incomplete_labels=allow_incomplete_labels)
 
     return dataset_sources
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index 26717c97b..d01246f55 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -127,12 +127,12 @@ def get_dict(self) -> Dict[str, Any]:
 class PatientDatasetSource(SampleBase):
     """
     Dataset source locations for channels associated with a given patient in a particular dataset.
-    Please note that "ground_truth_channels" is optional.
     """
     image_channels: List[PathOrString]
-    ground_truth_channels: List[Optional[PathOrString]]
+    ground_truth_channels: List[PathOrString]
     mask_channel: Optional[PathOrString]
     metadata: PatientMetadata
+    allow_incomplete_labels: bool
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -140,6 +140,8 @@ def __post_init__(self) -> None:
 
         if not self.image_channels:
             raise ValueError("image_channels cannot be empty")
+        if not self.ground_truth_channels and not self.allow_incomplete_labels:
+            raise ValueError("ground_truth_channels cannot be empty")
 
 
 @dataclass(frozen=True)
@@ -152,10 +154,9 @@ class Sample(SampleBase):
     image: Union[np.ndarray, torch.Tensor]
     # (Batches if from data loader) x Z x Y x X
     mask: Union[np.ndarray, torch.Tensor]
-    # (Batches if from data loader) x Classes x Z X Y x X, where the first class is background
-    labels: Optional[Union[np.ndarray, torch.Tensor]]
+    # (Batches if from data loader) x Classes x Z X Y x X
+    labels: Union[np.ndarray, torch.Tensor]
     metadata: PatientMetadata
-    missing_labels: List[bool]
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -163,9 +164,9 @@ def __post_init__(self) -> None:
 
         ml_util.check_size_matches(arg1=self.image, arg2=self.mask,
                                    matching_dimensions=self._get_matching_dimensions())
-        if self.labels is not None:
-            ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
-                                       matching_dimensions=self._get_matching_dimensions())
+
+        ml_util.check_size_matches(arg1=self.image, arg2=self.labels,
+                                   matching_dimensions=self._get_matching_dimensions())
 
     @property
     def patient_id(self) -> int:
diff --git a/InnerEye/ML/lightning_models.py b/InnerEye/ML/lightning_models.py
index 9f6b79c57..490e693ee 100644
--- a/InnerEye/ML/lightning_models.py
+++ b/InnerEye/ML/lightning_models.py
@@ -116,7 +116,6 @@ def compute_metrics(self, cropped_sample: CroppedSample, segmentation: torch.Ten
             ground_truth=cropped_sample.labels_center_crop,
             allow_multiple_classes_for_each_pixel=True)[:, 1:]
         # Number of foreground voxels per class, across all crops
-        assert cropped_sample.labels is not None
         foreground_voxels = metrics_util.get_number_of_voxels_per_class(cropped_sample.labels)[:, 1:]
         # Store Dice and voxel count per sample in the minibatch. We need a custom aggregation logic for Dice
         # because it can be NaN. Also use custom logging for voxel count because Lightning's batch-size weighted
diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index c4af0fc9a..f220aa338 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -223,63 +223,55 @@ def _add_zero_distances(num_segmented_surface_pixels: int, seg2ref_distance_map_
 
 def calculate_metrics_per_class(segmentation: np.ndarray,
                                 ground_truth: np.ndarray,
-                                missing_labels: List[bool],
                                 ground_truth_ids: List[str],
                                 voxel_spacing: TupleFloat3,
                                 patient_id: Optional[int] = None) -> MetricsDict:
     """
-    Calculate the dice for provided foreground structures (the background class is completely ignored).
-    Returns a MetricsDict with metrics values for provided foreground class
+    Calculate the dice for all foreground structures (the background class is completely ignored).
+    Returns a MetricsDict with metrics for each of the foreground
     structures. Metrics are NaN if both ground truth and prediction are all zero for a class.
+    :param ground_truth_ids: The names of all foreground classes.
     :param segmentation: predictions multi-value array with dimensions: [Z x Y x X]
-    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]. Note that the value of the 'C'
-                        dimension is function on the provided ground truth channels. The minimal value for
-                        C is 2: one background channel and one ground truth channel provided
-    :param missing_labels: list of booleans, if boolean variable is True, indicates that given channel was not provided
-                         and length of list is number of all foreground classes
-    :param ground_truth_ids: The names of all foreground classes
+    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]
     :param voxel_spacing: voxel_spacing in 3D Z x Y x X
     :param patient_id: for logging
     """
-    # For 'ground_truth', the expected C dimension is (Background Channel) + (Provided Ground Truth Channels)
-    # We can resolve the number of provided channels by subtracting the number of ground truth channels that were
-    # not provided from the number of classes
-    assert ground_truth is not None
-    assert ground_truth.shape[0] >= 2
-    num_classes_including_background = len(ground_truth_ids) + 1
-    if len(ground_truth_ids) - missing_labels.count(True) != (ground_truth.shape[0] - 1):
-        raise ValueError(f"Received {len(ground_truth_ids) - missing_labels.count(True)} foreground class names, but "
-                         f"the label tensor indicates that there are {num_classes_including_background - 1} classes.")
-    binaries = binaries_from_multi_label_array(segmentation, num_classes_including_background)
-
-    # Note that: i) binary_classes >= 2 since we count background class and at least one ground truth image class,
-    # ii) binary_classes <= num_classes_including_background-1
+    number_of_classes = ground_truth.shape[0]
+    if len(ground_truth_ids) != (number_of_classes - 1):
+        raise ValueError(f"Received {len(ground_truth_ids)} foreground class names, but "
+                         f"the label tensor indicates that there are {number_of_classes - 1} classes.")
+    binaries = binaries_from_multi_label_array(segmentation, number_of_classes)
+
     binary_classes = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
 
+    # If ground truth image is nan, then will not be used for metrics computation.
+    nan_images = [np.isnan(np.sum(ground_truth[label_id])) for label_id in range(ground_truth.shape[0])]
+
+    # Validates if not binary then nan
+    assert np.all(np.array(binary_classes) == ~np.array(nan_images))
+
     #  Validates that all binary images should be 0 or 1
-    if not np.all(binary_classes):
+    if not np.all(np.array(binary_classes)[~np.array(nan_images)]):
         raise ValueError("Ground truth values should be 0 or 1")
     overlap_measures_filter = sitk.LabelOverlapMeasuresImageFilter()
     hausdorff_distance_filter = sitk.HausdorffDistanceImageFilter()
     metrics = MetricsDict(hues=ground_truth_ids)
-
-    ground_truth_index_counter = 1
     for i, prediction in enumerate(binaries):
         # Skips if background image or nan_image
-        if i == 0:
-            continue
-        # Skips if ground truth channel was not provided
-        if missing_labels[i-1]:
+        if i == 0 or nan_images[i]:
             continue
+        check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
+        if not is_binary_array(prediction):
+            raise ValueError("Predictions values should be 0 or 1")
+        # simpleitk returns a Dice score of 0 if both ground truth and prediction are all zeros.
         # We want to be able to fish out those cases, and treat them specially later.
         prediction_zero = np.all(prediction == 0)
-        gt_zero = np.all(ground_truth[ground_truth_index_counter] == 0)
+        gt_zero = np.all(ground_truth[i] == 0)
         dice = mean_surface_distance = hausdorff_distance = math.nan
         if not (prediction_zero and gt_zero):
             prediction_image = sitk.GetImageFromArray(prediction.astype(np.uint8))
             prediction_image.SetSpacing(sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
-            # Use 'ground_truth_index_counter' to index the 'C' dimension
-            ground_truth_image = sitk.GetImageFromArray(ground_truth[ground_truth_index_counter].astype(np.uint8))
+            ground_truth_image = sitk.GetImageFromArray(ground_truth[i].astype(np.uint8))
             ground_truth_image.SetSpacing(sitk.VectorDouble(reverse_tuple_float3(voxel_spacing)))
             overlap_measures_filter.Execute(prediction_image, ground_truth_image)
             dice = overlap_measures_filter.GetDiceCoefficient()
@@ -304,7 +296,6 @@ def add_metric(metric_type: MetricType, value: float) -> None:
         add_metric(MetricType.DICE, dice)
         add_metric(MetricType.HAUSDORFF_mm, hausdorff_distance)
         add_metric(MetricType.MEAN_SURFACE_DIST_mm, mean_surface_distance)
-        ground_truth_index_counter += 1
     return metrics
 
 
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 6c1079d3b..d91253c17 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -229,32 +229,23 @@ def evaluate_model_predictions(process_id: int,
     :param results_folder: Path to results folder
     :returns [PatientMetadata, list[list]]: Patient metadata and list of computed metrics for each image.
     """
-
     sample = dataset.get_samples_at_index(index=process_id)[0]
-    assert sample.missing_labels is not None
-    if sample.labels is None:
-        logging.info(f"Ground truth label were not provided for patient {sample.patient_id}, skipping evaluation from "
-                     f"predictions")
-        return sample.metadata, MetricsDict(hues=config.ground_truth_ids)
-
     logging.info(f"Evaluating predictions for patient {sample.patient_id}")
 
     patient_results_folder = get_patient_results_folder(results_folder, sample.patient_id)
     segmentation = load_nifti_image(patient_results_folder / DEFAULT_RESULT_IMAGE_NAME).image
     metrics_per_class = metrics.calculate_metrics_per_class(segmentation,
                                                             sample.labels,
-                                                            sample.missing_labels,
                                                             ground_truth_ids=config.ground_truth_ids,
                                                             voxel_spacing=sample.image_spacing,
                                                             patient_id=sample.patient_id)
     thumbnails_folder = results_folder / THUMBNAILS_FOLDER
     thumbnails_folder.mkdir(exist_ok=True)
-    if sample.missing_labels.count(True) == 0:
-        plotting.plot_contours_for_all_classes(sample,
-                                               segmentation=segmentation,
-                                               foreground_class_names=config.ground_truth_ids,
-                                               result_folder=thumbnails_folder,
-                                               image_range=config.output_range)
+    plotting.plot_contours_for_all_classes(sample,
+                                           segmentation=segmentation,
+                                           foreground_class_names=config.ground_truth_ids,
+                                           result_folder=thumbnails_folder,
+                                           image_range=config.output_range)
     return sample.metadata, metrics_per_class
 
 
diff --git a/InnerEye/ML/plotting.py b/InnerEye/ML/plotting.py
index b94dbdea2..d0412849f 100644
--- a/InnerEye/ML/plotting.py
+++ b/InnerEye/ML/plotting.py
@@ -247,7 +247,6 @@ def plot_normalization_result(loaded_images: Sample,
     """
     # Labels are encoded with background and a single foreground class. We need the
     # slice with largest number of foreground voxels
-    assert loaded_images.labels is not None
     ground_truth = loaded_images.labels[class_index, ...]
     largest_gt_slice = get_largest_z_slice(ground_truth)
     first_channel = loaded_images.image[channel_index, ...]
@@ -290,7 +289,6 @@ def plot_contours_for_all_classes(sample: Sample,
     :param channel_index: The index of the image channel that should be plotted.
     :return: The paths to all generated PNG files.
     """
-    assert sample.labels is not None
     check_size_matches(sample.labels[0], segmentation)
     num_classes = sample.labels.shape[0]
     if len(foreground_class_names) != num_classes - 1:
diff --git a/InnerEye/ML/utils/augmentation.py b/InnerEye/ML/utils/augmentation.py
index 60dd34506..b89b44666 100644
--- a/InnerEye/ML/utils/augmentation.py
+++ b/InnerEye/ML/utils/augmentation.py
@@ -28,8 +28,8 @@ class among the available classes then samples a center point among the pixels o
                           voxel belongs to (must sum to 1), uniform distribution assumed if none provided.
     :return numpy int array (3x1) containing patch center spatial coordinates
     """
-    assert sample.labels is not None
     num_classes = sample.labels.shape[0]
+
     if class_weights is not None:
         if len(class_weights) != num_classes:
             raise Exception("A weight must be provided for each class, found weights:{}, expected:{}"
@@ -111,8 +111,7 @@ def random_crop(sample: Sample,
     """
     Randomly crops images, mask, and labels arrays according to the crop_size argument.
     The selection of the center is dependant on background probability.
-    By default it does not center on background
-    All class labels must be provided.
+    By default it does not center on background.
 
     :param sample: A set of Image channels, ground truth labels and mask to randomly crop.
     :param crop_size: The size of the crop expressed as a list of 3 ints, one per spatial dimension.
@@ -123,16 +122,12 @@ def random_crop(sample: Sample,
     crop.
     :raises ValueError: If there are shape mismatches among the arguments or if the crop size is larger than the image.
     """
-    assert sample.labels is not None
-    # Ensures no missing class labels
-    assert sample.missing_labels.count(True) == 0
     slicers, center = slicers_for_random_crop(sample, crop_size, class_weights)
     sample = Sample(
         image=sample.image[:, slicers[0], slicers[1], slicers[2]],
         labels=sample.labels[:, slicers[0], slicers[1], slicers[2]],
         mask=sample.mask[slicers[0], slicers[1], slicers[2]],
-        metadata=sample.metadata,
-        missing_labels=sample.missing_labels
+        metadata=sample.metadata
     )
     return sample, center
 
diff --git a/InnerEye/ML/utils/dataset_util.py b/InnerEye/ML/utils/dataset_util.py
index 14500bb23..b2275f4a0 100644
--- a/InnerEye/ML/utils/dataset_util.py
+++ b/InnerEye/ML/utils/dataset_util.py
@@ -197,14 +197,11 @@ def add_label_stats_to_dataframe(input_dataframe: pd.DataFrame,
 
     # Iterate over subjects and check overlapping labels
     for subject_id in [*dataset_sources.keys()]:
-        labels, __ = io_util.load_labels_from_dataset_source(dataset_sources[subject_id])
-        assert labels is not None
+        labels = io_util.load_labels_from_dataset_source(dataset_sources[subject_id])
         overlap_stats = metrics_util.get_label_overlap_stats(labels=labels[1:, ...],
                                                              label_names=target_label_names)
-        # Creates "ground_truth_channel" variable and asserts is not None to comply with mypy
-        ground_truth_channel = dataset_sources[subject_id].ground_truth_channels[0]
-        assert ground_truth_channel is not None
-        header = io_util.load_nifti_image(ground_truth_channel).header
+
+        header = io_util.load_nifti_image(dataset_sources[subject_id].ground_truth_channels[0]).header
         volume_stats = metrics_util.get_label_volume(labels=labels[1:, ...],
                                                      label_names=target_label_names,
                                                      label_spacing=header.spacing)
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index db64c530a..4b2c3b859 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -412,32 +412,36 @@ def load_image_in_known_formats(file: Path,
         raise ValueError(f"Unsupported image file type for path {file}")
 
 
-def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True) -> \
-        Tuple[Optional[np.ndarray], List[bool]]:
+def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True,
+                                    mask_size: Optional[Tuple[int]] = None) -> np.ndarray:
     """
     Load labels containing segmentation binary labels in one-hot-encoding.
     In the future, this function will be used to load global class and non-imaging information as well.
 
+    :type mask_size: Image size, tuple if integers.
     :param dataset_source: The dataset source for which channels are to be loaded into memory.
     :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
-    :return A label sample object containing ground-truth information if channel is provided
-            If no ground-truth channels provided, label is None
-            If some ground-truth channels provided are provided, then they are loaded
-            Background is loaded if at least one ground-truth channel is provided and is the first element of the tensor
-    :return A list of booleans indicating if ground-truth channel is missing
-    """
-
-    label_list = []
-    missing_labels = [True] * len(dataset_source.ground_truth_channels)
-    for i, gt in enumerate(dataset_source.ground_truth_channels):
-        if gt is not None:
-            label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
-            missing_labels[i] = False
-
-    if len(label_list) == 0:
-        return None, missing_labels
-    labels = np.stack(label_list)
-    if check_exclusive and (sum(np.array(label_list)) > 1.).any():  # type: ignore
+    :return: A label sample object containing ground-truth information.
+    """
+
+    if not dataset_source.allow_incomplete_labels:
+        labels = np.stack(
+            [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
+    else:
+        assert mask_size is not None
+        label_list = []
+        for gt in dataset_source.ground_truth_channels:
+            if str(gt) == '.':
+                label_list.append(np.full(mask_size, np.NAN, ImageDataType))
+            else:
+                label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
+        labels = np.stack(label_list)
+
+    # If ground truth image is nan, then will not be used to check check_exclusive.
+    not_nan_label_images = [labels[label_id] for label_id in range(labels.shape[0])
+                            if not np.isnan(np.sum(labels[label_id]))]
+
+    if check_exclusive and (sum(np.array(not_nan_label_images)) > 1.).any():  # type: ignore
         raise ValueError(f'The labels for patient {dataset_source.metadata.patient_id} are not mutually exclusive. '
                          'Some loss functions (e.g. SoftDice) may produce results on overlapping labels, while others '
                          '(e.g. FocalLoss) will fail. '
@@ -451,7 +455,7 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
     for c in range(len(labels)):
         background[labels[c] == 1] = 0
     background = background[np.newaxis, ...]
-    return np.vstack((background, labels)), missing_labels
+    return np.vstack((background, labels))
 
 
 def load_image(path: PathOrString, image_type: Optional[Type] = float) -> ImageWithHeader:
@@ -516,21 +520,12 @@ def load_images_from_dataset_source(dataset_source: PatientDatasetSource, check_
     # create raw sample to return
     metadata = copy(dataset_source.metadata)
     metadata.image_header = images[0].header
-    labels, missing_labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive)
-
-    # All class labels are counted, including missing or no missing ground truth class labels
-    assert len(missing_labels) == len(dataset_source.ground_truth_channels)
-    # Number of missing labels corresponds to the number of None objects in 'dataset_source.ground_truth_channels'
-    assert missing_labels.count(True) == dataset_source.ground_truth_channels.count(None)
-    # Number of not missing labels corresponds the number of provided ground truth (file name) channels.
-    assert missing_labels.count(False) == len(dataset_source.ground_truth_channels) - \
-           dataset_source.ground_truth_channels.count(None)
+    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive, mask_size=mask.shape)
 
     return Sample(image=image,
                   labels=labels,
                   mask=mask,
-                  metadata=metadata,
-                  missing_labels=missing_labels)
+                  metadata=metadata)
 
 
 def store_image_as_short_nifti(image: np.ndarray,
diff --git a/Tests/ML/datasets/test_dataset.py b/Tests/ML/datasets/test_dataset.py
index 7609d9121..6fc446f2d 100644
--- a/Tests/ML/datasets/test_dataset.py
+++ b/Tests/ML/datasets/test_dataset.py
@@ -28,7 +28,6 @@
 
 crop_size = [55, 55, 55]
 
-
 @pytest.fixture
 def num_dataload_workers() -> int:
     """PyTorch support for multiple dataloader workers is flaky on Windows (so return 0)"""
@@ -143,26 +142,21 @@ def normalize_fn(default_config: SegmentationModelBase) -> PhotometricNormalizat
 
 def test_dataset_content(default_config: ModelConfigBase, gt_image: np.ndarray,
                          cropping_dataset: CroppingDataset, full_image_dataset: FullImageDataset) -> None:
-    # Content is compared with the split training set, since it was use as argument
-    # for 'full_image_dataset' and  'cropping_dataset'
-    assert len(full_image_dataset) == len(cropping_dataset) == \
-           len(set(default_config.get_dataset_splits().train.subject))
+    # check number of patients
+    assert len(full_image_dataset) == len(cropping_dataset) == 2
     assert len(np.unique(gt_image)) == default_config.number_of_classes
 
 
-def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any, random_patient_id: Any,
-                default_config: ModelConfigBase) -> None:
+def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any, random_patient_id: Any) -> None:
     """
     Tests that after creating and extracting a sample we obtain the same result
     :return:
     """
-    missing_labels_list = [False] * default_config.number_of_classes
     metadata = PatientMetadata(patient_id='42', institution="foo")
     sample = Sample(image=random_image_crop,
                     mask=random_mask_crop,
                     labels=random_label_crop,
-                    metadata=metadata,
-                    missing_labels=missing_labels_list)
+                    metadata=metadata)
 
     patched_sample = CroppedSample(image=random_image_crop,
                                    mask=random_mask_crop,
@@ -170,7 +164,6 @@ def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop
                                    mask_center_crop=random_mask_crop,
                                    labels_center_crop=random_label_crop,
                                    metadata=metadata,
-                                   missing_labels=missing_labels_list,
                                    center_indices=np.zeros((1, 3)))
 
     extracted_sample = sample.get_dict()
@@ -190,10 +183,8 @@ def test_sample(random_image_crop: Any, random_mask_crop: Any, random_label_crop
     assert extracted_sample["metadata"] == extracted_patched_sample["metadata"] == metadata
 
 
-def test_cropping_dataset_as_data_loader(cropping_dataset: CroppingDataset, num_dataload_workers: int,
-                                         default_config: ModelConfigBase) -> None:
-    # Set batch size number of training classes 'default_config'
-    batch_size = len(set(default_config.get_dataset_splits().train.subject))
+def test_cropping_dataset_as_data_loader(cropping_dataset: CroppingDataset, num_dataload_workers: int) -> None:
+    batch_size = 2
     loader = cropping_dataset.as_data_loader(shuffle=True, batch_size=batch_size,
                                              num_dataload_workers=num_dataload_workers)
     for i, item in enumerate(loader):
@@ -271,17 +262,11 @@ def test_cropping_dataset_has_reproducible_randomness(cropping_dataset: Cropping
 
 
 def test_csv_dataset_as_data_loader(normalize_fn: Any,
-                                    full_image_dataset: FullImageDataset, num_dataload_workers: int,
-                                    default_config: ModelConfigBase) -> None:
-
-    # Set batch size number of training classes 'default_config'
-    batch_size = len(set(default_config.get_dataset_splits().train.subject))
+                                    full_image_dataset: FullImageDataset, num_dataload_workers: int) -> None:
+    batch_size = 2
     # load the original images separately for comparison
-    # expected number of patients is 3
     expected_samples = load_train_and_test_data_channels(patient_ids=list(range(1, batch_size + 1)),
                                                          normalization_fn=normalize_fn)
-    # expected number of patients is 3, since we use the training set derived 'default_config' which is derived from
-    # class 'DummyModel'
     csv_dataset_loader = full_image_dataset.as_data_loader(batch_size=batch_size, shuffle=True,
                                                            num_dataload_workers=num_dataload_workers)
     for i, batch in enumerate(csv_dataset_loader):
@@ -305,30 +290,24 @@ def test_full_image_dataset_no_mask(full_image_dataset_no_mask: FullImageDataset
 
 @pytest.mark.parametrize("crop_size", [(4, 4, 4), (8, 6, 4)])
 def test_create_possibly_padded_sample_for_cropping(crop_size: Any) -> None:
-    number_gt_classes = 2
-    missing_label_list = [False] * number_gt_classes
     image_size = [4] * 3
     image = np.random.uniform(size=[1] + image_size)
-    labels = np.zeros(shape=[number_gt_classes] + image_size)
+    labels = np.zeros(shape=[2] + image_size)
     mask = np.zeros(shape=image_size, dtype=ImageDataType.MASK.value)
 
     cropped_sample = CroppingDataset.create_possibly_padded_sample_for_cropping(
-        sample=Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata,
-                      missing_labels=missing_label_list),
+        sample=Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata),
         crop_size=crop_size,
         padding_mode=PaddingMode.Zero
     )
 
     assert cropped_sample.image.shape[-3:] == crop_size
-    assert cropped_sample.labels is not None
     assert cropped_sample.labels.shape[-3:] == crop_size
     assert cropped_sample.mask.shape[-3:] == crop_size
 
 
 @pytest.mark.parametrize("use_mask", [False, True])
 def test_cropped_sample(use_mask: bool) -> None:
-    number_of_gt_classes = 2
-    missing_label_list = [False] * number_of_gt_classes
     ml_util.set_random_seed(1)
     image_size = [4] * 3
     crop_size = (2, 2, 2)
@@ -336,7 +315,7 @@ def test_cropped_sample(use_mask: bool) -> None:
 
     # create small image sample for random cropping
     image = np.random.uniform(size=[1] + image_size)
-    labels = np.zeros(shape=[number_of_gt_classes] + image_size)
+    labels = np.zeros(shape=[2] + image_size)
     # Two foreground points in the corners at (0, 0, 0) and (3, 3, 3)
     labels[0] = 1
     labels[0, 0, 0, 0] = 0
@@ -361,8 +340,7 @@ def test_cropped_sample(use_mask: bool) -> None:
         image=image,
         labels=labels,
         mask=mask,
-        metadata=DummyPatientMetadata,
-        missing_labels=missing_label_list
+        metadata=DummyPatientMetadata
     )
 
     for _ in range(0, 100):
@@ -376,7 +354,6 @@ def test_cropped_sample(use_mask: bool) -> None:
         if expected_center is not None:
             assert list(cropped_sample.center_indices) == expected_center  # type: ignore
             assert np.array_equal(cropped_sample.image, sample.image[:, crop_slicer, crop_slicer, crop_slicer])
-            assert sample.labels is not None
             assert np.array_equal(cropped_sample.labels, sample.labels[:, crop_slicer, crop_slicer, crop_slicer])
             assert np.array_equal(cropped_sample.mask, sample.mask[crop_slicer, crop_slicer, crop_slicer])
         else:
@@ -512,8 +489,6 @@ def test_sample_metadata_field() -> None:
     Test that the string constant we use to identify the metadata field is really matching the
     field name in SampleWithMetadata
     """
-    number_of_classes = 2
-    missing_labels_list = [False] * number_of_classes
     batch_size = 5
     xyz = (6, 7, 8)
     shape = (batch_size,) + xyz
@@ -521,11 +496,9 @@ def test_sample_metadata_field() -> None:
     s = Sample(metadata=DummyPatientMetadata,
                image=zero,
                mask=zero,
-               labels=torch.zeros((batch_size,) + (number_of_classes,) + xyz),
-               missing_labels=missing_labels_list)
+               labels=torch.zeros((batch_size,) + (2,) + xyz))
     fields = vars(s)
-    # Assert fields for: 1) metadata, 2) image, 3) mask, 4) labels, 5) missing_labels
-    assert len(fields) == 5
+    assert len(fields) == 4
     assert SAMPLE_METADATA_FIELD in fields
     # Lightning attempts to determine the batch size by trying to find a tensor field in the sample.
     # This only works if any field other than Metadata is first.
@@ -549,15 +522,12 @@ def test_custom_collate() -> None:
     assert result[foo].tolist() == [1, 2]
 
 
-def test_sample_construct_copy(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any,
-                               default_config: SegmentationModelBase) -> None:
-    missing_labels_list = [False] * default_config.number_of_classes
+def test_sample_construct_copy(random_image_crop: Any, random_mask_crop: Any, random_label_crop: Any) -> None:
     sample = Sample(
         image=random_image_crop,
         mask=random_mask_crop,
         labels=random_label_crop,
-        metadata=PatientMetadata(patient_id='1'),
-        missing_labels=missing_labels_list
+        metadata=PatientMetadata(patient_id='1')
     )
 
     sample_clone = sample.clone_with_overrides()
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index e1003cf7e..14282d1b4 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -307,17 +307,23 @@ def test_evaluate_model_predictions() -> None:
             assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values('region_1').keys()
             for hue_name in ['region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 4 has all missing ground truth channels: "region", "region_1"
         if sample.metadata.patient_id == '4':
             for hue_name in ['region_1', 'region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 5 has no missing ground truth channels
         if sample.metadata.patient_id == '5':
+            assert len(metrics_per_class.values('Default').keys()) == 0
+            assert len(metrics_per_class.values('Default').keys()) == 0
             assert len(metrics_per_class.values('Default').keys()) == 0
             for hue_name in ['region_1', 'region']:
-                assert 'Dice' in metrics_per_class.values(hue_name).keys()
+                assert 'Dice' in metrics_per_class.values('region_1').keys()
                 assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
                 assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
diff --git a/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
deleted file mode 100644
index 004e23b48..000000000
--- a/Tests/ML/test_data/train_and_test_data/id3_channel1.nii.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d09abc0d1a4c5d18de6da26fec2b813c5e5e545577cf9a2e7e4b27dcf2d052e5
-size 683626
diff --git a/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
deleted file mode 100644
index 004e23b48..000000000
--- a/Tests/ML/test_data/train_and_test_data/id3_channel2.nii.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d09abc0d1a4c5d18de6da26fec2b813c5e5e545577cf9a2e7e4b27dcf2d052e5
-size 683626
diff --git a/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
deleted file mode 100644
index 62d10d47c..000000000
--- a/Tests/ML/test_data/train_and_test_data/id3_mask.nii.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:b0c2c7f0fe10f3df48fb86edce291c340895488645a7cba030d1d3d67a1a0584
-size 913
diff --git a/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz b/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz
deleted file mode 100644
index be4022fb1..000000000
--- a/Tests/ML/test_data/train_and_test_data/id3_region.nii.gz
+++ /dev/null
@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:4c077c518c0aca796332ae13b26d7f2eac5de7f93ba44eb497ff777e6c24abf8
-size 83093
diff --git a/Tests/ML/test_metrics.py b/Tests/ML/test_metrics.py
index 49bc7eea5..b30a754fb 100644
--- a/Tests/ML/test_metrics.py
+++ b/Tests/ML/test_metrics.py
@@ -33,23 +33,21 @@ def test_calculate_dice1() -> None:
     g1 = "g1"
     zero = np.zeros((3, 3, 3))
     one = np.ones((3, 3, 3))
-    missing_labels = [False]
 
     # ground truth is expected in one-hot encoding, but the segmentation is a map with class indices in each voxel
-    def assert_metrics(segmentation: np.ndarray, ground_truth: np.ndarray, missing_label: List[bool],
-                       expected_dice: float) -> None:
-        a = metrics.calculate_metrics_per_class(segmentation, ground_truth, missing_label,
+    def assert_metrics(segmentation: np.ndarray, ground_truth: np.ndarray, expected_dice: float) -> None:
+        a = metrics.calculate_metrics_per_class(segmentation, ground_truth,
                                                 voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
         assert a.get_hue_names(include_default=False) == [g1]
         assert equal_respecting_nan(a.get_single_metric(MetricType.DICE, hue=g1), expected_dice)
 
     # Case 1: Ground truth says everything is class 1, and segmentation says the same
-    assert_metrics(one, np.stack([zero, one]), missing_labels, expected_dice=1.0)
+    assert_metrics(one, np.stack([zero, one]), expected_dice=1.0)
     # Case 2: Ground truth says everything is class 0, but segmentation says it's class 1
-    assert_metrics(one, np.stack([one, zero]), missing_labels, expected_dice=0.0)
+    assert_metrics(one, np.stack([one, zero]), expected_dice=0.0)
     # Case 3: Ground truth says everything is class 0, and segmentation says the same: This means that class 1
     # is correctly predicted, but empty ground truth and empty prediction are indicated by Dice NaN
-    assert_metrics(zero, np.stack([one, zero]), missing_labels, expected_dice=math.nan)
+    assert_metrics(zero, np.stack([one, zero]), expected_dice=math.nan)
 
 
 def equal_respecting_nan(v1: float, v2: float) -> bool:
@@ -74,10 +72,7 @@ def expand(a: List[float]) -> np.ndarray:
     ground_truth_values = expand([0, 0, 1])
     ground_truth = np.stack([1 - ground_truth_values, ground_truth_values])
     prediction = expand(prediction_list)
-    # Since there is only label, 'missing_labels' list to have only element
-    missing_labels_list = [False]
-    m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels_list,
-                                            voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
+    m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=(1, 1, 1), ground_truth_ids=[g1])
     assert m.get_single_metric(MetricType.DICE, hue=g1) == expected_dice
 
 
@@ -88,15 +83,10 @@ def test_calculate_hd() -> None:
     prediction1 = np.ones_like(prediction0)
     gt_all_zero = np.stack([prediction1, prediction0])
     gt_all_one = np.stack([prediction0, prediction1])
-    # Since there is only label, 'missing_labels' list to have only element
-    missing_labels_list = [False]
 
-    def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray,
-                       missing_labels: List[bool],
-                       expected: Optional[float],
+    def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray, expected: Optional[float],
                        voxel_spacing: TupleFloat3 = (1, 1, 1)) -> float:
-        m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels,
-                                                voxel_spacing=voxel_spacing,
+        m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=voxel_spacing,
                                                 ground_truth_ids=[g1])
         actual = m.get_single_metric(MetricType.HAUSDORFF_mm, hue=g1)
         if expected is not None:
@@ -104,8 +94,8 @@ def assert_metrics(prediction: np.ndarray, ground_truth: np.ndarray,
         return actual
 
     # check an infinity value if either the prediction or gt have no foreground
-    assert_metrics(prediction0, gt_all_one, missing_labels_list, math.inf)
-    assert_metrics(prediction1, gt_all_zero, missing_labels_list, math.inf)
+    assert_metrics(prediction0, gt_all_one, math.inf)
+    assert_metrics(prediction1, gt_all_zero, math.inf)
 
     def generate_random_prediction() -> np.ndarray:
         result = np.round(np.random.uniform(size=prediction0.shape))
@@ -116,16 +106,12 @@ def generate_random_prediction() -> np.ndarray:
 
     random_prediction = generate_random_prediction()
     matching_gt = np.stack([1 - random_prediction, random_prediction])
-    # Since there is only label, 'missing_labels' list to have only element
-    missing_labels_list = [False]
-    assert_metrics(random_prediction, matching_gt, missing_labels_list, 0.0)
+    assert_metrics(random_prediction, matching_gt, 0.0)
     # check voxel spacing is being used as expected
     random_prediction2 = generate_random_prediction()
     non_matching_gt = np.stack([1 - random_prediction2, random_prediction2])
-    without_spacing = assert_metrics(random_prediction, non_matching_gt, missing_labels_list,
-                                     voxel_spacing=(1, 1, 1), expected=None)
-    with_spacing = assert_metrics(random_prediction, non_matching_gt, missing_labels_list,
-                                  voxel_spacing=(2.0, 2.0, 2.0), expected=None)
+    without_spacing = assert_metrics(random_prediction, non_matching_gt, voxel_spacing=(1, 1, 1), expected=None)
+    with_spacing = assert_metrics(random_prediction, non_matching_gt, voxel_spacing=(2.0, 2.0, 2.0), expected=None)
     assert without_spacing != with_spacing
 
 
@@ -139,10 +125,7 @@ def test_calculate_hd_exact() -> None:
 
     ground_truth = np.stack(np.stack([1 - ground_truth, ground_truth]))
     g1 = "g1"
-    # Since there is only label, 'missing_labels' list to have only element
-    missing_labels_list = [False]
-    m = metrics.calculate_metrics_per_class(prediction, ground_truth, missing_labels_list,
-                                            voxel_spacing=(1, 2, 3), ground_truth_ids=[g1])
+    m = metrics.calculate_metrics_per_class(prediction, ground_truth, voxel_spacing=(1, 2, 3), ground_truth_ids=[g1])
     assert m.get_single_metric(MetricType.HAUSDORFF_mm, hue=g1) == 6
     assert m.get_single_metric(MetricType.MEAN_SURFACE_DIST_mm, hue=g1) == 6
 
diff --git a/Tests/ML/test_plotting.py b/Tests/ML/test_plotting.py
index 4d618270e..5070f51e0 100644
--- a/Tests/ML/test_plotting.py
+++ b/Tests/ML/test_plotting.py
@@ -147,21 +147,17 @@ def test_plot_normalization_result(test_output_dirs: OutputFolderForTests) -> No
     Tests plotting of before/after histograms in photometric normalization.
     :return:
     """
-    number_of_gt_classes = 2
     size = (3, 3, 3)
     image = np.zeros((1,) + size)
     for i, (z, y, x) in enumerate(itertools.product(range(size[0]), range(size[1]), range(size[2]))):
         image[0, z, y, x] = i
-    labels = np.zeros((number_of_gt_classes,) + size)
-    # Initializes 'missing_labels_list' to 'False' for the given number of gt classes
-    missing_labels_list = [False] * number_of_gt_classes
+    labels = np.zeros((2,) + size)
     labels[1, 1, 1, 1] = 1
     sample = Sample(
         image=image,
         labels=labels,
         mask=np.ones(size),
-        metadata=DummyPatientMetadata,
-        missing_labels=missing_labels_list
+        metadata=DummyPatientMetadata
     )
     config = SegmentationModelBase(norm_method=PhotometricNormalizationMethod.CtWindow, window=4, level=13,
                                    should_validate=False)
@@ -173,16 +169,13 @@ def test_plot_normalization_result(test_output_dirs: OutputFolderForTests) -> No
 
 
 def test_plot_contours_for_all_classes(test_output_dirs: OutputFolderForTests) -> None:
-    number_of_gt_classes = 3
     size = (3, 3, 3)
     image = np.zeros((1,) + size)
     for i, (z, y, x) in enumerate(itertools.product(range(size[0]), range(size[1]), range(size[2]))):
         image[0, z, y, x] = i
     # Create a fake label array: For each class, there is exactly 1 pixel foreground, at the z slice that is
     # equal to the class index
-    labels = np.zeros((number_of_gt_classes,) + size)
-    # Initializes 'missing_labels_list' to 'False' for the given number of gt classes
-    missing_labels_list = [False] * number_of_gt_classes
+    labels = np.zeros((3,) + size)
     labels[0, 0, 1, 1] = 1
     labels[1, 1, 1, 1] = 1
     labels[2, 2, 1, 1] = 1
@@ -197,8 +190,7 @@ def test_plot_contours_for_all_classes(test_output_dirs: OutputFolderForTests) -
         image=image,
         labels=labels,
         mask=np.ones(size),
-        metadata=DummyPatientMetadata,
-        missing_labels=missing_labels_list
+        metadata=DummyPatientMetadata
     )
     plots = plotting.plot_contours_for_all_classes(sample,
                                                    segmentation,
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index 1fbbb8dea..fb49c116d 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -73,7 +73,8 @@ def load_train_and_test_data_channels(patient_ids: List[int],
         metadata=PatientMetadata(patient_id=z),
         image_channels=[file_name(z, c) for c in TEST_CHANNEL_IDS],
         mask_channel=file_name(z, TEST_MASK_ID),
-        ground_truth_channels=[file_name(z, TEST_GT_ID)]
+        ground_truth_channels=[file_name(z, TEST_GT_ID)],
+        allow_incomplete_labels=False
     ))
 
     samples = []
@@ -82,8 +83,7 @@ def load_train_and_test_data_channels(patient_ids: List[int],
         sample = Sample(image=normalization_fn.transform(sample.image, sample.mask),
                         mask=sample.mask,
                         labels=sample.labels,
-                        metadata=sample.metadata,
-                        missing_labels=sample.missing_labels)
+                        metadata=sample.metadata)
         samples.append(sample)
 
     return samples
diff --git a/Tests/ML/utils/test_augmentation.py b/Tests/ML/utils/test_augmentation.py
index d087437d6..43bbcf4f9 100644
--- a/Tests/ML/utils/test_augmentation.py
+++ b/Tests/ML/utils/test_augmentation.py
@@ -25,9 +25,6 @@
 valid_labels = np.zeros((number_of_classes,) + image_size)
 for c in range(number_of_classes):
     valid_labels[c, class_assignments == c] = 1
-# Since we have 5 classes and all ground truth class labels are provided, initialize 'missing_labels_list' to
-# to 'False' with  length 5
-missing_labels_list = [False] * number_of_classes
 valid_crop_size = (2, 2, 2)
 valid_full_crop_size = image_size
 valid_class_weights = [0.5] + [0.5 / (number_of_classes - 1)] * (number_of_classes - 1)
@@ -40,8 +37,7 @@ def test_valid_full_crop() -> None:
     sample, _ = augmentation.random_crop(sample=Sample(image=valid_image_4d,
                                                        labels=valid_labels,
                                                        mask=valid_mask,
-                                                       metadata=metadata,
-                                                       missing_labels=missing_labels_list),
+                                                       metadata=metadata),
                                          crop_size=valid_full_crop_size,
                                          class_weights=valid_class_weights)
 
@@ -64,37 +60,36 @@ def test_invalid_arrays(image: Any, labels: Any, mask: Any, class_weights: Any)
     if not (np.array_equal(image, valid_image_4d) and np.array_equal(labels, valid_labels)
             and np.array_equal(mask, valid_mask) and class_weights == valid_class_weights):
         with pytest.raises(Exception):
-            augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=image, labels=labels, mask=mask,
-                                            missing_labels=missing_labels_list), valid_crop_size, class_weights)
+            augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=image, labels=labels, mask=mask),
+                                     valid_crop_size, class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [None, ["a"], 5])
 def test_invalid_crop_arg(crop_size: Any) -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(
-            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask,
-                   missing_labels=missing_labels_list), crop_size, valid_class_weights)
+            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask),
+            crop_size, valid_class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [[2, 2], [2, 2, 2, 2], [10, 10, 10]])
 def test_invalid_crop_size(crop_size: Any) -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(
-            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask,
-                   missing_labels=missing_labels_list), crop_size, valid_class_weights)
+            Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels, mask=valid_mask),
+            crop_size, valid_class_weights)
 
 
 def test_random_crop_no_fg() -> None:
     with pytest.raises(Exception):
         augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=valid_image_4d, labels=valid_labels,
-                                        mask=np.zeros_like(valid_mask), missing_labels=missing_labels_list),
-                                        valid_crop_size, valid_class_weights)
+                                        mask=np.zeros_like(valid_mask)),
+                                 valid_crop_size, valid_class_weights)
 
     with pytest.raises(Exception):
         augmentation.random_crop(Sample(metadata=DummyPatientMetadata, image=valid_image_4d,
-                                        labels=np.zeros_like(valid_labels), mask=valid_mask,
-                                        missing_labels=missing_labels_list),
-                                        valid_crop_size, valid_class_weights)
+                                        labels=np.zeros_like(valid_labels), mask=valid_mask),
+                                 valid_crop_size, valid_class_weights)
 
 
 @pytest.mark.parametrize("crop_size", [valid_crop_size])
@@ -108,15 +103,13 @@ def test_random_crop(crop_size: Any) -> None:
         image=valid_image_4d,
         labels=valid_labels,
         mask=valid_mask,
-        metadata=DummyPatientMetadata,
-        missing_labels=missing_labels_list),
-        crop_size, valid_class_weights)
+        metadata=DummyPatientMetadata
+    ), crop_size, valid_class_weights)
 
     expected_img_crop_size = (valid_image_4d.shape[0], *crop_size)
     expected_labels_crop_size = (valid_labels.shape[0], *crop_size)
 
     assert sample.image.shape == expected_img_crop_size
-    assert sample.labels is not None
     assert sample.labels.shape == expected_labels_crop_size
     assert sample.mask.shape == tuple(crop_size)
 
@@ -140,7 +133,7 @@ def test_valid_class_weights(class_weights: List[float]) -> None:
     labels[class2][3, 2, 3] = 1
 
     mask = np.ones_like(valid_mask)
-    sample = Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata, missing_labels=missing_labels_list)
+    sample = Sample(image=image, labels=labels, mask=mask, metadata=DummyPatientMetadata)
 
     crop_size = (1, 1, 1)
     total_crops = 200
diff --git a/Tests/ML/utils/test_io_util.py b/Tests/ML/utils/test_io_util.py
index 2982ff4e8..4a57972b4 100644
--- a/Tests/ML/utils/test_io_util.py
+++ b/Tests/ML/utils/test_io_util.py
@@ -112,7 +112,8 @@ def _test_load_images_from_channels(
             metadata=metadata,
             image_channels=[image_channel] * 2,
             ground_truth_channels=[ground_truth_channel] * 4,
-            mask_channel=mask_channel
+            mask_channel=mask_channel,
+            allow_incomplete_labels=False
         ),
         check_exclusive=check_exclusive
     )
@@ -123,7 +124,6 @@ def _test_load_images_from_channels(
         if mask_channel:
             assert np.array_equal(sample.mask, image_with_header.image)
         if ground_truth_channel:
-            assert sample.labels is not None
             assert list(sample.labels.shape) == [5] + list(image_with_header.image.shape)
             assert np.all(sample.labels[0] == 0) and np.all(sample.labels[1:] == 1)
 
diff --git a/Tests/ML/visualizers/test_visualize_patches.py b/Tests/ML/visualizers/test_visualize_patches.py
index 556b93109..ef27fdcdf 100644
--- a/Tests/ML/visualizers/test_visualize_patches.py
+++ b/Tests/ML/visualizers/test_visualize_patches.py
@@ -37,8 +37,6 @@ def test_visualize_patch_sampling(test_output_dirs: OutputFolderForTests,
     shape = (10, 30, 30)
     foreground_classes = ["fg"]
     class_weights = equally_weighted_classes(foreground_classes)
-    # Initializes 'missing_labels_list' to 'False'
-    missing_labels_list = [False] * len(foreground_classes)
     config = SegmentationModelBase(should_validate=False,
                                    crop_size=(2, 10, 10),
                                    class_weights=class_weights)
@@ -58,8 +56,7 @@ def test_visualize_patch_sampling(test_output_dirs: OutputFolderForTests,
                     mask=mask,
                     labels=labels,
                     metadata=PatientMetadata(patient_id='123',
-                                             image_header=image_header),
-                    missing_labels=missing_labels_list)
+                                             image_header=image_header))
     expected_folder = full_ml_test_data_path("patch_sampling")
     heatmap = visualize_random_crops(sample, config, output_folder=output_folder)
     expected_heatmap = expected_folder / ("sampled_to_boundary.npy" if labels_to_boundary else "sampled_center.npy")
@@ -104,8 +101,6 @@ def test_visualize_patch_sampling_2d(test_output_dirs: OutputFolderForTests) ->
     set_random_seed(0)
     shape = (1, 20, 30)
     foreground_classes = ["fg"]
-    # Initializes 'missing_labels_list' to 'False'
-    missing_labels_list = [False] * len(foreground_classes)
     class_weights = equally_weighted_classes(foreground_classes)
     config = SegmentationModelBase(should_validate=False,
                                    crop_size=(1, 5, 10),
@@ -121,8 +116,7 @@ def test_visualize_patch_sampling_2d(test_output_dirs: OutputFolderForTests) ->
                     mask=mask,
                     labels=labels,
                     metadata=PatientMetadata(patient_id='123',
-                                             image_header=image_header),
-                    missing_labels=missing_labels_list)
+                                             image_header=image_header))
     heatmap = visualize_random_crops(sample, config, output_folder=output_folder)
     expected_folder = full_ml_test_data_path("patch_sampling")
     expected_heatmap = expected_folder / "sampling_2d.npy"

From 5caf24538c181e843f47de2962f4f8132b49aa59 Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang
 <82240512+asantamariapang@users.noreply.github.com>
Date: Wed, 2 Jun 2021 11:49:15 -0700
Subject: [PATCH 09/45] Updated missing ground truth labels and masks.

Updated code to allow missing ground truth labels and masks. Missing ground truth labels is allowed for inference. If missing ground truth label, then a channels with NaN is initialized and flagged.
---
 InnerEye/ML/dataset/full_image_dataset.py | 10 +++----
 InnerEye/ML/dataset/sample.py             | 10 +++++--
 InnerEye/ML/metrics.py                    |  3 +-
 InnerEye/ML/plotting.py                   |  7 +++++
 InnerEye/ML/utils/dataset_util.py         |  4 ++-
 InnerEye/ML/utils/io_util.py              | 35 +++++++++++------------
 Tests/ML/pipelines/test_inference.py      |  8 +-----
 Tests/ML/utils/test_io_util.py            |  3 +-
 8 files changed, 43 insertions(+), 37 deletions(-)

diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 6679e9dd2..ff2aa2bc9 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -270,7 +270,7 @@ def convert_channels_to_file_paths(channels: List[str],
                                    rows: pd.DataFrame,
                                    local_dataset_root_folder: Path,
                                    patient_id: str,
-                                   allow_incomplete_labels: bool = False) -> Tuple[List[Path], str]:
+                                   allow_incomplete_labels: bool = False) -> Tuple[List[Optional[Path]], str]:
     """
     Returns: 1) The full path for files specified in the training, validation and testing datasets, and
              2) Missing channels or missing files.
@@ -281,8 +281,8 @@ def convert_channels_to_file_paths(channels: List[str],
     :param local_dataset_root_folder: Root directory which points to the local dataset
     :param patient_id: string which contains subject identifier
     """
-    paths: List[Path] = []
-    failed_channel_info: str = ''
+    paths: List[Optional[Path]] = []
+    failed_channel_info = ''
 
     for channel_id in channels:
         row = rows.loc[rows[CSV_CHANNEL_HEADER] == channel_id]
@@ -290,7 +290,7 @@ def convert_channels_to_file_paths(channels: List[str],
             failed_channel_info += f"Patient {patient_id} does not have channel '{channel_id}'" + os.linesep
         elif len(row) == 0 and allow_incomplete_labels:
             # Keeps track of missing channels order
-            paths.append(Path(''))
+            paths.append(None)
         elif len(row) > 1:
             failed_channel_info += f"Patient {patient_id} has more than one entry for channel '{channel_id}'" + \
                                    os.linesep
@@ -344,7 +344,7 @@ def get_mask_channel_or_default() -> Optional[Path]:
         else:
             return paths[0]
 
-    def get_paths_for_channel_ids(channels: List[str]) -> List[Path]:
+    def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
         if len(set(channels)) < len(channels):
             raise ValueError(f"ids have duplicated entries: {channels}")
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
diff --git a/InnerEye/ML/dataset/sample.py b/InnerEye/ML/dataset/sample.py
index d01246f55..a233e01ec 100644
--- a/InnerEye/ML/dataset/sample.py
+++ b/InnerEye/ML/dataset/sample.py
@@ -129,10 +129,10 @@ class PatientDatasetSource(SampleBase):
     Dataset source locations for channels associated with a given patient in a particular dataset.
     """
     image_channels: List[PathOrString]
-    ground_truth_channels: List[PathOrString]
+    ground_truth_channels: List[Optional[PathOrString]]
     mask_channel: Optional[PathOrString]
     metadata: PatientMetadata
-    allow_incomplete_labels: bool
+    allow_incomplete_labels: Optional[bool] = False
 
     def __post_init__(self) -> None:
         # make sure all properties are populated
@@ -140,9 +140,13 @@ def __post_init__(self) -> None:
 
         if not self.image_channels:
             raise ValueError("image_channels cannot be empty")
-        if not self.ground_truth_channels and not self.allow_incomplete_labels:
+
+        if not self.ground_truth_channels:
             raise ValueError("ground_truth_channels cannot be empty")
 
+        if self.ground_truth_channels.count(None) > 0 and not self.allow_incomplete_labels:
+            raise ValueError("all ground_truth_channels must be provided")
+
 
 @dataclass(frozen=True)
 class Sample(SampleBase):
diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index f220aa338..45543d2b8 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -230,6 +230,7 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     Calculate the dice for all foreground structures (the background class is completely ignored).
     Returns a MetricsDict with metrics for each of the foreground
     structures. Metrics are NaN if both ground truth and prediction are all zero for a class.
+    If first element of a ground truth image channel is NaN, the image is flagged as NaN and not use.
     :param ground_truth_ids: The names of all foreground classes.
     :param segmentation: predictions multi-value array with dimensions: [Z x Y x X]
     :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]
@@ -245,7 +246,7 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     binary_classes = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
 
     # If ground truth image is nan, then will not be used for metrics computation.
-    nan_images = [np.isnan(np.sum(ground_truth[label_id])) for label_id in range(ground_truth.shape[0])]
+    nan_images = [np.isnan(ground_truth[label_id][0, 0, 0]) for label_id in range(ground_truth.shape[0])]
 
     # Validates if not binary then nan
     assert np.all(np.array(binary_classes) == ~np.array(nan_images))
diff --git a/InnerEye/ML/plotting.py b/InnerEye/ML/plotting.py
index d0412849f..d0b4f5487 100644
--- a/InnerEye/ML/plotting.py
+++ b/InnerEye/ML/plotting.py
@@ -7,6 +7,7 @@
 
 import matplotlib.pyplot as plt
 import numpy as np
+import sys
 from matplotlib import colors
 from matplotlib.pyplot import Axes
 
@@ -98,6 +99,8 @@ def resize_and_save(width_inch: int, height_inch: int, filename: PathOrString, d
     """
     fig = plt.gcf()
     fig.set_size_inches(width_inch, height_inch)
+    # Workaround for Exception in Tkinter callback
+    fig.canvas.start_event_loop(sys.float_info.min)
     plt.savefig(filename, dpi=dpi, bbox_inches='tight', pad_inches=0.1)
 
 
@@ -303,6 +306,10 @@ def plot_contours_for_all_classes(sample: Sample,
         if class_index == 0:
             continue
         ground_truth = sample.labels[class_index, ...]
+
+        if np.isnan(ground_truth[0, 0, 0]):
+            continue
+
         largest_gt_slice = get_largest_z_slice(ground_truth)
         labels_at_largest_gt = ground_truth[largest_gt_slice]
         segmentation_at_largest_gt = binary[largest_gt_slice, ...]
diff --git a/InnerEye/ML/utils/dataset_util.py b/InnerEye/ML/utils/dataset_util.py
index b2275f4a0..2e9154db0 100644
--- a/InnerEye/ML/utils/dataset_util.py
+++ b/InnerEye/ML/utils/dataset_util.py
@@ -201,7 +201,9 @@ def add_label_stats_to_dataframe(input_dataframe: pd.DataFrame,
         overlap_stats = metrics_util.get_label_overlap_stats(labels=labels[1:, ...],
                                                              label_names=target_label_names)
 
-        header = io_util.load_nifti_image(dataset_sources[subject_id].ground_truth_channels[0]).header
+        ground_truth_channel = dataset_sources[subject_id].ground_truth_channels[0]
+        assert ground_truth_channel is not None
+        header = io_util.load_nifti_image(ground_truth_channel).header
         volume_stats = metrics_util.get_label_volume(labels=labels[1:, ...],
                                                      label_names=target_label_names,
                                                      label_spacing=header.spacing)
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index 4b2c3b859..3fbba56ad 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -413,33 +413,32 @@ def load_image_in_known_formats(file: Path,
 
 
 def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_exclusive: bool = True,
-                                    mask_size: Optional[Tuple[int]] = None) -> np.ndarray:
+                                    image_size: Optional[Tuple[int]] = None) -> np.ndarray:
     """
     Load labels containing segmentation binary labels in one-hot-encoding.
     In the future, this function will be used to load global class and non-imaging information as well.
 
-    :type mask_size: Image size, tuple if integers.
+    :type image_size: Image size, tuple if integers.
     :param dataset_source: The dataset source for which channels are to be loaded into memory.
     :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
     :return: A label sample object containing ground-truth information.
     """
 
-    if not dataset_source.allow_incomplete_labels:
-        labels = np.stack(
-            [load_image(gt, ImageDataType.SEGMENTATION.value).image for gt in dataset_source.ground_truth_channels])
-    else:
-        assert mask_size is not None
-        label_list = []
-        for gt in dataset_source.ground_truth_channels:
-            if str(gt) == '.':
-                label_list.append(np.full(mask_size, np.NAN, ImageDataType))
-            else:
-                label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
-        labels = np.stack(label_list)
-
-    # If ground truth image is nan, then will not be used to check check_exclusive.
+    if dataset_source.ground_truth_channels.count(None) > 0:
+        assert image_size is not None
+
+    label_list = []
+    for gt in dataset_source.ground_truth_channels:
+        if gt is None:
+            label_list.append(np.full(image_size, np.NAN, ImageDataType))
+        else:
+            label_list.append(load_image(gt, ImageDataType.SEGMENTATION.value).image)
+    labels = np.stack(label_list)
+
+    # If ground truth image is nan, then will not be used to check check_exclusive
+    # Image is nan, if voxel at index [0, 0, 0] is NaN
     not_nan_label_images = [labels[label_id] for label_id in range(labels.shape[0])
-                            if not np.isnan(np.sum(labels[label_id]))]
+                            if not np.isnan(labels[label_id][0, 0, 0])]
 
     if check_exclusive and (sum(np.array(not_nan_label_images)) > 1.).any():  # type: ignore
         raise ValueError(f'The labels for patient {dataset_source.metadata.patient_id} are not mutually exclusive. '
@@ -520,7 +519,7 @@ def load_images_from_dataset_source(dataset_source: PatientDatasetSource, check_
     # create raw sample to return
     metadata = copy(dataset_source.metadata)
     metadata.image_header = images[0].header
-    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive, mask_size=mask.shape)
+    labels = load_labels_from_dataset_source(dataset_source, check_exclusive=check_exclusive, image_size=image[0].shape)
 
     return Sample(image=image,
                   labels=labels,
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 14282d1b4..e1003cf7e 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -307,23 +307,17 @@ def test_evaluate_model_predictions() -> None:
             assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values('region_1').keys()
             for hue_name in ['region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 4 has all missing ground truth channels: "region", "region_1"
         if sample.metadata.patient_id == '4':
             for hue_name in ['region_1', 'region', 'Default']:
                 assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
 
         # Patient 5 has no missing ground truth channels
         if sample.metadata.patient_id == '5':
-            assert len(metrics_per_class.values('Default').keys()) == 0
-            assert len(metrics_per_class.values('Default').keys()) == 0
             assert len(metrics_per_class.values('Default').keys()) == 0
             for hue_name in ['region_1', 'region']:
-                assert 'Dice' in metrics_per_class.values('region_1').keys()
+                assert 'Dice' in metrics_per_class.values(hue_name).keys()
                 assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
                 assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
diff --git a/Tests/ML/utils/test_io_util.py b/Tests/ML/utils/test_io_util.py
index 4a57972b4..b494e5588 100644
--- a/Tests/ML/utils/test_io_util.py
+++ b/Tests/ML/utils/test_io_util.py
@@ -112,8 +112,7 @@ def _test_load_images_from_channels(
             metadata=metadata,
             image_channels=[image_channel] * 2,
             ground_truth_channels=[ground_truth_channel] * 4,
-            mask_channel=mask_channel,
-            allow_incomplete_labels=False
+            mask_channel=mask_channel
         ),
         check_exclusive=check_exclusive
     )

From 77d3244e90448edfa0806e6aec17b7c1ceadc54b Mon Sep 17 00:00:00 2001
From: Alberto Santamaria-Pang
 <82240512+asantamariapang@users.noreply.github.com>
Date: Tue, 8 Jun 2021 12:43:09 -0700
Subject: [PATCH 10/45] Fixed bug and improved documentation.

---
 InnerEye/ML/dataset/full_image_dataset.py | 22 +++++++++++++---------
 InnerEye/ML/metrics.py                    |  4 ++--
 InnerEye/ML/utils/io_util.py              |  5 +++--
 3 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index ff2aa2bc9..654a82959 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -272,14 +272,15 @@ def convert_channels_to_file_paths(channels: List[str],
                                    patient_id: str,
                                    allow_incomplete_labels: bool = False) -> Tuple[List[Optional[Path]], str]:
     """
-    Returns: 1) The full path for files specified in the training, validation and testing datasets, and
-             2) Missing channels or missing files.
+    Returns: 1) A list of path file objects specified in the training, validation and testing datasets, and
+             2) a string with description of missing channels, files and more than one channel per patient.
 
-    :param allow_incomplete_labels: flag to enforce all ground truth labels
     :param channels: channel type defined in the configuration file
     :param rows: Input Pandas dataframe object containing subjectIds, path of local dataset, channel information
     :param local_dataset_root_folder: Root directory which points to the local dataset
     :param patient_id: string which contains subject identifier
+    :param allow_incomplete_labels: boolean flag. If false, all ground truth files must be provided. If true, ground
+                                    truth files are optional
     """
     paths: List[Optional[Path]] = []
     failed_channel_info = ''
@@ -320,7 +321,8 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     :param image_channels: The names of the image channels that should be used in the result.
     :param ground_truth_channels: The names of the ground truth channels that should be used in the result.
     :param mask_channel: The name of the mask channel that should be used in the result. This can be None.
-    :param allow_incomplete_labels: Boolean variable to allow missing ground truth files.
+    :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided. If true, ground
+                                    truth files are optional. Default value is false.
     :return: A dictionary mapping from an integer subject ID to a PatientDatasetSource.
     """
     expected_headers = {CSV_SUBJECT_HEADER, CSV_PATH_HEADER, CSV_CHANNEL_HEADER}
@@ -338,19 +340,19 @@ def load_dataset_sources(dataframe: pd.DataFrame,
     def get_mask_channel_or_default() -> Optional[Path]:
         if mask_channel is None:
             return None
-        paths = get_paths_for_channel_ids(channels=[mask_channel])
+        paths = get_paths_for_channel_ids(channels=[mask_channel], allow_incomplete_labels_flag=allow_incomplete_labels)
         if len(paths) == 0:
             return None
         else:
             return paths[0]
 
-    def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
+    def get_paths_for_channel_ids(channels: List[str], allow_incomplete_labels_flag: bool) -> List[Optional[Path]]:
         if len(set(channels)) < len(channels):
             raise ValueError(f"ids have duplicated entries: {channels}")
         rows = dataframe.loc[dataframe[CSV_SUBJECT_HEADER] == patient_id]
         # converts channels to paths and makes second sanity check for channel data
         paths, failed_channel_info = convert_channels_to_file_paths(channels, rows, local_dataset_root_folder,
-                                                                    patient_id, allow_incomplete_labels)
+                                                                    patient_id, allow_incomplete_labels_flag)
 
         if failed_channel_info:
             raise ValueError(failed_channel_info)
@@ -362,9 +364,11 @@ def get_paths_for_channel_ids(channels: List[str]) -> List[Optional[Path]]:
         metadata = PatientMetadata.from_dataframe(dataframe, patient_id)
         dataset_sources[patient_id] = PatientDatasetSource(
             metadata=metadata,
-            image_channels=get_paths_for_channel_ids(channels=image_channels),  # type: ignore
+            image_channels=get_paths_for_channel_ids(channels=image_channels,  # type: ignore
+                                                     allow_incomplete_labels_flag=allow_incomplete_labels),
             mask_channel=get_mask_channel_or_default(),
-            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels),  # type: ignore
+            ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels,  # type: ignore
+                                                            allow_incomplete_labels_flag=allow_incomplete_labels),
             allow_incomplete_labels=allow_incomplete_labels)
 
     return dataset_sources
diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index 45543d2b8..6593dc751 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -233,7 +233,7 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     If first element of a ground truth image channel is NaN, the image is flagged as NaN and not use.
     :param ground_truth_ids: The names of all foreground classes.
     :param segmentation: predictions multi-value array with dimensions: [Z x Y x X]
-    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X]
+    :param ground_truth: ground truth binary array with dimensions: [C x Z x Y x X].
     :param voxel_spacing: voxel_spacing in 3D Z x Y x X
     :param patient_id: for logging
     """
@@ -248,7 +248,7 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     # If ground truth image is nan, then will not be used for metrics computation.
     nan_images = [np.isnan(ground_truth[label_id][0, 0, 0]) for label_id in range(ground_truth.shape[0])]
 
-    # Validates if not binary then nan
+    # Compares element-wise if not binary then nan and checks all elements are True.
     assert np.all(np.array(binary_classes) == ~np.array(nan_images))
 
     #  Validates that all binary images should be 0 or 1
diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index 3fbba56ad..9df51191f 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -418,9 +418,9 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
     Load labels containing segmentation binary labels in one-hot-encoding.
     In the future, this function will be used to load global class and non-imaging information as well.
 
-    :type image_size: Image size, tuple if integers.
+    :type image_size: Image size, tuple of integers.
     :param dataset_source: The dataset source for which channels are to be loaded into memory.
-    :param check_exclusive: Check that the labels are mutually exclusive (defaults to True)
+    :param check_exclusive: Check that the labels are mutually exclusive (defaults to True).
     :return: A label sample object containing ground-truth information.
     """
 
@@ -428,6 +428,7 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
         assert image_size is not None
 
     label_list = []
+    # label_list keeps track of missing ground truth channels
     for gt in dataset_source.ground_truth_channels:
         if gt is None:
             label_list.append(np.full(image_size, np.NAN, ImageDataType))

From 652b6aa1f48f79ae5578108b6d6b2949217d5fb4 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Thu, 24 Jun 2021 17:49:51 +0100
Subject: [PATCH 11/45] WiP allowing NaNs in averaging to count them

---
 InnerEye/ML/metrics.py               | 19 +++++++----
 InnerEye/ML/model_testing.py         | 49 ++++++++++++++++++----------
 Tests/ML/pipelines/test_inference.py | 22 +++++++++----
 3 files changed, 60 insertions(+), 30 deletions(-)

diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index 6593dc751..8b8cab8d1 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -12,6 +12,7 @@
 
 import SimpleITK as sitk
 import numpy as np
+from numpy.core.numeric import NaN
 import torch
 import torch.nn.functional as F
 from azureml.core import Run
@@ -257,9 +258,19 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     overlap_measures_filter = sitk.LabelOverlapMeasuresImageFilter()
     hausdorff_distance_filter = sitk.HausdorffDistanceImageFilter()
     metrics = MetricsDict(hues=ground_truth_ids)
+
+    def add_metric(metric_type: MetricType, value: float) -> None:
+        metrics.add_metric(metric_type, value, skip_nan_when_averaging=True, hue=ground_truth_ids[i - 1])
+
     for i, prediction in enumerate(binaries):
-        # Skips if background image or nan_image
-        if i == 0 or nan_images[i]:
+        # Skip if background image
+        if i == 0:
+            continue
+        # Skip but record if nan_image
+        elif nan_images[i]:
+            add_metric(MetricType.DICE, NaN)
+            add_metric(MetricType.HAUSDORFF_mm, NaN)
+            add_metric(MetricType.MEAN_SURFACE_DIST_mm, NaN)
             continue
         check_size_matches(prediction, ground_truth[i], arg1_name="prediction", arg2_name="ground_truth")
         if not is_binary_array(prediction):
@@ -290,10 +301,6 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
                 except Exception as e:
                     logging.warning(f"Cannot calculate mean distance for structure {i} of patient {patient_id}: {e}")
             logging.debug(f"Patient {patient_id}, class {i} has Dice score {dice}")
-
-        def add_metric(metric_type: MetricType, value: float) -> None:
-            metrics.add_metric(metric_type, value, skip_nan_when_averaging=True, hue=ground_truth_ids[i - 1])
-
         add_metric(MetricType.DICE, dice)
         add_metric(MetricType.HAUSDORFF_mm, hausdorff_distance)
         add_metric(MetricType.MEAN_SURFACE_DIST_mm, mean_surface_distance)
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index d91253c17..3fb1f02aa 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -25,7 +25,7 @@
 from InnerEye.ML.dataset.sample import PatientMetadata, Sample
 from InnerEye.ML.metrics import InferenceMetrics, InferenceMetricsForClassification, InferenceMetricsForSegmentation, \
     compute_scalar_metrics
-from InnerEye.ML.metrics_dict import DataframeLogger, MetricsDict, ScalarMetricsDict, SequenceMetricsDict
+from InnerEye.ML.metrics_dict import DataframeLogger, FloatOrInt, MetricsDict, ScalarMetricsDict, SequenceMetricsDict
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
 from InnerEye.ML.pipelines.inference import FullImageInferencePipelineBase, InferencePipeline, InferencePipelineBase
@@ -182,23 +182,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
                     results_folder=results_folder),
             range(len(ds)))
 
-    average_dice = list()
-    metrics_writer = MetricsPerPatientWriter()
-    for (patient_metadata, metrics_for_patient) in pool_outputs:
-        # Add the Dice score for the foreground classes, stored in the default hue
-        metrics.add_average_foreground_dice(metrics_for_patient)
-        average_dice.append(metrics_for_patient.get_single_metric(MetricType.DICE))
-        # Structure names does not include the background class (index 0)
-        for structure_name in config.ground_truth_ids:
-            dice_for_struct = metrics_for_patient.get_single_metric(MetricType.DICE, hue=structure_name)
-            hd_for_struct = metrics_for_patient.get_single_metric(MetricType.HAUSDORFF_mm, hue=structure_name)
-            md_for_struct = metrics_for_patient.get_single_metric(MetricType.MEAN_SURFACE_DIST_mm, hue=structure_name)
-            metrics_writer.add(patient=str(patient_metadata.patient_id),
-                               structure=structure_name,
-                               dice=dice_for_struct,
-                               hausdorff_distance_mm=hd_for_struct,
-                               mean_distance_mm=md_for_struct)
-
+    metrics_writer, average_dice = populate_metrics_writer(pool_outputs, config)
     metrics_writer.to_csv(results_folder / SUBJECT_METRICS_FILE_NAME)
     metrics_writer.save_aggregates_to_csv(results_folder / METRICS_AGGREGATES_FILE)
     if config.is_plotting_enabled:
@@ -249,6 +233,35 @@ def evaluate_model_predictions(process_id: int,
     return sample.metadata, metrics_per_class
 
 
+def populate_metrics_writer(
+        model_prediction_evaluations: List[Tuple[PatientMetadata, MetricsDict]],
+        config: SegmentationModelBase) -> Tuple[MetricsPerPatientWriter, List[FloatOrInt]]:
+    """
+    Populate a MetricsPerPatientWriter with the metrics for each patient
+    :param model_prediction_evaluations: The list of PatientMetadata/MetricsDict tuples obtained
+    from evaluate_model_predictions
+    :param config: The SegmentationModelBase config from which we read the ground_truth_ids
+    :returns: A new MetricsPerPatientWriter and a list of foreground DICE score averages
+    """
+    average_dice: List[FloatOrInt] = []
+    metrics_writer = MetricsPerPatientWriter()
+    for (patient_metadata, metrics_for_patient) in model_prediction_evaluations:
+        # Add the Dice score for the foreground classes, stored in the default hue
+        metrics.add_average_foreground_dice(metrics_for_patient)
+        average_dice.append(metrics_for_patient.get_single_metric(MetricType.DICE))
+        # Structure names does not include the background class (index 0)
+        for structure_name in config.ground_truth_ids:
+            dice_for_struct = metrics_for_patient.get_single_metric(MetricType.DICE, hue=structure_name)
+            hd_for_struct = metrics_for_patient.get_single_metric(MetricType.HAUSDORFF_mm, hue=structure_name)
+            md_for_struct = metrics_for_patient.get_single_metric(MetricType.MEAN_SURFACE_DIST_mm, hue=structure_name)
+            metrics_writer.add(patient=str(patient_metadata.patient_id),
+                               structure=structure_name,
+                               dice=dice_for_struct,
+                               hausdorff_distance_mm=hd_for_struct,
+                               mean_distance_mm=md_for_struct)
+    return metrics_writer, average_dice
+
+
 def get_patient_results_folder(results_folder: Path, patient_id: int) -> Path:
     """
     Gets a folder name that will contain all results for a given patient, like root/017 for patient 17.
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index e1003cf7e..690b133f3 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -2,7 +2,8 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-from typing import Any, List
+from InnerEye.ML.metrics_dict import MetricsDict
+from typing import Any, List, Tuple
 
 import numpy as np
 import pandas as pd
@@ -21,9 +22,9 @@
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 from Tests.ML.configs.DummyModel import DummyModel
 from InnerEye.ML.utils.split_dataset import DatasetSplits
-from InnerEye.ML.dataset.sample import Sample
+from InnerEye.ML.dataset.sample import PatientMetadata, Sample
 from InnerEye.ML.common import ModelExecutionMode
-from InnerEye.ML.model_testing import store_inference_results, evaluate_model_predictions
+from InnerEye.ML.model_testing import store_inference_results, evaluate_model_predictions, populate_metrics_writer
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
@@ -274,6 +275,8 @@ def test_evaluate_model_predictions() -> None:
     if not results_folder.is_dir():
         results_folder.mkdir()
 
+    model_prediction_evaluations: List[Tuple[PatientMetadata, MetricsDict]] = []
+
     for sample_index, sample in enumerate(ds, 1):
         sample = Sample.from_dict(sample=sample)
         posteriors = np.zeros((3,) + sample.mask.shape, 'float32')
@@ -300,24 +303,31 @@ def test_evaluate_model_predictions() -> None:
             dataset=ds,
             results_folder=results_folder)
 
+        model_prediction_evaluations.append((metadata, metrics_per_class))
+
         # Patient 3 has one missing ground truth channel: "region"
         if sample.metadata.patient_id == '3':
             assert 'Dice' in metrics_per_class.values('region_1').keys()
             assert 'HausdorffDistance_millimeters' in metrics_per_class.values('region_1').keys()
             assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values('region_1').keys()
             for hue_name in ['region', 'Default']:
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                for metric_type in metrics_per_class.values(hue_name).keys():
+                    assert np.isnan(metrics_per_class.values(hue_name)[metric_type]).all()
 
         # Patient 4 has all missing ground truth channels: "region", "region_1"
         if sample.metadata.patient_id == '4':
             for hue_name in ['region_1', 'region', 'Default']:
-                assert len(metrics_per_class.values(hue_name).keys()) == 0
+                for metric_type in metrics_per_class.values(hue_name).keys():
+                    assert np.isnan(metrics_per_class.values(hue_name)[metric_type]).all()
 
         # Patient 5 has no missing ground truth channels
         if sample.metadata.patient_id == '5':
-            assert len(metrics_per_class.values('Default').keys()) == 0
+            for metric_type in metrics_per_class.values('Default').keys():
+                assert np.isnan(metrics_per_class.values('Default')[metric_type]).all()
             for hue_name in ['region_1', 'region']:
                 assert 'Dice' in metrics_per_class.values(hue_name).keys()
                 assert 'HausdorffDistance_millimeters' in metrics_per_class.values(hue_name).keys()
                 assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
+    metrics_writer, average_dice = populate_metrics_writer(model_prediction_evaluations, config)
+    
\ No newline at end of file

From 899e64803391dfc2b539eb15fdede1057e6f30c6 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 25 Jun 2021 09:58:54 +0100
Subject: [PATCH 12/45] Deeper testing in test_evaluate_model_predictions

To include metrics writer
---
 InnerEye/ML/utils/metrics_util.py    |  2 +-
 Tests/ML/pipelines/test_inference.py | 26 +++++++++++++++++++++++++-
 2 files changed, 26 insertions(+), 2 deletions(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index d3bc98a5b..d44835696 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -36,7 +36,7 @@ def add(self,
             hausdorff_distance_mm: float,
             mean_distance_mm: float) -> None:
         """
-        Adds a Dice score, Mean nad Hausdorff Distances for a patient + structure combination to the present object.
+        Adds a Dice score, Mean and Hausdorff Distances for a patient + structure combination to the present object.
 
         :param patient: The name of the patient.
         :param structure: The structure that is predicted for.
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 690b133f3..13de292fa 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -2,6 +2,7 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+from numpy.core.numeric import NaN
 from InnerEye.ML.metrics_dict import MetricsDict
 from typing import Any, List, Tuple
 
@@ -330,4 +331,27 @@ def test_evaluate_model_predictions() -> None:
                 assert 'MeanSurfaceDistance_millimeters' in metrics_per_class.values(hue_name).keys()
 
     metrics_writer, average_dice = populate_metrics_writer(model_prediction_evaluations, config)
-    
\ No newline at end of file
+    # Patient 3 has only one missing ground truth channel
+    assert not np.isnan(average_dice[0])
+    assert np.isnan(float(metrics_writer.columns["Dice"][0]))
+    assert not np.isnan(float(metrics_writer.columns["Dice"][1]))
+    assert np.isnan(float(metrics_writer.columns["HausdorffDistance_mm"][0]))
+    assert not np.isnan(float(metrics_writer.columns["HausdorffDistance_mm"][1]))
+    assert np.isnan(float(metrics_writer.columns["MeanDistance_mm"][0]))
+    assert not np.isnan(float(metrics_writer.columns["MeanDistance_mm"][1]))
+    # Patient 4 has all missing ground truth channels
+    assert np.isnan(average_dice[1])
+    assert np.isnan(float(metrics_writer.columns["Dice"][2]))
+    assert np.isnan(float(metrics_writer.columns["Dice"][3]))
+    assert np.isnan(float(metrics_writer.columns["HausdorffDistance_mm"][2]))
+    assert np.isnan(float(metrics_writer.columns["HausdorffDistance_mm"][3]))
+    assert np.isnan(float(metrics_writer.columns["MeanDistance_mm"][2]))
+    assert np.isnan(float(metrics_writer.columns["MeanDistance_mm"][3]))
+    # Patient 5 has no missing ground truth channels.
+    assert average_dice[2] > 0
+    assert float(metrics_writer.columns["Dice"][4]) >= 0
+    assert float(metrics_writer.columns["Dice"][5]) >= 0
+    assert float(metrics_writer.columns["HausdorffDistance_mm"][4]) >= 0
+    assert float(metrics_writer.columns["HausdorffDistance_mm"][5]) >= 0
+    assert float(metrics_writer.columns["MeanDistance_mm"][4]) >= 0
+    assert float(metrics_writer.columns["MeanDistance_mm"][5]) >= 0

From 2b46a44f29fed3e80d3b76488b58e8c2ca8edf74 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 25 Jun 2021 10:08:29 +0100
Subject: [PATCH 13/45] Reverting CHANGELOG for now

---
 CHANGELOG.md | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 313b7cc65..af057d167 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,13 +9,11 @@ For each Pull Request, the affected code parts should be briefly described and a
 Once a release is done, the "Upcoming" section becomes the release changelog, and a new empty "Upcoming" should be
 created.
 
+
 ## Upcoming
 
 ### Added
 
-- ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Added ability to run segmentation inference
-  module in the test data without or partial ground truth files. 
-
 ### Changed
 
 ### Fixed
@@ -29,6 +27,7 @@ any large models anymore because data loaders ran out of memory.
 
 ### Deprecated
 
+
 ## 0.3 (2021-06-01)
 
 ### Added

From 94b90ab44b1b94554e408c9f764dd8d6e6f409df Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 25 Jun 2021 20:46:32 +0100
Subject: [PATCH 14/45] Unused import

---
 Tests/ML/pipelines/test_inference.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index 13de292fa..e1a45a2f6 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -2,7 +2,6 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-from numpy.core.numeric import NaN
 from InnerEye.ML.metrics_dict import MetricsDict
 from typing import Any, List, Tuple
 

From 36f39949e45017fb2a71926b86ffa86bcdeadf3a Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 25 Jun 2021 20:47:22 +0100
Subject: [PATCH 15/45] WiP testing partial_ground_truth metrics output

---
 Tests/ML/test_model_testing.py | 102 ++++++++++++++++-----------------
 1 file changed, 49 insertions(+), 53 deletions(-)

diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index ebda3d3b8..bd20926d3 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -30,7 +30,8 @@
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
-def test_model_test(test_output_dirs: OutputFolderForTests) -> None:
+@pytest.mark.parametrize("partial_ground_truth", [True, False])
+def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth: bool) -> None:
     train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
     seed_everything(42)
     config = DummyModel()
@@ -40,12 +41,16 @@ def test_model_test(test_output_dirs: OutputFolderForTests) -> None:
     transform = config.get_full_image_sample_transforms().test
     df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
     df = df[df.subject.isin([1, 2])]
+    if partial_ground_truth:
+        # Patient 1 has one missing ground truth channel: "region"
+        config.check_exclusive = False
+        config.ground_truth_ids = ["region", "region_1"]
+        df = df[df["subject"].ne(1) & df["channel"].ne("region")]
     # noinspection PyTypeHints
     config._datasets_for_inference = \
         {ModelExecutionMode.TEST: FullImageDataset(config, df, full_image_sample_transforms=transform)}  # type: ignore
     execution_mode = ModelExecutionMode.TEST
-    checkpoint_handler = get_default_checkpoint_handler(model_config=config,
-                                                        project_root=test_output_dirs.root_dir)
+    checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
     create_model_and_store_checkpoint(config, config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
     checkpoint_handler.additional_training_done()
@@ -53,60 +58,51 @@ def test_model_test(test_output_dirs: OutputFolderForTests) -> None:
                                                               data_split=execution_mode,
                                                               checkpoint_handler=checkpoint_handler)
     epoch_dir = config.outputs_folder / get_best_epoch_results_path(execution_mode)
-    assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
 
-    assert config.outputs_folder.is_dir()
-    assert epoch_dir.is_dir()
-    patient1 = io_util.load_nifti_image(train_and_test_data_dir / "id1_channel1.nii.gz")
-    patient2 = io_util.load_nifti_image(train_and_test_data_dir / "id2_channel1.nii.gz")
+    if partial_ground_truth:
+        raise NotImplementedError("soon")
+    else:
+        assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
+        assert config.outputs_folder.is_dir()
+        assert epoch_dir.is_dir()
+        patient1 = io_util.load_nifti_image(train_and_test_data_dir / "id1_channel1.nii.gz")
+        patient2 = io_util.load_nifti_image(train_and_test_data_dir / "id2_channel1.nii.gz")
 
-    assert_file_contains_string(epoch_dir / DATASET_ID_FILE, placeholder_dataset_id)
-    assert_file_contains_string(epoch_dir / GROUND_TRUTH_IDS_FILE, "region")
-    assert_text_files_match(epoch_dir / model_testing.SUBJECT_METRICS_FILE_NAME,
-                            train_and_test_data_dir / model_testing.SUBJECT_METRICS_FILE_NAME)
-    assert_text_files_match(epoch_dir / model_testing.METRICS_AGGREGATES_FILE,
-                            train_and_test_data_dir / model_testing.METRICS_AGGREGATES_FILE)
-    # Plotting results vary between platforms. Can only check if the file is generated, but not its contents.
-    assert (epoch_dir / model_testing.BOXPLOT_FILE).exists()
+        assert_file_contains_string(epoch_dir / DATASET_ID_FILE, placeholder_dataset_id)
+        assert_file_contains_string(epoch_dir / GROUND_TRUTH_IDS_FILE, "region")
+        assert_text_files_match(epoch_dir / model_testing.SUBJECT_METRICS_FILE_NAME,
+                                train_and_test_data_dir / model_testing.SUBJECT_METRICS_FILE_NAME)
+        assert_text_files_match(epoch_dir / model_testing.METRICS_AGGREGATES_FILE,
+                                train_and_test_data_dir / model_testing.METRICS_AGGREGATES_FILE)
+        # Plotting results vary between platforms. Can only check if the file is generated, but not its contents.
+        assert (epoch_dir / model_testing.BOXPLOT_FILE).exists()
 
-    assert_nifti_content(epoch_dir / "001" / "posterior_region.nii.gz", get_image_shape(patient1),
-                         patient1.header,
-                         [137], np.ubyte)
-    assert_nifti_content(epoch_dir / "002" / "posterior_region.nii.gz", get_image_shape(patient2),
-                         patient2.header,
-                         [137], np.ubyte)
-    assert_nifti_content(epoch_dir / "001" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient1),
-                         patient1.header,
-                         [1], np.ubyte)
-    assert_nifti_content(epoch_dir / "002" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient2),
-                         patient2.header,
-                         [1], np.ubyte)
-    assert_nifti_content(epoch_dir / "001" / "posterior_background.nii.gz", get_image_shape(patient1),
-                         patient1.header,
-                         [117], np.ubyte)
-    assert_nifti_content(epoch_dir / "002" / "posterior_background.nii.gz", get_image_shape(patient2),
-                         patient2.header,
-                         [117], np.ubyte)
-    thumbnails_folder = epoch_dir / model_testing.THUMBNAILS_FOLDER
-    assert thumbnails_folder.is_dir()
-    png_files = list(thumbnails_folder.glob("*.png"))
-    overlays = [f for f in png_files if "_region_slice_" in str(f)]
-    assert len(overlays) == len(df.subject.unique()), "There should be one overlay/contour file per subject"
+        assert_nifti_content(epoch_dir / "001" / "posterior_region.nii.gz", get_image_shape(patient1), patient1.header, [137], np.ubyte)
+        assert_nifti_content(epoch_dir / "002" / "posterior_region.nii.gz", get_image_shape(patient2), patient2.header, [137], np.ubyte)
+        assert_nifti_content(epoch_dir / "001" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient1), patient1.header, [1], np.ubyte)
+        assert_nifti_content(epoch_dir / "002" / DEFAULT_RESULT_IMAGE_NAME, get_image_shape(patient2), patient2.header, [1], np.ubyte)
+        assert_nifti_content(epoch_dir / "001" / "posterior_background.nii.gz", get_image_shape(patient1), patient1.header, [117], np.ubyte)
+        assert_nifti_content(epoch_dir / "002" / "posterior_background.nii.gz", get_image_shape(patient2), patient2.header, [117], np.ubyte)
+        thumbnails_folder = epoch_dir / model_testing.THUMBNAILS_FOLDER
+        assert thumbnails_folder.is_dir()
+        png_files = list(thumbnails_folder.glob("*.png"))
+        overlays = [f for f in png_files if "_region_slice_" in str(f)]
+        assert len(overlays) == len(df.subject.unique()), "There should be one overlay/contour file per subject"
 
-    # Writing dataset.csv normally happens at the beginning of training,
-    # but this test reads off a saved checkpoint file.
-    # Dataset.csv must be present for plot_cross_validation.
-    config.write_dataset_files()
-    # Test if the metrics files can be picked up correctly by the cross validation code
-    config_and_files = get_config_and_results_for_offline_runs(config)
-    result_files = config_and_files.files
-    assert len(result_files) == 1
-    for file in result_files:
-        assert file.execution_mode == execution_mode
-        assert file.dataset_csv_file is not None
-        assert file.dataset_csv_file.exists()
-        assert file.metrics_file is not None
-        assert file.metrics_file.exists()
+        # Writing dataset.csv normally happens at the beginning of training,
+        # but this test reads off a saved checkpoint file.
+        # Dataset.csv must be present for plot_cross_validation.
+        config.write_dataset_files()
+        # Test if the metrics files can be picked up correctly by the cross validation code
+        config_and_files = get_config_and_results_for_offline_runs(config)
+        result_files = config_and_files.files
+        assert len(result_files) == 1
+        for file in result_files:
+            assert file.execution_mode == execution_mode
+            assert file.dataset_csv_file is not None
+            assert file.dataset_csv_file.exists()
+            assert file.metrics_file is not None
+            assert file.metrics_file.exists()
 
 
 @pytest.mark.parametrize("config", [DummyModel(), ClassificationModelForTesting()])

From 03e530c0111d609d2ae9343feaa5e2fc4cd0a442 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sat, 26 Jun 2021 13:54:23 +0100
Subject: [PATCH 16/45] messy WiP with testing ground truths

---
 InnerEye/Scripts/create_small_test_data.py | 40 ++++++++++++++++++++++
 Tests/ML/test_model_testing.py             |  4 +++
 2 files changed, 44 insertions(+)
 create mode 100644 InnerEye/Scripts/create_small_test_data.py

diff --git a/InnerEye/Scripts/create_small_test_data.py b/InnerEye/Scripts/create_small_test_data.py
new file mode 100644
index 000000000..f6156ae75
--- /dev/null
+++ b/InnerEye/Scripts/create_small_test_data.py
@@ -0,0 +1,40 @@
+#  ------------------------------------------------------------------------------------------
+#  Copyright (c) Microsoft Corporation. All rights reserved.
+#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
+#  ------------------------------------------------------------------------------------------
+r"""
+Script to create smaller and broader test data than the images found in
+InnerEye\Tests\ML\test_data\train_and_test_data. We provide multiple non-overlapping labels, and
+an additional subject.
+To run from the repository root:
+    conda activate InnerEye
+    export PYTHONPATH=`pwd`
+    python InnerEye/scripts/create_small_test_data.py
+"""
+from pathlib import Path
+import numpy as np
+from InnerEye.ML.utils import io_util
+from InnerEye.ML.utils.io_util import ImageHeader
+
+XY_DIMENSION = 25
+Z_DIMENSION = 10
+
+def create_small_train_and_test_data(output_dir: Path) -> None:
+    """
+    """
+    for id in ["id1", "id2", "id3"]:
+        for channel in ["channel1", "channel2"]:
+            image = np.random.random_sample((Z_DIMENSION, XY_DIMENSION, XY_DIMENSION))
+            image = np.array((image + 1) * 255).astype(int)
+            header = ImageHeader(origin=(1, 1, 1), direction=(1, 0, 0, 0, 1, 0, 0, 0, 1), spacing=(1, 1, 1))
+            io_util.store_as_nifti(image, header, (output_dir / (id + channel + ".nii.gz")).absolute, np.ubyte)
+
+
+def main() -> None:
+    output_dir = Path("train_and_test_data", "small")
+    output_dir.mkdir(parents=True, exist_ok=True)
+    create_small_train_and_test_data(output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index bd20926d3..06e9e4fa1 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -28,6 +28,8 @@
     get_default_checkpoint_handler, get_image_shape
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 
+@pytest.fixture
+def 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
 @pytest.mark.parametrize("partial_ground_truth", [True, False])
@@ -35,6 +37,8 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
     seed_everything(42)
     config = DummyModel()
+    config.check_exclusive = False
+    config.ground_truth_ids = ["region", "region_1"]
     config.set_output_to(test_output_dirs.root_dir)
     placeholder_dataset_id = "place_holder_dataset_id"
     config.azure_dataset_id = placeholder_dataset_id

From a79b1d074d0389fd9d4a283cebfab6863b88e92c Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sat, 26 Jun 2021 14:23:48 +0100
Subject: [PATCH 17/45] Removing started small test data script

---
 InnerEye/Scripts/create_small_test_data.py | 40 ----------------------
 1 file changed, 40 deletions(-)
 delete mode 100644 InnerEye/Scripts/create_small_test_data.py

diff --git a/InnerEye/Scripts/create_small_test_data.py b/InnerEye/Scripts/create_small_test_data.py
deleted file mode 100644
index f6156ae75..000000000
--- a/InnerEye/Scripts/create_small_test_data.py
+++ /dev/null
@@ -1,40 +0,0 @@
-#  ------------------------------------------------------------------------------------------
-#  Copyright (c) Microsoft Corporation. All rights reserved.
-#  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
-#  ------------------------------------------------------------------------------------------
-r"""
-Script to create smaller and broader test data than the images found in
-InnerEye\Tests\ML\test_data\train_and_test_data. We provide multiple non-overlapping labels, and
-an additional subject.
-To run from the repository root:
-    conda activate InnerEye
-    export PYTHONPATH=`pwd`
-    python InnerEye/scripts/create_small_test_data.py
-"""
-from pathlib import Path
-import numpy as np
-from InnerEye.ML.utils import io_util
-from InnerEye.ML.utils.io_util import ImageHeader
-
-XY_DIMENSION = 25
-Z_DIMENSION = 10
-
-def create_small_train_and_test_data(output_dir: Path) -> None:
-    """
-    """
-    for id in ["id1", "id2", "id3"]:
-        for channel in ["channel1", "channel2"]:
-            image = np.random.random_sample((Z_DIMENSION, XY_DIMENSION, XY_DIMENSION))
-            image = np.array((image + 1) * 255).astype(int)
-            header = ImageHeader(origin=(1, 1, 1), direction=(1, 0, 0, 0, 1, 0, 0, 0, 1), spacing=(1, 1, 1))
-            io_util.store_as_nifti(image, header, (output_dir / (id + channel + ".nii.gz")).absolute, np.ubyte)
-
-
-def main() -> None:
-    output_dir = Path("train_and_test_data", "small")
-    output_dir.mkdir(parents=True, exist_ok=True)
-    create_small_train_and_test_data(output_dir)
-
-
-if __name__ == "__main__":
-    main()

From fcacfcd81e409f5d8868ce2bfa1bbe533d5c7e9d Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 08:45:55 +0100
Subject: [PATCH 18/45] Adding labels for partial test

---
 Tests/ML/test_model_testing.py | 16 +++++++---------
 Tests/ML/util.py               |  2 +-
 2 files changed, 8 insertions(+), 10 deletions(-)

diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 06e9e4fa1..442ce255c 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -2,6 +2,8 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+from typing import Tuple
+
 import numpy as np
 import pandas as pd
 import pytest
@@ -24,12 +26,10 @@
 from InnerEye.ML.visualizers.plot_cross_validation import get_config_and_results_for_offline_runs
 from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
 from Tests.ML.configs.DummyModel import DummyModel
-from Tests.ML.util import assert_file_contains_string, assert_nifti_content, assert_text_files_match, \
-    get_default_checkpoint_handler, get_image_shape
+from Tests.ML.util import (assert_file_contains_string, assert_nifti_content, assert_text_files_match,
+                           get_default_checkpoint_handler, get_image_shape)
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 
-@pytest.fixture
-def 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
 @pytest.mark.parametrize("partial_ground_truth", [True, False])
@@ -37,8 +37,6 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
     seed_everything(42)
     config = DummyModel()
-    config.check_exclusive = False
-    config.ground_truth_ids = ["region", "region_1"]
     config.set_output_to(test_output_dirs.root_dir)
     placeholder_dataset_id = "place_holder_dataset_id"
     config.azure_dataset_id = placeholder_dataset_id
@@ -46,10 +44,10 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
     df = df[df.subject.isin([1, 2])]
     if partial_ground_truth:
-        # Patient 1 has one missing ground truth channel: "region"
+        # Patient 1 has one missing ground truth channel: "region_1"
         config.check_exclusive = False
-        config.ground_truth_ids = ["region", "region_1"]
-        df = df[df["subject"].ne(1) & df["channel"].ne("region")]
+        config.ground_truth_ids = ["region", "region_1", "region_2"]
+        df = df[df["subject"].ne(1) & df["channel"].ne("region_1")]
     # noinspection PyTypeHints
     config._datasets_for_inference = \
         {ModelExecutionMode.TEST: FullImageDataset(config, df, full_image_sample_transforms=transform)}  # type: ignore
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index fb49c116d..8faddfaa3 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -99,7 +99,7 @@ def assert_file_contains_string(full_file: Union[str, Path], expected: Any = Non
     file_path = full_file if isinstance(full_file, Path) else Path(full_file)
     assert_file_exists(file_path)
     if expected is not None:
-        _assert_line(file_path.read_text(), expected)
+        assert expected.strip() in file_path.read_text()
 
 
 def assert_text_files_match(full_file: Path, expected_file: Path) -> None:

From 75b1c370ed52b0cd3b551bd0c7ad52c9a06497b7 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 10:43:17 +0100
Subject: [PATCH 19/45] WiP on partial ground truth unit test of model_test

---
 Tests/ML/configs/DummyModel.py       | 12 ++++++-----
 Tests/ML/pipelines/test_inference.py |  2 +-
 Tests/ML/test_model_testing.py       | 31 +++++++++++++++++++++++-----
 3 files changed, 34 insertions(+), 11 deletions(-)

diff --git a/Tests/ML/configs/DummyModel.py b/Tests/ML/configs/DummyModel.py
index ba03f1819..1979ac85f 100644
--- a/Tests/ML/configs/DummyModel.py
+++ b/Tests/ML/configs/DummyModel.py
@@ -60,16 +60,18 @@ def __init__(self, **kwargs: Any) -> None:
             weight_decay=1e-4,
             class_weights=[0.5, 0.5],
             detect_anomaly=False,
-            use_mixed_precision=False,
-        )
+            use_mixed_precision=False)
         self.add_and_validate(kwargs)
         # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
         self.max_num_gpus = 1
+        train_subject_ids = ['1', '2', '3']
+        test_subject_ids = ['4', '7']
+        val_subject_ids = ['5', '6']
 
     def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
-        return DatasetSplits(train=dataset_df[dataset_df.subject.isin(['1', '2', '3'])],
-                             test=dataset_df[dataset_df.subject.isin(['4', '7'])],
-                             val=dataset_df[dataset_df.subject.isin(['5', '6'])])
+        return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],
+                             test=dataset_df[dataset_df.subject.isin(self.test_subject_ids)],
+                             val=dataset_df[dataset_df.subject.isin(self.val_subject_ids)])
 
     def get_parameter_search_hyperdrive_config(self, run_config: ScriptRunConfig) -> HyperDriveConfig:
         return super().get_parameter_search_hyperdrive_config(run_config)
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index e1a45a2f6..f49f975a1 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -238,7 +238,7 @@ def test_evaluate_model_predictions() -> None:
     """
     Creates an 'InferencePipeline.Result' object using pre-defined volumes, stores results and evaluates metrics.
     """
-    # Patient 3,4,5 are in test dataset such that:
+    # Patients 3, 4, and 5 are in test dataset such that:
     # Patient 3 has one missing ground truth channel: "region"
     # Patient 4 has all missing ground truth channels: "region", "region_1"
     # Patient 5 has no missing ground truth channels.
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 442ce255c..f5931ed52 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -42,15 +42,36 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     config.azure_dataset_id = placeholder_dataset_id
     transform = config.get_full_image_sample_transforms().test
     df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
-    df = df[df.subject.isin([1, 2])]
     if partial_ground_truth:
-        # Patient 1 has one missing ground truth channel: "region_1"
         config.check_exclusive = False
-        config.ground_truth_ids = ["region", "region_1", "region_2"]
-        df = df[df["subject"].ne(1) & df["channel"].ne("region_1")]
+        # TO ASK: Why do wwe need the next three when they are (always?) the same?
+        config.fg_ids = ["region", "region_1"]
+        config.ground_truth_ids = ["region", "region_1"]
+        config.ground_truth_ids_display_names = ["region", "region_1"]
+        # As in Tests.ML.pipelines.test.inference.test_evaluate_model_predictions patients 3, 4,
+        # and 5 are in the test dataset
+        df = df[df.subject.isin([3, 4, 5])]
+        # Patient 3 has one missing ground truth channel: "region"
+        df = df[df["subject"].ne(3) | df["channel"].ne("region")]
+        # Patient 4 has all missing ground truth channels: "region", "region_1"
+        df = df[df["subject"].ne(4) | df["channel"].ne("region")]
+        df = df[df["subject"].ne(4) | df["channel"].ne("region_1")]
+        # Patient 5 has no missing ground truth channels.
+        config.train_subject_ids = ['1', '2']
+        config.test_subject_ids = ['3', '4', '5']
+        config.val_subject_ids = ['6', '7']
+        # TO ASK: Why doesn't the partial_ground_truth = False version of this test need the next
+        # line?
+        config.dataset_data_frame = df
+    else:
+        df = df[df.subject.isin([1, 2])]
     # noinspection PyTypeHints
     config._datasets_for_inference = \
-        {ModelExecutionMode.TEST: FullImageDataset(config, df, full_image_sample_transforms=transform)}  # type: ignore
+        {ModelExecutionMode.TEST: FullImageDataset(
+            config,
+            df,
+            full_image_sample_transforms=transform,
+            allow_incomplete_labels=partial_ground_truth)}  # type: ignore
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.

From 6f4bbd853c5c4c4afd1f78ccad25d0b29fb2cde7 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 15:18:39 +0100
Subject: [PATCH 20/45] Unit test of partial ground truth works, but other
 fails :(

---
 InnerEye/ML/metrics.py            |  4 ++--
 InnerEye/ML/model_testing.py      | 24 ++++++++++++------------
 InnerEye/ML/utils/metrics_util.py | 13 +++++++++++--
 Tests/ML/test_model_testing.py    | 29 ++++++++++++++++++++---------
 Tests/ML/util.py                  | 24 ++++++++++++++++++++++++
 5 files changed, 69 insertions(+), 25 deletions(-)

diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index 8b8cab8d1..a60488197 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -57,7 +57,7 @@ class InferenceMetricsForSegmentation(InferenceMetrics):
     """
     Stores metrics for segmentation models, per execution mode and epoch.
     """
-    data_split: ModelExecutionMode
+    execution_mode: ModelExecutionMode
     metrics: float
 
     def get_metrics_log_key(self) -> str:
@@ -65,7 +65,7 @@ def get_metrics_log_key(self) -> str:
         Gets a string name for logging the metrics specific to the execution mode (train, val, test)
         :return:
         """
-        return f"InferenceMetrics_{self.data_split.value}"
+        return f"InferenceMetrics_{self.execution_mode.value}"
 
     def log_metrics(self, run_context: Run = None) -> None:
         """
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 3fb1f02aa..087abded5 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -76,14 +76,14 @@ def model_test(config: ModelConfigBase,
 
 
 def segmentation_model_test(config: SegmentationModelBase,
-                            data_split: ModelExecutionMode,
+                            execution_mode: ModelExecutionMode,
                             checkpoint_handler: CheckpointHandler,
                             model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> InferenceMetricsForSegmentation:
     """
     The main testing loop for segmentation models.
     It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
     :param config: The arguments object which has a valid random seed attribute.
-    :param data_split: Indicates which of the 3 sets (training, test, or validation) is being processed.
+    :param execution_mode: Indicates which of the 3 sets (training, test, or validation) is being processed.
     :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
     :param model_proc: whether we are testing an ensemble or single model
     :return: InferenceMetric object that contains metrics related for all of the checkpoint epochs.
@@ -93,12 +93,12 @@ def segmentation_model_test(config: SegmentationModelBase,
     if not checkpoints_to_test:
         raise ValueError("There were no checkpoints available for model testing.")
 
-    epoch_results_folder = config.outputs_folder / get_best_epoch_results_path(data_split, model_proc)
+    epoch_results_folder = config.outputs_folder / get_best_epoch_results_path(execution_mode, model_proc)
     # save the datasets.csv used
     config.write_dataset_files(root=epoch_results_folder)
-    epoch_and_split = f"{data_split.value} set"
+    epoch_and_split = f"{execution_mode.value} set"
     epoch_dice_per_image = segmentation_model_test_epoch(config=copy.deepcopy(config),
-                                                         data_split=data_split,
+                                                         execution_mode=execution_mode,
                                                          checkpoint_paths=checkpoints_to_test,
                                                          results_folder=epoch_results_folder,
                                                          epoch_and_split=epoch_and_split)
@@ -110,13 +110,13 @@ def segmentation_model_test(config: SegmentationModelBase,
         logging.info(f"Mean Dice: {epoch_average_dice:4f}")
         if model_proc == ModelProcessing.ENSEMBLE_CREATION:
             # For the upload, we want the path without the "OTHER_RUNS/ENSEMBLE" prefix.
-            name = str(get_best_epoch_results_path(data_split, ModelProcessing.DEFAULT))
+            name = str(get_best_epoch_results_path(execution_mode, ModelProcessing.DEFAULT))
             PARENT_RUN_CONTEXT.upload_folder(name=name, path=str(epoch_results_folder))
-    return InferenceMetricsForSegmentation(data_split=data_split, metrics=result)
+    return InferenceMetricsForSegmentation(execution_mode=execution_mode, metrics=result)
 
 
 def segmentation_model_test_epoch(config: SegmentationModelBase,
-                                  data_split: ModelExecutionMode,
+                                  execution_mode: ModelExecutionMode,
                                   checkpoint_paths: List[Path],
                                   results_folder: Path,
                                   epoch_and_split: str) -> Optional[List[float]]:
@@ -126,7 +126,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
     where the average is taken across all non-background structures in the image.
     :param checkpoint_paths: Checkpoint paths to run inference on.
     :param config: The arguments which specify all required information.
-    :param data_split: Is the model evaluated on train, test, or validation set?
+    :param execution_mode: Is the model evaluated on train, test, or validation set?
     :param results_folder: The folder where to store the results
     :param epoch_and_split: A string that should uniquely identify the epoch and the data split (train/val/test).
     :raises TypeError: If the arguments are of the wrong type.
@@ -136,8 +136,8 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
     ml_util.set_random_seed(config.get_effective_random_seed(), "Model testing")
     results_folder.mkdir(exist_ok=True)
 
-    test_dataframe = config.get_dataset_splits()[data_split]
-    test_csv_path = results_folder / STORED_CSV_FILE_NAMES[data_split]
+    test_dataframe = config.get_dataset_splits()[execution_mode]
+    test_csv_path = results_folder / STORED_CSV_FILE_NAMES[execution_mode]
     test_dataframe.to_csv(path_or_buf=test_csv_path, index=False)
     logging.info("Results directory: {}".format(results_folder))
     logging.info(f"Starting evaluation of model {config.model_name} on {epoch_and_split}")
@@ -145,7 +145,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
     # Write the dataset id and ground truth ids into the results folder
     store_run_information(results_folder, config.azure_dataset_id, config.ground_truth_ids, config.image_channels)
 
-    ds = config.get_torch_dataset_for_inference(data_split)
+    ds = config.get_torch_dataset_for_inference(execution_mode)
 
     inference_pipeline = create_inference_pipeline(config=config, checkpoint_paths=checkpoint_paths)
 
diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index d44835696..65502a437 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -73,12 +73,20 @@ def save_aggregates_to_csv(self, file_path: Path) -> None:
 
         stats_columns = ['mean', 'std', 'min', 'max']
         # get aggregates for all metrics
-        aggregates = self.to_data_frame().groupby(MetricsFileColumns.Structure.value).describe()
+        df = self.to_data_frame()
+        aggregates = df.groupby(MetricsFileColumns.Structure.value).describe()
+        num_subjects = len(pd.unique(df[MetricsFileColumns.Patient.value]))
+        total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
+        if not total_num_patients_column_name.endswith("s"):
+            total_num_patients_column_name += "s"
 
         def filter_rename_metric_columns(_metric_column: str, is_count_column: bool = False) -> pd.DataFrame:
             _columns = ["count"] + stats_columns if is_count_column else stats_columns
             _df = aggregates[_metric_column][_columns]
-            _columns_to_rename = [x for x in _df.columns if x != "count"]
+            if is_count_column:
+                _df[total_num_patients_column_name] = num_subjects
+                _df = _df[["count", total_num_patients_column_name] + stats_columns]
+            _columns_to_rename = [x for x in _df.columns if x != "count" and x != total_num_patients_column_name]
             return _df.rename(columns={k: f"{_metric_column}_{k}" for k in _columns_to_rename})
 
         def _merge_df(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
@@ -108,6 +116,7 @@ def to_data_frame(self) -> DataFrame:
             data=df[MetricsFileColumns.HausdorffDistanceMM.value].apply(float))
         df[MetricsFileColumns.MeanDistanceMM.value] = pd.Series(
             data=df[MetricsFileColumns.MeanDistanceMM.value].apply(float))
+        df = df.sort_values(by=[MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value])
         return df
 
 
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index f5931ed52..4b4eae1a0 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -10,8 +10,9 @@
 from pytorch_lightning import seed_everything
 
 from InnerEye.Common import common_util
-from InnerEye.Common.common_util import get_best_epoch_results_path
+from InnerEye.Common.common_util import METRICS_AGGREGATES_FILE, get_best_epoch_results_path
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
+from InnerEye.Common.metrics_constants import MetricsFileColumns
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML import model_testing
 from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
@@ -27,7 +28,7 @@
 from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
 from Tests.ML.configs.DummyModel import DummyModel
 from Tests.ML.util import (assert_file_contains_string, assert_nifti_content, assert_text_files_match,
-                           get_default_checkpoint_handler, get_image_shape)
+                           assert_csv_column_contains_value,get_default_checkpoint_handler, get_image_shape)
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 
 
@@ -49,20 +50,20 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
         config.ground_truth_ids = ["region", "region_1"]
         config.ground_truth_ids_display_names = ["region", "region_1"]
         # As in Tests.ML.pipelines.test.inference.test_evaluate_model_predictions patients 3, 4,
-        # and 5 are in the test dataset
-        df = df[df.subject.isin([3, 4, 5])]
+        # and 5 are in the test dataset with:
         # Patient 3 has one missing ground truth channel: "region"
         df = df[df["subject"].ne(3) | df["channel"].ne("region")]
         # Patient 4 has all missing ground truth channels: "region", "region_1"
         df = df[df["subject"].ne(4) | df["channel"].ne("region")]
         df = df[df["subject"].ne(4) | df["channel"].ne("region_1")]
         # Patient 5 has no missing ground truth channels.
-        config.train_subject_ids = ['1', '2']
-        config.test_subject_ids = ['3', '4', '5']
-        config.val_subject_ids = ['6', '7']
         # TO ASK: Why doesn't the partial_ground_truth = False version of this test need the next
         # line?
         config.dataset_data_frame = df
+        df = df[df.subject.isin([3, 4, 5])]
+        config.train_subject_ids = ['1', '2']
+        config.test_subject_ids = ['3', '4', '5']
+        config.val_subject_ids = ['6', '7']
     else:
         df = df[df.subject.isin([1, 2])]
     # noinspection PyTypeHints
@@ -78,13 +79,23 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     create_model_and_store_checkpoint(config, config.checkpoint_folder / BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX)
     checkpoint_handler.additional_training_done()
     inference_results = model_testing.segmentation_model_test(config,
-                                                              data_split=execution_mode,
+                                                              execution_mode=execution_mode,
                                                               checkpoint_handler=checkpoint_handler)
     epoch_dir = config.outputs_folder / get_best_epoch_results_path(execution_mode)
+    total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
+    if not total_num_patients_column_name.endswith("s"):
+        total_num_patients_column_name += "s"
 
     if partial_ground_truth:
-        raise NotImplementedError("soon")
+        num_subjects = len(pd.unique(df["subject"]))
+        assert_csv_column_contains_value(
+            csv_file_path=epoch_dir / METRICS_AGGREGATES_FILE,
+            column_name=total_num_patients_column_name,
+            value=num_subjects,
+            contains_only_value=True)
     else:
+        aggregates_df = pd.read_csv(epoch_dir / METRICS_AGGREGATES_FILE)
+        assert total_num_patients_column_name not in aggregates_df.columns
         assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
         assert config.outputs_folder.is_dir()
         assert epoch_dir.is_dir()
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index 8faddfaa3..deb45a65e 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -7,6 +7,7 @@
 from typing import Any, List, Optional, Tuple, Union
 
 import numpy as np
+import pandas as pd
 import pytest
 import torch
 from PIL import Image
@@ -189,6 +190,29 @@ def assert_binary_files_match(actual_file: Path, expected_file: Path) -> None:
     assert False, f"File contents does not match: len(actual)={len(actual)}, len(expected)={len(expected)}"
 
 
+def assert_csv_column_contains_value(
+        csv_file_path: Path,
+        column_name: str,
+        value: Any,
+        contains_only_value: bool = True) -> None:
+    """
+    Checks that the column in the csv file contains the given value (and perhaps only contains that value)
+    :param csv_file_path: The path to the CSV
+    :param column_name: The name of the column in which we look for the value
+    :param value: The value to look for
+    :param contains_only_value: Check that this is the only value in the column (default True)
+    """
+    if not csv_file_path.exists:
+        raise ValueError(f"The CSV at {csv_file_path} does not exist.")
+    df = pd.read_csv(csv_file_path)
+    if not column_name in df.columns:
+        raise ValueError(f"The column {column_name} is not in th CSV at {csv_file_path}, which has columns {df.columns}.")
+    if contains_only_value:
+        assert int(df[[column_name]].nunique(dropna=True)) == 1
+    else:
+        assert int(df[[column_name]].nunique(dropna=True)) > 0    
+
+
 DummyPatientMetadata = PatientMetadata(patient_id='42')
 
 

From 713c2f4df358fb10f86432ef424da37257250d05 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 19:10:30 +0100
Subject: [PATCH 21/45] mypy fixes

---
 InnerEye/ML/model_testing.py      | 22 +++++++++++++++-------
 InnerEye/ML/utils/metrics_util.py |  6 ++++--
 Tests/ML/configs/DummyModel.py    |  7 ++++---
 Tests/ML/test_model_testing.py    | 28 +++++++++++++++++-----------
 Tests/ML/util.py                  |  2 +-
 5 files changed, 41 insertions(+), 24 deletions(-)

diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 087abded5..608c79230 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -78,14 +78,18 @@ def model_test(config: ModelConfigBase,
 def segmentation_model_test(config: SegmentationModelBase,
                             execution_mode: ModelExecutionMode,
                             checkpoint_handler: CheckpointHandler,
-                            model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> InferenceMetricsForSegmentation:
+                            model_proc: ModelProcessing = ModelProcessing.DEFAULT,
+                            allow_incomplete_labels: bool = False) -> InferenceMetricsForSegmentation:
     """
     The main testing loop for segmentation models.
     It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
     :param config: The arguments object which has a valid random seed attribute.
     :param execution_mode: Indicates which of the 3 sets (training, test, or validation) is being processed.
-    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization
-    :param model_proc: whether we are testing an ensemble or single model
+    :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization.
+    :param model_proc: Whether we are testing an ensemble or single model.
+    :param patient_id: String which contains subject identifier.
+    :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided.
+    If true, ground truth files are optional. (Defaults to False.)
     :return: InferenceMetric object that contains metrics related for all of the checkpoint epochs.
     """
     checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()
@@ -101,7 +105,8 @@ def segmentation_model_test(config: SegmentationModelBase,
                                                          execution_mode=execution_mode,
                                                          checkpoint_paths=checkpoints_to_test,
                                                          results_folder=epoch_results_folder,
-                                                         epoch_and_split=epoch_and_split)
+                                                         epoch_and_split=epoch_and_split,
+                                                         allow_incomplete_labels=allow_incomplete_labels)
     if epoch_dice_per_image is None:
         raise ValueError("There was no single checkpoint file available for model testing.")
     else:
@@ -119,7 +124,8 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
                                   execution_mode: ModelExecutionMode,
                                   checkpoint_paths: List[Path],
                                   results_folder: Path,
-                                  epoch_and_split: str) -> Optional[List[float]]:
+                                  epoch_and_split: str,
+                                  allow_incomplete_labels: bool = False) -> Optional[List[float]]:
     """
     The main testing loop for a given epoch. It loads the model and datasets, then proceeds to test the model.
     Returns a list with an entry for each image in the dataset. The entry is the average Dice score,
@@ -127,10 +133,12 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
     :param checkpoint_paths: Checkpoint paths to run inference on.
     :param config: The arguments which specify all required information.
     :param execution_mode: Is the model evaluated on train, test, or validation set?
-    :param results_folder: The folder where to store the results
+    :param results_folder: The folder where to store the results.
     :param epoch_and_split: A string that should uniquely identify the epoch and the data split (train/val/test).
     :raises TypeError: If the arguments are of the wrong type.
     :raises ValueError: When there are issues loading the model.
+    :param allow_incomplete_labels: boolean flag. If false, all ground truth files must be provided.
+    If true, ground truth files are optional. (Defaults to False.)
     :return A list with the mean dice score (across all structures apart from background) for each image.
     """
     ml_util.set_random_seed(config.get_effective_random_seed(), "Model testing")
@@ -184,7 +192,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
 
     metrics_writer, average_dice = populate_metrics_writer(pool_outputs, config)
     metrics_writer.to_csv(results_folder / SUBJECT_METRICS_FILE_NAME)
-    metrics_writer.save_aggregates_to_csv(results_folder / METRICS_AGGREGATES_FILE)
+    metrics_writer.save_aggregates_to_csv(results_folder / METRICS_AGGREGATES_FILE, allow_incomplete_labels)
     if config.is_plotting_enabled:
         plt.figure()
         boxplot_per_structure(metrics_writer.to_data_frame(),
diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index 65502a437..040920be2 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -63,12 +63,14 @@ def to_csv(self, file_name: Path) -> None:
         del sorted_by_dice[dice_numeric]
         sorted_by_dice.to_csv(file_name, index=False, float_format=self.float_format)
 
-    def save_aggregates_to_csv(self, file_path: Path) -> None:
+    def save_aggregates_to_csv(self, file_path: Path, allow_incomplete_labels: bool = False) -> None:
         """
         Writes the per-structure aggregate Dice scores (mean, median, and others) to a CSV file.
         The aggregates are those that are output by the Dataframe 'describe' method.
 
         :param file_path: The name of the file to write to.
+        :param allow_incomplete_labels: boolean flag. If false, all ground truth files must be provided.
+        If true, ground truth files are optional. (Defaults to False.)
         """
 
         stats_columns = ['mean', 'std', 'min', 'max']
@@ -83,7 +85,7 @@ def save_aggregates_to_csv(self, file_path: Path) -> None:
         def filter_rename_metric_columns(_metric_column: str, is_count_column: bool = False) -> pd.DataFrame:
             _columns = ["count"] + stats_columns if is_count_column else stats_columns
             _df = aggregates[_metric_column][_columns]
-            if is_count_column:
+            if is_count_column and allow_incomplete_labels:
                 _df[total_num_patients_column_name] = num_subjects
                 _df = _df[["count", total_num_patients_column_name] + stats_columns]
             _columns_to_rename = [x for x in _df.columns if x != "count" and x != total_num_patients_column_name]
diff --git a/Tests/ML/configs/DummyModel.py b/Tests/ML/configs/DummyModel.py
index 1979ac85f..d9ddfed4b 100644
--- a/Tests/ML/configs/DummyModel.py
+++ b/Tests/ML/configs/DummyModel.py
@@ -16,6 +16,9 @@
 
 class DummyModel(SegmentationModelBase):
     fg_ids = ["region"]
+    train_subject_ids = ['1', '2', '3']
+    test_subject_ids = ['4', '7']
+    val_subject_ids = ['5', '6']
 
     def __init__(self, **kwargs: Any) -> None:
         super().__init__(
@@ -64,9 +67,7 @@ def __init__(self, **kwargs: Any) -> None:
         self.add_and_validate(kwargs)
         # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
         self.max_num_gpus = 1
-        train_subject_ids = ['1', '2', '3']
-        test_subject_ids = ['4', '7']
-        val_subject_ids = ['5', '6']
+
 
     def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
         return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 4b4eae1a0..4b633b60e 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -2,7 +2,6 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
-from typing import Tuple
 
 import numpy as np
 import pandas as pd
@@ -18,6 +17,7 @@
 from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
 from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, ModelArchitectureConfig
 from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
+from InnerEye.ML.dataset.sample import Sample
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.model_testing import DEFAULT_RESULT_IMAGE_NAME, create_inference_pipeline
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
@@ -43,12 +43,16 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     config.azure_dataset_id = placeholder_dataset_id
     transform = config.get_full_image_sample_transforms().test
     df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
+
     if partial_ground_truth:
         config.check_exclusive = False
-        # TO ASK: Why do wwe need the next three when they are (always?) the same?
+
+        # TO ASK: Why do the next three all exist, given that they are (usually/always?) the same?
+        # Do we have expample where their usage differs?
         config.fg_ids = ["region", "region_1"]
         config.ground_truth_ids = ["region", "region_1"]
         config.ground_truth_ids_display_names = ["region", "region_1"]
+
         # As in Tests.ML.pipelines.test.inference.test_evaluate_model_predictions patients 3, 4,
         # and 5 are in the test dataset with:
         # Patient 3 has one missing ground truth channel: "region"
@@ -57,22 +61,23 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
         df = df[df["subject"].ne(4) | df["channel"].ne("region")]
         df = df[df["subject"].ne(4) | df["channel"].ne("region_1")]
         # Patient 5 has no missing ground truth channels.
-        # TO ASK: Why doesn't the partial_ground_truth = False version of this test need the next
-        # line?
+
+        # TO ASK: Why doesn't the partial_ground_truth = False version of this test need this next
+        # line:
         config.dataset_data_frame = df
+
         df = df[df.subject.isin([3, 4, 5])]
+
         config.train_subject_ids = ['1', '2']
         config.test_subject_ids = ['3', '4', '5']
         config.val_subject_ids = ['6', '7']
     else:
         df = df[df.subject.isin([1, 2])]
+
     # noinspection PyTypeHints
-    config._datasets_for_inference = \
-        {ModelExecutionMode.TEST: FullImageDataset(
-            config,
-            df,
-            full_image_sample_transforms=transform,
-            allow_incomplete_labels=partial_ground_truth)}  # type: ignore
+    config._datasets_for_inference = {
+        ModelExecutionMode.TEST: \
+            FullImageDataset(config, df, full_image_sample_transforms=transform, allow_incomplete_labels=partial_ground_truth)}  # type: ignore
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
@@ -80,7 +85,8 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     checkpoint_handler.additional_training_done()
     inference_results = model_testing.segmentation_model_test(config,
                                                               execution_mode=execution_mode,
-                                                              checkpoint_handler=checkpoint_handler)
+                                                              checkpoint_handler=checkpoint_handler,
+                                                              allow_incomplete_labels=partial_ground_truth)
     epoch_dir = config.outputs_folder / get_best_epoch_results_path(execution_mode)
     total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
     if not total_num_patients_column_name.endswith("s"):
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index deb45a65e..01c3553c6 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -206,7 +206,7 @@ def assert_csv_column_contains_value(
         raise ValueError(f"The CSV at {csv_file_path} does not exist.")
     df = pd.read_csv(csv_file_path)
     if not column_name in df.columns:
-        raise ValueError(f"The column {column_name} is not in th CSV at {csv_file_path}, which has columns {df.columns}.")
+        raise ValueError(f"The column {column_name} is not in the CSV at {csv_file_path}, which has columns {df.columns}.")
     if contains_only_value:
         assert int(df[[column_name]].nunique(dropna=True)) == 1
     else:

From 554c6d20fa3f698864e522f171896bd271df2e47 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 19:13:38 +0100
Subject: [PATCH 22/45] tidy

---
 Tests/ML/test_model_testing.py | 10 ++--------
 1 file changed, 2 insertions(+), 8 deletions(-)

diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 4b633b60e..3be1d466e 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -46,12 +46,7 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
 
     if partial_ground_truth:
         config.check_exclusive = False
-
-        # TO ASK: Why do the next three all exist, given that they are (usually/always?) the same?
-        # Do we have expample where their usage differs?
-        config.fg_ids = ["region", "region_1"]
         config.ground_truth_ids = ["region", "region_1"]
-        config.ground_truth_ids_display_names = ["region", "region_1"]
 
         # As in Tests.ML.pipelines.test.inference.test_evaluate_model_predictions patients 3, 4,
         # and 5 are in the test dataset with:
@@ -62,8 +57,6 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
         df = df[df["subject"].ne(4) | df["channel"].ne("region_1")]
         # Patient 5 has no missing ground truth channels.
 
-        # TO ASK: Why doesn't the partial_ground_truth = False version of this test need this next
-        # line:
         config.dataset_data_frame = df
 
         df = df[df.subject.isin([3, 4, 5])]
@@ -101,7 +94,8 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
             contains_only_value=True)
     else:
         aggregates_df = pd.read_csv(epoch_dir / METRICS_AGGREGATES_FILE)
-        assert total_num_patients_column_name not in aggregates_df.columns
+        assert total_num_patients_column_name not in aggregates_df.columns  # Only added if using partial ground truth
+        
         assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
         assert config.outputs_folder.is_dir()
         assert epoch_dir.is_dir()

From a7f7a193e05e97e38758e747a69f7eb00e0852e0 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 27 Jun 2021 19:39:22 +0100
Subject: [PATCH 23/45] flake fixes

---
 Tests/ML/configs/DummyModel.py | 1 -
 Tests/ML/test_model_testing.py | 7 +++----
 Tests/ML/util.py               | 2 +-
 3 files changed, 4 insertions(+), 6 deletions(-)

diff --git a/Tests/ML/configs/DummyModel.py b/Tests/ML/configs/DummyModel.py
index d9ddfed4b..5abcf19aa 100644
--- a/Tests/ML/configs/DummyModel.py
+++ b/Tests/ML/configs/DummyModel.py
@@ -68,7 +68,6 @@ def __init__(self, **kwargs: Any) -> None:
         # Trying to run DDP from the test suite hangs, hence restrict to single GPU.
         self.max_num_gpus = 1
 
-
     def get_model_train_test_dataset_splits(self, dataset_df: pd.DataFrame) -> DatasetSplits:
         return DatasetSplits(train=dataset_df[dataset_df.subject.isin(self.train_subject_ids)],
                              test=dataset_df[dataset_df.subject.isin(self.test_subject_ids)],
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 3be1d466e..f353aa779 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -17,7 +17,6 @@
 from InnerEye.ML.common import BEST_CHECKPOINT_FILE_NAME_WITH_SUFFIX, DATASET_CSV_FILE_NAME, ModelExecutionMode
 from InnerEye.ML.config import DATASET_ID_FILE, GROUND_TRUTH_IDS_FILE, ModelArchitectureConfig
 from InnerEye.ML.dataset.full_image_dataset import FullImageDataset
-from InnerEye.ML.dataset.sample import Sample
 from InnerEye.ML.model_config_base import ModelConfigBase
 from InnerEye.ML.model_testing import DEFAULT_RESULT_IMAGE_NAME, create_inference_pipeline
 from InnerEye.ML.pipelines.ensemble import EnsemblePipeline
@@ -28,7 +27,7 @@
 from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
 from Tests.ML.configs.DummyModel import DummyModel
 from Tests.ML.util import (assert_file_contains_string, assert_nifti_content, assert_text_files_match,
-                           assert_csv_column_contains_value,get_default_checkpoint_handler, get_image_shape)
+                           assert_csv_column_contains_value, get_default_checkpoint_handler, get_image_shape)
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 
 
@@ -69,7 +68,7 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
 
     # noinspection PyTypeHints
     config._datasets_for_inference = {
-        ModelExecutionMode.TEST: \
+        ModelExecutionMode.TEST:
             FullImageDataset(config, df, full_image_sample_transforms=transform, allow_incomplete_labels=partial_ground_truth)}  # type: ignore
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
@@ -95,7 +94,7 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     else:
         aggregates_df = pd.read_csv(epoch_dir / METRICS_AGGREGATES_FILE)
         assert total_num_patients_column_name not in aggregates_df.columns  # Only added if using partial ground truth
-        
+
         assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
         assert config.outputs_folder.is_dir()
         assert epoch_dir.is_dir()
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index 01c3553c6..db6916ae0 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -205,7 +205,7 @@ def assert_csv_column_contains_value(
     if not csv_file_path.exists:
         raise ValueError(f"The CSV at {csv_file_path} does not exist.")
     df = pd.read_csv(csv_file_path)
-    if not column_name in df.columns:
+    if column_name not in df.columns:
         raise ValueError(f"The column {column_name} is not in the CSV at {csv_file_path}, which has columns {df.columns}.")
     if contains_only_value:
         assert int(df[[column_name]].nunique(dropna=True)) == 1

From a821f9b48211887b2e69699fa0485bdface64fd0 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Mon, 28 Jun 2021 16:52:55 +0100
Subject: [PATCH 24/45] Documentation typos

---
 docs/sample_tasks.md | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/sample_tasks.md b/docs/sample_tasks.md
index 665403086..92d6e567d 100644
--- a/docs/sample_tasks.md
+++ b/docs/sample_tasks.md
@@ -3,13 +3,14 @@
 This document contains two sample tasks for the classification and segmentation pipelines. 
 
 The document will walk through the steps in [Training Steps](building_models.md), but with specific examples for each task.
-Before trying tp train these models, you should have followed steps to set up an [environment](environment.md) and [AzureML](setting_up_aml.md)
+Before trying to train these models, you should have followed steps to set up an [environment](environment.md) and [AzureML](setting_up_aml.md)
 
 ## Sample classification task: Glaucoma Detection on OCT volumes
  
 This example is based on the paper [A feature agnostic approach for glaucoma detection in OCT volumes](https://arxiv.org/pdf/1807.04855v3.pdf).
 
 ### Downloading and preparing the dataset
+
 The dataset is available [here](https://zenodo.org/record/1481223#.Xs-ehzPiuM_) <sup>[[1]](#1)</sup>.
 
 After downloading and extracting the zip file, run the [create_glaucoma_dataset_csv.py](https://github.com/microsoft/InnerEye-DeepLearning/blob/main/InnerEye/Scripts/create_glaucoma_dataset_csv.py)
@@ -26,7 +27,6 @@ description below).
 
 ### Creating the model configuration and starting training
 
-
 Next, you need to create a configuration file `InnerEye/ML/configs/MyGlaucoma.py`
  which extends the GlaucomaPublic class like this:
 ```python
@@ -75,6 +75,7 @@ into a folder in the `datasets` container, for example `my_lung_dataset`. This f
 `azure_dataset_id` field of the model configuration, see below.
 
 ### Creating the model configuration and starting training
+
 You can then create a new model configuration, based on the template 
 [Lung.py](../InnerEye/ML/configs/segmentation/Lung.py). To do this, create a file 
 `InnerEye/ML/configs/segmentation/MyLungModel.py`, where you create a subclass of the template Lung model, and

From 45c4d2ae5effd4790ff3fe557a8b9d0985d8a07e Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Mon, 28 Jun 2021 16:54:04 +0100
Subject: [PATCH 25/45] Adding allow_incomplete_labels to Seg'ModelBase

---
 InnerEye/ML/config.py          |  7 +++++++
 InnerEye/ML/model_testing.py   | 15 ++++-----------
 Tests/ML/test_model_testing.py |  4 ++--
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index 6769bbbcc..aad1e05b5 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -474,17 +474,24 @@ class SegmentationModelBase(ModelConfigBase):
     is_plotting_enabled: bool = param.Boolean(True, doc="If true, various overview plots with results are generated "
                                                         "during model evaluation. Set to False if you see "
                                                         "non-deterministic pull request build failures.")
+
     show_patch_sampling: int = param.Integer(1, bounds=(0, None),
                                              doc="Number of patients from the training set for which the effect of"
                                                  "patch sampling will be shown. Nifti images and thumbnails for each"
                                                  "of the first N subjects in the training set will be "
                                                  "written to the outputs folder.")
+
     #: If true an error is raised in InnerEye.ML.utils.io_util.load_labels_from_dataset_source if the labels are not
     #: mutually exclusive. Some loss functions (e.g. SoftDice) may produce results on overlapping labels, but others (e.g.
     #: FocalLoss) will fail with a cryptic error message. Set to false if you are sure that you want to use labels that
     #: are not mutually exclusive.
     check_exclusive: bool = param.Boolean(True, doc="Raise an error if the segmentation labels are not mutually exclusive.")
 
+    allow_incomplete_labels: bool = param.Boolean(
+        default=False,
+        doc="If some test data includes patients with missing ground truth data then their data will be ignored "
+        "completely unless this flag is set. Only used for segmentation models.")
+
     def __init__(self, center_size: Optional[TupleInt3] = None,
                  inference_stride_size: Optional[TupleInt3] = None,
                  min_l_rate: float = 0,
diff --git a/InnerEye/ML/model_testing.py b/InnerEye/ML/model_testing.py
index 608c79230..9b6c7521e 100644
--- a/InnerEye/ML/model_testing.py
+++ b/InnerEye/ML/model_testing.py
@@ -78,8 +78,7 @@ def model_test(config: ModelConfigBase,
 def segmentation_model_test(config: SegmentationModelBase,
                             execution_mode: ModelExecutionMode,
                             checkpoint_handler: CheckpointHandler,
-                            model_proc: ModelProcessing = ModelProcessing.DEFAULT,
-                            allow_incomplete_labels: bool = False) -> InferenceMetricsForSegmentation:
+                            model_proc: ModelProcessing = ModelProcessing.DEFAULT) -> InferenceMetricsForSegmentation:
     """
     The main testing loop for segmentation models.
     It loads the model and datasets, then proceeds to test the model for all requested checkpoints.
@@ -88,8 +87,6 @@ def segmentation_model_test(config: SegmentationModelBase,
     :param checkpoint_handler: Checkpoint handler object to find checkpoint paths for model initialization.
     :param model_proc: Whether we are testing an ensemble or single model.
     :param patient_id: String which contains subject identifier.
-    :param allow_incomplete_labels: Boolean flag. If false, all ground truth files must be provided.
-    If true, ground truth files are optional. (Defaults to False.)
     :return: InferenceMetric object that contains metrics related for all of the checkpoint epochs.
     """
     checkpoints_to_test = checkpoint_handler.get_checkpoints_to_test()
@@ -105,8 +102,7 @@ def segmentation_model_test(config: SegmentationModelBase,
                                                          execution_mode=execution_mode,
                                                          checkpoint_paths=checkpoints_to_test,
                                                          results_folder=epoch_results_folder,
-                                                         epoch_and_split=epoch_and_split,
-                                                         allow_incomplete_labels=allow_incomplete_labels)
+                                                         epoch_and_split=epoch_and_split)
     if epoch_dice_per_image is None:
         raise ValueError("There was no single checkpoint file available for model testing.")
     else:
@@ -124,8 +120,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
                                   execution_mode: ModelExecutionMode,
                                   checkpoint_paths: List[Path],
                                   results_folder: Path,
-                                  epoch_and_split: str,
-                                  allow_incomplete_labels: bool = False) -> Optional[List[float]]:
+                                  epoch_and_split: str) -> Optional[List[float]]:
     """
     The main testing loop for a given epoch. It loads the model and datasets, then proceeds to test the model.
     Returns a list with an entry for each image in the dataset. The entry is the average Dice score,
@@ -137,8 +132,6 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
     :param epoch_and_split: A string that should uniquely identify the epoch and the data split (train/val/test).
     :raises TypeError: If the arguments are of the wrong type.
     :raises ValueError: When there are issues loading the model.
-    :param allow_incomplete_labels: boolean flag. If false, all ground truth files must be provided.
-    If true, ground truth files are optional. (Defaults to False.)
     :return A list with the mean dice score (across all structures apart from background) for each image.
     """
     ml_util.set_random_seed(config.get_effective_random_seed(), "Model testing")
@@ -192,7 +185,7 @@ def segmentation_model_test_epoch(config: SegmentationModelBase,
 
     metrics_writer, average_dice = populate_metrics_writer(pool_outputs, config)
     metrics_writer.to_csv(results_folder / SUBJECT_METRICS_FILE_NAME)
-    metrics_writer.save_aggregates_to_csv(results_folder / METRICS_AGGREGATES_FILE, allow_incomplete_labels)
+    metrics_writer.save_aggregates_to_csv(results_folder / METRICS_AGGREGATES_FILE, config.allow_incomplete_labels)
     if config.is_plotting_enabled:
         plt.figure()
         boxplot_per_structure(metrics_writer.to_data_frame(),
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index f353aa779..11ecfc49b 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -37,6 +37,7 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
     seed_everything(42)
     config = DummyModel()
+    config.allow_incomplete_labels = partial_ground_truth
     config.set_output_to(test_output_dirs.root_dir)
     placeholder_dataset_id = "place_holder_dataset_id"
     config.azure_dataset_id = placeholder_dataset_id
@@ -77,8 +78,7 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     checkpoint_handler.additional_training_done()
     inference_results = model_testing.segmentation_model_test(config,
                                                               execution_mode=execution_mode,
-                                                              checkpoint_handler=checkpoint_handler,
-                                                              allow_incomplete_labels=partial_ground_truth)
+                                                              checkpoint_handler=checkpoint_handler)
     epoch_dir = config.outputs_folder / get_best_epoch_results_path(execution_mode)
     total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
     if not total_num_patients_column_name.endswith("s"):

From 86a8be7103758bc36573db3a493f9eb5f66bab2b Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Tue, 29 Jun 2021 15:59:44 +0100
Subject: [PATCH 26/45] Checking that partial ground truth is not allowed
 unless explicit

---
 InnerEye/ML/utils/metrics_util.py | 14 +++---
 Tests/ML/test_model_testing.py    | 73 ++++++++++++++++++++++++-------
 Tests/ML/util.py                  | 20 ++++++---
 3 files changed, 77 insertions(+), 30 deletions(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index 040920be2..719eea243 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -4,7 +4,7 @@
 #  ------------------------------------------------------------------------------------------
 from functools import reduce
 from pathlib import Path
-from typing import Any, Dict, List, Tuple, Union
+from typing import Any, Dict, List, Tuple, Type, Union
 
 import numpy as np
 import pandas as pd
@@ -110,14 +110,14 @@ def to_data_frame(self) -> DataFrame:
         # slow, and should be avoided. Hence, work with dictionary as long as possible, and only finally
         # convert to a DataFrame.
 
-        # dtype is specified as (an instance of) str, not the str class itself, but this seems correct.
         # noinspection PyTypeChecker
+        dtypes: Dict[str, Union[Type[float], Type[str]]] = {column: str for column in self.columns}
+        dtypes[MetricsFileColumns.Dice.value] = float
+        dtypes[MetricsFileColumns.HausdorffDistanceMM.value] = float
+        dtypes[MetricsFileColumns.MeanDistanceMM.value] = float
         df = DataFrame(self.columns, dtype=str)
-        df[MetricsFileColumns.DiceNumeric.value] = pd.Series(data=df[MetricsFileColumns.Dice.value].apply(float))
-        df[MetricsFileColumns.HausdorffDistanceMM.value] = pd.Series(
-            data=df[MetricsFileColumns.HausdorffDistanceMM.value].apply(float))
-        df[MetricsFileColumns.MeanDistanceMM.value] = pd.Series(
-            data=df[MetricsFileColumns.MeanDistanceMM.value].apply(float))
+        df = df.astype(dtypes)
+        df[MetricsFileColumns.DiceNumeric.value] = df[MetricsFileColumns.Dice.value]
         df = df.sort_values(by=[MetricsFileColumns.Patient.value, MetricsFileColumns.Structure.value])
         return df
 
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 11ecfc49b..5b02e2a5a 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -4,12 +4,13 @@
 #  ------------------------------------------------------------------------------------------
 
 import numpy as np
+from numpy.core.shape_base import _block_dispatcher
 import pandas as pd
 import pytest
 from pytorch_lightning import seed_everything
 
 from InnerEye.Common import common_util
-from InnerEye.Common.common_util import METRICS_AGGREGATES_FILE, get_best_epoch_results_path
+from InnerEye.Common.common_util import METRICS_AGGREGATES_FILE, SUBJECT_METRICS_FILE_NAME, get_best_epoch_results_path
 from InnerEye.Common.fixed_paths_for_tests import full_ml_test_data_path
 from InnerEye.Common.metrics_constants import MetricsFileColumns
 from InnerEye.Common.output_directories import OutputFolderForTests
@@ -23,28 +24,38 @@
 from InnerEye.ML.pipelines.inference import InferencePipeline
 from InnerEye.ML.pipelines.scalar_inference import ScalarEnsemblePipeline, ScalarInferencePipeline
 from InnerEye.ML.utils import io_util
-from InnerEye.ML.visualizers.plot_cross_validation import get_config_and_results_for_offline_runs
+from InnerEye.ML.visualizers.plot_cross_validation import (METRICS_BY_MODE_AND_STRUCTURE_FILE,
+                                                           get_config_and_results_for_offline_runs)
 from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
 from Tests.ML.configs.DummyModel import DummyModel
 from Tests.ML.util import (assert_file_contains_string, assert_nifti_content, assert_text_files_match,
-                           assert_csv_column_contains_value, get_default_checkpoint_handler, get_image_shape)
+                           csv_column_contains_value, get_default_checkpoint_handler, get_image_shape)
 from Tests.ML.utils.test_model_util import create_model_and_store_checkpoint
 
 
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
-@pytest.mark.parametrize("partial_ground_truth", [True, False])
-def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth: bool) -> None:
+@pytest.mark.parametrize(["use_partial_ground_truth", "allow_partial_ground_truth"], [[True, True], [True, False], [False, False]])
+def test_model_test(
+    test_output_dirs: OutputFolderForTests,
+    use_partial_ground_truth: bool,
+    allow_partial_ground_truth: bool) -> None:
+    """
+    Check the CSVs (and image files) output by InnerEye.ML.model_testing.segmentation_model_test
+    :param test_output_dirs: The fixture in conftest.py
+    :param use_partial_ground_truth: Whether to remove some ground truth labels from some test users
+    :param allow_partial_ground_truth: What to set the allow_incomplete_labels flag to
+    """
     train_and_test_data_dir = full_ml_test_data_path("train_and_test_data")
     seed_everything(42)
     config = DummyModel()
-    config.allow_incomplete_labels = partial_ground_truth
+    config.allow_incomplete_labels = allow_partial_ground_truth
     config.set_output_to(test_output_dirs.root_dir)
     placeholder_dataset_id = "place_holder_dataset_id"
     config.azure_dataset_id = placeholder_dataset_id
     transform = config.get_full_image_sample_transforms().test
     df = pd.read_csv(full_ml_test_data_path(DATASET_CSV_FILE_NAME))
 
-    if partial_ground_truth:
+    if use_partial_ground_truth:
         config.check_exclusive = False
         config.ground_truth_ids = ["region", "region_1"]
 
@@ -67,10 +78,26 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     else:
         df = df[df.subject.isin([1, 2])]
 
-    # noinspection PyTypeHints
-    config._datasets_for_inference = {
-        ModelExecutionMode.TEST:
-            FullImageDataset(config, df, full_image_sample_transforms=transform, allow_incomplete_labels=partial_ground_truth)}  # type: ignore
+    if use_partial_ground_truth and not allow_partial_ground_truth:
+        with pytest.raises(ValueError) as value_error:
+            # noinspection PyTypeHints
+            config._datasets_for_inference = {
+                ModelExecutionMode.TEST:
+                    FullImageDataset(
+                        config,
+                        df,
+                        full_image_sample_transforms=transform,  # type: ignore
+                        allow_incomplete_labels=allow_partial_ground_truth)}
+        assert "Patient 3 does not have channel 'region'" in str(value_error.value)
+    else:
+        # noinspection PyTypeHints
+        config._datasets_for_inference = {
+            ModelExecutionMode.TEST:
+                FullImageDataset(
+                    config,
+                    df,
+                    full_image_sample_transforms=transform,  # type: ignore
+                    allow_incomplete_labels=allow_partial_ground_truth)}
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.
@@ -84,17 +111,29 @@ def test_model_test(test_output_dirs: OutputFolderForTests, partial_ground_truth
     if not total_num_patients_column_name.endswith("s"):
         total_num_patients_column_name += "s"
 
-    if partial_ground_truth:
+    if use_partial_ground_truth:
         num_subjects = len(pd.unique(df["subject"]))
-        assert_csv_column_contains_value(
-            csv_file_path=epoch_dir / METRICS_AGGREGATES_FILE,
-            column_name=total_num_patients_column_name,
-            value=num_subjects,
-            contains_only_value=True)
+        if allow_partial_ground_truth:
+            assert csv_column_contains_value(
+                csv_file_path=epoch_dir / METRICS_AGGREGATES_FILE,
+                column_name=total_num_patients_column_name,
+                value=num_subjects,
+                contains_only_value=True)
+            assert csv_column_contains_value(
+                csv_file_path=epoch_dir / SUBJECT_METRICS_FILE_NAME,
+                column_name=MetricsFileColumns.Dice.value,
+                value='',
+                contains_only_value=False)
     else:
         aggregates_df = pd.read_csv(epoch_dir / METRICS_AGGREGATES_FILE)
         assert total_num_patients_column_name not in aggregates_df.columns  # Only added if using partial ground truth
 
+        assert not csv_column_contains_value(
+            csv_file_path=epoch_dir / SUBJECT_METRICS_FILE_NAME,
+            column_name=MetricsFileColumns.Dice.value,
+            value='',
+            contains_only_value=False)
+
         assert inference_results.metrics == pytest.approx(0.66606902, abs=1e-6)
         assert config.outputs_folder.is_dir()
         assert epoch_dir.is_dir()
diff --git a/Tests/ML/util.py b/Tests/ML/util.py
index db6916ae0..38397ad96 100644
--- a/Tests/ML/util.py
+++ b/Tests/ML/util.py
@@ -190,27 +190,35 @@ def assert_binary_files_match(actual_file: Path, expected_file: Path) -> None:
     assert False, f"File contents does not match: len(actual)={len(actual)}, len(expected)={len(expected)}"
 
 
-def assert_csv_column_contains_value(
+def csv_column_contains_value(
         csv_file_path: Path,
         column_name: str,
         value: Any,
-        contains_only_value: bool = True) -> None:
+        contains_only_value: bool = True) -> bool:
     """
     Checks that the column in the csv file contains the given value (and perhaps only contains that value)
     :param csv_file_path: The path to the CSV
     :param column_name: The name of the column in which we look for the value
     :param value: The value to look for
     :param contains_only_value: Check that this is the only value in the column (default True)
+    :returns: Boolean, whether the CSV column contains the value (and perhaps only the value)
     """
+    result = True
     if not csv_file_path.exists:
         raise ValueError(f"The CSV at {csv_file_path} does not exist.")
     df = pd.read_csv(csv_file_path)
     if column_name not in df.columns:
-        raise ValueError(f"The column {column_name} is not in the CSV at {csv_file_path}, which has columns {df.columns}.")
-    if contains_only_value:
-        assert int(df[[column_name]].nunique(dropna=True)) == 1
+        ValueError(f"The column {column_name} is not in the CSV at {csv_file_path}, which has columns {df.columns}.")
+    if value:
+        result = result and value in df[column_name].unique()
     else:
-        assert int(df[[column_name]].nunique(dropna=True)) > 0    
+        result = result and df[column_name].isnull().any()
+    if contains_only_value:
+        if value:
+            result = result and df[column_name].nunique(dropna=True) == 1
+        else:
+            result = result and df[column_name].nunique(dropna=True) == 0
+    return result
 
 
 DummyPatientMetadata = PatientMetadata(patient_id='42')

From 58ab13b6233d5d9c21f40e2ae929a8ca1c9ce2f5 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Wed, 30 Jun 2021 13:30:09 +0100
Subject: [PATCH 27/45] Added unit test of IPYNB-HTML for partial ground truth

---
 Tests/ML/reports/test_segmentation_report.py | 49 +++++++++++++++++++-
 1 file changed, 47 insertions(+), 2 deletions(-)

diff --git a/Tests/ML/reports/test_segmentation_report.py b/Tests/ML/reports/test_segmentation_report.py
index efabac6d0..57a29e67a 100644
--- a/Tests/ML/reports/test_segmentation_report.py
+++ b/Tests/ML/reports/test_segmentation_report.py
@@ -2,22 +2,30 @@
 #  Copyright (c) Microsoft Corporation. All rights reserved.
 #  Licensed under the MIT License (MIT). See LICENSE in the repo root for license information.
 #  ------------------------------------------------------------------------------------------
+import re
 from io import StringIO
 from pathlib import Path
 
 import pandas as pd
 import pytest
+from numpy.core.numeric import NaN
 
+from InnerEye.Common.common_util import is_windows
 from InnerEye.Common.metrics_constants import MetricsFileColumns
 from InnerEye.Common.output_directories import OutputFolderForTests
-from InnerEye.Common.common_util import is_windows
 from InnerEye.ML.reports.notebook_report import generate_segmentation_notebook
 from InnerEye.ML.reports.segmentation_report import describe_score, worst_patients_and_outliers
 from InnerEye.ML.utils.csv_util import COL_IS_OUTLIER
 
 
 @pytest.mark.skipif(is_windows(), reason="Random timeout errors on windows.")
-def test_generate_segmentation_report(test_output_dirs: OutputFolderForTests) -> None:
+@pytest.mark.parametrize("use_partial_ground_truth", [False, True])
+def test_generate_segmentation_report(test_output_dirs: OutputFolderForTests, use_partial_ground_truth: bool) -> None:
+    if use_partial_ground_truth:
+        return _test_generate_segmentation_report_with_partial_ground_truth(test_output_dirs)
+    return _test_generate_segmentation_report_without_partial_ground_truth(test_output_dirs)
+
+def _test_generate_segmentation_report_without_partial_ground_truth(test_output_dirs: OutputFolderForTests) -> None:
     reports_folder = Path(__file__).parent
     metrics_file = reports_folder / "metrics_hn.csv"
     current_dir = test_output_dirs.make_sub_dir("test_segmentation_report")
@@ -31,6 +39,43 @@ def test_generate_segmentation_report(test_output_dirs: OutputFolderForTests) ->
     contents = result_html.read_text(encoding='utf-8')
     assert 'parotid_r' in contents
 
+def _test_generate_segmentation_report_with_partial_ground_truth(test_output_dirs: OutputFolderForTests) -> None:
+    """
+    The test without partial ground truth should cover more detail, here we just check that providing
+    partial ground truth results in some labels having a lower user count.
+    """
+    reports_folder = Path(__file__).parent
+    original_metrics_file = reports_folder / "metrics_hn.csv"
+    original_metrics = pd.read_csv(original_metrics_file)
+    partial_metrics = original_metrics
+    partial_metrics.loc[partial_metrics['Structure'].eq('brainstem') & partial_metrics['Patient'].isin([14, 15, 19]),
+                        ['Dice', 'HausdorffDistance_mm', 'MeanDistance_mm']] = NaN
+    current_dir = test_output_dirs.make_sub_dir("test_segmentation_report")
+    partial_metrics_file = current_dir / "metrics_hn.csv"
+    result_file = current_dir / "report.ipynb"
+    partial_metrics.to_csv(partial_metrics_file, index=False, float_format="%.3f", na_rep="")
+    result_html = generate_segmentation_notebook(result_notebook=result_file, test_metrics=partial_metrics_file)
+    result_html_text = result_html.read_text(encoding='utf-8')
+    # Look for this row in the HTML Dice table: 
+    #   <td>brainstem</td>\n      <td>0.82600</td>\n      <td>0.8570</td>\n      <td>0.87600</td>\n      <td>17.0</td>\n 
+    # It shows that for the brainstem label there are only 17, not 20, patients with that label,
+    # because we removed the brainstem label for patients 14, 15, and 19.
+
+    def get_patient_count_for_structure(structure: str, text: str) -> float:
+        regex = f"<td>{structure}" + r"<\/td>(\n\s*<td>[0-9\.]*<\/td>){3}\n\s*<td>([0-9\.]*)"
+        # which results in, for example, this regex:
+        #    regex = "<td>brainstem<\/td>(\n\s*<td>[0-9\.]*<\/td>){3}\n\s*<td>([0-9\.]*)"
+        match = re.search(regex, text)
+        if not match:
+            return NaN
+        patient_count_as_string = match.group(2)
+        return float(patient_count_as_string)
+
+    num_patients_with_lacrimal_gland_l_label = get_patient_count_for_structure("lacrimal_gland_l", result_html_text)
+    num_patients_with_brainstem_label = get_patient_count_for_structure("brainstem", result_html_text)
+    assert num_patients_with_lacrimal_gland_l_label == 20.0
+    assert num_patients_with_brainstem_label == 17.0
+
 
 def test_describe_metric() -> None:
     data = """Patient,Structure,Dice,HausdorffDistance_mm,MeanDistance_mm

From a4402f7410d1721da4ada1b99e6bf1f7a9c05922 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Wed, 30 Jun 2021 13:35:16 +0100
Subject: [PATCH 28/45] flake fixes

---
 Tests/ML/test_model_testing.py | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 5b02e2a5a..443ddd171 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -4,7 +4,6 @@
 #  ------------------------------------------------------------------------------------------
 
 import numpy as np
-from numpy.core.shape_base import _block_dispatcher
 import pandas as pd
 import pytest
 from pytorch_lightning import seed_everything
@@ -24,8 +23,7 @@
 from InnerEye.ML.pipelines.inference import InferencePipeline
 from InnerEye.ML.pipelines.scalar_inference import ScalarEnsemblePipeline, ScalarInferencePipeline
 from InnerEye.ML.utils import io_util
-from InnerEye.ML.visualizers.plot_cross_validation import (METRICS_BY_MODE_AND_STRUCTURE_FILE,
-                                                           get_config_and_results_for_offline_runs)
+from InnerEye.ML.visualizers.plot_cross_validation import get_config_and_results_for_offline_runs
 from Tests.ML.configs.ClassificationModelForTesting import ClassificationModelForTesting
 from Tests.ML.configs.DummyModel import DummyModel
 from Tests.ML.util import (assert_file_contains_string, assert_nifti_content, assert_text_files_match,
@@ -36,9 +34,9 @@
 @pytest.mark.skipif(common_util.is_windows(), reason="Too slow on windows")
 @pytest.mark.parametrize(["use_partial_ground_truth", "allow_partial_ground_truth"], [[True, True], [True, False], [False, False]])
 def test_model_test(
-    test_output_dirs: OutputFolderForTests,
-    use_partial_ground_truth: bool,
-    allow_partial_ground_truth: bool) -> None:
+        test_output_dirs: OutputFolderForTests,
+        use_partial_ground_truth: bool,
+        allow_partial_ground_truth: bool) -> None:
     """
     Check the CSVs (and image files) output by InnerEye.ML.model_testing.segmentation_model_test
     :param test_output_dirs: The fixture in conftest.py

From 5f68941ae25404f71820dfc4d41c589bd92a2b56 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Thu, 1 Jul 2021 11:59:43 +0100
Subject: [PATCH 29/45] temp change for end2end test

---
 InnerEye/ML/configs/segmentation/BasicModel2Epochs.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
index efc4d1d10..e1eb56298 100644
--- a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
+++ b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
@@ -42,7 +42,10 @@ def __init__(self, **kwargs: Any) -> None:
             num_epochs=2,
             recovery_checkpoint_save_interval=1,
             use_mixed_precision=True,
-            azure_dataset_id=AZURE_DATASET_ID,
+            # azure_dataset_id=AZURE_DATASET_ID,
+            # TEMP CHANGE FOR END-TO-END TEST:
+            # python InnerEye/ML/runner.py --allow_incomplete_labels  --azureml --model=BasicModel2EpochsMoreData --run_recovery_id=main_1625070510_eec68070
+            azure_dataset_id='geonorm_with_train_test_split_partial_test_data_2021_06_30',
             comparison_blob_storage_paths=comparison_blob_storage_paths,
             dataset_mountpoint="/tmp/innereye",
             # Use an LR scheduler with a pronounced and clearly visible decay, to be able to easily see if that

From 987234a4360ea8fad00109acae70217dd080d078 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Thu, 1 Jul 2021 15:09:00 +0100
Subject: [PATCH 30/45] WSL package for .Net on linux

---
 .gitignore | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.gitignore b/.gitignore
index 120d80cf5..eeed8fae8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -42,6 +42,7 @@ wheels/
 .installed.cfg
 *.egg
 MANIFEST
+packages-microsoft-prod.deb
 
 # PyInstaller
 #  Usually these files are written by a python script from a template

From b6241d24b61c1590f045a4799d991bd9b0bb4370 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Thu, 1 Jul 2021 15:09:48 +0100
Subject: [PATCH 31/45] Passing on allow_partial in lightning model base

---
 InnerEye/ML/lightning_base.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
index 105dc6260..6e58a1c49 100644
--- a/InnerEye/ML/lightning_base.py
+++ b/InnerEye/ML/lightning_base.py
@@ -161,11 +161,15 @@ def setup(self) -> None:
                 unique_ids = set(split_data[CSV_SUBJECT_HEADER])
                 for patient_id in unique_ids:
                     rows = split_data.loc[split_data[CSV_SUBJECT_HEADER] == patient_id]
+                    allow_incomplete_labels = False
+                    if isinstance(self.config, SegmentationModelBase):
+                        allow_incomplete_labels = self.config.allow_incomplete_labels  # type: ignore
                     # Converts channels from data frame to file paths and gets errors if any
                     __, failed_channel_info = convert_channels_to_file_paths(all_channels,
-                                                                              rows,
-                                                                              local_dataset_root_folder,
-                                                                              patient_id)
+                                                                             rows,
+                                                                             local_dataset_root_folder,
+                                                                             patient_id,
+                                                                             allow_incomplete_labels)
                     full_failed_channel_info += failed_channel_info
 
         if full_failed_channel_info:

From 5f173f483acfa8a9105098ef3944c1bd0d66ef96 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Thu, 1 Jul 2021 17:07:25 +0100
Subject: [PATCH 32/45] Reverting change made for end2end test only

---
 InnerEye/ML/configs/segmentation/BasicModel2Epochs.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
index e1eb56298..efc4d1d10 100644
--- a/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
+++ b/InnerEye/ML/configs/segmentation/BasicModel2Epochs.py
@@ -42,10 +42,7 @@ def __init__(self, **kwargs: Any) -> None:
             num_epochs=2,
             recovery_checkpoint_save_interval=1,
             use_mixed_precision=True,
-            # azure_dataset_id=AZURE_DATASET_ID,
-            # TEMP CHANGE FOR END-TO-END TEST:
-            # python InnerEye/ML/runner.py --allow_incomplete_labels  --azureml --model=BasicModel2EpochsMoreData --run_recovery_id=main_1625070510_eec68070
-            azure_dataset_id='geonorm_with_train_test_split_partial_test_data_2021_06_30',
+            azure_dataset_id=AZURE_DATASET_ID,
             comparison_blob_storage_paths=comparison_blob_storage_paths,
             dataset_mountpoint="/tmp/innereye",
             # Use an LR scheduler with a pronounced and clearly visible decay, to be able to easily see if that

From edb88d6c0b225fe24a69e99a76b61ac402dccb86 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 2 Jul 2021 14:39:36 +0100
Subject: [PATCH 33/45] Fixing param comment

---
 InnerEye/ML/config.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index aad1e05b5..6f29870f2 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -489,8 +489,9 @@ class SegmentationModelBase(ModelConfigBase):
 
     allow_incomplete_labels: bool = param.Boolean(
         default=False,
-        doc="If some test data includes patients with missing ground truth data then their data will be ignored "
-        "completely unless this flag is set. Only used for segmentation models.")
+        doc="If False, the default, then test patient data must include all of the ground truth labels. If true then "
+        "some test patient data with missing ground truth data is allowed and will be reflected in the patient "
+        "counts in the metrics and report.")
 
     def __init__(self, center_size: Optional[TupleInt3] = None,
                  inference_stride_size: Optional[TupleInt3] = None,

From 53ea4fc58f29fbe07c39a9c4e88c971fb3defaa9 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 2 Jul 2021 17:16:09 +0100
Subject: [PATCH 34/45] Fixing unnecessary allow_incomplete, already in args

https://github.com/microsoft/InnerEye-DeepLearning/pull/465/files?file-filters%5B%5D=.py&file-filters%5B%5D=dotfile#r662859625
---
 InnerEye/ML/config.py                     | 3 +--
 InnerEye/ML/dataset/full_image_dataset.py | 8 +++-----
 Tests/ML/pipelines/test_inference.py      | 1 +
 Tests/ML/test_model_testing.py            | 7 +++----
 4 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/InnerEye/ML/config.py b/InnerEye/ML/config.py
index 6f29870f2..8e3aab58c 100644
--- a/InnerEye/ML/config.py
+++ b/InnerEye/ML/config.py
@@ -775,8 +775,7 @@ def create_and_set_torch_datasets(self, for_training: bool = True, for_inference
                 mode: FullImageDataset(
                     self,
                     dataset_splits[mode],
-                    full_image_sample_transforms=full_image_transforms.test,  # type: ignore
-                    allow_incomplete_labels=True)
+                    full_image_sample_transforms=full_image_transforms.test)  # type: ignore
                 for mode in ModelExecutionMode if len(dataset_splits[mode]) > 0
             }
 
diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 654a82959..48021c1d0 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -213,11 +213,10 @@ class FullImageDataset(GeneralDataset):
     """
 
     def __init__(self, args: SegmentationModelBase, data_frame: pd.DataFrame,
-                 full_image_sample_transforms: Optional[Compose3D[Sample]] = None,
-                 allow_incomplete_labels: bool = False):
+                 full_image_sample_transforms: Optional[Compose3D[Sample]] = None):
         super().__init__(args, data_frame)
         self.full_image_sample_transforms = full_image_sample_transforms
-        self.allow_incomplete_labels = allow_incomplete_labels
+        self.allow_incomplete_labels = args.allow_incomplete_labels
         # Check base_path
         assert self.args.local_dataset is not None
         if not self.args.local_dataset.is_dir():
@@ -262,8 +261,7 @@ def _load_dataset_sources(self) -> Dict[str, PatientDatasetSource]:
                                     image_channels=self.args.image_channels,
                                     ground_truth_channels=self.args.ground_truth_ids,
                                     mask_channel=self.args.mask_id,
-                                    allow_incomplete_labels=self.allow_incomplete_labels
-                                    )
+                                    allow_incomplete_labels=self.allow_incomplete_labels)
 
 
 def convert_channels_to_file_paths(channels: List[str],
diff --git a/Tests/ML/pipelines/test_inference.py b/Tests/ML/pipelines/test_inference.py
index f49f975a1..01a3202b6 100644
--- a/Tests/ML/pipelines/test_inference.py
+++ b/Tests/ML/pipelines/test_inference.py
@@ -270,6 +270,7 @@ def test_evaluate_model_predictions() -> None:
         ["5", "train_and_test_data/id2_region.nii.gz", "region_1"]]
 
     config = create_config_from_dataset(input_list, train=['1'], val=['2'], test=['3', '4', '5'])
+    config.allow_incomplete_labels = True
     ds = config.get_torch_dataset_for_inference(ModelExecutionMode.TEST)
     results_folder = config.outputs_folder
     if not results_folder.is_dir():
diff --git a/Tests/ML/test_model_testing.py b/Tests/ML/test_model_testing.py
index 443ddd171..39ab580e5 100644
--- a/Tests/ML/test_model_testing.py
+++ b/Tests/ML/test_model_testing.py
@@ -84,9 +84,9 @@ def test_model_test(
                     FullImageDataset(
                         config,
                         df,
-                        full_image_sample_transforms=transform,  # type: ignore
-                        allow_incomplete_labels=allow_partial_ground_truth)}
+                        full_image_sample_transforms=transform)}  # type: ignore
         assert "Patient 3 does not have channel 'region'" in str(value_error.value)
+        return
     else:
         # noinspection PyTypeHints
         config._datasets_for_inference = {
@@ -94,8 +94,7 @@ def test_model_test(
                 FullImageDataset(
                     config,
                     df,
-                    full_image_sample_transforms=transform,  # type: ignore
-                    allow_incomplete_labels=allow_partial_ground_truth)}
+                    full_image_sample_transforms=transform)}  # type: ignore
     execution_mode = ModelExecutionMode.TEST
     checkpoint_handler = get_default_checkpoint_handler(model_config=config, project_root=test_output_dirs.root_dir)
     # Mimic the behaviour that checkpoints are downloaded from blob storage into the checkpoints folder.

From 3a86cf91b014804c454802cc34c793dcbd71cee0 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 2 Jul 2021 17:25:15 +0100
Subject: [PATCH 35/45] Removing redundant isinstance

---
 InnerEye/ML/lightning_base.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/InnerEye/ML/lightning_base.py b/InnerEye/ML/lightning_base.py
index 6e58a1c49..417e02d1b 100644
--- a/InnerEye/ML/lightning_base.py
+++ b/InnerEye/ML/lightning_base.py
@@ -161,9 +161,7 @@ def setup(self) -> None:
                 unique_ids = set(split_data[CSV_SUBJECT_HEADER])
                 for patient_id in unique_ids:
                     rows = split_data.loc[split_data[CSV_SUBJECT_HEADER] == patient_id]
-                    allow_incomplete_labels = False
-                    if isinstance(self.config, SegmentationModelBase):
-                        allow_incomplete_labels = self.config.allow_incomplete_labels  # type: ignore
+                    allow_incomplete_labels = self.config.allow_incomplete_labels  # type: ignore
                     # Converts channels from data frame to file paths and gets errors if any
                     __, failed_channel_info = convert_channels_to_file_paths(all_channels,
                                                                              rows,

From b47e1973e0b4edc8144ec1b6f25bad49935a7760 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Fri, 2 Jul 2021 17:52:48 +0100
Subject: [PATCH 36/45] Moving duplicated fragment to util function

---
 InnerEye/ML/metrics.py            |  9 +++++----
 InnerEye/ML/plotting.py           |  3 ++-
 InnerEye/ML/utils/metrics_util.py | 12 ++++++++++++
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/InnerEye/ML/metrics.py b/InnerEye/ML/metrics.py
index a60488197..a87dde368 100644
--- a/InnerEye/ML/metrics.py
+++ b/InnerEye/ML/metrics.py
@@ -22,12 +22,13 @@
 from InnerEye.Common.type_annotations import DictStrFloat, TupleFloat3
 from InnerEye.ML.common import ModelExecutionMode
 from InnerEye.ML.config import BACKGROUND_CLASS_NAME
-from InnerEye.ML.metrics_dict import DataframeLogger, INTERNAL_TO_LOGGING_COLUMN_NAMES, MetricsDict, \
-    ScalarMetricsDict
+from InnerEye.ML.metrics_dict import (DataframeLogger, INTERNAL_TO_LOGGING_COLUMN_NAMES, MetricsDict,
+                                      ScalarMetricsDict)
 from InnerEye.ML.scalar_config import ScalarLoss
 from InnerEye.ML.utils.image_util import binaries_from_multi_label_array, is_binary_array
 from InnerEye.ML.utils.io_util import reverse_tuple_float3
-from InnerEye.ML.utils.metrics_util import binary_classification_accuracy, mean_absolute_error, r2_score
+from InnerEye.ML.utils.metrics_util import (binary_classification_accuracy, mean_absolute_error,
+                                            r2_score, is_missing_ground_truth)
 from InnerEye.ML.utils.ml_util import check_size_matches
 from InnerEye.ML.utils.sequence_utils import get_masked_model_outputs_and_labels
 
@@ -247,7 +248,7 @@ def calculate_metrics_per_class(segmentation: np.ndarray,
     binary_classes = [is_binary_array(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
 
     # If ground truth image is nan, then will not be used for metrics computation.
-    nan_images = [np.isnan(ground_truth[label_id][0, 0, 0]) for label_id in range(ground_truth.shape[0])]
+    nan_images = [is_missing_ground_truth(ground_truth[label_id]) for label_id in range(ground_truth.shape[0])]
 
     # Compares element-wise if not binary then nan and checks all elements are True.
     assert np.all(np.array(binary_classes) == ~np.array(nan_images))
diff --git a/InnerEye/ML/plotting.py b/InnerEye/ML/plotting.py
index d0b4f5487..b467eeb2c 100644
--- a/InnerEye/ML/plotting.py
+++ b/InnerEye/ML/plotting.py
@@ -16,6 +16,7 @@
 from InnerEye.ML.photometric_normalization import PhotometricNormalization
 from InnerEye.ML.utils import plotting_util
 from InnerEye.ML.utils.image_util import binaries_from_multi_label_array, get_largest_z_slice
+from InnerEye.ML.utils.metrics_util import is_missing_ground_truth
 from InnerEye.ML.utils.ml_util import check_size_matches
 from InnerEye.ML.utils.surface_distance_utils import Plane, extract_border
 
@@ -307,7 +308,7 @@ def plot_contours_for_all_classes(sample: Sample,
             continue
         ground_truth = sample.labels[class_index, ...]
 
-        if np.isnan(ground_truth[0, 0, 0]):
+        if is_missing_ground_truth(ground_truth):
             continue
 
         largest_gt_slice = get_largest_z_slice(ground_truth)
diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index 719eea243..a9ccd5661 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -256,3 +256,15 @@ def convert_input_and_label(model_output: Union[torch.Tensor, np.ndarray],
     if not torch.is_tensor(label):
         label = torch.tensor(label)
     return model_output.float(), label.float()
+
+
+def is_missing_ground_truth(ground_truth: np.array) -> bool:
+    """
+    calculate_metrics_per_class in metrics.py and plot_contours_for_all_classes in plotting.py both
+    check whether there is ground truth missing using this simple check for NaN value at 0, 0, 0.
+    To avoid duplicate code we bring it here as a utility function.
+    :param ground_truth: ground truth binary array with dimensions: [Z x Y x X].
+    :param label_id: Integer index of the label to check.
+    :returns: True if the label is missing (signified by NaN), False otherwise.
+    """
+    return np.isnan(ground_truth[0, 0, 0]
\ No newline at end of file

From 615ae1a26d498149445fba19d4f3d68a433f4f3f Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 15:28:47 +0100
Subject: [PATCH 37/45] missing bracket

---
 InnerEye/ML/utils/metrics_util.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index a9ccd5661..c140e07b9 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -267,4 +267,4 @@ def is_missing_ground_truth(ground_truth: np.array) -> bool:
     :param label_id: Integer index of the label to check.
     :returns: True if the label is missing (signified by NaN), False otherwise.
     """
-    return np.isnan(ground_truth[0, 0, 0]
\ No newline at end of file
+    return np.isnan(ground_truth[0, 0, 0])
\ No newline at end of file

From 6a0bd3a76370245bd4bb03e3fcf5e565975773dc Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 16:08:44 +0100
Subject: [PATCH 38/45] Clearer comments for save_aggregates_to_csv

---
 InnerEye/ML/utils/metrics_util.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index c140e07b9..eafd6995d 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -70,22 +70,26 @@ def save_aggregates_to_csv(self, file_path: Path, allow_incomplete_labels: bool
 
         :param file_path: The name of the file to write to.
         :param allow_incomplete_labels: boolean flag. If false, all ground truth files must be provided.
-        If true, ground truth files are optional. (Defaults to False.)
+        If true, ground truth files are optional and we add a total_patients count column for easy
+        comparison. (Defaults to False.)
         """
 
         stats_columns = ['mean', 'std', 'min', 'max']
         # get aggregates for all metrics
         df = self.to_data_frame()
         aggregates = df.groupby(MetricsFileColumns.Structure.value).describe()
-        num_subjects = len(pd.unique(df[MetricsFileColumns.Patient.value]))
-        total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
-        if not total_num_patients_column_name.endswith("s"):
-            total_num_patients_column_name += "s"
 
         def filter_rename_metric_columns(_metric_column: str, is_count_column: bool = False) -> pd.DataFrame:
             _columns = ["count"] + stats_columns if is_count_column else stats_columns
             _df = aggregates[_metric_column][_columns]
             if is_count_column and allow_incomplete_labels:
+                # For this condition we add a total_patient count column so that readers can make
+                # more sense of aggregated metrics where some patients were missing the label (i.e.
+                # partial ground truth).
+                num_subjects = len(pd.unique(df[MetricsFileColumns.Patient.value]))
+                total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
+                if not total_num_patients_column_name.endswith("s"):
+                    total_num_patients_column_name += "s"
                 _df[total_num_patients_column_name] = num_subjects
                 _df = _df[["count", total_num_patients_column_name] + stats_columns]
             _columns_to_rename = [x for x in _df.columns if x != "count" and x != total_num_patients_column_name]

From b3ed3f1564f2bd4ce1559c098d28bbd866af893d Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 18:37:36 +0100
Subject: [PATCH 39/45] Unit test for InnerEyeContainer setup pass through

---
 Tests/ML/test_lightning_containers.py | 36 ++++++++++++++++++++++++++-
 1 file changed, 35 insertions(+), 1 deletion(-)

diff --git a/Tests/ML/test_lightning_containers.py b/Tests/ML/test_lightning_containers.py
index eafb1283c..6682fbccc 100644
--- a/Tests/ML/test_lightning_containers.py
+++ b/Tests/ML/test_lightning_containers.py
@@ -4,7 +4,7 @@
 #  ------------------------------------------------------------------------------------------
 from io import StringIO
 from pathlib import Path
-from typing import List
+from typing import List, Optional, Tuple
 from unittest import mock
 
 import pandas as pd
@@ -283,6 +283,7 @@ def test_container_hooks(test_output_dirs: OutputFolderForTests) -> None:
     for file in ["global_rank_zero.txt", "local_rank_zero.txt", "all_ranks.txt"]:
         assert (runner.container.outputs_folder / file).is_file(), f"Missing file: {file}"
 
+
 @pytest.mark.parametrize("number_of_cross_validation_splits", [0, 2])
 def test_get_hyperdrive_config(number_of_cross_validation_splits: int,
                                test_output_dirs: OutputFolderForTests) -> None:
@@ -312,3 +313,36 @@ def test_get_hyperdrive_config(number_of_cross_validation_splits: int,
     else:
         hd_config = container.get_hyperdrive_config(run_config=run_config)
         assert isinstance(hd_config, HyperDriveConfig)
+
+
+@pytest.mark.parametrize("allow_partial_ground_truth", [True, False])
+def test_innereyecontainer_setup_passes_on_allow_incomplete_labels(
+        test_output_dirs: OutputFolderForTests,
+        allow_partial_ground_truth: bool) -> None:
+    """
+    Test that InnerEyeContainer.setup passes on the correct value of allow_incomplete_labels to
+    full_image_dataset.convert_channels_to_file_paths
+    :param test_output_dirs: Test fixture.
+    :param allow_partial_ground_truth: The value to set allow_incomplete_labels to and check it is
+    passed through.
+    """
+    config = DummyModel()
+    config.set_output_to(test_output_dirs.root_dir)
+    config.allow_incomplete_labels = allow_partial_ground_truth
+    container = InnerEyeContainer(config)
+    test_done_message = "Stop now, the test has passed."
+
+    def mocked_convert_channels_to_file_paths(
+        _: List[str],
+        __: pd.DataFrame,
+        ___: Path,
+        ____: str,
+        allow_incomplete_labels: bool) -> Tuple[List[Optional[Path]], str]:
+            assert allow_incomplete_labels == allow_partial_ground_truth
+            raise RuntimeError(test_done_message)
+
+    with pytest.raises(RuntimeError) as runtime_error:
+        with mock.patch("InnerEye.ML.lightning_base.convert_channels_to_file_paths") as convert_channels_to_file_paths_mock:
+            convert_channels_to_file_paths_mock.side_effect = mocked_convert_channels_to_file_paths
+            container.setup()
+    assert  str(runtime_error.value) == test_done_message

From da79dd3d9a3fe28ed0dbbdbdf339fa26d67fff8a Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 19:15:16 +0100
Subject: [PATCH 40/45] flake fixes (I hope!)

---
 InnerEye/ML/utils/metrics_util.py     |  2 +-
 Tests/ML/test_lightning_containers.py | 16 ++++++++--------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index eafd6995d..d23e6cf14 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -271,4 +271,4 @@ def is_missing_ground_truth(ground_truth: np.array) -> bool:
     :param label_id: Integer index of the label to check.
     :returns: True if the label is missing (signified by NaN), False otherwise.
     """
-    return np.isnan(ground_truth[0, 0, 0])
\ No newline at end of file
+    return np.isnan(ground_truth[0, 0, 0])
diff --git a/Tests/ML/test_lightning_containers.py b/Tests/ML/test_lightning_containers.py
index 6682fbccc..5e1367816 100644
--- a/Tests/ML/test_lightning_containers.py
+++ b/Tests/ML/test_lightning_containers.py
@@ -333,16 +333,16 @@ def test_innereyecontainer_setup_passes_on_allow_incomplete_labels(
     test_done_message = "Stop now, the test has passed."
 
     def mocked_convert_channels_to_file_paths(
-        _: List[str],
-        __: pd.DataFrame,
-        ___: Path,
-        ____: str,
-        allow_incomplete_labels: bool) -> Tuple[List[Optional[Path]], str]:
-            assert allow_incomplete_labels == allow_partial_ground_truth
-            raise RuntimeError(test_done_message)
+            _: List[str],
+            __: pd.DataFrame,
+            ___: Path,
+            ____: str,
+            allow_incomplete_labels: bool) -> Tuple[List[Optional[Path]], str]:
+        assert allow_incomplete_labels == allow_partial_ground_truth
+        raise RuntimeError(test_done_message)
 
     with pytest.raises(RuntimeError) as runtime_error:
         with mock.patch("InnerEye.ML.lightning_base.convert_channels_to_file_paths") as convert_channels_to_file_paths_mock:
             convert_channels_to_file_paths_mock.side_effect = mocked_convert_channels_to_file_paths
             container.setup()
-    assert  str(runtime_error.value) == test_done_message
+    assert str(runtime_error.value) == test_done_message

From eadd7f9effb5fcd142e063edcc3974b9368e50f6 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 19:23:25 +0100
Subject: [PATCH 41/45] changelog

---
 CHANGELOG.md | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 92f6631b8..7211066ca 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -13,6 +13,8 @@ created.
 ## Upcoming
 
 ### Added
+- ([#465](https://github.com/microsoft/InnerEye-DeepLearning/pull/465/)) Adding ability to run segmentation inference
+module in the test data without or partial ground truth files.
 - ([#492](https://github.com/microsoft/InnerEye-DeepLearning/pull/492)) Adding capability for regression tests for test
 jobs that run in AzureML.
 

From f9540ed13cf12967231f283081e5e78a42d39204 Mon Sep 17 00:00:00 2001
From: dumbledad <tim_regan82@hotmail.com>
Date: Sun, 4 Jul 2021 19:28:34 +0100
Subject: [PATCH 42/45] Odd fix, moved definition to enclosing scope

---
 InnerEye/ML/utils/metrics_util.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/InnerEye/ML/utils/metrics_util.py b/InnerEye/ML/utils/metrics_util.py
index d23e6cf14..63a812dd1 100644
--- a/InnerEye/ML/utils/metrics_util.py
+++ b/InnerEye/ML/utils/metrics_util.py
@@ -79,6 +79,10 @@ def save_aggregates_to_csv(self, file_path: Path, allow_incomplete_labels: bool
         df = self.to_data_frame()
         aggregates = df.groupby(MetricsFileColumns.Structure.value).describe()
 
+        total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
+        if not total_num_patients_column_name.endswith("s"):
+            total_num_patients_column_name += "s"
+
         def filter_rename_metric_columns(_metric_column: str, is_count_column: bool = False) -> pd.DataFrame:
             _columns = ["count"] + stats_columns if is_count_column else stats_columns
             _df = aggregates[_metric_column][_columns]
@@ -87,9 +91,6 @@ def filter_rename_metric_columns(_metric_column: str, is_count_column: bool = Fa
                 # more sense of aggregated metrics where some patients were missing the label (i.e.
                 # partial ground truth).
                 num_subjects = len(pd.unique(df[MetricsFileColumns.Patient.value]))
-                total_num_patients_column_name = f"total_{MetricsFileColumns.Patient.value}".lower()
-                if not total_num_patients_column_name.endswith("s"):
-                    total_num_patients_column_name += "s"
                 _df[total_num_patients_column_name] = num_subjects
                 _df = _df[["count", total_num_patients_column_name] + stats_columns]
             _columns_to_rename = [x for x in _df.columns if x != "count" and x != total_num_patients_column_name]

From 5f4fe5b73da13165c488e34ae1726281c6b64cbf Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Mon, 5 Jul 2021 12:22:00 +0100
Subject: [PATCH 43/45] removing Path(__file__)

---
 Tests/ML/reports/test_segmentation_report.py | 19 +++++++++++--------
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/Tests/ML/reports/test_segmentation_report.py b/Tests/ML/reports/test_segmentation_report.py
index 57a29e67a..12c51fa91 100644
--- a/Tests/ML/reports/test_segmentation_report.py
+++ b/Tests/ML/reports/test_segmentation_report.py
@@ -11,6 +11,7 @@
 from numpy.core.numeric import NaN
 
 from InnerEye.Common.common_util import is_windows
+from InnerEye.Common.fixed_paths_for_tests import tests_root_directory
 from InnerEye.Common.metrics_constants import MetricsFileColumns
 from InnerEye.Common.output_directories import OutputFolderForTests
 from InnerEye.ML.reports.notebook_report import generate_segmentation_notebook
@@ -21,13 +22,15 @@
 @pytest.mark.skipif(is_windows(), reason="Random timeout errors on windows.")
 @pytest.mark.parametrize("use_partial_ground_truth", [False, True])
 def test_generate_segmentation_report(test_output_dirs: OutputFolderForTests, use_partial_ground_truth: bool) -> None:
+    reports_folder = tests_root_directory() / "ML" / "reports"
+    metrics_file = reports_folder / "metrics_hn.csv"
     if use_partial_ground_truth:
-        return _test_generate_segmentation_report_with_partial_ground_truth(test_output_dirs)
-    return _test_generate_segmentation_report_without_partial_ground_truth(test_output_dirs)
+        return _test_generate_segmentation_report_with_partial_ground_truth(test_output_dirs, metrics_file)
+    return _test_generate_segmentation_report_without_partial_ground_truth(test_output_dirs, metrics_file)
 
-def _test_generate_segmentation_report_without_partial_ground_truth(test_output_dirs: OutputFolderForTests) -> None:
-    reports_folder = Path(__file__).parent
-    metrics_file = reports_folder / "metrics_hn.csv"
+def _test_generate_segmentation_report_without_partial_ground_truth(
+        test_output_dirs: OutputFolderForTests,
+        metrics_file: Path) -> None:
     current_dir = test_output_dirs.make_sub_dir("test_segmentation_report")
     result_file = current_dir / "report.ipynb"
     result_html = generate_segmentation_notebook(result_notebook=result_file,
@@ -39,13 +42,13 @@ def _test_generate_segmentation_report_without_partial_ground_truth(test_output_
     contents = result_html.read_text(encoding='utf-8')
     assert 'parotid_r' in contents
 
-def _test_generate_segmentation_report_with_partial_ground_truth(test_output_dirs: OutputFolderForTests) -> None:
+def _test_generate_segmentation_report_with_partial_ground_truth(
+        test_output_dirs: OutputFolderForTests,
+        original_metrics_file: Path) -> None:
     """
     The test without partial ground truth should cover more detail, here we just check that providing
     partial ground truth results in some labels having a lower user count.
     """
-    reports_folder = Path(__file__).parent
-    original_metrics_file = reports_folder / "metrics_hn.csv"
     original_metrics = pd.read_csv(original_metrics_file)
     partial_metrics = original_metrics
     partial_metrics.loc[partial_metrics['Structure'].eq('brainstem') & partial_metrics['Patient'].isin([14, 15, 19]),

From 411277cc1d7586243e4ef246c51b42b045f7bed7 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Mon, 5 Jul 2021 14:58:50 +0100
Subject: [PATCH 44/45] using is_missing again

---
 InnerEye/ML/utils/io_util.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/InnerEye/ML/utils/io_util.py b/InnerEye/ML/utils/io_util.py
index 9df51191f..02faeafb6 100644
--- a/InnerEye/ML/utils/io_util.py
+++ b/InnerEye/ML/utils/io_util.py
@@ -29,6 +29,7 @@
 from InnerEye.ML.utils.hdf5_util import HDF5Object
 from InnerEye.ML.utils.image_util import ImageDataType, ImageHeader, check_array_range, get_center_crop, \
     get_unit_image_header, is_binary_array
+from InnerEye.ML.utils.metrics_util import is_missing_ground_truth
 from InnerEye.ML.utils.transforms import LinearTransform, get_range_for_window_level
 
 RESULTS_POSTERIOR_FILE_NAME_PREFIX = "posterior_"
@@ -439,7 +440,7 @@ def load_labels_from_dataset_source(dataset_source: PatientDatasetSource, check_
     # If ground truth image is nan, then will not be used to check check_exclusive
     # Image is nan, if voxel at index [0, 0, 0] is NaN
     not_nan_label_images = [labels[label_id] for label_id in range(labels.shape[0])
-                            if not np.isnan(labels[label_id][0, 0, 0])]
+                            if not is_missing_ground_truth(labels[label_id])]
 
     if check_exclusive and (sum(np.array(not_nan_label_images)) > 1.).any():  # type: ignore
         raise ValueError(f'The labels for patient {dataset_source.metadata.patient_id} are not mutually exclusive. '

From 5a5f946aef08b6f49411993247add5abf145a608 Mon Sep 17 00:00:00 2001
From: Tim Regan <timregan@microsoft.com>
Date: Mon, 5 Jul 2021 15:41:55 +0100
Subject: [PATCH 45/45] no partial images!

---
 InnerEye/ML/dataset/full_image_dataset.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/InnerEye/ML/dataset/full_image_dataset.py b/InnerEye/ML/dataset/full_image_dataset.py
index 48021c1d0..63883c006 100644
--- a/InnerEye/ML/dataset/full_image_dataset.py
+++ b/InnerEye/ML/dataset/full_image_dataset.py
@@ -363,7 +363,7 @@ def get_paths_for_channel_ids(channels: List[str], allow_incomplete_labels_flag:
         dataset_sources[patient_id] = PatientDatasetSource(
             metadata=metadata,
             image_channels=get_paths_for_channel_ids(channels=image_channels,  # type: ignore
-                                                     allow_incomplete_labels_flag=allow_incomplete_labels),
+                                                     allow_incomplete_labels_flag=False),
             mask_channel=get_mask_channel_or_default(),
             ground_truth_channels=get_paths_for_channel_ids(channels=ground_truth_channels,  # type: ignore
                                                             allow_incomplete_labels_flag=allow_incomplete_labels),