From c4e0d78813a85fc7f6176ce67def781635b7b3de Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Mon, 12 Jun 2023 16:04:53 -0700
Subject: [PATCH 1/3] Make metric logging keys configurable

---
 mart/models/modular.py | 37 ++++++++++++++++---------------------
 1 file changed, 16 insertions(+), 21 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index eb5dd934..4fcfe783 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -36,6 +36,9 @@ def __init__(
         test_step_log=None,
         test_metrics=None,
         load_state_dict=None,
+        output_loss_key="loss",
+        output_preds_key="preds",
+        output_target_key="target",
     ):
         super().__init__()
 
@@ -88,6 +91,10 @@ def __init__(
             logger.info(f"Loading state_dict {path} for {module.__class__.__name__}...")
             module.load_state_dict(torch.load(path, map_location="cpu"))
 
+        self.output_loss_key = output_loss_key
+        self.output_preds_key = output_preds_key
+        self.output_target_key = output_target_key
+
     def configure_optimizers(self):
         config = {}
         config["optimizer"] = self.optimizer_fn(self.model)
@@ -118,19 +125,15 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
-        assert "loss" in output
-        return output
-
-    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
-            if "preds" not in output or "target" not in output:
+            if self.output_preds_key not in output or self.output_target_key not in output:
                 raise ValueError(
-                    "You have specified training_metrics, but the model does not return preds and target during training. You can either nullify training_metrics or configure the model to return preds and target in the training output."
+                    f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
-            self.training_metrics(output["preds"], output["target"])
-        loss = output.pop("loss")
-        return loss
+            self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
+
+        return output[self.output_loss_key]
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -152,13 +155,9 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
-        return output
+        self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-    def validation_step_end(self, output):
-        self.validation_metrics(output["preds"], output["target"])
-
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -178,13 +177,9 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
-        return output
-
-    def test_step_end(self, output):
-        self.test_metrics(output["preds"], output["target"])
+        self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()

From 508798ca12d98d2ce757bcb18779b7c3fe474cdd Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 14:15:36 -0700
Subject: [PATCH 2/3] cleanup

---
 mart/models/modular.py | 20 ++++++++++++++++----
 1 file changed, 16 insertions(+), 4 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index 4fcfe783..d663dda3 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -125,6 +125,10 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
+        assert "loss" in output
+        return output
+
+    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
             if self.output_preds_key not in output or self.output_target_key not in output:
@@ -132,8 +136,8 @@ def training_step(self, batch, batch_idx):
                     f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
             self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
-
-        return output[self.output_loss_key]
+        loss = output.pop(self.output_loss_key)
+        return loss
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -155,9 +159,13 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
+        return output
+
+    def validation_step_end(self, output):
         self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        return None
+        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
+        output.clear()
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -177,9 +185,13 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
+        return output
+
+    def test_step_end(self, output):
         self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        return None
+        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
+        output.clear()
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()

From fc770e81d7783edf7e0ef7bf04b4b12a25a2eaa0 Mon Sep 17 00:00:00 2001
From: Cory Cornelius <cory.cornelius@intel.com>
Date: Tue, 13 Jun 2023 17:12:51 -0700
Subject: [PATCH 3/3] Remove *_step_end

---
 mart/models/modular.py | 20 ++++----------------
 1 file changed, 4 insertions(+), 16 deletions(-)

diff --git a/mart/models/modular.py b/mart/models/modular.py
index d663dda3..4fcfe783 100644
--- a/mart/models/modular.py
+++ b/mart/models/modular.py
@@ -125,10 +125,6 @@ def training_step(self, batch, batch_idx):
         for name in self.training_step_log:
             self.log(f"training/{name}", output[name])
 
-        assert "loss" in output
-        return output
-
-    def training_step_end(self, output):
         if self.training_metrics is not None:
             # Some models only return loss in the training mode.
             if self.output_preds_key not in output or self.output_target_key not in output:
@@ -136,8 +132,8 @@ def training_step_end(self, output):
                     f"You have specified training_metrics, but the model does not return {self.output_preds_key} or {self.output_target_key} during training. You can either nullify training_metrics or configure the model to return {self.output_preds_key} and {self.output_target_key} in the training output."
                 )
             self.training_metrics(output[self.output_preds_key], output[self.output_target_key])
-        loss = output.pop(self.output_loss_key)
-        return loss
+
+        return output[self.output_loss_key]
 
     def training_epoch_end(self, outputs):
         if self.training_metrics is not None:
@@ -159,13 +155,9 @@ def validation_step(self, batch, batch_idx):
         for name in self.validation_step_log:
             self.log(f"validation/{name}", output[name])
 
-        return output
-
-    def validation_step_end(self, output):
         self.validation_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def validation_epoch_end(self, outputs):
         metrics = self.validation_metrics.compute()
@@ -185,13 +177,9 @@ def test_step(self, batch, batch_idx):
         for name in self.test_step_log:
             self.log(f"test/{name}", output[name])
 
-        return output
-
-    def test_step_end(self, output):
         self.test_metrics(output[self.output_preds_key], output[self.output_target_key])
 
-        # I don't know why this is required to prevent CUDA memory leak in validaiton and test. (Not required in training.)
-        output.clear()
+        return None
 
     def test_epoch_end(self, outputs):
         metrics = self.test_metrics.compute()