From c2b0e7344fe6647ece907f478f42794299dcff00 Mon Sep 17 00:00:00 2001
From: Matthias Feurer <feurerm@informatik.uni-freiburg.de>
Date: Mon, 12 Apr 2021 19:04:08 +0200
Subject: [PATCH] Fix calculate loss (#1123)

* FIX an issue with calculate loss

* simplify code

* fix unit test
---
 autosklearn/automl.py            |  11 +-
 autosklearn/ensemble_builder.py  |   7 +-
 autosklearn/metrics/__init__.py  | 170 ++++++++++++++++++++++---------
 test/test_automl/test_automl.py  |   2 +-
 test/test_metric/test_metrics.py |  36 +++++--
 5 files changed, 160 insertions(+), 66 deletions(-)

diff --git a/autosklearn/automl.py b/autosklearn/automl.py
index 414a0ee8d8..658d35c1b2 100644
--- a/autosklearn/automl.py
+++ b/autosklearn/automl.py
@@ -40,7 +40,7 @@
 from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash
 from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings
 from autosklearn.evaluation.train_evaluator import _fit_with_budget
-from autosklearn.metrics import calculate_score
+from autosklearn.metrics import calculate_metric
 from autosklearn.util.backend import Backend
 from autosklearn.util.stopwatch import StopWatch
 from autosklearn.util.logging_ import (
@@ -1153,11 +1153,10 @@ def score(self, X, y):
         # same representation domain
         prediction = self.InputValidator.target_validator.transform(prediction)
 
-        return calculate_score(solution=y,
-                               prediction=prediction,
-                               task_type=self._task,
-                               metric=self._metric,
-                               scoring_functions=None)
+        return calculate_metric(solution=y,
+                                prediction=prediction,
+                                task_type=self._task,
+                                metric=self._metric, )
 
     @property
     def cv_results_(self):
diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py
index 84e19ee397..ca92830c98 100644
--- a/autosklearn/ensemble_builder.py
+++ b/autosklearn/ensemble_builder.py
@@ -420,7 +420,7 @@ def __init__(
             performance_range_threshold: float
                 Keep only models that are better than:
                     dummy + (best - dummy)*performance_range_threshold
-                E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3
+                E.g dummy=2, best=4, thresh=0.5 --> only consider models with loss > 3
                 Will at most return the minimum between ensemble_nbest models,
                 and max_models_on_disc. Might return less
             seed: int
@@ -978,7 +978,7 @@ def get_n_best_preds(self):
             # no model left; try to use dummy loss (num_run==0)
             # log warning when there are other models but not better than dummy model
             if num_keys > num_dummy:
-                self.logger.warning("No models better than random - using Dummy Score!"
+                self.logger.warning("No models better than random - using Dummy loss!"
                                     "Number of models besides current dummy model: %d. "
                                     "Number of dummy models: %d",
                                     num_keys - 1,
@@ -1105,8 +1105,7 @@ def get_n_best_preds(self):
                 #  only if the model ends up in the ensemble
                 self.read_losses[k]['loaded'] = 1
 
-        # return best scored keys of self.read_losses
-        # That is, the one with the lowest loss
+        # return keys of self.read_losses with lowest losses
         return sorted_keys[:ensemble_n_best]
 
     def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], List[str]]:
diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
index 89be6ab8c5..95167c01cc 100644
--- a/autosklearn/metrics/__init__.py
+++ b/autosklearn/metrics/__init__.py
@@ -342,18 +342,40 @@ def calculate_score(
     metric: Scorer,
     scoring_functions: Optional[List[Scorer]] = None
 ) -> Union[float, Dict[str, float]]:
+    """
+    Returns a score (a magnitude that allows casting the
+    optimization problem as a maximization one) for the
+    given Auto-Sklearn Scorer object
+
+    Parameters
+    ----------
+    solution: np.ndarray
+        The ground truth of the targets
+    prediction: np.ndarray
+        The best estimate from the model, of the given targets
+    task_type: int
+        To understand if the problem task is classification
+        or regression
+    metric: Scorer
+        Object that host a function to calculate how good the
+        prediction is according to the solution.
+    scoring_functions: List[Scorer]
+        A list of metrics to calculate multiple losses
+    Returns
+    -------
+    float or Dict[str, float]
+    """
     if task_type not in TASK_TYPES:
         raise NotImplementedError(task_type)
 
     if scoring_functions:
         score_dict = dict()
         if task_type in REGRESSION_TASKS:
-            # TODO put this into the regression metric itself
-            cprediction = sanitize_array(prediction)
-            for metric_ in scoring_functions:
+            for metric_ in scoring_functions + [metric]:
 
                 try:
-                    score_dict[metric_.name] = metric_._sign * metric_(solution, cprediction)
+                    score_dict[metric_.name] = _compute_scorer(
+                        metric_, prediction, solution, task_type)
                 except ValueError as e:
                     print(e, e.args[0])
                     if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \
@@ -363,13 +385,14 @@ def calculate_score(
                         raise e
 
         else:
-            for metric_ in scoring_functions:
+            for metric_ in scoring_functions + [metric]:
 
                 # TODO maybe annotate metrics to define which cases they can
                 # handle?
 
                 try:
-                    score_dict[metric_.name] = metric_._sign * metric_(solution, prediction)
+                    score_dict[metric_.name] = _compute_scorer(
+                        metric_, prediction, solution, task_type)
                 except ValueError as e:
                     if e.args[0] == 'multiclass format is not supported':
                         continue
@@ -383,34 +406,10 @@ def calculate_score(
                     else:
                         raise e
 
-        if metric.name not in score_dict.keys():
-            score_dict[metric.name] = get_metric_score(metric, prediction, solution, task_type)
         return score_dict
 
     else:
-        return get_metric_score(metric, prediction, solution, task_type)
-
-
-def get_metric_score(
-        metric_: Scorer,
-        prediction: np.ndarray,
-        solution: np.ndarray,
-        task_type: int
-) -> float:
-    # We match the behaviour of GridSearchCV
-    # In scikit learn, the exact value of the score_func
-    # is returned (not that of the 'Scorer' which might be
-    # negative in functions like mse, as scikit learn
-    # maximizes.) If an user wants to use GridSearchCV
-    # They are expected to pass neg_mean_squared_error
-    # For this reason we multiply back by metric_._sign
-    if task_type in REGRESSION_TASKS:
-        # TODO put this into the regression metric itself
-        cprediction = sanitize_array(prediction)
-        score = metric_._sign * metric_(solution, cprediction)
-    else:
-        score = metric_._sign * metric_(solution, prediction)
-    return score
+        return _compute_scorer(metric, prediction, solution, task_type)
 
 
 def calculate_loss(
@@ -422,26 +421,28 @@ def calculate_loss(
 ) -> Union[float, Dict[str, float]]:
     """
     Returns a loss (a magnitude that allows casting the
-    optimization problem, as a minimization one) for the
+    optimization problem as a minimization one) for the
     given Auto-Sklearn Scorer object
+
     Parameters
     ----------
-        solution: np.ndarray
-            The ground truth of the targets
-        prediction: np.ndarray
-            The best estimate from the model, of the given targets
-        task_type: int
-            To understand if the problem task is classification
-            or regression
-        metric: Scorer
-            Object that host a function to calculate how good the
-            prediction is according to the solution.
-        scoring_functions: List[Scorer]
-            A list of metrics to calculate multiple losses
+    solution: np.ndarray
+        The ground truth of the targets
+    prediction: np.ndarray
+        The best estimate from the model, of the given targets
+    task_type: int
+        To understand if the problem task is classification
+        or regression
+    metric: Scorer
+        Object that host a function to calculate how good the
+        prediction is according to the solution.
+    scoring_functions: List[Scorer]
+        A list of metrics to calculate multiple losses
+
     Returns
     -------
-        float or Dict[str, float]
-            A loss function for each of the provided scorer objects
+    float or Dict[str, float]
+        A loss function for each of the provided scorer objects
     """
     score = calculate_score(
         solution=solution,
@@ -463,7 +464,80 @@ def calculate_loss(
             # maybe metric argument is not in scoring_functions
             # so append it to the list. Rather than check if such
             # is the case, redefining loss_dict[metric] is less expensive
-            loss_dict[metric_.name] = metric_._optimum - metric_._sign * score[metric_.name]
+            loss_dict[metric_.name] = metric_._optimum - score[metric_.name]
         return loss_dict
     else:
-        return metric._optimum - metric._sign * cast(float, score)
+        rval = metric._optimum - cast(float, score)
+        return rval
+
+
+def calculate_metric(
+    metric: Scorer,
+    prediction: np.ndarray,
+    solution: np.ndarray,
+    task_type: int
+) -> float:
+    """
+    Returns a metric for the given Auto-Sklearn Scorer object.
+    It's direction is determined by the metric itself.
+
+    Parameters
+    ----------
+    solution: np.ndarray
+        The ground truth of the targets
+    prediction: np.ndarray
+        The best estimate from the model, of the given targets
+    task_type: int
+        To understand if the problem task is classification
+        or regression
+    metric: Scorer
+        Object that host a function to calculate how good the
+        prediction is according to the solution.
+
+    Returns
+    -------
+    float
+    """
+    score = _compute_scorer(
+        solution=solution,
+        prediction=prediction,
+        metric=metric,
+        task_type=task_type,
+    )
+    return metric._sign * score
+
+
+def _compute_scorer(
+    metric: Scorer,
+    prediction: np.ndarray,
+    solution: np.ndarray,
+    task_type: int
+) -> float:
+    """
+    Returns a score (a magnitude that allows casting the
+    optimization problem as a maximization one) for the
+    given Auto-Sklearn Scorer object
+
+    Parameters
+    ----------
+    solution: np.ndarray
+        The ground truth of the targets
+    prediction: np.ndarray
+        The best estimate from the model, of the given targets
+    task_type: int
+        To understand if the problem task is classification
+        or regression
+    metric: Scorer
+        Object that host a function to calculate how good the
+        prediction is according to the solution.
+    Returns
+    -------
+    float
+    """
+    if task_type in REGRESSION_TASKS:
+        # TODO put this into the regression metric itself
+        cprediction = sanitize_array(prediction)
+        score = metric(solution, cprediction)
+    else:
+        score = metric(solution, prediction)
+    return score
diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py
index 71da849358..759479f9b8 100644
--- a/test/test_automl/test_automl.py
+++ b/test/test_automl/test_automl.py
@@ -637,7 +637,7 @@ def test_load_best_individual_model(metric, backend, dask_client):
         assert automl.score(X_test, Y_test) > 0.9
     elif metric.name == 'log_loss':
         # Seen values in github actions of 0.6978304740364537
-        assert automl.score(X_test, Y_test) <= 0.72
+        assert automl.score(X_test, Y_test) < 0.7
     else:
         raise ValueError(metric.name)
 
diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py
index 0f4a57e07c..ea00da9275 100644
--- a/test/test_metric/test_metrics.py
+++ b/test/test_metric/test_metrics.py
@@ -7,7 +7,7 @@
 
 import autosklearn.metrics
 
-from autosklearn.metrics import calculate_score, calculate_loss
+from autosklearn.metrics import calculate_score, calculate_loss, calculate_metric
 from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION
 
 from smac.utils.constants import MAXINT
@@ -536,7 +536,7 @@ def test_regression_only_metric(self):
 
 def test_calculate_loss():
     # In a 0-1 ranged scorer, make sure that the loss
-    # has a expected positive value
+    # has an expected positive value
     y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0])
     y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0])
     score = sklearn.metrics.accuracy_score(y_true, y_pred)
@@ -546,8 +546,7 @@ def test_calculate_loss():
         task_type=BINARY_CLASSIFICATION,
         metric=autosklearn.metrics.accuracy,
     )
-    loss = 1.0 - score
-    assert pytest.approx(loss) == calculate_loss(
+    assert pytest.approx(1.0 - score) == calculate_loss(
         solution=y_true,
         prediction=y_pred,
         task_type=BINARY_CLASSIFICATION,
@@ -582,14 +581,37 @@ def test_calculate_loss():
     y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
     y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66])
     score = sklearn.metrics.mean_squared_error(y_true, y_pred)
-    assert pytest.approx(score) == calculate_score(
+    assert pytest.approx(0 - score) == calculate_score(
+        solution=y_true,
+        prediction=y_pred,
+        task_type=REGRESSION,
+        metric=autosklearn.metrics.mean_squared_error,
+    )
+    assert pytest.approx(score) == calculate_loss(
         solution=y_true,
         prediction=y_pred,
         task_type=REGRESSION,
         metric=autosklearn.metrics.mean_squared_error,
     )
-    loss = score
-    assert pytest.approx(loss) == calculate_loss(
+
+
+def test_calculate_metric():
+    # metric to be maximized
+    y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0])
+    y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0])
+    score = sklearn.metrics.accuracy_score(y_true, y_pred)
+    assert pytest.approx(score) == calculate_metric(
+        solution=y_true,
+        prediction=y_pred,
+        task_type=BINARY_CLASSIFICATION,
+        metric=autosklearn.metrics.accuracy,
+    )
+
+    # metric to be minimized
+    y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6])
+    y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66])
+    score = sklearn.metrics.mean_squared_error(y_true, y_pred)
+    assert pytest.approx(score) == calculate_metric(
         solution=y_true,
         prediction=y_pred,
         task_type=REGRESSION,