Calculate loss support (#1075)

* Calculate loss support * Relaxed log loss test for individual models * Feedback from #1075 * Missing loss in comment * Revert back test as well
automl · Feb 16, 2021 · cf27323 · cf27323
1 parent a275763
commit cf27323
Show file tree

Hide file tree

Showing 8 changed files with 329 additions and 212 deletions.
diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py
diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py
@@ -6,7 +6,7 @@
 
 from autosklearn.constants import TASK_TYPES
 from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble
-from autosklearn.metrics import Scorer, calculate_score
+from autosklearn.metrics import Scorer, calculate_loss
 from autosklearn.pipeline.base import BasePipeline
 
 
@@ -100,7 +100,7 @@ def _fast(
             dtype=np.float64,
         )
         for i in range(ensemble_size):
-            scores = np.zeros(
+            losses = np.zeros(
                 (len(predictions)),
                 dtype=np.float64,
             )
@@ -129,24 +129,23 @@ def _fast(
                     out=fant_ensemble_prediction
                 )
 
-                # Calculate score is versatile and can return a dict of score
+                # calculate_loss is versatile and can return a dict of losses
                 # when scoring_functions=None, we know it will be a float
-                calculated_score = cast(
+                losses[j] = cast(
                     float,
-                    calculate_score(
+                    calculate_loss(
                         solution=labels,
                         prediction=fant_ensemble_prediction,
                         task_type=self.task_type,
                         metric=self.metric,
                         scoring_functions=None
                     )
                 )
-                scores[j] = self.metric._optimum - calculated_score
 
-            all_best = np.argwhere(scores == np.nanmin(scores)).flatten()
+            all_best = np.argwhere(losses == np.nanmin(losses)).flatten()
             best = self.random_state.choice(all_best)
             ensemble.append(predictions[best])
-            trajectory.append(scores[best])
+            trajectory.append(losses[best])
             order.append(best)
 
             # Handle special case
@@ -155,7 +154,7 @@ def _fast(
 
         self.indices_ = order
         self.trajectory_ = trajectory
-        self.train_score_ = trajectory[-1]
+        self.train_loss_ = trajectory[-1]
 
     def _slow(
         self,
@@ -172,30 +171,29 @@ def _slow(
         ensemble_size = self.ensemble_size
 
         for i in range(ensemble_size):
-            scores = np.zeros(
+            losses = np.zeros(
                 [np.shape(predictions)[0]],
                 dtype=np.float64,
             )
             for j, pred in enumerate(predictions):
                 ensemble.append(pred)
                 ensemble_prediction = np.mean(np.array(ensemble), axis=0)
-                # Calculate score is versatile and can return a dict of score
+                # calculate_loss is versatile and can return a dict of losses
                 # when scoring_functions=None, we know it will be a float
-                calculated_score = cast(
+                losses[j] = cast(
                     float,
-                    calculate_score(
+                    calculate_loss(
                         solution=labels,
                         prediction=ensemble_prediction,
                         task_type=self.task_type,
                         metric=self.metric,
                         scoring_functions=None
                     )
                 )
-                scores[j] = self.metric._optimum - calculated_score
                 ensemble.pop()
-            best = np.nanargmin(scores)
+            best = np.nanargmin(losses)
             ensemble.append(predictions[best])
-            trajectory.append(scores[best])
+            trajectory.append(losses[best])
             order.append(best)
 
             # Handle special case
@@ -210,7 +208,7 @@ def _slow(
             trajectory,
             dtype=np.float64,
         )
-        self.train_score_ = trajectory[-1]
+        self.train_loss_ = trajectory[-1]
 
     def _calculate_weights(self) -> None:
         ensemble_members = Counter(self.indices_).most_common()

diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py
@@ -19,7 +19,7 @@
 from autosklearn.pipeline.implementations.util import (
     convert_multioutput_multiclass_to_multilabel
 )
-from autosklearn.metrics import calculate_score
+from autosklearn.metrics import calculate_loss, Scorer
 from autosklearn.util.logging_ import get_named_client_logger
 
 from ConfigSpace import Configuration
@@ -236,17 +236,18 @@ def _get_model(self):
                                      init_params=self._init_params)
         return model
 
-    def _loss(self, y_true, y_hat, scoring_functions=None):
-        """Auto-sklearn follows a minimization goal, so the make_scorer
-        sign is used as a guide to obtain the value to reduce.
-
-        On this regard, to optimize a metric:
-            1- score is calculared with calculate_score, with the caveat, that if
-            for the metric greater is not better, a negative score is returned.
-            2- the err (the optimization goal) is then:
-                optimum - (metric.sign * actual_score)
-                For accuracy for example: optimum(1) - (+1 * actual score)
-                For logloss for example: optimum(0) - (-1 * actual score)
+    def _loss(self, y_true: np.ndarray, y_hat: np.ndarray,
+              scoring_functions: typing.Optional[typing.List[Scorer]] = None
+              ) -> typing.Union[float, typing.Dict[str, float]]:
+        """Auto-sklearn follows a minimization goal.
+        The calculate_loss internally translate a score function to
+        a minimization problem.
+
+        For a dummy prediction, the worst result is assumed.
+
+        Parameters
+        ----------
+            y_true
         """
         scoring_functions = (
             self.scoring_functions
@@ -255,23 +256,14 @@ def _loss(self, y_true, y_hat, scoring_functions=None):
         )
         if not isinstance(self.configuration, Configuration):
             if scoring_functions:
-                return {self.metric.name: 1.0}
+                return {self.metric.name: self.metric._worst_possible_result}
             else:
-                return 1.0
+                return self.metric._worst_possible_result
 
-        score = calculate_score(
+        return calculate_loss(
             y_true, y_hat, self.task_type, self.metric,
             scoring_functions=scoring_functions)
 
-        if hasattr(score, '__len__'):
-            err = {metric.name: metric._optimum - score[metric.name]
-                   for metric in scoring_functions}
-            err[self.metric.name] = self.metric._optimum - score[self.metric.name]
-        else:
-            err = self.metric._optimum - score
-
-        return err
-
     def finish_up(self, loss, train_loss,  opt_pred, valid_pred, test_pred,
                   additional_run_info, file_output, final_call, status):
         """This function does everything necessary after the fitting is done:

diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py
@@ -5,8 +5,7 @@
     AbstractEvaluator,
     _fit_and_suppress_warnings,
 )
-from autosklearn.metrics import calculate_score, CLASSIFICATION_METRICS, REGRESSION_METRICS
-from autosklearn.constants import CLASSIFICATION_TASKS
+from autosklearn.metrics import calculate_loss
 
 
 __all__ = [
@@ -71,7 +70,7 @@ def predict_and_loss(self, train=False):
         if train:
             Y_pred = self.predict_function(self.X_train, self.model,
                                            self.task_type, self.Y_train)
-            score = calculate_score(
+            err = calculate_loss(
                 solution=self.Y_train,
                 prediction=Y_pred,
                 task_type=self.task_type,
@@ -80,23 +79,13 @@ def predict_and_loss(self, train=False):
         else:
             Y_pred = self.predict_function(self.X_test, self.model,
                                            self.task_type, self.Y_train)
-            score = calculate_score(
+            err = calculate_loss(
                 solution=self.Y_test,
                 prediction=Y_pred,
                 task_type=self.task_type,
                 metric=self.metric,
                 scoring_functions=self.scoring_functions)
 
-        if hasattr(score, '__len__'):
-            if self.task_type in CLASSIFICATION_TASKS:
-                err = {key: metric._optimum - score[key] for key, metric in
-                       CLASSIFICATION_METRICS.items() if key in score}
-            else:
-                err = {key: metric._optimum - score[key] for key, metric in
-                       REGRESSION_METRICS.items() if key in score}
-        else:
-            err = self.metric._optimum - score
-
         return err, Y_pred, None, None
 
 

diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py
@@ -1,6 +1,6 @@
 from abc import ABCMeta, abstractmethod
 from functools import partial
-from typing import Any, Callable, Dict, List, Optional, Union
+from typing import Any, Callable, Dict, List, Optional, Union, cast
 
 import numpy as np
 
@@ -353,7 +353,7 @@ def calculate_score(
             for metric_ in scoring_functions:
 
                 try:
-                    score_dict[metric_.name] = metric_(solution, cprediction)
+                    score_dict[metric_.name] = metric_._sign * metric_(solution, cprediction)
                 except ValueError as e:
                     print(e, e.args[0])
                     if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \
@@ -369,7 +369,7 @@ def calculate_score(
                 # handle?
 
                 try:
-                    score_dict[metric_.name] = metric_(solution, prediction)
+                    score_dict[metric_.name] = metric_._sign * metric_(solution, prediction)
                 except ValueError as e:
                     if e.args[0] == 'multiclass format is not supported':
                         continue
@@ -397,10 +397,73 @@ def get_metric_score(
         solution: np.ndarray,
         task_type: int
 ) -> float:
+    # We match the behaviour of GridSearchCV
+    # In scikit learn, the exact value of the score_func
+    # is returned (not that of the 'Scorer' which might be
+    # negative in functions like mse, as scikit learn
+    # maximizes.) If an user wants to use GridSearchCV
+    # They are expected to pass neg_mean_squared_error
+    # For this reason we multiply back by metric_._sign
     if task_type in REGRESSION_TASKS:
         # TODO put this into the regression metric itself
         cprediction = sanitize_array(prediction)
-        score = metric_(solution, cprediction)
+        score = metric_._sign * metric_(solution, cprediction)
     else:
-        score = metric_(solution, prediction)
+        score = metric_._sign * metric_(solution, prediction)
     return score
+
+
+def calculate_loss(
+    solution: np.ndarray,
+    prediction: np.ndarray,
+    task_type: int,
+    metric: Scorer,
+    scoring_functions: Optional[List[Scorer]] = None
+) -> Union[float, Dict[str, float]]:
+    """
+    Returns a loss (a magnitude that allows casting the
+    optimization problem, as a minimization one) for the
+    given Auto-Sklearn Scorer object
+    Parameters
+    ----------
+        solution: np.ndarray
+            The ground truth of the targets
+        prediction: np.ndarray
+            The best estimate from the model, of the given targets
+        task_type: int
+            To understand if the problem task is classification
+            or regression
+        metric: Scorer
+            Object that host a function to calculate how good the
+            prediction is according to the solution.
+        scoring_functions: List[Scorer]
+            A list of metrics to calculate multiple losses
+    Returns
+    -------
+        float or Dict[str, float]
+            A loss function for each of the provided scorer objects
+    """
+    score = calculate_score(
+        solution=solution,
+        prediction=prediction,
+        task_type=task_type,
+        metric=metric,
+        scoring_functions=scoring_functions,
+    )
+
+    if scoring_functions:
+        score = cast(Dict, score)
+        # we expect a dict() object for which we should calculate the loss
+        loss_dict = dict()
+        for metric_ in scoring_functions + [metric]:
+            # TODO: When metrics are annotated with type_of_target support
+            # we can remove this check
+            if metric_.name not in score:
+                continue
+            # maybe metric argument is not in scoring_functions
+            # so append it to the list. Rather than check if such
+            # is the case, redefining loss_dict[metric] is less expensive
+            loss_dict[metric_.name] = metric_._optimum - metric_._sign * score[metric_.name]
+        return loss_dict
+    else:
+        return metric._optimum - metric._sign * cast(float, score)
diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py
@@ -308,7 +308,7 @@ def test_automl_outputs(backend, dask_client):
         'start_time_100',
         'datamanager.pkl',
         'ensemble_read_preds.pkl',
-        'ensemble_read_scores.pkl',
+        'ensemble_read_losses.pkl',
         'runs',
         'ensembles',
         'ensemble_history.json',
@@ -625,7 +625,8 @@ def test_load_best_individual_model(metric, backend, dask_client):
     if metric.name == 'balanced_accuracy':
         assert automl.score(X_test, Y_test) > 0.9
     elif metric.name == 'log_loss':
-        assert automl.score(X_test, Y_test) <= 0.2
+        # Seen values in github actions of 0.6978304740364537
+        assert automl.score(X_test, Y_test) <= 0.72
     else:
         raise ValueError(metric.name)