From c2b0e7344fe6647ece907f478f42794299dcff00 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 12 Apr 2021 19:04:08 +0200 Subject: [PATCH] Fix calculate loss (#1123) * FIX an issue with calculate loss * simplify code * fix unit test --- autosklearn/automl.py | 11 +- autosklearn/ensemble_builder.py | 7 +- autosklearn/metrics/__init__.py | 170 ++++++++++++++++++++++--------- test/test_automl/test_automl.py | 2 +- test/test_metric/test_metrics.py | 36 +++++-- 5 files changed, 160 insertions(+), 66 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 414a0ee8d8..658d35c1b2 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -40,7 +40,7 @@ from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings from autosklearn.evaluation.train_evaluator import _fit_with_budget -from autosklearn.metrics import calculate_score +from autosklearn.metrics import calculate_metric from autosklearn.util.backend import Backend from autosklearn.util.stopwatch import StopWatch from autosklearn.util.logging_ import ( @@ -1153,11 +1153,10 @@ def score(self, X, y): # same representation domain prediction = self.InputValidator.target_validator.transform(prediction) - return calculate_score(solution=y, - prediction=prediction, - task_type=self._task, - metric=self._metric, - scoring_functions=None) + return calculate_metric(solution=y, + prediction=prediction, + task_type=self._task, + metric=self._metric, ) @property def cv_results_(self): diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 84e19ee397..ca92830c98 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -420,7 +420,7 @@ def __init__( performance_range_threshold: float Keep only models that are better than: dummy + (best - dummy)*performance_range_threshold - E.g dummy=2, best=4, thresh=0.5 --> only consider models with score > 3 + E.g dummy=2, best=4, thresh=0.5 --> only consider models with loss > 3 Will at most return the minimum between ensemble_nbest models, and max_models_on_disc. Might return less seed: int @@ -978,7 +978,7 @@ def get_n_best_preds(self): # no model left; try to use dummy loss (num_run==0) # log warning when there are other models but not better than dummy model if num_keys > num_dummy: - self.logger.warning("No models better than random - using Dummy Score!" + self.logger.warning("No models better than random - using Dummy loss!" "Number of models besides current dummy model: %d. " "Number of dummy models: %d", num_keys - 1, @@ -1105,8 +1105,7 @@ def get_n_best_preds(self): # only if the model ends up in the ensemble self.read_losses[k]['loaded'] = 1 - # return best scored keys of self.read_losses - # That is, the one with the lowest loss + # return keys of self.read_losses with lowest losses return sorted_keys[:ensemble_n_best] def get_valid_test_preds(self, selected_keys: List[str]) -> Tuple[List[str], List[str]]: diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 89be6ab8c5..95167c01cc 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -342,18 +342,40 @@ def calculate_score( metric: Scorer, scoring_functions: Optional[List[Scorer]] = None ) -> Union[float, Dict[str, float]]: + """ + Returns a score (a magnitude that allows casting the + optimization problem as a maximization one) for the + given Auto-Sklearn Scorer object + + Parameters + ---------- + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + scoring_functions: List[Scorer] + A list of metrics to calculate multiple losses + Returns + ------- + float or Dict[str, float] + """ if task_type not in TASK_TYPES: raise NotImplementedError(task_type) if scoring_functions: score_dict = dict() if task_type in REGRESSION_TASKS: - # TODO put this into the regression metric itself - cprediction = sanitize_array(prediction) - for metric_ in scoring_functions: + for metric_ in scoring_functions + [metric]: try: - score_dict[metric_.name] = metric_._sign * metric_(solution, cprediction) + score_dict[metric_.name] = _compute_scorer( + metric_, prediction, solution, task_type) except ValueError as e: print(e, e.args[0]) if e.args[0] == "Mean Squared Logarithmic Error cannot be used when " \ @@ -363,13 +385,14 @@ def calculate_score( raise e else: - for metric_ in scoring_functions: + for metric_ in scoring_functions + [metric]: # TODO maybe annotate metrics to define which cases they can # handle? try: - score_dict[metric_.name] = metric_._sign * metric_(solution, prediction) + score_dict[metric_.name] = _compute_scorer( + metric_, prediction, solution, task_type) except ValueError as e: if e.args[0] == 'multiclass format is not supported': continue @@ -383,34 +406,10 @@ def calculate_score( else: raise e - if metric.name not in score_dict.keys(): - score_dict[metric.name] = get_metric_score(metric, prediction, solution, task_type) return score_dict else: - return get_metric_score(metric, prediction, solution, task_type) - - -def get_metric_score( - metric_: Scorer, - prediction: np.ndarray, - solution: np.ndarray, - task_type: int -) -> float: - # We match the behaviour of GridSearchCV - # In scikit learn, the exact value of the score_func - # is returned (not that of the 'Scorer' which might be - # negative in functions like mse, as scikit learn - # maximizes.) If an user wants to use GridSearchCV - # They are expected to pass neg_mean_squared_error - # For this reason we multiply back by metric_._sign - if task_type in REGRESSION_TASKS: - # TODO put this into the regression metric itself - cprediction = sanitize_array(prediction) - score = metric_._sign * metric_(solution, cprediction) - else: - score = metric_._sign * metric_(solution, prediction) - return score + return _compute_scorer(metric, prediction, solution, task_type) def calculate_loss( @@ -422,26 +421,28 @@ def calculate_loss( ) -> Union[float, Dict[str, float]]: """ Returns a loss (a magnitude that allows casting the - optimization problem, as a minimization one) for the + optimization problem as a minimization one) for the given Auto-Sklearn Scorer object + Parameters ---------- - solution: np.ndarray - The ground truth of the targets - prediction: np.ndarray - The best estimate from the model, of the given targets - task_type: int - To understand if the problem task is classification - or regression - metric: Scorer - Object that host a function to calculate how good the - prediction is according to the solution. - scoring_functions: List[Scorer] - A list of metrics to calculate multiple losses + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + scoring_functions: List[Scorer] + A list of metrics to calculate multiple losses + Returns ------- - float or Dict[str, float] - A loss function for each of the provided scorer objects + float or Dict[str, float] + A loss function for each of the provided scorer objects """ score = calculate_score( solution=solution, @@ -463,7 +464,80 @@ def calculate_loss( # maybe metric argument is not in scoring_functions # so append it to the list. Rather than check if such # is the case, redefining loss_dict[metric] is less expensive - loss_dict[metric_.name] = metric_._optimum - metric_._sign * score[metric_.name] + loss_dict[metric_.name] = metric_._optimum - score[metric_.name] return loss_dict else: - return metric._optimum - metric._sign * cast(float, score) + rval = metric._optimum - cast(float, score) + return rval + + +def calculate_metric( + metric: Scorer, + prediction: np.ndarray, + solution: np.ndarray, + task_type: int +) -> float: + """ + Returns a metric for the given Auto-Sklearn Scorer object. + It's direction is determined by the metric itself. + + Parameters + ---------- + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + + Returns + ------- + float + """ + score = _compute_scorer( + solution=solution, + prediction=prediction, + metric=metric, + task_type=task_type, + ) + return metric._sign * score + + +def _compute_scorer( + metric: Scorer, + prediction: np.ndarray, + solution: np.ndarray, + task_type: int +) -> float: + """ + Returns a score (a magnitude that allows casting the + optimization problem as a maximization one) for the + given Auto-Sklearn Scorer object + + Parameters + ---------- + solution: np.ndarray + The ground truth of the targets + prediction: np.ndarray + The best estimate from the model, of the given targets + task_type: int + To understand if the problem task is classification + or regression + metric: Scorer + Object that host a function to calculate how good the + prediction is according to the solution. + Returns + ------- + float + """ + if task_type in REGRESSION_TASKS: + # TODO put this into the regression metric itself + cprediction = sanitize_array(prediction) + score = metric(solution, cprediction) + else: + score = metric(solution, prediction) + return score diff --git a/test/test_automl/test_automl.py b/test/test_automl/test_automl.py index 71da849358..759479f9b8 100644 --- a/test/test_automl/test_automl.py +++ b/test/test_automl/test_automl.py @@ -637,7 +637,7 @@ def test_load_best_individual_model(metric, backend, dask_client): assert automl.score(X_test, Y_test) > 0.9 elif metric.name == 'log_loss': # Seen values in github actions of 0.6978304740364537 - assert automl.score(X_test, Y_test) <= 0.72 + assert automl.score(X_test, Y_test) < 0.7 else: raise ValueError(metric.name) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 0f4a57e07c..ea00da9275 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -7,7 +7,7 @@ import autosklearn.metrics -from autosklearn.metrics import calculate_score, calculate_loss +from autosklearn.metrics import calculate_score, calculate_loss, calculate_metric from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION from smac.utils.constants import MAXINT @@ -536,7 +536,7 @@ def test_regression_only_metric(self): def test_calculate_loss(): # In a 0-1 ranged scorer, make sure that the loss - # has a expected positive value + # has an expected positive value y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) @@ -546,8 +546,7 @@ def test_calculate_loss(): task_type=BINARY_CLASSIFICATION, metric=autosklearn.metrics.accuracy, ) - loss = 1.0 - score - assert pytest.approx(loss) == calculate_loss( + assert pytest.approx(1.0 - score) == calculate_loss( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, @@ -582,14 +581,37 @@ def test_calculate_loss(): y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) - assert pytest.approx(score) == calculate_score( + assert pytest.approx(0 - score) == calculate_score( + solution=y_true, + prediction=y_pred, + task_type=REGRESSION, + metric=autosklearn.metrics.mean_squared_error, + ) + assert pytest.approx(score) == calculate_loss( solution=y_true, prediction=y_pred, task_type=REGRESSION, metric=autosklearn.metrics.mean_squared_error, ) - loss = score - assert pytest.approx(loss) == calculate_loss( + + +def test_calculate_metric(): + # metric to be maximized + y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) + y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) + score = sklearn.metrics.accuracy_score(y_true, y_pred) + assert pytest.approx(score) == calculate_metric( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metric=autosklearn.metrics.accuracy, + ) + + # metric to be minimized + y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) + y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) + score = sklearn.metrics.mean_squared_error(y_true, y_pred) + assert pytest.approx(score) == calculate_metric( solution=y_true, prediction=y_pred, task_type=REGRESSION,