From 288569253d09bfa28a7f19dd59b280b12e91b7e9 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 3 May 2022 14:51:32 +0200 Subject: [PATCH 01/24] First draft of multi-objective optimization Co-authored-by: Katharina Eggensperger --- autosklearn/automl.py | 31 ++++-- autosklearn/estimators.py | 2 +- autosklearn/evaluation/__init__.py | 42 ++++++- autosklearn/evaluation/abstract_evaluator.py | 31 ++++-- autosklearn/evaluation/train_evaluator.py | 42 ++++--- autosklearn/metrics/__init__.py | 110 ++++++++++--------- autosklearn/smbo.py | 20 +++- 7 files changed, 183 insertions(+), 95 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 50e4591346..ecb468426b 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Callable, Iterable, Mapping, Optional, Tuple +from typing import Any, Callable, Iterable, List, Mapping, Optional, Tuple, Union import copy import io @@ -210,7 +210,7 @@ def __init__( get_smac_object_callback: Optional[Callable] = None, smac_scenario_args: Optional[Mapping] = None, logging_config: Optional[Mapping] = None, - metric: Optional[Scorer] = None, + metric: Optional[Union[Scorer, List[Scorer], Tuple[Scorer]]] = None, scoring_functions: Optional[list[Scorer]] = None, get_trials_callback: Optional[IncorporateRunResultCallback] = None, dataset_compression: bool | Mapping[str, Any] = True, @@ -265,7 +265,7 @@ def __init__( initial_configurations_via_metalearning ) - self._scoring_functions = scoring_functions or {} + self._scoring_functions = scoring_functions or [] self._resampling_strategy_arguments = resampling_strategy_arguments or {} # Single core, local runs should use fork to prevent the __main__ requirements @@ -692,10 +692,14 @@ def fit( # defined in the estimator fit call if self._metric is None: raise ValueError("No metric given.") - if not isinstance(self._metric, Scorer): - raise ValueError( - "Metric must be instance of " "autosklearn.metrics.Scorer." - ) + if isinstance(self._metric, (List, Tuple)): + for entry in self._metric: + if not isinstance(entry, Scorer): + raise ValueError( + "Metric must be instance of autosklearn.metrics.Scorer." + ) + elif not isinstance(self._metric, Scorer): + raise ValueError("Metric must be instance of autosklearn.metrics.Scorer.") # If no dask client was provided, we create one, so that we can # start a ensemble process in parallel to smbo optimize @@ -790,7 +794,11 @@ def fit( backend=copy.deepcopy(self._backend), dataset_name=dataset_name, task=self._task, - metric=self._metric, + metric=( + self._metric[0] + if isinstance(self._metric, (List, Tuple)) + else self._metric + ), ensemble_size=self._ensemble_size, ensemble_nbest=self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -1492,7 +1500,11 @@ def fit_ensemble( backend=copy.deepcopy(self._backend), dataset_name=dataset_name if dataset_name else self._dataset_name, task=task if task else self._task, - metric=self._metric, + metric=( + self._metric[0] + if isinstance(self._metric, (List, Tuple)) + else self._metric + ), ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -1912,7 +1924,6 @@ def show_models(self) -> dict[int, Any]: .. code-block:: python import sklearn.datasets - import sklearn.metrics import autosklearn.regression X, y = sklearn.datasets.load_diabetes(return_X_y=True) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index db931a338a..fa1e83693a 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -46,7 +46,7 @@ def __init__( smac_scenario_args=None, logging_config=None, metadata_directory=None, - metric=None, + metric: Optional[Union[Scorer, List[Scorer], Tuple[Scorer]]] = None, scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, get_trials_callback=None, diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 89c61d144d..d26aa7fd4e 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -85,10 +85,13 @@ def fit_predict_try_except_decorator( queue.close() -def get_cost_of_crash(metric: Scorer) -> float: +def get_cost_of_crash( + metric: Union[Scorer, List[Scorer], Tuple[Scorer]] +) -> Union[float, List[float]]: - # The metric must always be defined to extract optimum/worst - if not isinstance(metric, Scorer): + if isinstance(metric, (List, Tuple)): + return [cast(float, get_cost_of_crash(metric_)) for metric_ in metric] + elif not isinstance(metric, Scorer): raise ValueError("The metric must be stricly be an instance of Scorer") # Autosklearn optimizes the err. This function translates @@ -126,7 +129,7 @@ def __init__( resampling_strategy: Union[ str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], cost_for_crash: float, abort_on_first_run_crash: bool, port: int, @@ -144,7 +147,7 @@ def __init__( disable_file_output: bool = False, init_params: Optional[Dict[str, Any]] = None, budget_type: Optional[str] = None, - ta: Optional[Callable] = None, + ta: Optional[Callable] = None, # Required by SMAC's parent class **resampling_strategy_args: Any, ): if resampling_strategy == "holdout": @@ -186,6 +189,7 @@ def __init__( par_factor=par_factor, cost_for_crash=self.worst_possible_result, abort_on_first_run_crash=abort_on_first_run_crash, + multi_objectives=multi_objectives, ) self.backend = backend @@ -550,4 +554,32 @@ def run( autosklearn.evaluation.util.empty_queue(queue) self.logger.info("Finished evaluating configuration %d" % config_id) + + # Do some sanity checking (for multi objective) + if len(self.multi_objectives) > 1: + error = ( + f"Returned costs {cost} does not match the number of objectives" + f" {len(self.multi_objectives)}." + ) + + # If dict convert to array + # Make sure the ordering is correct + if isinstance(cost, dict): + ordered_cost = [] + for name in self.multi_objectives: + if name not in cost: + raise RuntimeError( + f"Objective {name} was not found in the returned costs." + ) + + ordered_cost.append(cost[name]) + cost = ordered_cost + + if isinstance(cost, list): + if len(cost) != len(self.multi_objectives): + raise RuntimeError(error) + + if isinstance(cost, float): + raise RuntimeError(error) + return status, cost, runtime, additional_run_info diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 7843de6a8a..9ece32d9d4 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -184,7 +184,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -326,7 +326,6 @@ def _loss( self, y_true: np.ndarray, y_hat: np.ndarray, - scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to @@ -338,21 +337,30 @@ def _loss( ---------- y_true """ - scoring_functions = ( - self.scoring_functions if scoring_functions is None else scoring_functions - ) if not isinstance(self.configuration, Configuration): - if scoring_functions: - return {self.metric.name: self.metric._worst_possible_result} + if self.scoring_functions: + if isinstance(self.metric, Scorer): + return {self.metric.name: self.metric._worst_possible_result} + else: + return { + metric.name: metric._worst_possible_result + for metric in self.metric + } else: - return self.metric._worst_possible_result + if isinstance(self.metric, Scorer): + return self.metric._worst_possible_result + else: + return { + metric.name: metric._worst_possible_result + for metric in self.metric + } return calculate_loss( y_true, y_hat, self.task_type, self.metric, - scoring_functions=scoring_functions, + scoring_functions=self.scoring_functions, ) def finish_up( @@ -402,7 +410,10 @@ def finish_up( if isinstance(loss, dict): loss_ = loss - loss = loss_[self.metric.name] + if isinstance(self.metric, Scorer): + loss = loss_[self.metric.name] + else: + loss = {metric: loss_[metric] for metric in loss_} else: loss_ = {} diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 7a047d3e10..a859e36ab8 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -182,7 +182,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -645,19 +645,31 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # train_losses is a list of either scalars or dicts. If it contains dicts, # then train_loss is computed using the target metric (self.metric). if all(isinstance(elem, dict) for elem in train_losses): - train_loss = np.average( - [ - train_losses[i][str(self.metric)] - for i in range(self.num_cv_folds) - ], - weights=train_fold_weights, - ) + if isinstance(self.metric, Scorer): + train_loss = np.average( + [ + train_losses[i][str(self.metric)] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights, + ) + else: + train_loss = [ + np.average( + [ + train_losses[i][str(metric)] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights, + ) + for metric in self.metric + ] else: train_loss = np.average(train_losses, weights=train_fold_weights) # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. - if self.scoring_functions: + if self.scoring_functions or not isinstance(self.metric, Scorer): opt_loss = {} for metric in opt_losses[0].keys(): opt_loss[metric] = np.average( @@ -1316,7 +1328,7 @@ def eval_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, @@ -1363,7 +1375,7 @@ def eval_iterative_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, @@ -1410,7 +1422,7 @@ def eval_partial_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, @@ -1463,7 +1475,7 @@ def eval_partial_cv_iterative( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, @@ -1511,7 +1523,7 @@ def eval_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, @@ -1559,7 +1571,7 @@ def eval_iterative_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], seed: int, num_run: int, instance: str, diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 3234329658..88002a87e6 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,5 +1,5 @@ from abc import ABCMeta, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Union, cast +from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast from functools import partial from itertools import product @@ -388,7 +388,7 @@ def calculate_score( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ @@ -417,64 +417,70 @@ def calculate_score( if task_type not in TASK_TYPES: raise NotImplementedError(task_type) + to_score = [] if scoring_functions: - score_dict = dict() - if task_type in REGRESSION_TASKS: - for metric_ in scoring_functions + [metric]: - - try: - score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type - ) - except ValueError as e: - print(e, e.args[0]) - if ( - e.args[0] - == "Mean Squared Logarithmic Error cannot be used when " - "targets contain negative values." - ): - continue - else: - raise e - - else: - for metric_ in scoring_functions + [metric]: - - # TODO maybe annotate metrics to define which cases they can - # handle? + to_score.extend(scoring_functions) + if isinstance(metric, (list, tuple)): + to_score.extend(metric) + else: + to_score.append(metric) - try: - score_dict[metric_.name] = _compute_scorer( - metric_, prediction, solution, task_type - ) - except ValueError as e: - if e.args[0] == "multiclass format is not supported": - continue - elif ( - e.args[0] == "Samplewise metrics are not available " - "outside of multilabel classification." - ): - continue - elif ( - e.args[0] == "Target is multiclass but " - "average='binary'. Please choose another average " - "setting, one of [None, 'micro', 'macro', 'weighted']." - ): - continue - else: - raise e + score_dict = dict() + if task_type in REGRESSION_TASKS: + for metric_ in to_score: + + try: + score_dict[metric_.name] = _compute_scorer( + metric_, prediction, solution, task_type + ) + except ValueError as e: + print(e, e.args[0]) + if ( + e.args[0] == "Mean Squared Logarithmic Error cannot be used when " + "targets contain negative values." + ): + continue + else: + raise e - return score_dict + else: + for metric_ in to_score: + + # TODO maybe annotate metrics to define which cases they can + # handle? + + try: + score_dict[metric_.name] = _compute_scorer( + metric_, prediction, solution, task_type + ) + except ValueError as e: + if e.args[0] == "multiclass format is not supported": + continue + elif ( + e.args[0] == "Samplewise metrics are not available " + "outside of multilabel classification." + ): + continue + elif ( + e.args[0] == "Target is multiclass but " + "average='binary'. Please choose another average " + "setting, one of [None, 'micro', 'macro', 'weighted']." + ): + continue + else: + raise e + if scoring_functions is None and isinstance(metric, Scorer): + return score_dict[metric.name] else: - return _compute_scorer(metric, prediction, solution, task_type) + return score_dict def calculate_loss( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ @@ -510,11 +516,13 @@ def calculate_loss( scoring_functions=scoring_functions, ) - if scoring_functions: + if scoring_functions or isinstance(metric, (list, tuple)): score = cast(Dict, score) + scoring_functions = cast(List, scoring_functions) + metric_list = list(cast(List, metric)) # Please mypy # we expect a dict() object for which we should calculate the loss loss_dict = dict() - for metric_ in scoring_functions + [metric]: + for metric_ in scoring_functions + metric_list: # TODO: When metrics are annotated with type_of_target support # we can remove this check if metric_.name not in score: diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 608c58921d..ba286049dc 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,5 +1,5 @@ import typing -from typing import Dict, List, Optional +from typing import Dict, List, Optional, Tuple, Union import copy import json @@ -16,6 +16,7 @@ from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.intensification import Intensifier from smac.intensification.simple_intensifier import SimpleIntensifier +from smac.optimizer.multi_objective.parego import ParEGO from smac.runhistory.runhistory2epm import RunHistory2EPM4LogCost from smac.scenario.scenario import Scenario from smac.tae.dask_runner import DaskParallelRunner @@ -40,6 +41,7 @@ ) from autosklearn.metalearning.metalearning.meta_base import MetaBase from autosklearn.metalearning.mismbo import suggest_via_metalearning +from autosklearn.metrics import Scorer from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules from autosklearn.util.stopwatch import StopWatch @@ -218,6 +220,8 @@ def get_smac_object( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): if len(scenario_dict["instances"]) > 1: intensifier = Intensifier @@ -242,6 +246,8 @@ def get_smac_object( intensifier=intensifier, dask_client=dask_client, n_jobs=n_jobs, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) @@ -254,7 +260,7 @@ def __init__( total_walltime_limit, func_eval_time_limit, memory_limit, - metric, + metric: Union[Scorer, List[Scorer], Tuple[Scorer]], stopwatch: StopWatch, n_jobs, dask_client: dask.distributed.Client, @@ -355,7 +361,11 @@ def collect_metalearning_suggestions(self, meta_base): metalearning_configurations = _get_metalearning_configurations( meta_base=meta_base, basename=self.dataset_name, - metric=self.metric, + metric=( + self.metric[0] + if isinstance(self.metric, (List, Tuple)) + else self.metric + ), configuration_space=self.config_space, task=self.task, is_sparse=self.datamanager.info["is_sparse"], @@ -535,6 +545,10 @@ def run_smbo(self): "n_jobs": self.n_jobs, "dask_client": self.dask_client, } + if not isinstance(self.metric, Scorer): + smac_args["multi_objective_algorithm"] = ParEGO + smac_args["multi_objective_kwargs"] = {"rho": 0.05} + scenario_dict["multi_objectives"] = [metric.name for metric in self.metric] if self.get_smac_object_callback is not None: smac = self.get_smac_object_callback(**smac_args) else: From 4e361b718106f87bf79103dce46a7281be4a03d5 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 4 May 2022 09:58:10 +0200 Subject: [PATCH 02/24] Feedback from Eddie --- autosklearn/automl.py | 12 +++++------ autosklearn/estimators.py | 6 ++++-- autosklearn/evaluation/__init__.py | 21 ++++++++++++++++---- autosklearn/evaluation/abstract_evaluator.py | 7 +++++-- autosklearn/evaluation/test_evaluator.py | 8 +++++--- autosklearn/evaluation/train_evaluator.py | 18 +++++++++-------- autosklearn/metrics/__init__.py | 12 ++++++----- autosklearn/smbo.py | 10 +++++----- 8 files changed, 58 insertions(+), 36 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index ecb468426b..1c419e2433 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import Any, Callable, Iterable, List, Mapping, Optional, Tuple, Union +from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Tuple import copy import io @@ -210,7 +210,7 @@ def __init__( get_smac_object_callback: Optional[Callable] = None, smac_scenario_args: Optional[Mapping] = None, logging_config: Optional[Mapping] = None, - metric: Optional[Union[Scorer, List[Scorer], Tuple[Scorer]]] = None, + metric: Optional[Scorer | Sequence[Scorer]] = None, scoring_functions: Optional[list[Scorer]] = None, get_trials_callback: Optional[IncorporateRunResultCallback] = None, dataset_compression: bool | Mapping[str, Any] = True, @@ -692,7 +692,7 @@ def fit( # defined in the estimator fit call if self._metric is None: raise ValueError("No metric given.") - if isinstance(self._metric, (List, Tuple)): + if isinstance(self._metric, Sequence): for entry in self._metric: if not isinstance(entry, Scorer): raise ValueError( @@ -796,7 +796,7 @@ def fit( task=self._task, metric=( self._metric[0] - if isinstance(self._metric, (List, Tuple)) + if isinstance(self._metric, Sequence) else self._metric ), ensemble_size=self._ensemble_size, @@ -1501,9 +1501,7 @@ def fit_ensemble( dataset_name=dataset_name if dataset_name else self._dataset_name, task=task if task else self._task, metric=( - self._metric[0] - if isinstance(self._metric, (List, Tuple)) - else self._metric + self._metric[0] if isinstance(self._metric, Sequence) else self._metric ), ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index fa1e83693a..b7beec7693 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1,5 +1,7 @@ # -*- encoding: utf-8 -*- -from typing import Any, Dict, Iterable, List, Mapping, Optional, Tuple, Union +from __future__ import annotations + +from typing import Any, Dict, Iterable, List, Mapping, Optional, Sequence, Tuple, Union import dask.distributed import joblib @@ -46,7 +48,7 @@ def __init__( smac_scenario_args=None, logging_config=None, metadata_directory=None, - metric: Optional[Union[Scorer, List[Scorer], Tuple[Scorer]]] = None, + metric: Optional[Scorer | Sequence[Scorer]] = None, scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, get_trials_callback=None, diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index d26aa7fd4e..d295d321a6 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -1,5 +1,18 @@ # -*- encoding: utf-8 -*- -from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast +from __future__ import annotations + +from typing import ( + Any, + Callable, + Dict, + List, + Optional, + Sequence, + Tuple, + Type, + Union, + cast, +) import functools import json @@ -86,10 +99,10 @@ def fit_predict_try_except_decorator( def get_cost_of_crash( - metric: Union[Scorer, List[Scorer], Tuple[Scorer]] + metric: Union[Scorer | Sequence[Scorer]], ) -> Union[float, List[float]]: - if isinstance(metric, (List, Tuple)): + if isinstance(metric, Sequence): return [cast(float, get_cost_of_crash(metric_)) for metric_ in metric] elif not isinstance(metric, Scorer): raise ValueError("The metric must be stricly be an instance of Scorer") @@ -129,7 +142,7 @@ def __init__( resampling_strategy: Union[ str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], cost_for_crash: float, abort_on_first_run_crash: bool, port: int, diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 9ece32d9d4..2c967f0b3e 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, List, Optional, TextIO, Tuple, Type, Union, cast +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, TextIO, Tuple, Type, Union, cast import logging import multiprocessing @@ -184,7 +186,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -338,6 +340,7 @@ def _loss( y_true """ if not isinstance(self.configuration, Configuration): + # Dummy prediction if self.scoring_functions: if isinstance(self.metric, Scorer): return {self.metric.name: self.metric._worst_possible_result} diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 4b6cf8452c..c7ea592c88 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -1,5 +1,7 @@ # -*- encoding: utf-8 -*- -from typing import Any, Dict, List, Optional, Tuple, Union +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import multiprocessing @@ -23,7 +25,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Scorer, + metric: Union[Scorer | Sequence[Scorer]], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -111,7 +113,7 @@ def eval_t( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - metric: Scorer, + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: Dict[str, Any], diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index a859e36ab8..e45b2e01a1 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -1,4 +1,6 @@ -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from __future__ import annotations + +from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast import copy import json @@ -182,7 +184,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -1328,7 +1330,7 @@ def eval_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, @@ -1375,7 +1377,7 @@ def eval_iterative_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, @@ -1422,7 +1424,7 @@ def eval_partial_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, @@ -1475,7 +1477,7 @@ def eval_partial_cv_iterative( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, @@ -1523,7 +1525,7 @@ def eval_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, @@ -1571,7 +1573,7 @@ def eval_iterative_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Union[Scorer | Sequence[Scorer]], seed: int, num_run: int, instance: str, diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 88002a87e6..430336009e 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,5 +1,7 @@ +from __future__ import annotations + from abc import ABCMeta, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast from functools import partial from itertools import product @@ -388,7 +390,7 @@ def calculate_score( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Scorer | Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ @@ -420,7 +422,7 @@ def calculate_score( to_score = [] if scoring_functions: to_score.extend(scoring_functions) - if isinstance(metric, (list, tuple)): + if isinstance(metric, Sequence): to_score.extend(metric) else: to_score.append(metric) @@ -480,7 +482,7 @@ def calculate_loss( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Scorer | Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, ) -> Union[float, Dict[str, float]]: """ @@ -516,7 +518,7 @@ def calculate_loss( scoring_functions=scoring_functions, ) - if scoring_functions or isinstance(metric, (list, tuple)): + if scoring_functions or isinstance(metric, Sequence): score = cast(Dict, score) scoring_functions = cast(List, scoring_functions) metric_list = list(cast(List, metric)) # Please mypy diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index ba286049dc..e1ed33e024 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,5 +1,7 @@ +from __future__ import annotations + import typing -from typing import Dict, List, Optional, Tuple, Union +from typing import Dict, List, Optional, Sequence import copy import json @@ -260,7 +262,7 @@ def __init__( total_walltime_limit, func_eval_time_limit, memory_limit, - metric: Union[Scorer, List[Scorer], Tuple[Scorer]], + metric: Scorer | Sequence[Scorer], stopwatch: StopWatch, n_jobs, dask_client: dask.distributed.Client, @@ -362,9 +364,7 @@ def collect_metalearning_suggestions(self, meta_base): meta_base=meta_base, basename=self.dataset_name, metric=( - self.metric[0] - if isinstance(self.metric, (List, Tuple)) - else self.metric + self.metric[0] if isinstance(self.metric, Sequence) else self.metric ), configuration_space=self.config_space, task=self.task, From 77a223ec26ed3ec5770c15e42a274d4392d1ff4b Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 4 May 2022 10:37:29 +0200 Subject: [PATCH 03/24] Make metric internally always a list --- autosklearn/automl.py | 4 +- autosklearn/ensemble_builder.py | 12 +-- autosklearn/ensembles/ensemble_selection.py | 19 ++--- autosklearn/evaluation/__init__.py | 11 ++- autosklearn/evaluation/abstract_evaluator.py | 59 ++++++------- autosklearn/evaluation/test_evaluator.py | 4 +- autosklearn/evaluation/train_evaluator.py | 87 ++++++++++---------- autosklearn/metrics/__init__.py | 69 +++++++--------- autosklearn/smbo.py | 18 ++-- 9 files changed, 129 insertions(+), 154 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 1c419e2433..fcec2c57a1 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -447,7 +447,9 @@ def _do_dummy_prediction(self) -> None: resampling_strategy=self._resampling_strategy, initial_num_run=dummy_run_num, stats=stats, - metric=self._metric, + metrics=( + [self._metric] if isinstance(self._metric, Scorer) else self._metric + ), memory_limit=memory_limit, disable_file_output=self._disable_evaluator_output, abort_on_first_run_crash=False, diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 3707ce84c9..8002c11b1d 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -1003,9 +1003,9 @@ def compute_loss_per_model(self): solution=self.y_true_ensemble, prediction=y_ensemble, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] if np.isfinite(self.read_losses[y_ens_fn]["ens_loss"]): self.logger.debug( @@ -1515,9 +1515,9 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): solution=self.y_true_ensemble, prediction=train_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ), + )[self.metric.name], } if valid_pred is not None: # TODO: valid_pred are a legacy from competition manager @@ -1526,9 +1526,9 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): solution=self.y_valid, prediction=valid_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] # In case test_pred was provided if test_pred is not None: diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 3ae216da01..7649fb8a07 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -164,18 +164,13 @@ def _fast( out=fant_ensemble_prediction, ) - # calculate_loss is versatile and can return a dict of losses - # when scoring_functions=None, we know it will be a float - losses[j] = cast( - float, - calculate_loss( - solution=labels, - prediction=fant_ensemble_prediction, - task_type=self.task_type, - metric=self.metric, - scoring_functions=None, - ), - ) + losses[j] = calculate_loss( + solution=labels, + prediction=fant_ensemble_prediction, + task_type=self.task_type, + metrics=[self.metric], + scoring_functions=None, + )[self.metric.name] all_best = np.argwhere(losses == np.nanmin(losses)).flatten() diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index d295d321a6..48662f2bf6 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -105,7 +105,10 @@ def get_cost_of_crash( if isinstance(metric, Sequence): return [cast(float, get_cost_of_crash(metric_)) for metric_ in metric] elif not isinstance(metric, Scorer): - raise ValueError("The metric must be stricly be an instance of Scorer") + raise ValueError( + "The metric must be stricly be an instance of Scorer or a sequence of " + "Scorers" + ) # Autosklearn optimizes the err. This function translates # worst_possible_result to be a minimization problem. @@ -142,7 +145,7 @@ def __init__( resampling_strategy: Union[ str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], cost_for_crash: float, abort_on_first_run_crash: bool, port: int, @@ -209,7 +212,7 @@ def __init__( self.autosklearn_seed = autosklearn_seed self.resampling_strategy = resampling_strategy self.initial_num_run = initial_num_run - self.metric = metric + self.metrics = metrics self.resampling_strategy = resampling_strategy self.resampling_strategy_args = resampling_strategy_args self.scoring_functions = scoring_functions @@ -373,7 +376,7 @@ def run( config=config, backend=self.backend, port=self.port, - metric=self.metric, + metrics=self.metrics, seed=self.autosklearn_seed, num_run=num_run, scoring_functions=self.scoring_functions, diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 2c967f0b3e..03934387af 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -186,7 +186,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -221,7 +221,7 @@ def __init__( self.X_test = self.datamanager.data.get("X_test") self.y_test = self.datamanager.data.get("Y_test") - self.metric = metric + self.metrics = metrics self.task_type = self.datamanager.info["task"] self.seed = seed @@ -328,7 +328,7 @@ def _loss( self, y_true: np.ndarray, y_hat: np.ndarray, - ) -> Union[float, Dict[str, float]]: + ) -> Dict[str, float]: """Auto-sklearn follows a minimization goal. The calculate_loss internally translate a score function to a minimization problem. @@ -341,30 +341,21 @@ def _loss( """ if not isinstance(self.configuration, Configuration): # Dummy prediction - if self.scoring_functions: - if isinstance(self.metric, Scorer): - return {self.metric.name: self.metric._worst_possible_result} - else: - return { - metric.name: metric._worst_possible_result - for metric in self.metric - } - else: - if isinstance(self.metric, Scorer): - return self.metric._worst_possible_result - else: - return { - metric.name: metric._worst_possible_result - for metric in self.metric - } - - return calculate_loss( - y_true, - y_hat, - self.task_type, - self.metric, - scoring_functions=self.scoring_functions, - ) + rval = {} + for metric in self.scoring_functions if self.scoring_functions else []: + rval[metric.name] = metric._worst_possible_result + for metric in self.metrics: + rval[metric.name] = metric._worst_possible_result + return rval + + else: + return calculate_loss( + y_true, + y_hat, + self.task_type, + self.metrics, + scoring_functions=self.scoring_functions, + ) def finish_up( self, @@ -413,8 +404,8 @@ def finish_up( if isinstance(loss, dict): loss_ = loss - if isinstance(self.metric, Scorer): - loss = loss_[self.metric.name] + if len(self.metrics) == 1: + loss = loss_[self.metrics[0].name] else: loss = {metric: loss_[metric] for metric in loss_} else: @@ -447,14 +438,14 @@ def calculate_auxiliary_losses( self, Y_valid_pred: np.ndarray, Y_test_pred: np.ndarray, - ) -> Tuple[Optional[float], Optional[float]]: + ) -> Tuple[Optional[float | Sequence[float]], Optional[float | Sequence[float]]]: if Y_valid_pred is not None: if self.y_valid is not None: validation_loss: Optional[Union[float, Dict[str, float]]] = self._loss( self.y_valid, Y_valid_pred ) - if isinstance(validation_loss, dict): - validation_loss = validation_loss[self.metric.name] + if len(self.metrics) == 1: + validation_loss = validation_loss[self.metrics[0].name] else: validation_loss = None else: @@ -465,8 +456,8 @@ def calculate_auxiliary_losses( test_loss: Optional[Union[float, Dict[str, float]]] = self._loss( self.y_test, Y_test_pred ) - if isinstance(test_loss, dict): - test_loss = test_loss[self.metric.name] + if len(self.metrics) == 1: + test_loss = test_loss[self.metrics[0].name] else: test_loss = None else: diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index c7ea592c88..2d7fa5cbb5 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -113,7 +113,7 @@ def eval_t( queue: multiprocessing.Queue, config: Union[int, Configuration], backend: Backend, - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: Dict[str, Any], @@ -131,7 +131,7 @@ def eval_t( evaluator = TestEvaluator( configuration=config, backend=backend, - metric=metric, + metric=metrics, seed=seed, port=port, queue=queue, diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index e45b2e01a1..30280c5fe6 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -184,7 +184,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -212,7 +212,7 @@ def __init__( queue=queue, port=port, configuration=configuration, - metric=metric, + metrics=metrics, additional_components=additional_components, scoring_functions=scoring_functions, seed=seed, @@ -330,8 +330,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: y = _get_y_array(self.Y_train, self.task_type) - # stores train loss of each fold. - train_losses = [np.NaN] * self.num_cv_folds + # stores train loss(es) of each fold. + train_losses = [dict()] * self.num_cv_folds # used as weights when averaging train losses. train_fold_weights = [np.NaN] * self.num_cv_folds # stores opt (validation) loss of each fold. @@ -439,21 +439,25 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: w / sum(opt_fold_weights) for w in opt_fold_weights ] - # train_losses is a list of either scalars or dicts. If it contains - # dicts, then train_loss is computed using the target metric - # (self.metric). - if all(isinstance(elem, dict) for elem in train_losses): + if len(self.metrics) == 1: train_loss = np.average( [ - train_losses[i][str(self.metric)] + train_losses[i][str(self.metrics[0])] for i in range(self.num_cv_folds) ], weights=train_fold_weights_percentage, ) else: - train_loss = np.average( - train_losses, weights=train_fold_weights_percentage - ) + train_loss = [ + np.average( + [ + train_losses[i][str(metric)] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights_percentage, + ) + for metric in self.metrics + ] # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. @@ -644,34 +648,29 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ] opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights] - # train_losses is a list of either scalars or dicts. If it contains dicts, - # then train_loss is computed using the target metric (self.metric). - if all(isinstance(elem, dict) for elem in train_losses): - if isinstance(self.metric, Scorer): - train_loss = np.average( + if len(self.metrics) == 1: + train_loss = np.average( + [ + train_losses[i][str(self.metrics[0])] + for i in range(self.num_cv_folds) + ], + weights=train_fold_weights, + ) + else: + train_loss = [ + np.average( [ - train_losses[i][str(self.metric)] + train_losses[i][str(metric)] for i in range(self.num_cv_folds) ], weights=train_fold_weights, ) - else: - train_loss = [ - np.average( - [ - train_losses[i][str(metric)] - for i in range(self.num_cv_folds) - ], - weights=train_fold_weights, - ) - for metric in self.metric - ] - else: - train_loss = np.average(train_losses, weights=train_fold_weights) + for metric in self.metrics + ] # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. - if self.scoring_functions or not isinstance(self.metric, Scorer): + if self.scoring_functions or len(self.metrics) > 1: opt_loss = {} for metric in opt_losses[0].keys(): opt_loss[metric] = np.average( @@ -1330,7 +1329,7 @@ def eval_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1352,7 +1351,7 @@ def eval_holdout( queue=queue, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, - metric=metric, + metrics=metrics, configuration=config, seed=seed, num_run=num_run, @@ -1377,7 +1376,7 @@ def eval_iterative_holdout( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1397,7 +1396,7 @@ def eval_iterative_holdout( port=port, config=config, backend=backend, - metric=metric, + metrics=metrics, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, @@ -1424,7 +1423,7 @@ def eval_partial_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1449,7 +1448,7 @@ def eval_partial_cv( backend=backend, port=port, queue=queue, - metric=metric, + metrics=metrics, configuration=config, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, @@ -1477,7 +1476,7 @@ def eval_partial_cv_iterative( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1498,7 +1497,7 @@ def eval_partial_cv_iterative( queue=queue, config=config, backend=backend, - metric=metric, + metrics=metrics, resampling_strategy=resampling_strategy, resampling_strategy_args=resampling_strategy_args, seed=seed, @@ -1525,7 +1524,7 @@ def eval_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1545,7 +1544,7 @@ def eval_cv( backend=backend, port=port, queue=queue, - metric=metric, + metrics=metrics, configuration=config, seed=seed, num_run=num_run, @@ -1573,7 +1572,7 @@ def eval_iterative_cv( str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit ], resampling_strategy_args: Dict[str, Optional[Union[float, int, str]]], - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], seed: int, num_run: int, instance: str, @@ -1592,7 +1591,7 @@ def eval_iterative_cv( eval_cv( backend=backend, queue=queue, - metric=metric, + metrics=metrics, config=config, seed=seed, num_run=num_run, diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 430336009e..9d1ee2a062 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,7 +1,7 @@ from __future__ import annotations from abc import ABCMeta, abstractmethod -from typing import Any, Callable, Dict, List, Optional, Sequence, Union, cast +from typing import Any, Callable, Dict, List, Optional, Sequence from functools import partial from itertools import product @@ -390,9 +390,9 @@ def calculate_score( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer | Sequence[Scorer], + metrics: Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, -) -> Union[float, Dict[str, float]]: +) -> Dict[str, float]: """ Returns a score (a magnitude that allows casting the optimization problem as a maximization one) for the @@ -407,25 +407,21 @@ def calculate_score( task_type: int To understand if the problem task is classification or regression - metric: Scorer - Object that host a function to calculate how good the + metrics: Sequence[Scorer] + A lost of objets that host a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses Returns ------- - float or Dict[str, float] + Dict[str, float] """ if task_type not in TASK_TYPES: raise NotImplementedError(task_type) - to_score = [] + to_score = list(metrics) if scoring_functions: to_score.extend(scoring_functions) - if isinstance(metric, Sequence): - to_score.extend(metric) - else: - to_score.append(metric) score_dict = dict() if task_type in REGRESSION_TASKS: @@ -472,19 +468,16 @@ def calculate_score( else: raise e - if scoring_functions is None and isinstance(metric, Scorer): - return score_dict[metric.name] - else: - return score_dict + return score_dict def calculate_loss( solution: np.ndarray, prediction: np.ndarray, task_type: int, - metric: Scorer | Sequence[Scorer], + metrics: Sequence[Scorer], scoring_functions: Optional[List[Scorer]] = None, -) -> Union[float, Dict[str, float]]: +) -> Dict[str, float]: """ Returns a loss (a magnitude that allows casting the optimization problem as a minimization one) for the @@ -499,44 +492,38 @@ def calculate_loss( task_type: int To understand if the problem task is classification or regression - metric: Scorer - Object that host a function to calculate how good the + metric: Sequence[Scorer] + A lost of objets that host a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses Returns ------- - float or Dict[str, float] + Dict[str, float] A loss function for each of the provided scorer objects """ score = calculate_score( solution=solution, prediction=prediction, task_type=task_type, - metric=metric, + metrics=metrics, scoring_functions=scoring_functions, ) - - if scoring_functions or isinstance(metric, Sequence): - score = cast(Dict, score) - scoring_functions = cast(List, scoring_functions) - metric_list = list(cast(List, metric)) # Please mypy - # we expect a dict() object for which we should calculate the loss - loss_dict = dict() - for metric_ in scoring_functions + metric_list: - # TODO: When metrics are annotated with type_of_target support - # we can remove this check - if metric_.name not in score: - continue - # maybe metric argument is not in scoring_functions - # so append it to the list. Rather than check if such - # is the case, redefining loss_dict[metric] is less expensive - loss_dict[metric_.name] = metric_._optimum - score[metric_.name] - return loss_dict - else: - rval = metric._optimum - cast(float, score) - return rval + scoring_functions = scoring_functions if scoring_functions else [] + + # we expect a dict() object for which we should calculate the loss + loss_dict = dict() + for metric_ in scoring_functions + list(metrics): + # TODO: When metrics are annotated with type_of_target support + # we can remove this check + if metric_.name not in score: + continue + # maybe metric argument is not in scoring_functions + # so append it to the list. Rather than check if such + # is the case, redefining loss_dict[metric] is less expensive + loss_dict[metric_.name] = metric_._optimum - score[metric_.name] + return loss_dict def calculate_metric( diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index e1ed33e024..605ac93a11 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -289,7 +289,7 @@ def __init__( # data related self.dataset_name = dataset_name self.datamanager = None - self.metric = metric + self.metrics = metric if isinstance(metric, Sequence) else [metric] self.task = None self.backend = backend self.port = port @@ -308,7 +308,7 @@ def __init__( self.resampling_strategy_args = resampling_strategy_args # and a bunch of useful limits - self.worst_possible_result = get_cost_of_crash(self.metric) + self.worst_possible_result = get_cost_of_crash(self.metrics) self.total_walltime_limit = int(total_walltime_limit) self.func_eval_time_limit = int(func_eval_time_limit) self.memory_limit = memory_limit @@ -363,9 +363,7 @@ def collect_metalearning_suggestions(self, meta_base): metalearning_configurations = _get_metalearning_configurations( meta_base=meta_base, basename=self.dataset_name, - metric=( - self.metric[0] if isinstance(self.metric, Sequence) else self.metric - ), + metric=self.metrics[0], configuration_space=self.config_space, task=self.task, is_sparse=self.datamanager.info["is_sparse"], @@ -479,7 +477,7 @@ def run_smbo(self): initial_num_run=num_run, include=self.include, exclude=self.exclude, - metric=self.metric, + metrics=self.metrics, memory_limit=self.memory_limit, disable_file_output=self.disable_file_output, scoring_functions=self.scoring_functions, @@ -545,10 +543,10 @@ def run_smbo(self): "n_jobs": self.n_jobs, "dask_client": self.dask_client, } - if not isinstance(self.metric, Scorer): + if len(self.metrics) > 1: smac_args["multi_objective_algorithm"] = ParEGO smac_args["multi_objective_kwargs"] = {"rho": 0.05} - scenario_dict["multi_objectives"] = [metric.name for metric in self.metric] + scenario_dict["multi_objectives"] = [metric.name for metric in self.metrics] if self.get_smac_object_callback is not None: smac = self.get_smac_object_callback(**smac_args) else: @@ -594,7 +592,7 @@ def get_metalearning_suggestions(self): "files", "%s_%s_%s" % ( - self.metric, + self.metrics[0], TASK_TYPES_TO_STRING[meta_task], "sparse" if self.datamanager.info["is_sparse"] else "dense", ), @@ -621,7 +619,7 @@ def get_metalearning_suggestions(self): self.metadata_directory, "%s_%s_%s" % ( - self.metric, + self.metrics[0], TASK_TYPES_TO_STRING[meta_task], "sparse" if self.datamanager.info["is_sparse"] else "dense", ), From c3b5072f18b8c3078659653705b626b1555d7798 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 4 May 2022 13:07:31 +0200 Subject: [PATCH 04/24] Fix most examples --- autosklearn/automl.py | 24 +++++++++++++------ autosklearn/ensemble_builder.py | 4 ++-- autosklearn/estimators.py | 5 +++- autosklearn/evaluation/train_evaluator.py | 19 ++++++++------- autosklearn/smbo.py | 3 +++ examples/60_search/example_random_search.py | 12 +++++++++- .../60_search/example_successive_halving.py | 2 ++ 7 files changed, 49 insertions(+), 20 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index fcec2c57a1..2e7fff49f0 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1299,7 +1299,13 @@ def fit_pipeline( if "resampling_strategy" not in kwargs: kwargs["resampling_strategy"] = self._resampling_strategy if "metric" not in kwargs: - kwargs["metric"] = self._metric + kwargs["metric"] = ( + [self._metric] if isinstance(self._metric, Scorer) else self._metric + ) + elif "metric" in kwargs and isinstance(kwargs["metric"], Scorer): + kwargs["metric"] = [kwargs["metric"]] + kwargs["metrics"] = kwargs["metric"] + del kwargs["metric"] if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: @@ -1317,7 +1323,7 @@ def fit_pipeline( autosklearn_seed=self._seed, abort_on_first_run_crash=False, multi_objectives=["cost"], - cost_for_crash=get_cost_of_crash(kwargs["metric"]), + cost_for_crash=get_cost_of_crash(kwargs["metrics"]), port=self._logger_port, **kwargs, **self._resampling_strategy_arguments, @@ -1644,7 +1650,7 @@ def score(self, X, y): ) def _get_runhistory_models_performance(self): - metric = self._metric + metric = self._metric if isinstance(self._metric, Scorer) else self._metric[0] data = self.runhistory_.data performance_list = [] for run_key, run_value in data.items(): @@ -1656,7 +1662,10 @@ def _get_runhistory_models_performance(self): endtime = pd.Timestamp( time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_value.endtime)) ) - val_score = metric._optimum - (metric._sign * run_value.cost) + cost = run_value.cost + if not isinstance(self._metric, Scorer): + cost = cost[0] + val_score = metric._optimum - (metric._sign * cost) train_score = metric._optimum - ( metric._sign * run_value.additional_info["train_loss"] ) @@ -1668,9 +1677,10 @@ def _get_runhistory_models_performance(self): # Append test-scores, if data for test_loss are available. # This is the case, if X_test and y_test where provided. if "test_loss" in run_value.additional_info: - test_score = metric._optimum - ( - metric._sign * run_value.additional_info["test_loss"] - ) + test_loss = run_value.additional_info["test_loss"] + if not isinstance(self._metric, Scorer): + test_loss = test_loss[0] + test_score = metric._optimum - (metric._sign * test_loss) scores["single_best_test_score"] = test_score performance_list.append(scores) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 8002c11b1d..738427500f 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -1536,9 +1536,9 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): solution=self.y_test, prediction=test_pred, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, - ) + )[self.metric.name] self.ensemble_history.append(performance_stamp) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index b7beec7693..f3b97d0328 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -794,6 +794,9 @@ def leaderboard( What column to sort by. If that column is not present, the sorting defaults to the ``"model_id"`` index column. + Defaults to the metric optimized. Sort by the first objective + in case of a multi-objective optimization problem + sort_order: "auto" or "ascending" or "descending" = "auto" Which sort order to apply to the ``sort_by`` column. If left as ``"auto"``, it will sort by a sensible default where "better" is @@ -886,7 +889,7 @@ def has_key(rv, key): "start_time": rval.starttime, "end_time": rval.endtime, "status": str(rval.status), - "cost": rval.cost, + "cost": rval.cost if isinstance(rval.cost, float) else rval.cost[0], "train_loss": rval.additional_info["train_loss"] if has_key(rval, "train_loss") else None, diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 30280c5fe6..777458b5ff 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -670,15 +670,16 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. - if self.scoring_functions or len(self.metrics) > 1: - opt_loss = {} - for metric in opt_losses[0].keys(): - opt_loss[metric] = np.average( - [opt_losses[i][metric] for i in range(self.num_cv_folds)], - weights=opt_fold_weights, - ) - else: - opt_loss = np.average(opt_losses, weights=opt_fold_weights) + opt_loss = {} + for metric_name in list(opt_losses[0].keys()) + [ + metric.name for metric in self.metrics + ]: + opt_loss[metric_name] = np.average( + [opt_losses[i][metric_name] for i in range(self.num_cv_folds)], + weights=opt_fold_weights, + ) + if len(self.metrics) == 1: + opt_loss = opt_loss[self.metrics[0].name] Y_targets = self.Y_targets Y_train_targets = self.Y_train_targets diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 605ac93a11..5f60c6ba92 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -547,6 +547,9 @@ def run_smbo(self): smac_args["multi_objective_algorithm"] = ParEGO smac_args["multi_objective_kwargs"] = {"rho": 0.05} scenario_dict["multi_objectives"] = [metric.name for metric in self.metrics] + else: + smac_args["multi_objective_algorithm"] = None + smac_args["multi_objective_kwargs"] = {} if self.get_smac_object_callback is not None: smac = self.get_smac_object_callback(**smac_args) else: diff --git a/examples/60_search/example_random_search.py b/examples/60_search/example_random_search.py index 520c8c18b0..908fe44ffe 100644 --- a/examples/60_search/example_random_search.py +++ b/examples/60_search/example_random_search.py @@ -45,6 +45,8 @@ def get_roar_object_callback( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, # This argument will be ignored as ROAR does not yet support multi-objective optimization + multi_objective_kwargs, ): """Random online adaptive racing.""" @@ -91,7 +93,15 @@ def get_roar_object_callback( # Fit a classifier using Random Search # ==================================== def get_random_search_object_callback( - scenario_dict, seed, ta, ta_kwargs, metalearning_configurations, n_jobs, dask_client + scenario_dict, + seed, + ta, + ta_kwargs, + metalearning_configurations, + n_jobs, + dask_client, + multi_objective_algorithm, # This argument will be ignored as ROAR does not yet support multi-objective optimization + multi_objective_kwargs, ): """Random search""" diff --git a/examples/60_search/example_successive_halving.py b/examples/60_search/example_successive_halving.py index e57be7f157..71749f5668 100644 --- a/examples/60_search/example_successive_halving.py +++ b/examples/60_search/example_successive_halving.py @@ -37,6 +37,8 @@ def get_smac_object( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, # This argument will be ignored as SH does not yet support multi-objective optimization + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.successive_halving import SuccessiveHalving From fc89c68f5a466a1ac7101aba1799c844fed46c80 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 5 May 2022 16:09:30 +0200 Subject: [PATCH 05/24] Take further feedback into account --- autosklearn/automl.py | 78 +++++++++++------------ autosklearn/estimators.py | 4 +- autosklearn/evaluation/__init__.py | 45 ++++++------- autosklearn/evaluation/test_evaluator.py | 6 +- autosklearn/evaluation/train_evaluator.py | 47 ++++---------- autosklearn/metrics/__init__.py | 16 ++--- autosklearn/smbo.py | 6 +- test/test_metric/test_metrics.py | 6 +- 8 files changed, 89 insertions(+), 119 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 2e7fff49f0..0cdc502a4d 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -66,7 +66,7 @@ from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget -from autosklearn.metrics import Scorer, calculate_metric, default_metric_for_task +from autosklearn.metrics import Scorer, compute_single_metric, default_metric_for_task from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( @@ -210,7 +210,7 @@ def __init__( get_smac_object_callback: Optional[Callable] = None, smac_scenario_args: Optional[Mapping] = None, logging_config: Optional[Mapping] = None, - metric: Optional[Scorer | Sequence[Scorer]] = None, + metrics: Sequence[Scorer] | None = None, scoring_functions: Optional[list[Scorer]] = None, get_trials_callback: Optional[IncorporateRunResultCallback] = None, dataset_compression: bool | Mapping[str, Any] = True, @@ -244,7 +244,7 @@ def __init__( self._delete_tmp_folder_after_terminate = delete_tmp_folder_after_terminate self._time_for_task = time_left_for_this_task self._per_run_time_limit = per_run_time_limit - self._metric = metric + self._metrics = metrics self._ensemble_size = ensemble_size self._ensemble_nbest = ensemble_nbest self._max_models_on_disc = max_models_on_disc @@ -422,8 +422,8 @@ def _do_dummy_prediction(self) -> None: if self._resampling_strategy in ["partial-cv", "partial-cv-iterative-fit"]: return - if self._metric is None: - raise ValueError("Metric was not set") + if self._metrics is None: + raise ValueError("Metric/Metrics was/were not set") # Dummy prediction always have num_run set to 1 dummy_run_num = 1 @@ -447,13 +447,11 @@ def _do_dummy_prediction(self) -> None: resampling_strategy=self._resampling_strategy, initial_num_run=dummy_run_num, stats=stats, - metrics=( - [self._metric] if isinstance(self._metric, Scorer) else self._metric - ), + metrics=self._metrics, memory_limit=memory_limit, disable_file_output=self._disable_evaluator_output, abort_on_first_run_crash=False, - cost_for_crash=get_cost_of_crash(self._metric), + cost_for_crash=get_cost_of_crash(self._metrics), port=self._logger_port, pynisher_context=self._multiprocessing_context, **self._resampling_strategy_arguments, @@ -613,8 +611,8 @@ def fit( self._task = task # Assign a metric if it doesnt exist - if self._metric is None: - self._metric = default_metric_for_task[self._task] + if self._metrics is None: + self._metrics = [default_metric_for_task[self._task]] if dataset_name is None: dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) @@ -692,16 +690,19 @@ def fit( # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call - if self._metric is None: - raise ValueError("No metric given.") - if isinstance(self._metric, Sequence): - for entry in self._metric: + if self._metrics is None: + raise ValueError("No metrics given.") + if isinstance(self._metrics, Sequence): + for entry in self._metrics: if not isinstance(entry, Scorer): raise ValueError( - "Metric must be instance of autosklearn.metrics.Scorer." + "Metric {entry} must be instance of autosklearn.metrics.Scorer." ) - elif not isinstance(self._metric, Scorer): - raise ValueError("Metric must be instance of autosklearn.metrics.Scorer.") + else: + raise ValueError( + "Metric must be a sequence of instances of " + "autosklearn.metrics.Scorer." + ) # If no dask client was provided, we create one, so that we can # start a ensemble process in parallel to smbo optimize @@ -796,11 +797,7 @@ def fit( backend=copy.deepcopy(self._backend), dataset_name=dataset_name, task=self._task, - metric=( - self._metric[0] - if isinstance(self._metric, Sequence) - else self._metric - ), + metric=self._metrics[0], ensemble_size=self._ensemble_size, ensemble_nbest=self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -872,7 +869,7 @@ def fit( config_file=configspace_path, seed=self._seed, metadata_directory=self._metadata_directory, - metric=self._metric, + metrics=self._metrics, resampling_strategy=self._resampling_strategy, resampling_strategy_args=self._resampling_strategy_arguments, include=self._include, @@ -1011,7 +1008,10 @@ def _log_fit_setup(self) -> None: ) self._logger.debug(" smac_scenario_args: %s", str(self._smac_scenario_args)) self._logger.debug(" logging_config: %s", str(self.logging_config)) - self._logger.debug(" metric: %s", str(self._metric)) + if len(self._metrics) == 1: + self._logger.debug(" metric: %s", str(self._metrics[0])) + else: + self._logger.debug(" metrics: %s", str(self._metrics)) self._logger.debug("Done printing arguments to auto-sklearn") self._logger.debug("Starting to print available components") for choice in ( @@ -1264,8 +1264,8 @@ def fit_pipeline( self._task = task # Assign a metric if it doesnt exist - if self._metric is None: - self._metric = default_metric_for_task[self._task] + if self._metrics is None: + self._metrics = [default_metric_for_task[self._task]] # Get the configuration space # This also ensures that the Backend has processed the @@ -1298,12 +1298,8 @@ def fit_pipeline( kwargs["memory_limit"] = self._memory_limit if "resampling_strategy" not in kwargs: kwargs["resampling_strategy"] = self._resampling_strategy - if "metric" not in kwargs: - kwargs["metric"] = ( - [self._metric] if isinstance(self._metric, Scorer) else self._metric - ) - elif "metric" in kwargs and isinstance(kwargs["metric"], Scorer): - kwargs["metric"] = [kwargs["metric"]] + if "metrics" not in kwargs: + kwargs["metric"] = self._metrics kwargs["metrics"] = kwargs["metric"] del kwargs["metric"] if "disable_file_output" not in kwargs: @@ -1508,9 +1504,7 @@ def fit_ensemble( backend=copy.deepcopy(self._backend), dataset_name=dataset_name if dataset_name else self._dataset_name, task=task if task else self._task, - metric=( - self._metric[0] if isinstance(self._metric, Sequence) else self._metric - ), + metric=self._metric[0], ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -1608,7 +1602,7 @@ def _load_best_individual_model(self): # SingleBest contains the best model found by AutoML ensemble = SingleBest( - metric=self._metric, + metric=self._metrics[0], seed=self._seed, run_history=self.runhistory_, backend=self._backend, @@ -1642,15 +1636,15 @@ def score(self, X, y): # same representation domain prediction = self.InputValidator.target_validator.transform(prediction) - return calculate_metric( + return compute_single_metric( solution=y, prediction=prediction, task_type=self._task, - metric=self._metric, + metric=self._metrics[0], ) def _get_runhistory_models_performance(self): - metric = self._metric if isinstance(self._metric, Scorer) else self._metric[0] + metric = self._metrics[0] data = self.runhistory_.data performance_list = [] for run_key, run_value in data.items(): @@ -1663,7 +1657,7 @@ def _get_runhistory_models_performance(self): time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_value.endtime)) ) cost = run_value.cost - if not isinstance(self._metric, Scorer): + if len(self._metrics) > 1: cost = cost[0] val_score = metric._optimum - (metric._sign * cost) train_score = metric._optimum - ( @@ -1678,7 +1672,7 @@ def _get_runhistory_models_performance(self): # This is the case, if X_test and y_test where provided. if "test_loss" in run_value.additional_info: test_loss = run_value.additional_info["test_loss"] - if not isinstance(self._metric, Scorer): + if len(self._metrics) > 1: test_loss = test_loss[0] test_score = metric._optimum - (metric._sign * test_loss) scores["single_best_test_score"] = test_score diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index f3b97d0328..8cbc428986 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -48,7 +48,7 @@ def __init__( smac_scenario_args=None, logging_config=None, metadata_directory=None, - metric: Optional[Scorer | Sequence[Scorer]] = None, + metric: Scorer | Sequence[Scorer] | None = None, scoring_functions: Optional[List[Scorer]] = None, load_models: bool = True, get_trials_callback=None, @@ -419,7 +419,7 @@ def build_automl(self): smac_scenario_args=self.smac_scenario_args, logging_config=self.logging_config, metadata_directory=self.metadata_directory, - metric=self.metric, + metrics=[self.metric] if isinstance(self.metric, Scorer) else self.metric, scoring_functions=self.scoring_functions, get_trials_callback=self.get_trials_callback, dataset_compression=self.dataset_compression, diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index 48662f2bf6..c4b688edae 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -98,29 +98,30 @@ def fit_predict_try_except_decorator( queue.close() -def get_cost_of_crash( - metric: Union[Scorer | Sequence[Scorer]], -) -> Union[float, List[float]]: - - if isinstance(metric, Sequence): - return [cast(float, get_cost_of_crash(metric_)) for metric_ in metric] - elif not isinstance(metric, Scorer): - raise ValueError( - "The metric must be stricly be an instance of Scorer or a sequence of " - "Scorers" - ) +def get_cost_of_crash(metrics: Sequence[Scorer]) -> List[float] | float: + """Return the cost of crash. + + Return value can be either a list (multi-objective optimization) or a + raw float (single objective) because SMAC assumes different types in the + two different cases. + """ + costs = [] + for metric in metrics: + if not isinstance(metric, Scorer): + raise ValueError("The metric {metric} must be an instance of Scorer") + + # Autosklearn optimizes the err. This function translates + # worst_possible_result to be a minimization problem. + # For metrics like accuracy that are bounded to [0,1] + # metric.optimum==1 is the worst cost. + # A simple guide is to use greater_is_better embedded as sign + if metric._sign < 0: + worst_possible_result = metric._worst_possible_result + else: + worst_possible_result = metric._optimum - metric._worst_possible_result + costs.append(worst_possible_result) - # Autosklearn optimizes the err. This function translates - # worst_possible_result to be a minimization problem. - # For metrics like accuracy that are bounded to [0,1] - # metric.optimum==1 is the worst cost. - # A simple guide is to use greater_is_better embedded as sign - if metric._sign < 0: - worst_possible_result = metric._worst_possible_result - else: - worst_possible_result = metric._optimum - metric._worst_possible_result - - return worst_possible_result + return costs if len(costs) > 1 else costs[0] def _encode_exit_status( diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index 2d7fa5cbb5..fd5739e9ea 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -1,6 +1,4 @@ # -*- encoding: utf-8 -*- -from __future__ import annotations - from typing import Any, Dict, List, Optional, Sequence, Tuple, Union import multiprocessing @@ -25,7 +23,7 @@ def __init__( self, backend: Backend, queue: multiprocessing.Queue, - metric: Union[Scorer | Sequence[Scorer]], + metrics: Sequence[Scorer], additional_components: Dict[str, ThirdPartyComponents], port: Optional[int], configuration: Optional[Union[int, Configuration]] = None, @@ -41,7 +39,7 @@ def __init__( queue=queue, port=port, configuration=configuration, - metric=metric, + metrics=metrics, additional_components=additional_components, scoring_functions=scoring_functions, seed=seed, diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 777458b5ff..ae5e88d2e1 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from typing import Any, Dict, List, Optional, Sequence, Tuple, Union, cast import copy @@ -439,25 +437,18 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: w / sum(opt_fold_weights) for w in opt_fold_weights ] - if len(self.metrics) == 1: - train_loss = np.average( + train_loss = [ + np.average( [ - train_losses[i][str(self.metrics[0])] + train_losses[i][str(metric)] for i in range(self.num_cv_folds) ], weights=train_fold_weights_percentage, ) - else: - train_loss = [ - np.average( - [ - train_losses[i][str(metric)] - for i in range(self.num_cv_folds) - ], - weights=train_fold_weights_percentage, - ) - for metric in self.metrics - ] + for metric in self.metrics + ] + if len(self.metrics) == 1: + train_loss = train_loss[self.metrics[0].name] # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. @@ -648,25 +639,15 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ] opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights] - if len(self.metrics) == 1: - train_loss = np.average( - [ - train_losses[i][str(self.metrics[0])] - for i in range(self.num_cv_folds) - ], + train_loss = [ + np.average( + [train_losses[i][str(metric)] for i in range(self.num_cv_folds)], weights=train_fold_weights, ) - else: - train_loss = [ - np.average( - [ - train_losses[i][str(metric)] - for i in range(self.num_cv_folds) - ], - weights=train_fold_weights, - ) - for metric in self.metrics - ] + for metric in self.metrics + ] + if len(self.metrics) == 1: + train_loss = train_loss[self.metrics[0].name] # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 9d1ee2a062..f0cf02947c 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,5 +1,3 @@ -from __future__ import annotations - from abc import ABCMeta, abstractmethod from typing import Any, Callable, Dict, List, Optional, Sequence @@ -408,7 +406,7 @@ def calculate_score( To understand if the problem task is classification or regression metrics: Sequence[Scorer] - A lost of objets that host a function to calculate how good the + A list of objects that hosts a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses @@ -428,7 +426,7 @@ def calculate_score( for metric_ in to_score: try: - score_dict[metric_.name] = _compute_scorer( + score_dict[metric_.name] = _compute_single_scorer( metric_, prediction, solution, task_type ) except ValueError as e: @@ -448,7 +446,7 @@ def calculate_score( # handle? try: - score_dict[metric_.name] = _compute_scorer( + score_dict[metric_.name] = _compute_single_scorer( metric_, prediction, solution, task_type ) except ValueError as e: @@ -493,7 +491,7 @@ def calculate_loss( To understand if the problem task is classification or regression metric: Sequence[Scorer] - A lost of objets that host a function to calculate how good the + A list of objects that hosts a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] A list of metrics to calculate multiple losses @@ -526,7 +524,7 @@ def calculate_loss( return loss_dict -def calculate_metric( +def compute_single_metric( metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ @@ -550,7 +548,7 @@ def calculate_metric( ------- float """ - score = _compute_scorer( + score = _compute_single_scorer( solution=solution, prediction=prediction, metric=metric, @@ -559,7 +557,7 @@ def calculate_metric( return metric._sign * score -def _compute_scorer( +def _compute_single_scorer( metric: Scorer, prediction: np.ndarray, solution: np.ndarray, task_type: int ) -> float: """ diff --git a/autosklearn/smbo.py b/autosklearn/smbo.py index 5f60c6ba92..cd83e94e1e 100644 --- a/autosklearn/smbo.py +++ b/autosklearn/smbo.py @@ -1,5 +1,3 @@ -from __future__ import annotations - import typing from typing import Dict, List, Optional, Sequence @@ -262,7 +260,7 @@ def __init__( total_walltime_limit, func_eval_time_limit, memory_limit, - metric: Scorer | Sequence[Scorer], + metrics: Sequence[Scorer], stopwatch: StopWatch, n_jobs, dask_client: dask.distributed.Client, @@ -289,7 +287,7 @@ def __init__( # data related self.dataset_name = dataset_name self.datamanager = None - self.metrics = metric if isinstance(metric, Sequence) else [metric] + self.metrics = metrics self.task = None self.backend = backend self.port = port diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 541b2d6783..de27ed0451 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -6,7 +6,7 @@ import autosklearn.metrics from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION -from autosklearn.metrics import calculate_loss, calculate_metric, calculate_score +from autosklearn.metrics import calculate_loss, calculate_score, compute_single_metric import pytest import unittest @@ -702,7 +702,7 @@ def test_calculate_metric(): y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) - assert pytest.approx(score) == calculate_metric( + assert pytest.approx(score) == compute_single_metric( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, @@ -713,7 +713,7 @@ def test_calculate_metric(): y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) - assert pytest.approx(score) == calculate_metric( + assert pytest.approx(score) == compute_single_metric( solution=y_true, prediction=y_pred, task_type=REGRESSION, From 1a09c12d5c8c8593bf7e3bb03574918338c74988 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 6 May 2022 13:29:43 +0200 Subject: [PATCH 06/24] Fix unit tests --- autosklearn/automl.py | 13 +- autosklearn/ensemble_builder.py | 10 +- autosklearn/ensembles/ensemble_selection.py | 6 +- autosklearn/evaluation/abstract_evaluator.py | 15 +- autosklearn/evaluation/test_evaluator.py | 12 +- autosklearn/evaluation/train_evaluator.py | 30 +- autosklearn/experimental/askl2.py | 8 + autosklearn/metrics/__init__.py | 16 +- test/test_automl/test_dummy_predictions.py | 8 +- .../test_abstract_evaluator.py | 12 +- test/test_evaluation/test_evaluation.py | 44 +-- test/test_evaluation/test_test_evaluator.py | 6 +- test/test_evaluation/test_train_evaluator.py | 259 +++++++++--------- test/test_metric/test_metrics.py | 79 ++++-- test/test_optimizer/test_smbo.py | 6 +- 15 files changed, 286 insertions(+), 238 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 0cdc502a4d..b051e7217c 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1504,7 +1504,7 @@ def fit_ensemble( backend=copy.deepcopy(self._backend), dataset_name=dataset_name if dataset_name else self._dataset_name, task=task if task else self._task, - metric=self._metric[0], + metric=self._metrics[0], ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, max_models_on_disc=self._max_models_on_disc, @@ -1804,7 +1804,7 @@ def cv_results_(self): param_dict = config.get_dictionary() params.append(param_dict) mean_test_score.append( - self._metric._optimum - (self._metric._sign * run_value.cost) + self._metrics[0]._optimum - (self._metrics[0]._sign * run_value.cost) ) mean_fit_time.append(run_value.time) budgets.append(run_key.budget) @@ -1838,7 +1838,7 @@ def cv_results_(self): results["mean_fit_time"] = np.array(mean_fit_time) results["params"] = params - rank_order = -1 * self._metric._sign * results["mean_test_score"] + rank_order = -1 * self._metrics[0]._sign * results["mean_test_score"] results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") results["status"] = status results["budgets"] = budgets @@ -1857,7 +1857,10 @@ def sprint_statistics(self) -> str: sio = io.StringIO() sio.write("auto-sklearn results:\n") sio.write(" Dataset name: %s\n" % self._dataset_name) - sio.write(" Metric: %s\n" % self._metric) + if len(self._metrics) == 1: + sio.write(" Metric: %s\n" % self._metrics[0]) + else: + sio.write(" Metrics: %s\n" % self._metrics) idx_success = np.where( np.array( [ @@ -1868,7 +1871,7 @@ def sprint_statistics(self) -> str: ) )[0] if len(idx_success) > 0: - if not self._metric._optimum: + if not self._metrics[0]._optimum: idx_best_run = np.argmin(cv_results["mean_test_score"][idx_success]) else: idx_best_run = np.argmax(cv_results["mean_test_score"][idx_success]) diff --git a/autosklearn/ensemble_builder.py b/autosklearn/ensemble_builder.py index 738427500f..3dec9828ef 100644 --- a/autosklearn/ensemble_builder.py +++ b/autosklearn/ensemble_builder.py @@ -30,7 +30,7 @@ from autosklearn.automl_common.common.utils.backend import Backend from autosklearn.constants import BINARY_CLASSIFICATION from autosklearn.ensembles.ensemble_selection import EnsembleSelection -from autosklearn.metrics import Scorer, calculate_loss, calculate_score +from autosklearn.metrics import Scorer, calculate_losses, calculate_scores from autosklearn.util.logging_ import get_named_client_logger from autosklearn.util.parallel import preload_modules @@ -999,7 +999,7 @@ def compute_loss_per_model(self): # actually read the predictions and compute their respective loss try: y_ensemble = self._read_np_fn(y_ens_fn) - loss = calculate_loss( + loss = calculate_losses( solution=self.y_true_ensemble, prediction=y_ensemble, task_type=self.task_type, @@ -1511,7 +1511,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): performance_stamp = { "Timestamp": pd.Timestamp.now(), - "ensemble_optimization_score": calculate_score( + "ensemble_optimization_score": calculate_scores( solution=self.y_true_ensemble, prediction=train_pred, task_type=self.task_type, @@ -1522,7 +1522,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): if valid_pred is not None: # TODO: valid_pred are a legacy from competition manager # and this if never happens. Re-evaluate Y_valid support - performance_stamp["ensemble_val_score"] = calculate_score( + performance_stamp["ensemble_val_score"] = calculate_scores( solution=self.y_valid, prediction=valid_pred, task_type=self.task_type, @@ -1532,7 +1532,7 @@ def _add_ensemble_trajectory(self, train_pred, valid_pred, test_pred): # In case test_pred was provided if test_pred is not None: - performance_stamp["ensemble_test_score"] = calculate_score( + performance_stamp["ensemble_test_score"] = calculate_scores( solution=self.y_test, prediction=test_pred, task_type=self.task_type, diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 7649fb8a07..d5bffcd596 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -8,7 +8,7 @@ from autosklearn.constants import TASK_TYPES from autosklearn.ensembles.abstract_ensemble import AbstractEnsemble -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.base import BasePipeline @@ -164,7 +164,7 @@ def _fast( out=fant_ensemble_prediction, ) - losses[j] = calculate_loss( + losses[j] = calculate_losses( solution=labels, prediction=fant_ensemble_prediction, task_type=self.task_type, @@ -210,7 +210,7 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: # when scoring_functions=None, we know it will be a float losses[j] = cast( float, - calculate_loss( + calculate_losses( solution=labels, prediction=ensemble_prediction, task_type=self.task_type, diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 03934387af..852c47e472 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -25,7 +25,7 @@ MULTIOUTPUT_REGRESSION, REGRESSION_TASKS, ) -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.components.base import ThirdPartyComponents, _addons from autosklearn.pipeline.implementations.util import ( convert_multioutput_multiclass_to_multilabel, @@ -349,7 +349,7 @@ def _loss( return rval else: - return calculate_loss( + return calculate_losses( y_true, y_hat, self.task_type, @@ -402,14 +402,11 @@ def finish_up( if file_out_loss is not None: return self.duration, file_out_loss, self.seed, additional_run_info_ - if isinstance(loss, dict): - loss_ = loss - if len(self.metrics) == 1: - loss = loss_[self.metrics[0].name] - else: - loss = {metric: loss_[metric] for metric in loss_} + loss_ = loss + if len(self.metrics) == 1: + loss = loss_[self.metrics[0].name] else: - loss_ = {} + loss = {metric: loss_[metric] for metric in loss_} additional_run_info = {} if additional_run_info is None else additional_run_info for metric_name, value in loss_.items(): diff --git a/autosklearn/evaluation/test_evaluator.py b/autosklearn/evaluation/test_evaluator.py index fd5739e9ea..e76186aa06 100644 --- a/autosklearn/evaluation/test_evaluator.py +++ b/autosklearn/evaluation/test_evaluator.py @@ -12,7 +12,7 @@ AbstractEvaluator, _fit_and_suppress_warnings, ) -from autosklearn.metrics import Scorer, calculate_loss +from autosklearn.metrics import Scorer, calculate_losses from autosklearn.pipeline.components.base import ThirdPartyComponents __all__ = ["eval_t", "TestEvaluator"] @@ -83,22 +83,22 @@ def predict_and_loss( Y_pred = self.predict_function( self.X_train, self.model, self.task_type, self.Y_train ) - err = calculate_loss( + err = calculate_losses( solution=self.Y_train, prediction=Y_pred, task_type=self.task_type, - metric=self.metric, + metrics=self.metrics, scoring_functions=self.scoring_functions, ) else: Y_pred = self.predict_function( self.X_test, self.model, self.task_type, self.Y_train ) - err = calculate_loss( + err = calculate_losses( solution=self.Y_test, prediction=Y_pred, task_type=self.task_type, - metric=self.metric, + metrics=self.metrics, scoring_functions=self.scoring_functions, ) @@ -129,7 +129,7 @@ def eval_t( evaluator = TestEvaluator( configuration=config, backend=backend, - metric=metrics, + metrics=metrics, seed=seed, port=port, queue=queue, diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index ae5e88d2e1..240e9fecbc 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -448,23 +448,15 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: for metric in self.metrics ] if len(self.metrics) == 1: - train_loss = train_loss[self.metrics[0].name] + train_loss = train_loss[0] # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. - if self.scoring_functions: - opt_loss = {} - for metric in opt_losses[0].keys(): - opt_loss[metric] = np.average( - [ - opt_losses[i][metric] - for i in range(self.num_cv_folds) - ], - weights=opt_fold_weights_percentage, - ) - else: - opt_loss = np.average( - opt_losses, weights=opt_fold_weights_percentage + opt_loss = {} + for metric in opt_losses[0].keys(): + opt_loss[metric] = np.average( + [opt_losses[i][metric] for i in range(self.num_cv_folds)], + weights=opt_fold_weights_percentage, ) Y_targets = self.Y_targets @@ -647,7 +639,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: for metric in self.metrics ] if len(self.metrics) == 1: - train_loss = train_loss[self.metrics[0].name] + train_loss = train_loss[0] # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. @@ -659,8 +651,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: [opt_losses[i][metric_name] for i in range(self.num_cv_folds)], weights=opt_fold_weights, ) - if len(self.metrics) == 1: - opt_loss = opt_loss[self.metrics[0].name] Y_targets = self.Y_targets Y_train_targets = self.Y_train_targets @@ -798,6 +788,8 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No add_model_to_self=True, ) train_loss = self._loss(self.Y_actual_train, train_pred) + if len(self.metrics) == 1: + train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_targets[fold], opt_pred) if self.model.estimator_supports_iterative_fit(): @@ -898,6 +890,8 @@ def _partial_fit_and_predict_iterative( else self.Y_train[train_indices], Y_train_pred, ) + if len(self.metrics) == 1: + train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) additional_run_info = model.get_additional_run_info() @@ -943,6 +937,8 @@ def _partial_fit_and_predict_iterative( else self.Y_train[train_indices], Y_train_pred, ) + if len(self.metrics) == 1: + train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) if self.model.estimator_supports_iterative_fit(): model_max_iter = self.model.get_max_iter() diff --git a/autosklearn/experimental/askl2.py b/autosklearn/experimental/askl2.py index 65ef9b2def..8e9112e48f 100644 --- a/autosklearn/experimental/askl2.py +++ b/autosklearn/experimental/askl2.py @@ -93,6 +93,8 @@ def __call__( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.simple_intensifier import SimpleIntensifier @@ -122,6 +124,8 @@ def __call__( run_id=seed, n_jobs=n_jobs, dask_client=dask_client, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) @@ -141,6 +145,8 @@ def __call__( metalearning_configurations, n_jobs, dask_client, + multi_objective_algorithm, + multi_objective_kwargs, ): from smac.facade.smac_ac_facade import SMAC4AC from smac.intensification.successive_halving import SuccessiveHalving @@ -178,6 +184,8 @@ def __call__( }, dask_client=dask_client, n_jobs=n_jobs, + multi_objective_algorithm=multi_objective_algorithm, + multi_objective_kwargs=multi_objective_kwargs, ) smac4ac.solver.epm_chooser.min_samples_model = int( len(scenario.cs.get_hyperparameters()) / 2 diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index f0cf02947c..6e1ba745b8 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -384,7 +384,7 @@ def make_scorer( CLASSIFICATION_METRICS[name] = scorer -def calculate_score( +def calculate_scores( solution: np.ndarray, prediction: np.ndarray, task_type: int, @@ -392,9 +392,9 @@ def calculate_score( scoring_functions: Optional[List[Scorer]] = None, ) -> Dict[str, float]: """ - Returns a score (a magnitude that allows casting the + Returns the scores (a magnitude that allows casting the optimization problem as a maximization one) for the - given Auto-Sklearn Scorer object + given Auto-Sklearn Scorer objects. Parameters ---------- @@ -469,7 +469,7 @@ def calculate_score( return score_dict -def calculate_loss( +def calculate_losses( solution: np.ndarray, prediction: np.ndarray, task_type: int, @@ -477,9 +477,9 @@ def calculate_loss( scoring_functions: Optional[List[Scorer]] = None, ) -> Dict[str, float]: """ - Returns a loss (a magnitude that allows casting the + Returns the losses (a magnitude that allows casting the optimization problem as a minimization one) for the - given Auto-Sklearn Scorer object + given Auto-Sklearn Scorer objects. Parameters ---------- @@ -490,7 +490,7 @@ def calculate_loss( task_type: int To understand if the problem task is classification or regression - metric: Sequence[Scorer] + metrics: Sequence[Scorer] A list of objects that hosts a function to calculate how good the prediction is according to the solution. scoring_functions: List[Scorer] @@ -501,7 +501,7 @@ def calculate_loss( Dict[str, float] A loss function for each of the provided scorer objects """ - score = calculate_score( + score = calculate_scores( solution=solution, prediction=prediction, task_type=task_type, diff --git a/test/test_automl/test_dummy_predictions.py b/test/test_automl/test_dummy_predictions.py index 9a268d1a2c..3b9350ce8b 100644 --- a/test/test_automl/test_dummy_predictions.py +++ b/test/test_automl/test_dummy_predictions.py @@ -66,7 +66,7 @@ def test_produces_correct_output( * It should produce predictions "predictions_ensemble_1337_1_0.0.npy" """ seed = 1337 - automl = make_automl(metric=metric, seed=seed) + automl = make_automl(metrics=[metric], seed=seed) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -115,7 +115,7 @@ def test_runs_with_correct_args( dataset = "iris" task = MULTICLASS_CLASSIFICATION - automl = make_automl(metric=accuracy) + automl = make_automl(metrics=[accuracy]) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -159,7 +159,7 @@ def test_crash_due_to_memory_exception( dataset = "iris" task = MULTICLASS_CLASSIFICATION - automl = make_automl(metric=accuracy) + automl = make_automl(metrics=[accuracy]) automl._logger = mock_logger datamanager = make_sklearn_dataset( @@ -181,5 +181,5 @@ def test_crash_due_to_memory_exception( def test_raises_if_no_metric_set(make_automl: Callable[..., AutoML]) -> None: automl = make_automl() - with pytest.raises(ValueError, match="Metric was not set"): + with pytest.raises(ValueError, match="Metric/Metrics was/were not set"): automl._do_dummy_prediction() diff --git a/test/test_evaluation/test_abstract_evaluator.py b/test/test_evaluation/test_abstract_evaluator.py index 7f88383bcd..7bd52c0f76 100644 --- a/test/test_evaluation/test_abstract_evaluator.py +++ b/test/test_evaluation/test_abstract_evaluator.py @@ -65,7 +65,7 @@ def test_finish_up_model_predicts_NaN(self): port=self.port, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) ae.Y_optimization = rs.rand(33, 3) @@ -143,7 +143,7 @@ def test_disable_file_output(self): backend=self.backend_mock, queue=queue_mock, disable_file_output=True, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -169,7 +169,7 @@ def test_disable_file_output(self): output_y_hat_optimization=False, queue=queue_mock, disable_file_output=[disable], - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -226,7 +226,7 @@ def test_disable_file_output(self): backend=self.backend_mock, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], disable_file_output=["y_optimization"], port=self.port, additional_components=dict(), @@ -286,7 +286,7 @@ def test_file_output(self): backend=backend, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -357,7 +357,7 @@ def test_add_additional_components(self): backend=backend, output_y_hat_optimization=False, queue=queue_mock, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=additional_components, ) diff --git a/test/test_evaluation/test_evaluation.py b/test/test_evaluation/test_evaluation.py index 1723b208f2..f5292060a6 100644 --- a/test/test_evaluation/test_evaluation.py +++ b/test/test_evaluation/test_evaluation.py @@ -95,8 +95,8 @@ def test_eval_with_limits_holdout(self, pynisher_mock): multi_objectives=["cost"], stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -126,8 +126,8 @@ def test_zero_or_negative_cutoff(self, pynisher_mock): multi_objectives=["cost"], resampling_strategy="holdout", stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -156,8 +156,8 @@ def test_cutoff_lower_than_remaining_time(self, pynisher_mock): multi_objectives=["cost"], resampling_strategy="holdout", stats=self.stats, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -189,8 +189,8 @@ def test_eval_with_limits_holdout_fail_silent(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -260,8 +260,8 @@ def test_eval_with_limits_holdout_fail_memory_error(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=log_loss, - cost_for_crash=get_cost_of_crash(log_loss), + metrics=[log_loss], + cost_for_crash=get_cost_of_crash([log_loss]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -302,8 +302,8 @@ def test_eval_with_limits_holdout_fail_timeout(self, pynisher_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -352,8 +352,8 @@ def side_effect(**kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -388,8 +388,8 @@ def side_effect(**kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="forkserver", ) @@ -432,8 +432,8 @@ def side_effect(*args, **kwargs): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -468,8 +468,8 @@ def test_exception_in_target_function(self, eval_holdout_mock): resampling_strategy="holdout", stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, pynisher_context="fork", ) @@ -504,8 +504,8 @@ def test_silent_exception_in_target_function(self): multi_objectives=["cost"], stats=self.stats, memory_limit=3072, - metric=accuracy, - cost_for_crash=get_cost_of_crash(accuracy), + metrics=[accuracy], + cost_for_crash=get_cost_of_crash([accuracy]), abort_on_first_run_crash=False, iterative=False, pynisher_context="fork", diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 8615682ce7..813ad7a35e 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -72,7 +72,7 @@ def test_datasets(self): evaluator = TestEvaluator( backend_mock, queue_, - metric=metric_lookup[D.info["task"]], + metrics=[metric_lookup[D.info["task"]]], port=logging.handlers.DEFAULT_TCP_LOGGING_PORT, additional_components=dict(), ) @@ -110,7 +110,7 @@ def test_eval_test(self): queue=self.queue, backend=self.backend, config=self.configuration, - metric=accuracy, + metrics=[accuracy], seed=1, num_run=1, scoring_functions=None, @@ -133,7 +133,7 @@ def test_eval_test_all_loss_functions(self): queue=self.queue, backend=self.backend, config=self.configuration, - metric=accuracy, + metrics=[accuracy], seed=1, num_run=1, scoring_functions=SCORER_LIST, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index afed8b5ce1..0651443f2a 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -139,7 +139,7 @@ def test_holdout(self, pipeline_mock): resampling_strategy_args={"train_size": 0.66}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], port=self.port, additional_components=dict(), ) @@ -229,7 +229,7 @@ def configuration_fully_fitted(self): resampling_strategy="holdout", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -239,42 +239,45 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 1.0, - 1.0, - 1.0, - 1.0, - 0.9, - 0.9, - 0.9, - 0.9, - 0.8, - 0.8, - 0.8, - 0.8, - 0.7, - 0.7, - 0.7, - 0.7, - 0.6, - 0.6, - 0.6, - 0.6, - 0.5, - 0.5, - 0.5, - 0.5, - 0.4, - 0.4, - 0.4, - 0.4, - 0.3, - 0.3, - 0.3, - 0.3, - 0.2, - 0.2, - 0.2, - 0.2, + {"accuracy": value} + for value in [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] ] self.iteration = 0 @@ -381,7 +384,7 @@ def configuration_fully_fitted(self): resampling_strategy="holdout-iterative-fit", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -391,26 +394,29 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 0.8, - 0.8, - 0.8, - 0.8, - 0.6, - 0.6, - 0.6, - 0.6, - 0.4, - 0.4, - 0.4, - 0.4, - 0.2, - 0.2, - 0.2, - 0.2, - 0.0, - 0.0, - 0.0, - 0.0, + {"accuracy": value} + for value in [ + 0.8, + 0.8, + 0.8, + 0.8, + 0.6, + 0.6, + 0.6, + 0.6, + 0.4, + 0.4, + 0.4, + 0.4, + 0.2, + 0.2, + 0.2, + 0.2, + 0.0, + 0.0, + 0.0, + 0.0, + ] ] self.iteration = 0 @@ -482,7 +488,7 @@ def test_iterative_holdout_not_iterative(self, pipeline_mock): resampling_strategy="holdout-iterative-fit", scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -538,7 +544,7 @@ def test_cv(self, pipeline_mock): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -606,7 +612,7 @@ def test_partial_cv(self, pipeline_mock): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -688,7 +694,7 @@ def configuration_fully_fitted(self): resampling_strategy_args={"folds": 5}, scoring_functions=None, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -698,42 +704,45 @@ def configuration_fully_fitted(self): class LossSideEffect(object): def __init__(self): self.losses = [ - 1.0, - 1.0, - 1.0, - 1.0, - 0.9, - 0.9, - 0.9, - 0.9, - 0.8, - 0.8, - 0.8, - 0.8, - 0.7, - 0.7, - 0.7, - 0.7, - 0.6, - 0.6, - 0.6, - 0.6, - 0.5, - 0.5, - 0.5, - 0.5, - 0.4, - 0.4, - 0.4, - 0.4, - 0.3, - 0.3, - 0.3, - 0.3, - 0.2, - 0.2, - 0.2, - 0.2, + {"accuracy": value} + for value in [ + 1.0, + 1.0, + 1.0, + 1.0, + 0.9, + 0.9, + 0.9, + 0.9, + 0.8, + 0.8, + 0.8, + 0.8, + 0.7, + 0.7, + 0.7, + 0.7, + 0.6, + 0.6, + 0.6, + 0.6, + 0.5, + 0.5, + 0.5, + 0.5, + 0.4, + 0.4, + 0.4, + 0.4, + 0.3, + 0.3, + 0.3, + 0.3, + 0.2, + 0.2, + 0.2, + 0.2, + ] ] self.iteration = 0 @@ -791,7 +800,7 @@ def test_file_output(self, loss_mock, model_mock): resampling_strategy_args={"folds": 5}, scoring_functions=SCORER_LIST, output_y_hat_optimization=True, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -903,7 +912,7 @@ def test_subsample_indices_classification(self, mock, backend_mock): configuration=configuration, resampling_strategy="cv", resampling_strategy_args={"folds": 10}, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) train_indices = np.arange(69, dtype=int) @@ -974,7 +983,7 @@ def test_subsample_indices_regression(self, mock, backend_mock): configuration=configuration, resampling_strategy="cv", resampling_strategy_args={"folds": 10}, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) train_indices = np.arange(69, dtype=int) @@ -1043,7 +1052,7 @@ def test_predict_proba_binary_classification(self, mock): resampling_strategy="cv", resampling_strategy_args={"folds": 10}, output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) @@ -1091,7 +1100,7 @@ def test_fit_predict_and_loss_standard_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1139,7 +1148,7 @@ def __call__(self, *args, **kwargs): resampling_strategy="cv", resampling_strategy_args={"folds": 2}, output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1182,7 +1191,7 @@ def __call__(self): mock.get_current_iter.side_effect = Counter() mock.get_max_iter.return_value = 1 mock.get_additional_run_info.return_value = 14678 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1199,7 +1208,7 @@ def __call__(self): configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget=0.0, additional_components=dict(), ) @@ -1228,7 +1237,7 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( mock.estimator_supports_iterative_fit.return_value = False mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = 14678 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1245,7 +1254,7 @@ def test_fit_predict_and_loss_iterative_noniterativemodel_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) evaluator.file_output = unittest.mock.Mock(spec=evaluator.file_output) @@ -1285,7 +1294,7 @@ def __call__(self): mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = {"val": 14678} mock.get_max_iter.return_value = 512 - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1302,7 +1311,7 @@ def __call__(self): configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget_type="iterations", budget=50, additional_components=dict(), @@ -1335,7 +1344,7 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( mock.estimator_supports_iterative_fit.return_value = False mock.fit_transformer.return_value = ("Xt", {}) mock.get_additional_run_info.return_value = {"val": 14678} - loss_mock.return_value = 0.5 + loss_mock.return_value = {"accuracy": 0.5} D = get_binary_classification_datamanager() backend_mock.load_datamanager.return_value = D @@ -1352,7 +1361,7 @@ def test_fit_predict_and_loss_budget_2_additional_run_info( configuration=configuration, resampling_strategy="holdout", output_y_hat_optimization=False, - metric=accuracy, + metrics=[accuracy], budget_type="subsample", budget=50, additional_components=dict(), @@ -1406,7 +1415,7 @@ def test_datasets(self): resampling_strategy="cv", resampling_strategy_args={"folds": 2}, output_y_hat_optimization=False, - metric=metric_lookup[D.info["task"]], + metrics=[metric_lookup[D.info["task"]]], additional_components=dict(), ) @@ -2984,7 +2993,7 @@ def test_eval_holdout(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) info = read_queue(self.queue) @@ -3009,7 +3018,7 @@ def test_eval_holdout_all_loss_functions(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3063,7 +3072,7 @@ def test_eval_holdout_iterative_fit_no_timeout(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3088,7 +3097,7 @@ def test_eval_holdout_budget_iterations(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=1, budget_type="iterations", additional_components=dict(), @@ -3119,7 +3128,7 @@ def test_eval_holdout_budget_iterations_converged(self): exclude={"classifier": ["random_forest", "liblinear_svc"]}, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=80, budget_type="iterations", additional_components=dict(), @@ -3146,7 +3155,7 @@ def test_eval_holdout_budget_subsample(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=30, budget_type="subsample", additional_components=dict(), @@ -3174,7 +3183,7 @@ def test_eval_holdout_budget_mixed_iterations(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=1, budget_type="mixed", additional_components=dict(), @@ -3204,7 +3213,7 @@ def test_eval_holdout_budget_mixed_subsample(self): exclude={"classifier": ["random_forest"]}, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], budget=40, budget_type="mixed", additional_components=dict(), @@ -3231,7 +3240,7 @@ def test_eval_cv(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3256,7 +3265,7 @@ def test_eval_cv_all_loss_functions(self): exclude=None, disable_file_output=False, instance=self.dataset_name, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) @@ -3330,7 +3339,7 @@ def test_eval_partial_cv(self): include=None, exclude=None, disable_file_output=False, - metric=accuracy, + metrics=[accuracy], additional_components=dict(), ) rval = read_queue(self.queue) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index de27ed0451..6b5d591a80 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -6,7 +6,11 @@ import autosklearn.metrics from autosklearn.constants import BINARY_CLASSIFICATION, REGRESSION -from autosklearn.metrics import calculate_loss, calculate_score, compute_single_metric +from autosklearn.metrics import ( + calculate_losses, + calculate_scores, + compute_single_metric, +) import pytest import unittest @@ -543,7 +547,7 @@ def test_unsupported_task_type(self): raised = False try: - calculate_score(y_true, y_pred, 6, scorer) + calculate_scores(y_true, y_pred, 6, scorer) except NotImplementedError: raised = True self.assertTrue(raised) @@ -561,11 +565,11 @@ def test_classification_scoring_functions(self): y_pred = np.array( [[0.0, 1.0], [0.0, 1.0], [0.0, 1.0], [1.0, 0.0], [1.0, 0.0], [1.0, 0.0]] ) - score_dict = calculate_score( + score_dict = calculate_scores( y_true, y_pred, BINARY_CLASSIFICATION, - autosklearn.metrics.accuracy, + [autosklearn.metrics.accuracy], scoring_functions, ) @@ -591,11 +595,11 @@ def test_regression_scoring_functions(self): y_true = np.array([1, 2, 3, -4]) y_pred = y_true.copy() - score_dict = calculate_score( + score_dict = calculate_scores( y_true, y_pred, REGRESSION, - autosklearn.metrics.root_mean_squared_error, + [autosklearn.metrics.root_mean_squared_error], scoring_functions, ) @@ -615,7 +619,9 @@ def test_classification_only_metric(self): ) scorer = autosklearn.metrics.accuracy - score = calculate_score(y_true, y_pred, BINARY_CLASSIFICATION, scorer) + score = calculate_scores(y_true, y_pred, BINARY_CLASSIFICATION, [scorer])[ + "accuracy" + ] previous_score = scorer._optimum self.assertAlmostEqual(score, previous_score) @@ -625,36 +631,65 @@ def test_regression_only_metric(self): y_pred = y_true.copy() scorer = autosklearn.metrics.root_mean_squared_error - score = calculate_score(y_true, y_pred, REGRESSION, scorer) + score = calculate_scores(y_true, y_pred, REGRESSION, [scorer])[ + "root_mean_squared_error" + ] previous_score = scorer._optimum self.assertAlmostEqual(score, previous_score) -def test_calculate_loss(): +def test_calculate_losses(): # In a 0-1 ranged scorer, make sure that the loss # has an expected positive value y_pred = np.array([0, 1, 0, 1, 1, 1, 0, 0, 0, 0]) y_true = np.array([0, 1, 0, 1, 1, 0, 0, 0, 0, 0]) score = sklearn.metrics.accuracy_score(y_true, y_pred) - assert pytest.approx(score) == calculate_score( + assert {"accuracy": pytest.approx(score)} == calculate_scores( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], ) - assert pytest.approx(1.0 - score) == calculate_loss( + assert {"accuracy": pytest.approx(1.0 - score)} == calculate_losses( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], ) - # Test the dictionary case - score_dict = calculate_score( + # Test two metrics + score_dict = calculate_scores( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + expected_score_dict = { + "accuracy": 0.9, + "balanced_accuracy": 0.9285714285714286, + } + loss_dict = calculate_losses( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + for expected_metric, expected_score in expected_score_dict.items(): + assert pytest.approx(expected_score) == score_dict[expected_metric] + assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] + + # Test additional scoring functions + score_dict = calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy, @@ -664,11 +699,11 @@ def test_calculate_loss(): "accuracy": 0.9, "balanced_accuracy": 0.9285714285714286, } - loss_dict = calculate_loss( + loss_dict = calculate_losses( solution=y_true, prediction=y_pred, task_type=BINARY_CLASSIFICATION, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], scoring_functions=[ autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy, @@ -683,17 +718,17 @@ def test_calculate_loss(): y_true = np.array([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]) y_pred = np.array([0.11, 0.22, 0.33, 0.44, 0.55, 0.66]) score = sklearn.metrics.mean_squared_error(y_true, y_pred) - assert pytest.approx(0 - score) == calculate_score( + assert {"mean_squared_error": pytest.approx(0 - score)} == calculate_scores( solution=y_true, prediction=y_pred, task_type=REGRESSION, - metric=autosklearn.metrics.mean_squared_error, + metrics=[autosklearn.metrics.mean_squared_error], ) - assert pytest.approx(score) == calculate_loss( + assert {"mean_squared_error": pytest.approx(score)} == calculate_losses( solution=y_true, prediction=y_pred, task_type=REGRESSION, - metric=autosklearn.metrics.mean_squared_error, + metrics=[autosklearn.metrics.mean_squared_error], ) diff --git a/test/test_optimizer/test_smbo.py b/test/test_optimizer/test_smbo.py index 0b14f4a722..8462c67baf 100644 --- a/test/test_optimizer/test_smbo.py +++ b/test/test_optimizer/test_smbo.py @@ -13,13 +13,13 @@ import pytest -@pytest.mark.parametrize("context", ["fork", "forkserver"]) +@pytest.mark.parametrize("context", ["fork", "forkserver", "spawn"]) def test_smbo_metalearning_configurations(backend, context, dask_client) -> None: # Get the inputs to the optimizer X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") config_space = AutoML( delete_tmp_folder_after_terminate=False, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], time_left_for_this_task=20, per_run_time_limit=5, ).fit( @@ -38,7 +38,7 @@ def test_smbo_metalearning_configurations(backend, context, dask_client) -> None total_walltime_limit=10, func_eval_time_limit=5, memory_limit=4096, - metric=autosklearn.metrics.accuracy, + metrics=[autosklearn.metrics.accuracy], stopwatch=stopwatch, n_jobs=1, dask_client=dask_client, From 19251e846f928add04a2aba4ad9d8527cf4ea8b0 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 6 May 2022 15:39:20 +0200 Subject: [PATCH 07/24] Fix one more example --- examples/40_advanced/example_metrics.py | 20 ++++++++------------ 1 file changed, 8 insertions(+), 12 deletions(-) diff --git a/examples/40_advanced/example_metrics.py b/examples/40_advanced/example_metrics.py index 33d0f678fd..7784491746 100644 --- a/examples/40_advanced/example_metrics.py +++ b/examples/40_advanced/example_metrics.py @@ -71,18 +71,18 @@ def error_wk(solution, prediction, extra_argument): print("#" * 80) print("Use predefined accuracy metric") +scorer = autosklearn.metrics.accuracy cls = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=60, per_run_time_limit=30, seed=1, - metric=autosklearn.metrics.accuracy, + metric=scorer, ) cls.fit(X_train, y_train) predictions = cls.predict(X_test) -score = sklearn.metrics.accuracy_score(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name}") +score = scorer(y_test, predictions) +print(f"Accuracy score {score:.3f} using {scorer.name}") ############################################################################ # Second example: Use own accuracy metric @@ -108,8 +108,7 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = accuracy_scorer(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name:s}") +print(f"Accuracy score {score:.3f} using {accuracy_scorer.name:s}") ############################################################################ # Third example: Use own error metric @@ -135,8 +134,7 @@ def error_wk(solution, prediction, extra_argument): cls.predictions = cls.predict(X_test) score = error_rate(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Error score {score:.3f} using {metric_name:s}") +print(f"Error score {score:.3f} using {error_rate.name:s}") ############################################################################ # Fourth example: Use own accuracy metric with additional argument @@ -160,8 +158,7 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = accuracy_scorer(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Accuracy score {score:.3f} using {metric_name:s}") +print(f"Accuracy score {score:.3f} using {accuracy_scorer.name:s}") ############################################################################ # Fifth example: Use own accuracy metric with additional argument @@ -188,5 +185,4 @@ def error_wk(solution, prediction, extra_argument): predictions = cls.predict(X_test) score = error_rate(y_test, predictions) -metric_name = cls.automl_._metric.name -print(f"Error score {score:.3f} using {metric_name:s}") +print(f"Error score {score:.3f} using {error_rate.name:s}") From 177d9135f62795a283355076faa2ed72903d9977 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Fri, 6 May 2022 16:31:41 +0200 Subject: [PATCH 08/24] Add multi-objective example --- autosklearn/automl.py | 5 +- .../40_advanced/example_multi_objective.py | 65 +++++++++++++++++++ 2 files changed, 69 insertions(+), 1 deletion(-) create mode 100644 examples/40_advanced/example_multi_objective.py diff --git a/autosklearn/automl.py b/autosklearn/automl.py index b051e7217c..aa86137ad7 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1803,8 +1803,11 @@ def cv_results_(self): param_dict = config.get_dictionary() params.append(param_dict) + cost = run_value.cost + if len(self._metrics) > 1: + cost = cost[0] mean_test_score.append( - self._metrics[0]._optimum - (self._metrics[0]._sign * run_value.cost) + self._metrics[0]._optimum - (self._metrics[0]._sign * cost) ) mean_fit_time.append(run_value.time) budgets.append(run_key.budget) diff --git a/examples/40_advanced/example_multi_objective.py b/examples/40_advanced/example_multi_objective.py new file mode 100644 index 0000000000..403bf54dfd --- /dev/null +++ b/examples/40_advanced/example_multi_objective.py @@ -0,0 +1,65 @@ +# -*- encoding: utf-8 -*- +""" +============== +Classification +============== + +The following example shows how to fit *auto-sklearn* to optimize for two +competing metrics: `precision` and `recall` (read more on this tradeoff +in the `scikit-learn docs `_. + +Auto-sklearn uses `SMAC3's implementation of ParEGO `_. +Multi-objective ensembling and proper access to the full Pareto front will be added in the near +future. +""" +from pprint import pprint + +import sklearn.datasets +import sklearn.metrics + +import autosklearn.classification +import autosklearn.metrics + + +############################################################################ +# Data Loading +# ============ + +X, y = sklearn.datasets.load_breast_cancer(return_X_y=True) +X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split( + X, y, random_state=1 +) + +############################################################################ +# Build and fit a classifier +# ========================== + +automl = autosklearn.classification.AutoSklearnClassifier( + time_left_for_this_task=30, + tmp_folder="/tmp/autosklearn_classification_example_tmp", + metric=[autosklearn.metrics.precision, autosklearn.metrics.recall], +) +automl.fit(X_train, y_train, dataset_name="breast_cancer") + +############################################################################ +# Compute the two competing metrics +# ================================= + +predictions = automl.predict(X_test) +print("Precision", sklearn.metrics.precision_score(y_test, predictions)) +print("Recall", sklearn.metrics.recall_score(y_test, predictions)) + +############################################################################ +# View the models found by auto-sklearn +# ===================================== +# They are by default sorted by the first metric given to *auto-sklearn*. + +print(automl.leaderboard()) + +############################################################################ +# ``cv_results`` also contains both metrics +# ========================================= +# Similarly to the leaderboard, they are sorted by the first metric given +# to *auto-sklearn*. + +pprint(automl.cv_results_) From 98beb3aee8e95abf77ec40a360952e60e3bb4f2b Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Sat, 7 May 2022 21:10:02 +0200 Subject: [PATCH 09/24] Simplify internal interface --- autosklearn/evaluation/abstract_evaluator.py | 9 ++++++-- autosklearn/evaluation/train_evaluator.py | 22 ++++++-------------- 2 files changed, 13 insertions(+), 18 deletions(-) diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 852c47e472..82ad211560 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -360,7 +360,7 @@ def _loss( def finish_up( self, loss: Union[Dict[str, float], float], - train_loss: Optional[Union[float, Dict[str, float]]], + train_loss: Optional[Dict[str, float]], opt_pred: np.ndarray, valid_pred: np.ndarray, test_pred: np.ndarray, @@ -414,7 +414,12 @@ def finish_up( additional_run_info["duration"] = self.duration additional_run_info["num_run"] = self.num_run if train_loss is not None: - additional_run_info["train_loss"] = train_loss + if len(self.metrics) == 1: + additional_run_info["train_loss"] = train_loss[self.metrics[0].name] + else: + additional_run_info["train_loss"] = [ + train_loss[metric.name] for metric in self.metrics + ] if validation_loss is not None: additional_run_info["validation_loss"] = validation_loss if test_loss is not None: diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index 240e9fecbc..a52420b8aa 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -437,8 +437,8 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: w / sum(opt_fold_weights) for w in opt_fold_weights ] - train_loss = [ - np.average( + train_loss = { + metric.name: np.average( [ train_losses[i][str(metric)] for i in range(self.num_cv_folds) @@ -446,9 +446,7 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: weights=train_fold_weights_percentage, ) for metric in self.metrics - ] - if len(self.metrics) == 1: - train_loss = train_loss[0] + } # if all_scoring_function is true, return a dict of opt_loss. # Otherwise, return a scalar. @@ -631,15 +629,13 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: ] opt_fold_weights = [w / sum(opt_fold_weights) for w in opt_fold_weights] - train_loss = [ - np.average( + train_loss = { + metric.name: np.average( [train_losses[i][str(metric)] for i in range(self.num_cv_folds)], weights=train_fold_weights, ) for metric in self.metrics - ] - if len(self.metrics) == 1: - train_loss = train_loss[0] + } # if all_scoring_function is true, return a dict of opt_loss. Otherwise, # return a scalar. @@ -788,8 +784,6 @@ def partial_fit_predict_and_loss(self, fold: int, iterative: bool = False) -> No add_model_to_self=True, ) train_loss = self._loss(self.Y_actual_train, train_pred) - if len(self.metrics) == 1: - train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_targets[fold], opt_pred) if self.model.estimator_supports_iterative_fit(): @@ -890,8 +884,6 @@ def _partial_fit_and_predict_iterative( else self.Y_train[train_indices], Y_train_pred, ) - if len(self.metrics) == 1: - train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) additional_run_info = model.get_additional_run_info() @@ -937,8 +929,6 @@ def _partial_fit_and_predict_iterative( else self.Y_train[train_indices], Y_train_pred, ) - if len(self.metrics) == 1: - train_loss = train_loss[self.metrics[0].name] loss = self._loss(self.Y_train[test_indices], Y_optimization_pred) if self.model.estimator_supports_iterative_fit(): model_max_iter = self.model.get_max_iter() From 93cdbfc87387cba39063fd5b43ee7c8f5076720b Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 9 May 2022 10:13:01 +0200 Subject: [PATCH 10/24] Act on further feedback --- autosklearn/automl.py | 8 +++++--- autosklearn/evaluation/train_evaluator.py | 4 ---- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index aa86137ad7..d305ee8a08 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1299,9 +1299,11 @@ def fit_pipeline( if "resampling_strategy" not in kwargs: kwargs["resampling_strategy"] = self._resampling_strategy if "metrics" not in kwargs: - kwargs["metric"] = self._metrics - kwargs["metrics"] = kwargs["metric"] - del kwargs["metric"] + if "metric" in kwargs: + kwargs["metrics"] = kwargs["metric"] + del kwargs["metric"] + else: + kwargs["metrics"] = self._metrics if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: diff --git a/autosklearn/evaluation/train_evaluator.py b/autosklearn/evaluation/train_evaluator.py index a52420b8aa..f6317ca94e 100644 --- a/autosklearn/evaluation/train_evaluator.py +++ b/autosklearn/evaluation/train_evaluator.py @@ -395,8 +395,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_test_pred[i] = test_pred train_splits[i] = train_indices - # Compute train loss of this fold and store it. train_loss could - # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( self.Y_train.iloc[train_indices] if hasattr(self.Y_train, "iloc") @@ -601,8 +599,6 @@ def fit_predict_and_loss(self, iterative: bool = False) -> None: Y_test_pred[i] = test_pred train_splits[i] = train_split - # Compute train loss of this fold and store it. train_loss could - # either be a scalar or a dict of scalars with metrics as keys. train_loss = self._loss( self.Y_train_targets[train_split], train_pred, From 610cda86a16da9c746bd0c15d6d9e4b470199d58 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 9 May 2022 13:00:17 +0200 Subject: [PATCH 11/24] Fix bug --- autosklearn/automl.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index d305ee8a08..0bdbcfb9fb 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1304,6 +1304,8 @@ def fit_pipeline( del kwargs["metric"] else: kwargs["metrics"] = self._metrics + if not isinstance(kwargs["metrics"], Sequence): + kwargs["metrics"] = [kwargs["metrics"]] if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: From 155713189de382979d579346d9b01e9d37463caa Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 9 May 2022 14:01:01 +0200 Subject: [PATCH 12/24] Update cv_results_ for multi-objective sklearn compliance --- autosklearn/automl.py | 44 +++++++++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 16 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 0bdbcfb9fb..cdb71acefe 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -4,6 +4,7 @@ import copy import io +import itertools import json import logging.handlers import multiprocessing @@ -1767,14 +1768,11 @@ def cv_results_(self): metric_mask = dict() metric_dict = dict() - metric_name = [] - for metric in self._scoring_functions: - metric_name.append(metric.name) + for metric in itertools.chain(self._metrics, self._scoring_functions): metric_dict[metric.name] = [] metric_mask[metric.name] = [] - mean_test_score = [] mean_fit_time = [] params = [] status = [] @@ -1807,12 +1805,7 @@ def cv_results_(self): param_dict = config.get_dictionary() params.append(param_dict) - cost = run_value.cost - if len(self._metrics) > 1: - cost = cost[0] - mean_test_score.append( - self._metrics[0]._optimum - (self._metrics[0]._sign * cost) - ) + mean_fit_time.append(run_value.time) budgets.append(run_key.budget) @@ -1827,6 +1820,14 @@ def cv_results_(self): parameter_dictionaries[hp_name].append(hp_value) masks[hp_name].append(mask_value) + cost = [run_value.cost] if len(self._metrics) == 1 else run_value.cost + for metric_idx, metric in enumerate(self._metrics): + metric_cost = cost[metric_idx] + metric_value = metric._optimum - (metric._sign * metric_cost) + mask_value = False + metric_dict[metric.name].append(metric_value) + metric_mask[metric.name].append(mask_value) + for metric in self._scoring_functions: if metric.name in run_value.additional_info.keys(): metric_cost = run_value.additional_info[metric.name] @@ -1838,15 +1839,26 @@ def cv_results_(self): metric_dict[metric.name].append(metric_value) metric_mask[metric.name].append(mask_value) - results["mean_test_score"] = np.array(mean_test_score) - for name in metric_name: - masked_array = ma.MaskedArray(metric_dict[name], metric_mask[name]) - results["metric_%s" % name] = masked_array + if len(self._metrics) == 1: + results["mean_test_score"] = np.array(metric_dict[self._metrics[0].name]) + rank_order = -1 * self._metrics[0]._sign * results["mean_test_score"] + results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") + else: + for metric in self._metrics: + key = f"mean_test_{metric.name}" + results[key] = np.array(metric_dict[metric.name]) + rank_order = -1 * metric._sign * results[key] + results[f"rank_test_{metric.name}"] = scipy.stats.rankdata( + rank_order, method="min" + ) + for metric in self._scoring_functions: + masked_array = ma.MaskedArray( + metric_dict[metric.name], metric_mask[metric.name] + ) + results[f"metric_{metric.name}"] = masked_array results["mean_fit_time"] = np.array(mean_fit_time) results["params"] = params - rank_order = -1 * self._metrics[0]._sign * results["mean_test_score"] - results["rank_test_scores"] = scipy.stats.rankdata(rank_order, method="min") results["status"] = status results["budgets"] = budgets From 75c3eb43ac54770b5ad20a459b4b9a539ba6a2ec Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 9 May 2022 16:15:13 +0200 Subject: [PATCH 13/24] Update leaderboard for multi-objective optimization --- autosklearn/estimators.py | 206 ++++++++++++------ .../40_advanced/example_multi_objective.py | 2 +- test/test_estimators/test_estimators.py | 2 +- 3 files changed, 141 insertions(+), 69 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 8cbc428986..b23ec89d55 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -821,7 +821,12 @@ def leaderboard( # TODO validate that `self` is fitted. This is required for # self.ensemble_ to get the identifiers of models it will generate # weights for. - column_types = AutoSklearnEstimator._leaderboard_columns() + num_metrics = ( + 1 + if self.metric is None or isinstance(self.metric, Scorer) + else len(self.metric) + ) + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics) # Validation of top_k if ( @@ -862,11 +867,26 @@ def leaderboard( columns = column_types["simple"] # Validation of sorting - if sort_by not in column_types["all"]: - raise ValueError( - f"sort_by='{sort_by}' must be one of included " - f"columns {set(column_types['all'])}" - ) + if sort_by == "cost": + sort_by_cost = True + if num_metrics == 1: + sort_by = ["cost", "model_id"] + else: + sort_by = [f"cost_{i}" for i in range(num_metrics)] + ["model_id"] + else: + sort_by_cost = False + if isinstance(sort_by, str): + if sort_by not in column_types["all"]: + raise ValueError( + f"sort_by='{sort_by}' must be one of included " + f"columns {set(column_types['all'])}" + ) + elif len(set(sort_by) - set(column_types["all"])) > 0: + too_much = set(sort_by) - set(column_types["all"]) + raise ValueError( + f"sort_by='{too_much}' must be in the included columns " + f"{set(column_types['all'])}" + ) valid_sort_orders = ["auto", "ascending", "descending"] if not (isinstance(sort_order, str) and sort_order in valid_sort_orders): @@ -876,30 +896,37 @@ def leaderboard( # To get all the models that were optmized, we collect what we can from # runhistory first. - def has_key(rv, key): + def additional_info_has_key(rv, key): return rv.additional_info and key in rv.additional_info - model_runs = { - rval.additional_info["num_run"]: { - "model_id": rval.additional_info["num_run"], - "seed": rkey.seed, - "budget": rkey.budget, - "duration": rval.time, - "config_id": rkey.config_id, - "start_time": rval.starttime, - "end_time": rval.endtime, - "status": str(rval.status), - "cost": rval.cost if isinstance(rval.cost, float) else rval.cost[0], - "train_loss": rval.additional_info["train_loss"] - if has_key(rval, "train_loss") - else None, - "config_origin": rval.additional_info["configuration_origin"] - if has_key(rval, "configuration_origin") - else None, - } - for rkey, rval in self.automl_.runhistory_.data.items() - if has_key(rval, "num_run") - } + model_runs = {} + for rkey, rval in self.automl_.runhistory_.data.items(): + if not additional_info_has_key(rval, "num_run"): + continue + else: + model_key = rval.additional_info["num_run"] + model_run = { + "model_id": rval.additional_info["num_run"], + "seed": rkey.seed, + "budget": rkey.budget, + "duration": rval.time, + "config_id": rkey.config_id, + "start_time": rval.starttime, + "end_time": rval.endtime, + "status": str(rval.status), + "train_loss": rval.additional_info["train_loss"] + if additional_info_has_key(rval, "train_loss") + else None, + "config_origin": rval.additional_info["configuration_origin"] + if additional_info_has_key(rval, "configuration_origin") + else None, + } + if num_metrics == 1: + model_run["cost"] = rval.cost + else: + for cost_idx, cost in enumerate(rval.cost): + model_run[f"cost_{cost_idx}"] = cost + model_runs[model_key] = model_run # Next we get some info about the model itself model_class_strings = { @@ -947,7 +974,7 @@ def has_key(rv, key): # collected. I have no clue why this is but to prevent failures, we fill # the values with NaN if model_id not in model_runs: - model_runs[model_id] = { + model_run = { "model_id": model_id, "seed": pd.NA, "budget": pd.NA, @@ -956,10 +983,15 @@ def has_key(rv, key): "start_time": pd.NA, "end_time": pd.NA, "status": pd.NA, - "cost": pd.NA, "train_loss": pd.NA, "config_origin": pd.NA, } + if num_metrics == 1: + model_run[model_id]["cost"] = pd.NA + else: + for cost_idx in range(num_metrics): + model_run[model_id][f"cost_{cost_idx}"] = pd.NA + model_runs[model_id] = model_run model_runs[model_id]["ensemble_weight"] = weight @@ -978,8 +1010,13 @@ def has_key(rv, key): # `rank` relies on `cost` so we include `cost` # We drop it later if it's not requested - if "rank" in columns and "cost" not in columns: - columns = [*columns, "cost"] + if "rank" in columns: + if num_metrics == 1 and "cost" not in columns: + columns = [*columns, "cost"] + elif num_metrics > 1 and any( + f"cost_{i}" not in columns for i in range(num_metrics) + ): + columns = columns + [f"cost_{i}" for i in range(num_metrics)] # Finally, convert into a tabular format by converting the dict into # column wise orientation. @@ -997,40 +1034,63 @@ def has_key(rv, key): # Add the `rank` column if needed, dropping `cost` if it's not # requested by the user if "rank" in columns: - dataframe.sort_values(by="cost", ascending=True, inplace=True) - dataframe.insert( - column="rank", - value=range(1, len(dataframe) + 1), - loc=list(columns).index("rank") - 1, - ) # account for `model_id` - - if "cost" not in columns: - dataframe.drop("cost", inplace=True) + if num_metrics == 1: + dataframe.sort_values(by="cost", ascending=True, inplace=True) + dataframe.insert( + column="rank", + value=range(1, len(dataframe) + 1), + loc=list(columns).index("rank") - 1, + ) # account for `model_id` + else: + self.automl_._logger.warning( + "Cannot compute rank for multi-objective optimization porblems." + ) # Decide on the sort order depending on what it gets sorted by descending_columns = ["ensemble_weight", "duration"] if sort_order == "auto": - ascending_param = False if sort_by in descending_columns else True + ascending_param = [ + False if sby in descending_columns else True for sby in sort_by + ] else: ascending_param = False if sort_order == "descending" else True # Sort by the given column name, defaulting to 'model_id' if not present - if sort_by not in dataframe.columns: + if ( + (not sort_by_cost and len(set(sort_by) - set(dataframe.columns)) > 0) + or (sort_by_cost and "cost" not in dataframe.columns) + or ( + sort_by_cost + and any( + f"cost_{i}" not in dataframe.columns for i in range(num_metrics) + ) + ) + ): self.automl_._logger.warning( f"sort_by = '{sort_by}' was not present" ", defaulting to sort on the index " "'model_id'" ) sort_by = "model_id" + sort_by_cost = False + ascending_param = True - # Cost can be the same but leave rank all over the place - if "rank" in columns and sort_by == "cost": + # Single objective + if sort_by_cost: dataframe.sort_values( - by=[sort_by, "rank"], ascending=[ascending_param, True], inplace=True + by=sort_by, ascending=[True] * len(sort_by), inplace=True ) else: dataframe.sort_values(by=sort_by, ascending=ascending_param, inplace=True) + if num_metrics: + if "cost" not in columns and "cost" in dataframe.columns: + dataframe.drop("cost", inplace=True) + else: + for i in range(num_metrics): + if f"cost_{i}" not in columns and f"cost_{i}" in dataframe.columns: + dataframe.drop(f"cost_{i}", inplace=True) + # Lastly, just grab the top_k if top_k == "all" or top_k >= len(dataframe): top_k = len(dataframe) @@ -1040,27 +1100,39 @@ def has_key(rv, key): return dataframe @staticmethod - def _leaderboard_columns() -> Dict[Literal["all", "simple", "detailed"], List[str]]: - all = [ - "model_id", - "rank", - "ensemble_weight", - "type", - "cost", - "duration", - "config_id", - "train_loss", - "seed", - "start_time", - "end_time", - "budget", - "status", - "data_preprocessors", - "feature_preprocessors", - "balancing_strategy", - "config_origin", - ] - simple = ["model_id", "rank", "ensemble_weight", "type", "cost", "duration"] + def _leaderboard_columns( + num_metrics: int, + ) -> Dict[Literal["all", "simple", "detailed"], List[str]]: + if num_metrics == 1: + cost_list = ["cost"] + else: + cost_list = [f"cost_{i}" for i in range(num_metrics)] + all = ( + [ + "model_id", + "rank", + "ensemble_weight", + "type", + ] + + cost_list + + [ + "duration", + "config_id", + "train_loss", + "seed", + "start_time", + "end_time", + "budget", + "status", + "data_preprocessors", + "feature_preprocessors", + "balancing_strategy", + "config_origin", + ] + ) + simple = ( + ["model_id", "rank", "ensemble_weight", "type"] + cost_list + ["duration"] + ) detailed = all return {"all": all, "detailed": detailed, "simple": simple} diff --git a/examples/40_advanced/example_multi_objective.py b/examples/40_advanced/example_multi_objective.py index 403bf54dfd..f81f0d4709 100644 --- a/examples/40_advanced/example_multi_objective.py +++ b/examples/40_advanced/example_multi_objective.py @@ -36,7 +36,7 @@ automl = autosklearn.classification.AutoSklearnClassifier( time_left_for_this_task=30, - tmp_folder="/tmp/autosklearn_classification_example_tmp", + tmp_folder="/tmp/autosklearn_multi_objective_example_tmp", metric=[autosklearn.metrics.precision, autosklearn.metrics.recall], ) automl.fit(X_train, y_train, dataset_name="breast_cancer") diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index cd4b0922de..dac4febe19 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -388,7 +388,7 @@ def test_leaderboard( # Comprehensive test tasks a substantial amount of time, manually set if # required. MAX_COMBO_SIZE_FOR_INCLUDE_PARAM = 3 # [0, len(valid_columns) + 1] - column_types = AutoSklearnEstimator._leaderboard_columns() + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics=1) # Create a dict of all possible param values for each param # with some invalid one's of the incorrect type From 1f90ec8497bc34757166ccc203757f965449b9db Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Mon, 9 May 2022 18:19:09 +0200 Subject: [PATCH 14/24] Include Feedback from Katharina --- autosklearn/automl.py | 2 -- autosklearn/ensembles/ensemble_selection.py | 2 +- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index cdb71acefe..90e70392b4 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -691,8 +691,6 @@ def fit( # The metric must exist as of this point # It can be provided in the constructor, or automatically # defined in the estimator fit call - if self._metrics is None: - raise ValueError("No metrics given.") if isinstance(self._metrics, Sequence): for entry in self._metrics: if not isinstance(entry, Scorer): diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index d5bffcd596..14a8a01445 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -214,7 +214,7 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: solution=labels, prediction=ensemble_prediction, task_type=self.task_type, - metric=self.metric, + metrics=[self.metric], scoring_functions=None, ), ) From a9227b8f34c52920e16b7bbce9de65ee1a813030 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 10 May 2022 18:18:17 +0200 Subject: [PATCH 15/24] Take offline feedback into account --- autosklearn/ensembles/ensemble_selection.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index 14a8a01445..b252f32149 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, List, Optional, Tuple, Union, cast +from typing import Any, Dict, List, Optional, Tuple, Union import random from collections import Counter @@ -208,16 +208,13 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: ensemble_prediction = np.mean(np.array(ensemble), axis=0) # calculate_loss is versatile and can return a dict of losses # when scoring_functions=None, we know it will be a float - losses[j] = cast( - float, - calculate_losses( - solution=labels, - prediction=ensemble_prediction, - task_type=self.task_type, - metrics=[self.metric], - scoring_functions=None, - ), - ) + losses[j] = calculate_losses( + solution=labels, + prediction=ensemble_prediction, + task_type=self.task_type, + metrics=[self.metric], + scoring_functions=None, + )[self.metric.name] ensemble.pop() best = np.nanargmin(losses) ensemble.append(predictions[best]) From 2c36d456a76ca11aa5caacc8964ff6d2eb0ce5c6 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Tue, 10 May 2022 18:20:34 +0200 Subject: [PATCH 16/24] Take offline feedback into account --- autosklearn/ensembles/ensemble_selection.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/autosklearn/ensembles/ensemble_selection.py b/autosklearn/ensembles/ensemble_selection.py index b252f32149..0c99db64c1 100644 --- a/autosklearn/ensembles/ensemble_selection.py +++ b/autosklearn/ensembles/ensemble_selection.py @@ -206,8 +206,6 @@ def _slow(self, predictions: List[np.ndarray], labels: np.ndarray) -> None: for j, pred in enumerate(predictions): ensemble.append(pred) ensemble_prediction = np.mean(np.array(ensemble), axis=0) - # calculate_loss is versatile and can return a dict of losses - # when scoring_functions=None, we know it will be a float losses[j] = calculate_losses( solution=labels, prediction=ensemble_prediction, From 1f1d2184ecf9c4c126d4e4265788bf6ab344e636 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 11 May 2022 09:24:39 +0200 Subject: [PATCH 17/24] Eddie's feedback --- autosklearn/estimators.py | 26 ++++++++++++-------- autosklearn/evaluation/__init__.py | 3 ++- autosklearn/evaluation/abstract_evaluator.py | 8 +++--- autosklearn/metrics/__init__.py | 4 +-- 4 files changed, 23 insertions(+), 18 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index b23ec89d55..85ffbeec76 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -827,6 +827,10 @@ def leaderboard( else len(self.metric) ) column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics) + if num_metrics == 1: + multi_objective_cost_names = [] + else: + multi_objective_cost_names = [f"cost_{i}" for i in range(num_metrics)] # Validation of top_k if ( @@ -872,7 +876,7 @@ def leaderboard( if num_metrics == 1: sort_by = ["cost", "model_id"] else: - sort_by = [f"cost_{i}" for i in range(num_metrics)] + ["model_id"] + sort_by = list(multi_objective_cost_names) + ["model_id"] else: sort_by_cost = False if isinstance(sort_by, str): @@ -985,12 +989,13 @@ def additional_info_has_key(rv, key): "status": pd.NA, "train_loss": pd.NA, "config_origin": pd.NA, + "type": pd.NA, } if num_metrics == 1: - model_run[model_id]["cost"] = pd.NA + model_run["cost"] = pd.NA else: for cost_idx in range(num_metrics): - model_run[model_id][f"cost_{cost_idx}"] = pd.NA + model_run[f"cost_{cost_idx}"] = pd.NA model_runs[model_id] = model_run model_runs[model_id]["ensemble_weight"] = weight @@ -1014,9 +1019,9 @@ def additional_info_has_key(rv, key): if num_metrics == 1 and "cost" not in columns: columns = [*columns, "cost"] elif num_metrics > 1 and any( - f"cost_{i}" not in columns for i in range(num_metrics) + cost_name not in columns for cost_name in multi_objective_cost_names ): - columns = columns + [f"cost_{i}" for i in range(num_metrics)] + columns = columns + list(multi_objective_cost_names) # Finally, convert into a tabular format by converting the dict into # column wise orientation. @@ -1062,7 +1067,8 @@ def additional_info_has_key(rv, key): or ( sort_by_cost and any( - f"cost_{i}" not in dataframe.columns for i in range(num_metrics) + cost_name not in dataframe.columns + for cost_name in multi_objective_cost_names ) ) ): @@ -1083,13 +1089,13 @@ def additional_info_has_key(rv, key): else: dataframe.sort_values(by=sort_by, ascending=ascending_param, inplace=True) - if num_metrics: + if num_metrics == 1: if "cost" not in columns and "cost" in dataframe.columns: dataframe.drop("cost", inplace=True) else: - for i in range(num_metrics): - if f"cost_{i}" not in columns and f"cost_{i}" in dataframe.columns: - dataframe.drop(f"cost_{i}", inplace=True) + for cost_name in multi_objective_cost_names: + if cost_name not in columns and cost_name in dataframe.columns: + dataframe.drop(cost_name, inplace=True) # Lastly, just grab the top_k if top_k == "all" or top_k >= len(dataframe): diff --git a/autosklearn/evaluation/__init__.py b/autosklearn/evaluation/__init__.py index c4b688edae..52794fec03 100644 --- a/autosklearn/evaluation/__init__.py +++ b/autosklearn/evaluation/__init__.py @@ -586,7 +586,8 @@ def run( for name in self.multi_objectives: if name not in cost: raise RuntimeError( - f"Objective {name} was not found in the returned costs." + f"Objective {name} was not found " + f"in the returned costs ({cost})" ) ordered_cost.append(cost[name]) diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index 82ad211560..adeb251440 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -226,7 +226,7 @@ def __init__( self.seed = seed self.output_y_hat_optimization = output_y_hat_optimization - self.scoring_functions = scoring_functions + self.scoring_functions = scoring_functions if scoring_functions else [] if isinstance(disable_file_output, (bool, list)): self.disable_file_output: Union[bool, List[str]] = disable_file_output @@ -406,11 +406,11 @@ def finish_up( if len(self.metrics) == 1: loss = loss_[self.metrics[0].name] else: - loss = {metric: loss_[metric] for metric in loss_} + loss = {metric.name: loss_[metric.name] for metric in self.metrics} additional_run_info = {} if additional_run_info is None else additional_run_info - for metric_name, value in loss_.items(): - additional_run_info[metric_name] = value + for metric in self.scoring_functions: + additional_run_info[metric.name] = loss_[metric.name] additional_run_info["duration"] = self.duration additional_run_info["num_run"] = self.num_run if train_loss is not None: diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 6e1ba745b8..6558670191 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -513,13 +513,11 @@ def calculate_losses( # we expect a dict() object for which we should calculate the loss loss_dict = dict() for metric_ in scoring_functions + list(metrics): + # maybe metric argument is not in scoring_functions # TODO: When metrics are annotated with type_of_target support # we can remove this check if metric_.name not in score: continue - # maybe metric argument is not in scoring_functions - # so append it to the list. Rather than check if such - # is the case, redefining loss_dict[metric] is less expensive loss_dict[metric_.name] = metric_._optimum - score[metric_.name] return loss_dict From 8de1e5a8a8b8f1530c59a15115ec312913f0abec Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 11 May 2022 11:58:52 +0200 Subject: [PATCH 18/24] Fix metadata generation unit test --- autosklearn/evaluation/abstract_evaluator.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/autosklearn/evaluation/abstract_evaluator.py b/autosklearn/evaluation/abstract_evaluator.py index adeb251440..ab9e961128 100644 --- a/autosklearn/evaluation/abstract_evaluator.py +++ b/autosklearn/evaluation/abstract_evaluator.py @@ -410,7 +410,8 @@ def finish_up( additional_run_info = {} if additional_run_info is None else additional_run_info for metric in self.scoring_functions: - additional_run_info[metric.name] = loss_[metric.name] + if metric.name in loss_: + additional_run_info[metric.name] = loss_[metric.name] additional_run_info["duration"] = self.duration additional_run_info["num_run"] = self.num_run if train_loss is not None: From 08c3bd0d8b0470c7fa6f190e1c6b72b7e5c75248 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 11 May 2022 16:30:48 +0200 Subject: [PATCH 19/24] Test for metrics with the same name --- autosklearn/automl.py | 15 +++++++++- autosklearn/metrics/__init__.py | 33 +++++++++++++++++++++ test/test_metric/test_metrics.py | 50 ++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 1 deletion(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 90e70392b4..9b17045f5f 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -67,7 +67,12 @@ from autosklearn.evaluation import ExecuteTaFuncWithQueue, get_cost_of_crash from autosklearn.evaluation.abstract_evaluator import _fit_and_suppress_warnings from autosklearn.evaluation.train_evaluator import TrainEvaluator, _fit_with_budget -from autosklearn.metrics import Scorer, compute_single_metric, default_metric_for_task +from autosklearn.metrics import ( + Scorer, + _validate_metrics, + compute_single_metric, + default_metric_for_task, +) from autosklearn.pipeline.base import BasePipeline from autosklearn.pipeline.components.classification import ClassifierChoice from autosklearn.pipeline.components.data_preprocessing.categorical_encoding import ( @@ -614,6 +619,7 @@ def fit( # Assign a metric if it doesnt exist if self._metrics is None: self._metrics = [default_metric_for_task[self._task]] + _validate_metrics(self._metrics, self._scoring_functions) if dataset_name is None: dataset_name = str(uuid.uuid1(clock_seq=os.getpid())) @@ -1305,6 +1311,8 @@ def fit_pipeline( kwargs["metrics"] = self._metrics if not isinstance(kwargs["metrics"], Sequence): kwargs["metrics"] = [kwargs["metrics"]] + if "scoring_functions" not in kwargs: + kwargs["scoring_function"] = self._scoring_functions if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: @@ -1315,6 +1323,8 @@ def fit_pipeline( kwargs["stats"] = Stats(scenario_mock) kwargs["stats"].start_timing() + _validate_metrics(kwargs["metrics"], kwargs["scoring_functions"]) + # Fit a pipeline, which will be stored on disk # which we can later load via the backend ta = ExecuteTaFuncWithQueue( @@ -1826,7 +1836,10 @@ def cv_results_(self): metric_dict[metric.name].append(metric_value) metric_mask[metric.name].append(mask_value) + optimization_metric_names = set(m.name for m in self._metrics) for metric in self._scoring_functions: + if metric.name in optimization_metric_names: + continue if metric.name in run_value.additional_info.keys(): metric_cost = run_value.additional_info[metric.name] metric_value = metric._optimum - (metric._sign * metric_cost) diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index 6558670191..c3ffb3c424 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -1,6 +1,7 @@ from abc import ABCMeta, abstractmethod from typing import Any, Callable, Dict, List, Optional, Sequence +import collections from functools import partial from itertools import product @@ -384,6 +385,36 @@ def make_scorer( CLASSIFICATION_METRICS[name] = scorer +def _validate_metrics( + metrics: Sequence[Scorer], + scoring_functions: Optional[List[Scorer]] = None, +) -> None: + """ + Validate metrics given to Auto-sklearn. Raises an Exception in case of a problem. + + metrics: Sequence[Scorer] + A list of objects that hosts a function to calculate how good the + prediction is according to the solution. + scoring_functions: List[Scorer] + A list of metrics to calculate multiple losses + """ + + to_score = list(metrics) + if scoring_functions: + to_score.extend(scoring_functions) + + if len(metrics) == 0: + raise ValueError("Number of metrics to compute must be greater than zero.") + + metric_counter = collections.Counter(to_score) + metric_names_counter = collections.Counter(metric.name for metric in to_score) + if len(metric_counter) != len(metric_names_counter): + raise ValueError( + "Error in metrics passed to Auto-sklearn. A metric name was used " + "multiple times for different metrics!" + ) + + def calculate_scores( solution: np.ndarray, prediction: np.ndarray, @@ -417,6 +448,8 @@ def calculate_scores( if task_type not in TASK_TYPES: raise NotImplementedError(task_type) + _validate_metrics(metrics, scoring_functions) + to_score = list(metrics) if scoring_functions: to_score.extend(scoring_functions) diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 6b5d591a80..6375b34e76 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -684,6 +684,56 @@ def test_calculate_losses(): assert pytest.approx(expected_score) == score_dict[expected_metric] assert pytest.approx(1 - expected_score) == loss_dict[expected_metric] + # Test no metric + with pytest.raises( + ValueError, match="Number of metrics to compute must be greater than zero." + ): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[], + ) + + with pytest.raises( + ValueError, match="Number of metrics to compute must be greater than zero." + ): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[], + scoring_functions=[ + autosklearn.metrics.accuracy, + autosklearn.metrics.balanced_accuracy, + ], + ) + + # Test the same metric twice + accuracy_fixture = {"accuracy": pytest.approx(0.9)} + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy, autosklearn.metrics.accuracy], + ) + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], + scoring_functions=[autosklearn.metrics.accuracy], + ) + assert accuracy_fixture == calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy], + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.accuracy], + ) + + # Test the same name for multiple metrics! + # Test additional scoring functions score_dict = calculate_scores( solution=y_true, From fdfed8516eb389c057bc01a3001c888be3ceda0c Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Wed, 11 May 2022 17:45:58 +0200 Subject: [PATCH 20/24] Fix? --- autosklearn/automl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/autosklearn/automl.py b/autosklearn/automl.py index 9b17045f5f..8c0f4717dc 100644 --- a/autosklearn/automl.py +++ b/autosklearn/automl.py @@ -1312,7 +1312,7 @@ def fit_pipeline( if not isinstance(kwargs["metrics"], Sequence): kwargs["metrics"] = [kwargs["metrics"]] if "scoring_functions" not in kwargs: - kwargs["scoring_function"] = self._scoring_functions + kwargs["scoring_functions"] = self._scoring_functions if "disable_file_output" not in kwargs: kwargs["disable_file_output"] = self._disable_evaluator_output if "pynisher_context" not in kwargs: From fe676181fd27adcc1c64cd361b211f007b07ad43 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 12 May 2022 10:18:17 +0200 Subject: [PATCH 21/24] Test CV results --- test/test_estimators/test_estimators.py | 68 +++++++++++++++++++++++++ 1 file changed, 68 insertions(+) diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index dac4febe19..1ba3657f9b 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -378,6 +378,74 @@ def test_cv_results(tmp_dir): assert hasattr(cls, "classes_") +def test_cv_results_multi_objective(tmp_dir): + # TODO restructure and actually use real SMAC output from a long run + # to do this unittest! + X_train, Y_train, X_test, Y_test = putil.get_dataset("iris") + + cls = AutoSklearnClassifier( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=os.path.join(tmp_dir, "backend"), + seed=1, + initial_configurations_via_metalearning=0, + metric=[autosklearn.metrics.precision_macro, autosklearn.metrics.roc_auc], + scoring_functions=[autosklearn.metrics.accuracy, autosklearn.metrics.roc_auc], + ) + + params = cls.get_params() + original_params = copy.deepcopy(params) + + cls.fit(X_train, Y_train) + + cv_results = cls.cv_results_ + assert isinstance(cv_results, dict), type(cv_results) + assert "mean_test_score" not in cv_results + assert "rank_test_scores" not in cv_results + for expected_column in ( + "mean_test_precision_macro", + "mean_test_roc_auc", + "mean_fit_time", + "rank_test_precision_macro", + "rank_test_roc_auc", + "metric_roc_auc", + "metric_accuracy", + ): + assert isinstance(cv_results[expected_column], np.ndarray), type( + cv_results[expected_column] + ) + + assert isinstance(cv_results["params"], list), type(cv_results["params"]) + cv_result_items = [ + isinstance(val, npma.MaskedArray) + for key, val in cv_results.items() + if key.startswith("param_") + ] + assert all(cv_result_items), cv_results.items() + + # Compare the state of the model parameters with the original parameters + new_params = clone(cls).get_params() + for param_name, original_value in original_params.items(): + new_value = new_params[param_name] + + # Taken from Sklearn code: + # We should never change or mutate the internal state of input + # parameters by default. To check this we use the joblib.hash function + # that introspects recursively any subobjects to compute a checksum. + # The only exception to this rule of immutable constructor parameters + # is possible RandomState instance but in this check we explicitly + # fixed the random_state params recursively to be integer seeds. + assert joblib.hash(new_value) == joblib.hash(original_value), ( + "Estimator %s should not change or mutate " + " the parameter %s from %s to %s during fit." + % (cls, param_name, original_value, new_value) + ) + + # Comply with https://scikit-learn.org/dev/glossary.html#term-classes + is_classifier(cls) + assert hasattr(cls, "classes_") + + @pytest.mark.parametrize( "estimator_type,dataset_name", [(AutoSklearnClassifier, "iris"), (AutoSklearnRegressor, "boston")], From 0aa65e9b96710421de282a6712cc303ae5d1a8dd Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 12 May 2022 13:51:25 +0200 Subject: [PATCH 22/24] Test leaderboard for multi-objective optimization --- autosklearn/estimators.py | 3 + test/test_estimators/test_estimators.py | 147 +++++++++++++++++++++++- 2 files changed, 149 insertions(+), 1 deletion(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 85ffbeec76..b1837214cb 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1139,6 +1139,9 @@ def _leaderboard_columns( simple = ( ["model_id", "rank", "ensemble_weight", "type"] + cost_list + ["duration"] ) + if num_metrics > 1: + simple.remove("rank") + all.remove("rank") detailed = all return {"all": all, "detailed": detailed, "simple": simple} diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index 1ba3657f9b..2e132b6023 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -1,4 +1,4 @@ -from typing import Any, Dict, Type, Union, cast +from typing import Any, Dict, Sequence, Type, Union, cast import copy import glob @@ -564,6 +564,151 @@ def exclude(lst, s): assert all(leaderboard["ensemble_weight"] > 0) +@pytest.mark.parametrize( + "estimator_type,dataset_name,metrics", + [ + ( + AutoSklearnClassifier, + "iris", + (autosklearn.metrics.accuracy, autosklearn.metrics.balanced_accuracy), + ), + ( + AutoSklearnRegressor, + "boston", + (autosklearn.metrics.r2, autosklearn.metrics.root_mean_squared_error), + ), + ], +) +def test_leaderboard_multi_objective( + tmp_dir: str, + estimator_type: Type[AutoSklearnEstimator], + dataset_name: str, + metrics: Sequence[autosklearn.metrics.Scorer], +): + # Comprehensive test tasks a substantial amount of time, manually set if + # required. + MAX_COMBO_SIZE_FOR_INCLUDE_PARAM = 3 # [0, len(valid_columns) + 1] + column_types = AutoSklearnEstimator._leaderboard_columns(num_metrics=2) + + # Create a dict of all possible param values for each param + # with some invalid one's of the incorrect type + include_combinations = itertools.chain( + itertools.combinations(column_types["all"], item_count) + for item_count in range(1, MAX_COMBO_SIZE_FOR_INCLUDE_PARAM) + ) + valid_params = { + "detailed": [True, False], + "ensemble_only": [True, False], + "top_k": [-10, 0, 1, 10, "all"], + "sort_by": [ + "cost", + "cost_0", + "cost_1", + ["cost_1", "cost_0"], + *column_types["all"], + "invalid", + ], + "sort_order": ["ascending", "descending", "auto", "invalid", None], + "include": itertools.chain([None, "invalid", "type"], include_combinations), + } + + # Create a generator of all possible combinations of valid_params + params_generator = iter( + dict(zip(valid_params.keys(), param_values)) + for param_values in itertools.product(*valid_params.values()) + ) + + X_train, Y_train, _, _ = putil.get_dataset(dataset_name) + model = estimator_type( + time_left_for_this_task=30, + per_run_time_limit=5, + tmp_folder=os.path.join(tmp_dir, "backend"), + seed=1, + metric=metrics, + ) + + model.fit(X_train, Y_train) + + for params in params_generator: + # Convert from iterator to solid list + if params["include"] is not None and not isinstance(params["include"], str): + params["include"] = list(params["include"]) + + # Invalid top_k should raise an error, is a positive int or 'all' + if not (params["top_k"] == "all" or params["top_k"] > 0): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Invalid sort_by column + elif ( + params["sort_by"] not in column_types["all"] + and params["sort_by"] != "cost" + and params["sort_by"] != ["cost_1", "cost_0"] + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Shouldn't accept an invalid sort order + elif params["sort_order"] not in ["ascending", "descending", "auto"]: + with pytest.raises(ValueError): + model.leaderboard(**params) + + # include is single str but not valid + elif ( + isinstance(params["include"], str) + and params["include"] not in column_types["all"] + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Crash if include is list but contains invalid column + elif ( + isinstance(params["include"], list) + and len(set(params["include"]) - set(column_types["all"])) != 0 + ): + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Can't have just model_id, in both single str and list case + elif params["include"] == "model_id" or params["include"] == ["model_id"]: + with pytest.raises(ValueError): + model.leaderboard(**params) + + # Else all valid combinations should be validated + else: + leaderboard = model.leaderboard(**params) + assert "cost" not in leaderboard.columns + + # top_k should never be less than the rows given back + # It can however be larger + if isinstance(params["top_k"], int): + assert params["top_k"] >= len(leaderboard) + + # Check the right columns are present and in the right order + # The model_id is set as the index, not included in pandas columns + columns = list(leaderboard.columns) + + def exclude(lst, s): + return [x for x in lst if x != s] + + if params["include"] is not None: + # Include with only single str should be the only column + if isinstance(params["include"], str): + assert params["include"] in columns and len(columns) == 1 + # Include as a list should have all the columns without model_id + else: + assert columns == exclude(params["include"], "model_id") + elif params["detailed"]: + assert columns == exclude(column_types["detailed"], "model_id") + else: + assert columns == exclude(column_types["simple"], "model_id") + + # Ensure that if it's ensemble only + # Can only check if 'ensemble_weight' is present + if params["ensemble_only"] and "ensemble_weight" in columns: + assert all(leaderboard["ensemble_weight"] > 0) + + @pytest.mark.parametrize("estimator", [AutoSklearnRegressor]) @pytest.mark.parametrize("resampling_strategy", ["holdout"]) @pytest.mark.parametrize( From b1a0c7285af3b563f65ba14f30c5c34f7147a882 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 12 May 2022 14:52:19 +0200 Subject: [PATCH 23/24] Last batch of unit tests added --- autosklearn/estimators.py | 4 +- test/test_evaluation/test_test_evaluator.py | 30 ++- test/test_evaluation/test_train_evaluator.py | 208 ++++++++++++++++++- 3 files changed, 237 insertions(+), 5 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index b1837214cb..08311f0458 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -1036,7 +1036,7 @@ def additional_info_has_key(rv, key): # Give it an index, even if not in the `include` dataframe.set_index("model_id", inplace=True) - # Add the `rank` column if needed, dropping `cost` if it's not + # Add the `rank` column if needed # requested by the user if "rank" in columns: if num_metrics == 1: @@ -1048,7 +1048,7 @@ def additional_info_has_key(rv, key): ) # account for `model_id` else: self.automl_._logger.warning( - "Cannot compute rank for multi-objective optimization porblems." + "Cannot compute rank for multi-objective optimization problems." ) # Decide on the sort order depending on what it gets sorted by diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 813ad7a35e..69eafccc51 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -20,7 +20,7 @@ ) from autosklearn.evaluation.test_evaluator import TestEvaluator, eval_t from autosklearn.evaluation.util import read_queue -from autosklearn.metrics import accuracy, f1_macro, r2 +from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space import unittest @@ -128,6 +128,34 @@ def test_eval_test(self): self.assertEqual(rval[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) + def test_eval_test_multi_objective(self): + metrics = { + accuracy: 0.040000000000000036, + balanced_accuracy: 0.02777777777777779, + } + eval_t( + queue=self.queue, + backend=self.backend, + config=self.configuration, + metrics=list(metrics.keys()), + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=False, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + port=self.port, + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[0]["loss"][metric.name], loss) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) + def test_eval_test_all_loss_functions(self): eval_t( queue=self.queue, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 0651443f2a..53bb3277ca 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -48,7 +48,7 @@ subsample_indices, ) from autosklearn.evaluation.util import read_queue -from autosklearn.metrics import accuracy, f1_macro, r2 +from autosklearn.metrics import accuracy, balanced_accuracy, f1_macro, r2 from autosklearn.util.pipeline import get_configuration_space import unittest @@ -3002,6 +3002,36 @@ def test_eval_holdout(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_multi_objective(self): + metrics = { + accuracy: 0.030303030303030276, + balanced_accuracy: 0.033333333333333326, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_all_loss_functions(self): eval_holdout( queue=self.queue, @@ -3081,6 +3111,36 @@ def test_eval_holdout_iterative_fit_no_timeout(self): self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE) self.assertEqual(rval[-1]["status"], StatusType.SUCCESS) + def test_eval_holdout_iterative_fit_no_timeout_multi_objective(self): + metrics = { + accuracy: 0.030303030303030276, + balanced_accuracy: 0.033333333333333326, + } + eval_iterative_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 9) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[-1]["loss"][metric.name], loss) + self.assertEqual(rval[0]["status"], StatusType.DONOTADVANCE) + self.assertEqual(rval[-1]["status"], StatusType.SUCCESS) + def test_eval_holdout_budget_iterations(self): eval_holdout( queue=self.queue, @@ -3108,7 +3168,39 @@ def test_eval_holdout_budget_iterations(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) - def test_eval_holdout_budget_iterations_converged(self): + def test_eval_holdout_budget_iterations_multi_objective(self): + metrics = { + accuracy: 0.06060606060606055, + balanced_accuracy: 0.06666666666666676, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=1, # Not iterative, but only for 1% of the budget + budget_type="iterations", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + + def test_eval_holdout_budget_iterations_converged_multi_objective(self): configuration = get_configuration_space( exclude={"classifier": ["random_forest", "liblinear_svc"]}, info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, @@ -3139,6 +3231,42 @@ def test_eval_holdout_budget_iterations_converged(self): self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_iterations_converged(self): + metrics = { + accuracy: 0.18181818181818177, + balanced_accuracy: 0.18787878787878787, + } + configuration = get_configuration_space( + exclude={"classifier": ["random_forest", "liblinear_svc"]}, + info={"task": MULTICLASS_CLASSIFICATION, "is_sparse": False}, + ).get_default_configuration() + eval_holdout( + queue=self.queue, + port=self.port, + config=configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude={"classifier": ["random_forest", "liblinear_svc"]}, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=80, + budget_type="iterations", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.DONOTADVANCE) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_subsample(self): eval_holdout( queue=self.queue, @@ -3166,6 +3294,38 @@ def test_eval_holdout_budget_subsample(self): self.assertEqual(info[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_subsample_multi_objective(self): + metrics = { + accuracy: 0.0, + balanced_accuracy: 0.0, + } + eval_holdout( + queue=self.queue, + port=self.port, + config=self.configuration, + backend=self.backend, + resampling_strategy="holdout", + resampling_strategy_args=None, + seed=1, + num_run=1, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + instance=self.dataset_name, + metrics=list(metrics.keys()), + budget=30, + budget_type="subsample", + additional_components=dict(), + ) + info = read_queue(self.queue) + self.assertEqual(len(info), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(info[0]["loss"][metric.name], loss) + self.assertEqual(info[0]["status"], StatusType.SUCCESS) + self.assertNotIn("bac_metric", info[0]["additional_run_info"]) + def test_eval_holdout_budget_mixed_iterations(self): print(self.configuration) eval_holdout( @@ -3346,3 +3506,47 @@ def test_eval_partial_cv(self): self.assertEqual(len(rval), 1) self.assertAlmostEqual(rval[0]["loss"], results[fold]) self.assertEqual(rval[0]["status"], StatusType.SUCCESS) + + def test_eval_partial_cv_multi_objective(self): + metrics = { + accuracy: [ + 0.050000000000000044, + 0.0, + 0.09999999999999998, + 0.09999999999999998, + 0.050000000000000044, + ], + balanced_accuracy: [ + 0.04761904761904756, + 0.0, + 0.10317460317460314, + 0.11111111111111116, + 0.05555555555555547, + ], + } + + for fold in range(5): + instance = json.dumps({"task_id": "data", "fold": fold}) + eval_partial_cv( + port=self.port, + queue=self.queue, + config=self.configuration, + backend=self.backend, + seed=1, + num_run=1, + instance=instance, + resampling_strategy="partial-cv", + resampling_strategy_args={"folds": 5}, + scoring_functions=None, + output_y_hat_optimization=True, + include=None, + exclude=None, + disable_file_output=False, + metrics=list(metrics.keys()), + additional_components=dict(), + ) + rval = read_queue(self.queue) + self.assertEqual(len(rval), 1) + for metric, loss in metrics.items(): + self.assertAlmostEqual(rval[0]["loss"][metric.name], loss[fold]) + self.assertEqual(rval[0]["status"], StatusType.SUCCESS) From d432e076e3ca7af1c2b99fd919713cc29df104f2 Mon Sep 17 00:00:00 2001 From: Matthias Feurer Date: Thu, 12 May 2022 20:24:10 +0200 Subject: [PATCH 24/24] Include Eddie's feedback --- autosklearn/estimators.py | 19 +++++++------------ autosklearn/metrics/__init__.py | 4 ++-- test/test_estimators/test_estimators.py | 12 +++++++++++- test/test_evaluation/test_test_evaluator.py | 12 +++++++----- test/test_evaluation/test_train_evaluator.py | 6 ++++-- test/test_metric/test_metrics.py | 11 +++++++++++ 6 files changed, 42 insertions(+), 22 deletions(-) diff --git a/autosklearn/estimators.py b/autosklearn/estimators.py index 08311f0458..1c283e06e6 100644 --- a/autosklearn/estimators.py +++ b/autosklearn/estimators.py @@ -876,7 +876,7 @@ def leaderboard( if num_metrics == 1: sort_by = ["cost", "model_id"] else: - sort_by = list(multi_objective_cost_names) + ["model_id"] + sort_by = multi_objective_cost_names + ["model_id"] else: sort_by_cost = False if isinstance(sort_by, str): @@ -1041,15 +1041,13 @@ def additional_info_has_key(rv, key): if "rank" in columns: if num_metrics == 1: dataframe.sort_values(by="cost", ascending=True, inplace=True) - dataframe.insert( - column="rank", - value=range(1, len(dataframe) + 1), - loc=list(columns).index("rank") - 1, - ) # account for `model_id` else: - self.automl_._logger.warning( - "Cannot compute rank for multi-objective optimization problems." - ) + dataframe.sort_values(by="cost_0", ascending=True, inplace=True) + dataframe.insert( + column="rank", + value=range(1, len(dataframe) + 1), + loc=list(columns).index("rank") - 1, + ) # account for `model_id` # Decide on the sort order depending on what it gets sorted by descending_columns = ["ensemble_weight", "duration"] @@ -1139,9 +1137,6 @@ def _leaderboard_columns( simple = ( ["model_id", "rank", "ensemble_weight", "type"] + cost_list + ["duration"] ) - if num_metrics > 1: - simple.remove("rank") - all.remove("rank") detailed = all return {"all": all, "detailed": detailed, "simple": simple} diff --git a/autosklearn/metrics/__init__.py b/autosklearn/metrics/__init__.py index c3ffb3c424..3104716da3 100644 --- a/autosklearn/metrics/__init__.py +++ b/autosklearn/metrics/__init__.py @@ -395,7 +395,7 @@ def _validate_metrics( metrics: Sequence[Scorer] A list of objects that hosts a function to calculate how good the prediction is according to the solution. - scoring_functions: List[Scorer] + scoring_functions: Optional[List[Scorer]] A list of metrics to calculate multiple losses """ @@ -448,7 +448,7 @@ def calculate_scores( if task_type not in TASK_TYPES: raise NotImplementedError(task_type) - _validate_metrics(metrics, scoring_functions) + _validate_metrics(metrics=metrics, scoring_functions=scoring_functions) to_score = list(metrics) if scoring_functions: diff --git a/test/test_estimators/test_estimators.py b/test/test_estimators/test_estimators.py index 2e132b6023..4dd13d4c17 100644 --- a/test/test_estimators/test_estimators.py +++ b/test/test_estimators/test_estimators.py @@ -42,6 +42,7 @@ import unittest import unittest.mock +import test.conftest from test.test_automl.automl_utils import ( count_succeses, include_single_scores, @@ -623,7 +624,7 @@ def test_leaderboard_multi_objective( time_left_for_this_task=30, per_run_time_limit=5, tmp_folder=os.path.join(tmp_dir, "backend"), - seed=1, + seed=test.conftest.DEFAULT_SEED, metric=metrics, ) @@ -644,6 +645,7 @@ def test_leaderboard_multi_objective( params["sort_by"] not in column_types["all"] and params["sort_by"] != "cost" and params["sort_by"] != ["cost_1", "cost_0"] + and params["sort_by"] not in ["cost_0", "cost_1"] ): with pytest.raises(ValueError): model.leaderboard(**params) @@ -679,6 +681,14 @@ def test_leaderboard_multi_objective( leaderboard = model.leaderboard(**params) assert "cost" not in leaderboard.columns + if params["include"] is None: + assert "cost_0" in leaderboard.columns + assert "cost_1" in leaderboard.columns + else: + for cost_name in ["cost_0", "cost_1"]: + if cost_name in params["include"]: + assert cost_name in leaderboard.columns + # top_k should never be less than the rows given back # It can however be larger if isinstance(params["top_k"], int): diff --git a/test/test_evaluation/test_test_evaluator.py b/test/test_evaluation/test_test_evaluator.py index 69eafccc51..457661df03 100644 --- a/test/test_evaluation/test_test_evaluator.py +++ b/test/test_evaluation/test_test_evaluator.py @@ -26,6 +26,8 @@ import unittest import unittest.mock +import test.conftest + this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import ( # noqa (E402: module level import not at top of file) @@ -111,7 +113,7 @@ def test_eval_test(self): backend=self.backend, config=self.configuration, metrics=[accuracy], - seed=1, + seed=test.conftest.DEFAULT_SEED, num_run=1, scoring_functions=None, output_y_hat_optimization=False, @@ -124,21 +126,21 @@ def test_eval_test(self): ) rval = read_queue(self.queue) self.assertEqual(len(rval), 1) - self.assertAlmostEqual(rval[0]["loss"], 0.040000000000000036) + self.assertAlmostEqual(rval[0]["loss"], 0.07999999999999996) self.assertEqual(rval[0]["status"], StatusType.SUCCESS) self.assertNotIn("bac_metric", rval[0]["additional_run_info"]) def test_eval_test_multi_objective(self): metrics = { - accuracy: 0.040000000000000036, - balanced_accuracy: 0.02777777777777779, + accuracy: 0.07999999999999996, + balanced_accuracy: 0.05555555555555547, } eval_t( queue=self.queue, backend=self.backend, config=self.configuration, metrics=list(metrics.keys()), - seed=1, + seed=test.conftest.DEFAULT_SEED, num_run=1, scoring_functions=None, output_y_hat_optimization=False, diff --git a/test/test_evaluation/test_train_evaluator.py b/test/test_evaluation/test_train_evaluator.py index 53bb3277ca..23607b8e4d 100644 --- a/test/test_evaluation/test_train_evaluator.py +++ b/test/test_evaluation/test_train_evaluator.py @@ -54,6 +54,8 @@ import unittest import unittest.mock +import test.conftest + this_directory = os.path.dirname(__file__) sys.path.append(this_directory) from evaluation_util import ( # noqa (E402: module level import not at top of file) @@ -3014,7 +3016,7 @@ def test_eval_holdout_multi_objective(self): backend=self.backend, resampling_strategy="holdout", resampling_strategy_args=None, - seed=1, + seed=test.conftest.DEFAULT_SEED, num_run=1, scoring_functions=None, output_y_hat_optimization=True, @@ -3297,7 +3299,7 @@ def test_eval_holdout_budget_subsample(self): def test_eval_holdout_budget_subsample_multi_objective(self): metrics = { accuracy: 0.0, - balanced_accuracy: 0.0, + f1_macro: 0.0, } eval_holdout( queue=self.queue, diff --git a/test/test_metric/test_metrics.py b/test/test_metric/test_metrics.py index 6375b34e76..2cb7dc2158 100644 --- a/test/test_metric/test_metrics.py +++ b/test/test_metric/test_metrics.py @@ -733,6 +733,17 @@ def test_calculate_losses(): ) # Test the same name for multiple metrics! + bogus_accuracy = autosklearn.metrics.make_scorer( + "accuracy", + score_func=sklearn.metrics.roc_auc_score, + ) + with pytest.raises(ValueError, match="used multiple times"): + calculate_scores( + solution=y_true, + prediction=y_pred, + task_type=BINARY_CLASSIFICATION, + metrics=[autosklearn.metrics.accuracy, bogus_accuracy], + ) # Test additional scoring functions score_dict = calculate_scores(