-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
First draft of multi-objective optimization #1455
Changes from 4 commits
2885692
4e361b7
77a223e
c3b5072
fc89c68
1a09c12
19251e8
177d913
98beb3a
93cdbfc
610cda8
1557131
75c3eb4
1f90ec8
a9227b8
2c36d45
1f1d218
8de1e5a
08c3bd0
fdfed85
fe67618
0aa65e9
b1a0c72
d432e07
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,6 +1,6 @@ | ||
from __future__ import annotations | ||
|
||
from typing import Any, Callable, Iterable, Mapping, Optional, Tuple | ||
from typing import Any, Callable, Iterable, Mapping, Optional, Sequence, Tuple | ||
|
||
import copy | ||
import io | ||
|
@@ -210,7 +210,7 @@ def __init__( | |
get_smac_object_callback: Optional[Callable] = None, | ||
smac_scenario_args: Optional[Mapping] = None, | ||
logging_config: Optional[Mapping] = None, | ||
metric: Optional[Scorer] = None, | ||
metric: Optional[Scorer | Sequence[Scorer]] = None, | ||
scoring_functions: Optional[list[Scorer]] = None, | ||
get_trials_callback: Optional[IncorporateRunResultCallback] = None, | ||
dataset_compression: bool | Mapping[str, Any] = True, | ||
|
@@ -265,7 +265,7 @@ def __init__( | |
initial_configurations_via_metalearning | ||
) | ||
|
||
self._scoring_functions = scoring_functions or {} | ||
self._scoring_functions = scoring_functions or [] | ||
self._resampling_strategy_arguments = resampling_strategy_arguments or {} | ||
|
||
# Single core, local runs should use fork to prevent the __main__ requirements | ||
|
@@ -447,7 +447,9 @@ def _do_dummy_prediction(self) -> None: | |
resampling_strategy=self._resampling_strategy, | ||
initial_num_run=dummy_run_num, | ||
stats=stats, | ||
metric=self._metric, | ||
metrics=( | ||
[self._metric] if isinstance(self._metric, Scorer) else self._metric | ||
), | ||
memory_limit=memory_limit, | ||
disable_file_output=self._disable_evaluator_output, | ||
abort_on_first_run_crash=False, | ||
|
@@ -692,10 +694,14 @@ def fit( | |
# defined in the estimator fit call | ||
if self._metric is None: | ||
raise ValueError("No metric given.") | ||
if not isinstance(self._metric, Scorer): | ||
raise ValueError( | ||
"Metric must be instance of " "autosklearn.metrics.Scorer." | ||
) | ||
if isinstance(self._metric, Sequence): | ||
for entry in self._metric: | ||
if not isinstance(entry, Scorer): | ||
raise ValueError( | ||
"Metric must be instance of autosklearn.metrics.Scorer." | ||
) | ||
elif not isinstance(self._metric, Scorer): | ||
raise ValueError("Metric must be instance of autosklearn.metrics.Scorer.") | ||
|
||
# If no dask client was provided, we create one, so that we can | ||
# start a ensemble process in parallel to smbo optimize | ||
|
@@ -790,7 +796,11 @@ def fit( | |
backend=copy.deepcopy(self._backend), | ||
dataset_name=dataset_name, | ||
task=self._task, | ||
metric=self._metric, | ||
metric=( | ||
self._metric[0] | ||
if isinstance(self._metric, Sequence) | ||
else self._metric | ||
), | ||
ensemble_size=self._ensemble_size, | ||
ensemble_nbest=self._ensemble_nbest, | ||
max_models_on_disc=self._max_models_on_disc, | ||
|
@@ -1289,7 +1299,13 @@ def fit_pipeline( | |
if "resampling_strategy" not in kwargs: | ||
kwargs["resampling_strategy"] = self._resampling_strategy | ||
if "metric" not in kwargs: | ||
kwargs["metric"] = self._metric | ||
kwargs["metric"] = ( | ||
[self._metric] if isinstance(self._metric, Scorer) else self._metric | ||
) | ||
elif "metric" in kwargs and isinstance(kwargs["metric"], Scorer): | ||
kwargs["metric"] = [kwargs["metric"]] | ||
kwargs["metrics"] = kwargs["metric"] | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
del kwargs["metric"] | ||
if "disable_file_output" not in kwargs: | ||
kwargs["disable_file_output"] = self._disable_evaluator_output | ||
if "pynisher_context" not in kwargs: | ||
|
@@ -1307,7 +1323,7 @@ def fit_pipeline( | |
autosklearn_seed=self._seed, | ||
abort_on_first_run_crash=False, | ||
multi_objectives=["cost"], | ||
cost_for_crash=get_cost_of_crash(kwargs["metric"]), | ||
cost_for_crash=get_cost_of_crash(kwargs["metrics"]), | ||
port=self._logger_port, | ||
**kwargs, | ||
**self._resampling_strategy_arguments, | ||
|
@@ -1492,7 +1508,9 @@ def fit_ensemble( | |
backend=copy.deepcopy(self._backend), | ||
dataset_name=dataset_name if dataset_name else self._dataset_name, | ||
task=task if task else self._task, | ||
metric=self._metric, | ||
metric=( | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
self._metric[0] if isinstance(self._metric, Sequence) else self._metric | ||
), | ||
ensemble_size=ensemble_size if ensemble_size else self._ensemble_size, | ||
ensemble_nbest=ensemble_nbest if ensemble_nbest else self._ensemble_nbest, | ||
max_models_on_disc=self._max_models_on_disc, | ||
|
@@ -1632,7 +1650,7 @@ def score(self, X, y): | |
) | ||
|
||
def _get_runhistory_models_performance(self): | ||
metric = self._metric | ||
metric = self._metric if isinstance(self._metric, Scorer) else self._metric[0] | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
data = self.runhistory_.data | ||
performance_list = [] | ||
for run_key, run_value in data.items(): | ||
|
@@ -1644,7 +1662,10 @@ def _get_runhistory_models_performance(self): | |
endtime = pd.Timestamp( | ||
time.strftime("%Y-%m-%d %H:%M:%S", time.localtime(run_value.endtime)) | ||
) | ||
val_score = metric._optimum - (metric._sign * run_value.cost) | ||
cost = run_value.cost | ||
if not isinstance(self._metric, Scorer): | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cost = cost[0] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I assume this is a point of API conflict? It would be good to know about all the metrics for a model but at the end of the day, we currently only support one and so we choose the first? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it would be good to know about all the metrics. I will look into returning multiple metrics here (should be possible). There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Please see my comment wrt this in the PR comment at the top. |
||
val_score = metric._optimum - (metric._sign * cost) | ||
train_score = metric._optimum - ( | ||
metric._sign * run_value.additional_info["train_loss"] | ||
) | ||
|
@@ -1656,9 +1677,10 @@ def _get_runhistory_models_performance(self): | |
# Append test-scores, if data for test_loss are available. | ||
# This is the case, if X_test and y_test where provided. | ||
if "test_loss" in run_value.additional_info: | ||
test_score = metric._optimum - ( | ||
metric._sign * run_value.additional_info["test_loss"] | ||
) | ||
test_loss = run_value.additional_info["test_loss"] | ||
if not isinstance(self._metric, Scorer): | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
test_loss = test_loss[0] | ||
test_score = metric._optimum - (metric._sign * test_loss) | ||
scores["single_best_test_score"] = test_score | ||
|
||
performance_list.append(scores) | ||
|
@@ -1912,7 +1934,6 @@ def show_models(self) -> dict[int, Any]: | |
.. code-block:: python | ||
|
||
import sklearn.datasets | ||
import sklearn.metrics | ||
import autosklearn.regression | ||
|
||
X, y = sklearn.datasets.load_diabetes(return_X_y=True) | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,5 +1,18 @@ | ||
# -*- encoding: utf-8 -*- | ||
from typing import Any, Callable, Dict, List, Optional, Tuple, Type, Union, cast | ||
from __future__ import annotations | ||
|
||
from typing import ( | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
Any, | ||
Callable, | ||
Dict, | ||
List, | ||
Optional, | ||
Sequence, | ||
Tuple, | ||
Type, | ||
Union, | ||
cast, | ||
) | ||
|
||
import functools | ||
import json | ||
|
@@ -85,11 +98,17 @@ def fit_predict_try_except_decorator( | |
queue.close() | ||
|
||
|
||
def get_cost_of_crash(metric: Scorer) -> float: | ||
def get_cost_of_crash( | ||
metric: Union[Scorer | Sequence[Scorer]], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Like wise here, i.e. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks for catching. |
||
) -> Union[float, List[float]]: | ||
|
||
# The metric must always be defined to extract optimum/worst | ||
if not isinstance(metric, Scorer): | ||
raise ValueError("The metric must be stricly be an instance of Scorer") | ||
if isinstance(metric, Sequence): | ||
return [cast(float, get_cost_of_crash(metric_)) for metric_ in metric] | ||
elif not isinstance(metric, Scorer): | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
raise ValueError( | ||
"The metric must be stricly be an instance of Scorer or a sequence of " | ||
"Scorers" | ||
) | ||
|
||
# Autosklearn optimizes the err. This function translates | ||
# worst_possible_result to be a minimization problem. | ||
|
@@ -126,7 +145,7 @@ def __init__( | |
resampling_strategy: Union[ | ||
str, BaseCrossValidator, _RepeatedSplits, BaseShuffleSplit | ||
], | ||
metric: Scorer, | ||
metrics: Sequence[Scorer], | ||
KEggensperger marked this conversation as resolved.
Show resolved
Hide resolved
|
||
cost_for_crash: float, | ||
abort_on_first_run_crash: bool, | ||
port: int, | ||
|
@@ -144,7 +163,7 @@ def __init__( | |
disable_file_output: bool = False, | ||
init_params: Optional[Dict[str, Any]] = None, | ||
budget_type: Optional[str] = None, | ||
ta: Optional[Callable] = None, | ||
ta: Optional[Callable] = None, # Required by SMAC's parent class | ||
**resampling_strategy_args: Any, | ||
): | ||
if resampling_strategy == "holdout": | ||
|
@@ -186,13 +205,14 @@ def __init__( | |
par_factor=par_factor, | ||
cost_for_crash=self.worst_possible_result, | ||
abort_on_first_run_crash=abort_on_first_run_crash, | ||
multi_objectives=multi_objectives, | ||
) | ||
|
||
self.backend = backend | ||
self.autosklearn_seed = autosklearn_seed | ||
self.resampling_strategy = resampling_strategy | ||
self.initial_num_run = initial_num_run | ||
self.metric = metric | ||
self.metrics = metrics | ||
self.resampling_strategy = resampling_strategy | ||
self.resampling_strategy_args = resampling_strategy_args | ||
self.scoring_functions = scoring_functions | ||
|
@@ -356,7 +376,7 @@ def run( | |
config=config, | ||
backend=self.backend, | ||
port=self.port, | ||
metric=self.metric, | ||
metrics=self.metrics, | ||
seed=self.autosklearn_seed, | ||
num_run=num_run, | ||
scoring_functions=self.scoring_functions, | ||
|
@@ -550,4 +570,32 @@ def run( | |
|
||
autosklearn.evaluation.util.empty_queue(queue) | ||
self.logger.info("Finished evaluating configuration %d" % config_id) | ||
|
||
# Do some sanity checking (for multi objective) | ||
if len(self.multi_objectives) > 1: | ||
error = ( | ||
f"Returned costs {cost} does not match the number of objectives" | ||
f" {len(self.multi_objectives)}." | ||
) | ||
|
||
# If dict convert to array | ||
# Make sure the ordering is correct | ||
if isinstance(cost, dict): | ||
ordered_cost = [] | ||
for name in self.multi_objectives: | ||
if name not in cost: | ||
raise RuntimeError( | ||
f"Objective {name} was not found in the returned costs." | ||
) | ||
|
||
ordered_cost.append(cost[name]) | ||
cost = ordered_cost | ||
|
||
if isinstance(cost, list): | ||
if len(cost) != len(self.multi_objectives): | ||
raise RuntimeError(error) | ||
|
||
if isinstance(cost, float): | ||
raise RuntimeError(error) | ||
|
||
return status, cost, runtime, additional_run_info |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Not neccessary, just something to know
Optional[X] == Union[X, None] == X | None
i..e you could write
Scorer | Sequence[Scorer] | None = None
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
That's look nice, will do.