Skip to content

Commit

Permalink
Matthias Feurer: Multi-objective ensemble API (#1485)
Browse files Browse the repository at this point in the history
  • Loading branch information
Github Actions committed May 30, 2022
1 parent 669e4b3 commit 159cf70
Show file tree
Hide file tree
Showing 101 changed files with 6,819 additions and 6,672 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
############################################################################
# Data Loading
# ======================================
from autosklearn.ensembles.ensemble_selection import EnsembleSelection

X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
Expand All @@ -31,20 +32,19 @@
# ======================================

automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=120,
per_run_time_limit=30,
time_left_for_this_task=60,
tmp_folder="/tmp/autosklearn_sequential_example_tmp",
# Do not construct ensembles in parallel to avoid using more than one
# core at a time. The ensemble will be constructed after auto-sklearn
# finished fitting all machine learning models.
ensemble_size=0,
ensemble_class=None,
delete_tmp_folder_after_terminate=False,
)
automl.fit(X_train, y_train, dataset_name="breast_cancer")

# This call to fit_ensemble uses all models trained in the previous call
# to fit to build an ensemble which can be used with automl.predict()
automl.fit_ensemble(y_train, ensemble_size=50)
automl.fit_ensemble(y_train, ensemble_class=EnsembleSelection)

############################################################################
# Print the final ensemble constructed by auto-sklearn
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,6 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
scorer = autosklearn.metrics.accuracy
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=30,
seed=1,
metric=scorer,
)
Expand All @@ -107,7 +106,6 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=30,
seed=1,
metric=accuracy_scorer,
)
Expand All @@ -133,7 +131,6 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=30,
seed=1,
metric=error_rate,
)
Expand Down Expand Up @@ -184,7 +181,6 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=30,
seed=1,
metric=error_rate,
)
Expand Down Expand Up @@ -217,10 +213,8 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
)
cls = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=60,
per_run_time_limit=30,
seed=1,
metric=accuracy_scorer,
ensemble_size=0,
)
cls.fit(X_train, y_train)

Expand All @@ -232,4 +226,4 @@ def metric_which_needs_x(solution, prediction, X_data, consider_col, val_thresho
consider_col=1,
val_threshold=18.8,
)
print(f"Error score {score:.3f} using {error_rate.name:s}")
print(f"Error score {score:.3f} using {accuracy_scorer.name:s}")
Original file line number Diff line number Diff line change
Expand Up @@ -9,11 +9,13 @@
in the `scikit-learn docs <https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html>`_.
Auto-sklearn uses `SMAC3's implementation of ParEGO <https://automl.github.io/SMAC3/main/details/multi_objective.html>`_.
Multi-objective ensembling and proper access to the full Pareto front will be added in the near
Multi-objective ensembling and proper access to the full Pareto set will be added in the near
future.
"""
from pprint import pprint

import matplotlib.pyplot as plt
import numpy as np
import sklearn.datasets
import sklearn.metrics

Expand All @@ -25,7 +27,11 @@
# Data Loading
# ============

X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)
X, y = sklearn.datasets.fetch_openml(data_id=31, return_X_y=True, as_frame=True)
# Change the target to align with scikit-learn's convention that
# ``1`` is the minority class. In this example it is predicting
# that a credit is "bad", i.e. that it will default.
y = np.array([1 if val == "bad" else 0 for val in y])
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
X, y, random_state=1
)
Expand All @@ -35,11 +41,11 @@
# ==========================

automl = autosklearn.classification.AutoSklearnClassifier(
time_left_for_this_task=30,
tmp_folder="/tmp/autosklearn_multi_objective_example_tmp",
time_left_for_this_task=120,
metric=[autosklearn.metrics.precision, autosklearn.metrics.recall],
delete_tmp_folder_after_terminate=False,
)
automl.fit(X_train, y_train, dataset_name="breast_cancer")
automl.fit(X_train, y_train, dataset_name="German Credit")

############################################################################
# Compute the two competing metrics
Expand All @@ -63,3 +69,22 @@
# to *auto-sklearn*.

pprint(automl.cv_results_)

############################################################################
# Visualize the Pareto set
# ==========================
plot_values = []
pareto_front = automl.get_pareto_set()
for ensemble in pareto_front:
predictions = ensemble.predict(X_test)
precision = sklearn.metrics.precision_score(y_test, predictions)
recall = sklearn.metrics.recall_score(y_test, predictions)
plot_values.append((precision, recall))
fig = plt.figure()
ax = fig.add_subplot(111)
for precision, recall in plot_values:
ax.scatter(precision, recall, c="blue")
ax.set_xlabel("Precision")
ax.set_ylabel("Recall")
ax.set_title("Pareto set")
plt.show()
Original file line number Diff line number Diff line change
Expand Up @@ -44,7 +44,7 @@
},
"outputs": [],
"source": [
"X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X, y, random_state=1\n)"
"from autosklearn.ensembles.ensemble_selection import EnsembleSelection\n\nX, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X, y, random_state=1\n)"
]
},
{
Expand All @@ -62,7 +62,7 @@
},
"outputs": [],
"source": [
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=120,\n per_run_time_limit=30,\n tmp_folder=\"/tmp/autosklearn_sequential_example_tmp\",\n # Do not construct ensembles in parallel to avoid using more than one\n # core at a time. The ensemble will be constructed after auto-sklearn\n # finished fitting all machine learning models.\n ensemble_size=0,\n delete_tmp_folder_after_terminate=False,\n)\nautoml.fit(X_train, y_train, dataset_name=\"breast_cancer\")\n\n# This call to fit_ensemble uses all models trained in the previous call\n# to fit to build an ensemble which can be used with automl.predict()\nautoml.fit_ensemble(y_train, ensemble_size=50)"
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n tmp_folder=\"/tmp/autosklearn_sequential_example_tmp\",\n # Do not construct ensembles in parallel to avoid using more than one\n # core at a time. The ensemble will be constructed after auto-sklearn\n # finished fitting all machine learning models.\n ensemble_class=None,\n delete_tmp_folder_after_terminate=False,\n)\nautoml.fit(X_train, y_train, dataset_name=\"breast_cancer\")\n\n# This call to fit_ensemble uses all models trained in the previous call\n# to fit to build an ensemble which can be used with automl.predict()\nautoml.fit_ensemble(y_train, ensemble_class=EnsembleSelection)"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
"select_percentile_classification",
],
},
ensemble_size=1,
ensemble_kwargs={"ensemble_size": 1},
)
automl.fit(X_train, y_train, dataset_name="breast_cancer")

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
},
"outputs": [],
"source": [
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=120,\n per_run_time_limit=30,\n tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n include={\n \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n \"feature_preprocessor\": [\n \"no_preprocessing\",\n \"polynomial\",\n \"select_percentile_classification\",\n ],\n },\n ensemble_size=1,\n)\nautoml.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=120,\n per_run_time_limit=30,\n tmp_folder=\"/tmp/autosklearn_interpretable_models_example_tmp\",\n include={\n \"classifier\": [\"decision_tree\", \"lda\", \"sgd\"],\n \"feature_preprocessor\": [\n \"no_preprocessing\",\n \"polynomial\",\n \"select_percentile_classification\",\n ],\n },\n ensemble_kwargs={\"ensemble_size\": 1},\n)\nautoml.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
]
},
{
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -98,7 +98,7 @@
},
"outputs": [],
"source": [
"print(\"#\" * 80)\nprint(\"Use predefined accuracy metric\")\nscorer = autosklearn.metrics.accuracy\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n per_run_time_limit=30,\n seed=1,\n metric=scorer,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = scorer(y_test, predictions)\nprint(f\"Accuracy score {score:.3f} using {scorer.name}\")"
"print(\"#\" * 80)\nprint(\"Use predefined accuracy metric\")\nscorer = autosklearn.metrics.accuracy\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n seed=1,\n metric=scorer,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = scorer(y_test, predictions)\nprint(f\"Accuracy score {score:.3f} using {scorer.name}\")"
]
},
{
Expand All @@ -116,7 +116,7 @@
},
"outputs": [],
"source": [
"print(\"#\" * 80)\nprint(\"Use self defined accuracy metric\")\naccuracy_scorer = autosklearn.metrics.make_scorer(\n name=\"accu\",\n score_func=accuracy,\n optimum=1,\n greater_is_better=True,\n needs_proba=False,\n needs_threshold=False,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n per_run_time_limit=30,\n seed=1,\n metric=accuracy_scorer,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = accuracy_scorer(y_test, predictions)\nprint(f\"Accuracy score {score:.3f} using {accuracy_scorer.name:s}\")"
"print(\"#\" * 80)\nprint(\"Use self defined accuracy metric\")\naccuracy_scorer = autosklearn.metrics.make_scorer(\n name=\"accu\",\n score_func=accuracy,\n optimum=1,\n greater_is_better=True,\n needs_proba=False,\n needs_threshold=False,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n seed=1,\n metric=accuracy_scorer,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = accuracy_scorer(y_test, predictions)\nprint(f\"Accuracy score {score:.3f} using {accuracy_scorer.name:s}\")"
]
},
{
Expand All @@ -134,7 +134,7 @@
},
"outputs": [],
"source": [
"print(\"#\" * 80)\nprint(\"Use self defined error metric\")\nerror_rate = autosklearn.metrics.make_scorer(\n name=\"error\",\n score_func=error,\n optimum=0,\n greater_is_better=False,\n needs_proba=False,\n needs_threshold=False,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n per_run_time_limit=30,\n seed=1,\n metric=error_rate,\n)\ncls.fit(X_train, y_train)\n\ncls.predictions = cls.predict(X_test)\nscore = error_rate(y_test, predictions)\nprint(f\"Error score {score:.3f} using {error_rate.name:s}\")"
"print(\"#\" * 80)\nprint(\"Use self defined error metric\")\nerror_rate = autosklearn.metrics.make_scorer(\n name=\"error\",\n score_func=error,\n optimum=0,\n greater_is_better=False,\n needs_proba=False,\n needs_threshold=False,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n seed=1,\n metric=error_rate,\n)\ncls.fit(X_train, y_train)\n\ncls.predictions = cls.predict(X_test)\nscore = error_rate(y_test, predictions)\nprint(f\"Error score {score:.3f} using {error_rate.name:s}\")"
]
},
{
Expand Down Expand Up @@ -170,7 +170,7 @@
},
"outputs": [],
"source": [
"print(\"#\" * 80)\nprint(\"Use self defined error with additional argument\")\nerror_rate = autosklearn.metrics.make_scorer(\n name=\"error_add\",\n score_func=error_wk,\n optimum=0,\n greater_is_better=True,\n needs_proba=False,\n needs_threshold=False,\n extra_argument=None,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n per_run_time_limit=30,\n seed=1,\n metric=error_rate,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = error_rate(y_test, predictions)\nprint(f\"Error score {score:.3f} using {error_rate.name:s}\")"
"print(\"#\" * 80)\nprint(\"Use self defined error with additional argument\")\nerror_rate = autosklearn.metrics.make_scorer(\n name=\"error_add\",\n score_func=error_wk,\n optimum=0,\n greater_is_better=True,\n needs_proba=False,\n needs_threshold=False,\n extra_argument=None,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n seed=1,\n metric=error_rate,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = error_rate(y_test, predictions)\nprint(f\"Error score {score:.3f} using {error_rate.name:s}\")"
]
},
{
Expand All @@ -188,7 +188,7 @@
},
"outputs": [],
"source": [
"\"\"\"\nFinally, *Auto-sklearn* also support metric that require the train data (aka X_data) to\ncompute a value. This can be useful if one only cares about the score on a subset of the\ndata.\n\"\"\"\n\naccuracy_scorer = autosklearn.metrics.make_scorer(\n name=\"accu_X\",\n score_func=metric_which_needs_x,\n optimum=1,\n greater_is_better=True,\n needs_proba=False,\n needs_X=True,\n needs_threshold=False,\n consider_col=1,\n val_threshold=18.8,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n per_run_time_limit=30,\n seed=1,\n metric=accuracy_scorer,\n ensemble_size=0,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = metric_which_needs_x(\n y_test,\n predictions,\n X_data=X_test,\n consider_col=1,\n val_threshold=18.8,\n)\nprint(f\"Error score {score:.3f} using {error_rate.name:s}\")"
"\"\"\"\nFinally, *Auto-sklearn* also support metric that require the train data (aka X_data) to\ncompute a value. This can be useful if one only cares about the score on a subset of the\ndata.\n\"\"\"\n\naccuracy_scorer = autosklearn.metrics.make_scorer(\n name=\"accu_X\",\n score_func=metric_which_needs_x,\n optimum=1,\n greater_is_better=True,\n needs_proba=False,\n needs_X=True,\n needs_threshold=False,\n consider_col=1,\n val_threshold=18.8,\n)\ncls = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=60,\n seed=1,\n metric=accuracy_scorer,\n)\ncls.fit(X_train, y_train)\n\npredictions = cls.predict(X_test)\nscore = metric_which_needs_x(\n y_test,\n predictions,\n X_data=X_test,\n consider_col=1,\n val_threshold=18.8,\n)\nprint(f\"Error score {score:.3f} using {accuracy_scorer.name:s}\")"
]
}
],
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"\n# Classification\n\nThe following example shows how to fit *auto-sklearn* to optimize for two\ncompeting metrics: `precision` and `recall` (read more on this tradeoff\nin the `scikit-learn docs <https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html>`_.\n\nAuto-sklearn uses `SMAC3's implementation of ParEGO <https://automl.github.io/SMAC3/main/details/multi_objective.html>`_.\nMulti-objective ensembling and proper access to the full Pareto front will be added in the near\nfuture.\n"
"\n# Classification\n\nThe following example shows how to fit *auto-sklearn* to optimize for two\ncompeting metrics: `precision` and `recall` (read more on this tradeoff\nin the `scikit-learn docs <https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html>`_.\n\nAuto-sklearn uses `SMAC3's implementation of ParEGO <https://automl.github.io/SMAC3/main/details/multi_objective.html>`_.\nMulti-objective ensembling and proper access to the full Pareto set will be added in the near\nfuture.\n"
]
},
{
Expand All @@ -26,7 +26,7 @@
},
"outputs": [],
"source": [
"from pprint import pprint\n\nimport sklearn.datasets\nimport sklearn.metrics\n\nimport autosklearn.classification\nimport autosklearn.metrics"
"from pprint import pprint\n\nimport matplotlib.pyplot as plt\nimport numpy as np\nimport sklearn.datasets\nimport sklearn.metrics\n\nimport autosklearn.classification\nimport autosklearn.metrics"
]
},
{
Expand All @@ -44,7 +44,7 @@
},
"outputs": [],
"source": [
"X, y = sklearn.datasets.load_breast_cancer(return_X_y=True)\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X, y, random_state=1\n)"
"X, y = sklearn.datasets.fetch_openml(data_id=31, return_X_y=True, as_frame=True)\n# Change the target to align with scikit-learn's convention that\n# ``1`` is the minority class. In this example it is predicting\n# that a credit is \"bad\", i.e. that it will default.\ny = np.array([1 if val == \"bad\" else 0 for val in y])\nX_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(\n X, y, random_state=1\n)"
]
},
{
Expand All @@ -62,7 +62,7 @@
},
"outputs": [],
"source": [
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=30,\n tmp_folder=\"/tmp/autosklearn_multi_objective_example_tmp\",\n metric=[autosklearn.metrics.precision, autosklearn.metrics.recall],\n)\nautoml.fit(X_train, y_train, dataset_name=\"breast_cancer\")"
"automl = autosklearn.classification.AutoSklearnClassifier(\n time_left_for_this_task=120,\n metric=[autosklearn.metrics.precision, autosklearn.metrics.recall],\n delete_tmp_folder_after_terminate=False,\n)\nautoml.fit(X_train, y_train, dataset_name=\"German Credit\")"
]
},
{
Expand Down Expand Up @@ -118,6 +118,24 @@
"source": [
"pprint(automl.cv_results_)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize the Pareto set\n\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"plot_values = []\npareto_front = automl.get_pareto_set()\nfor ensemble in pareto_front:\n predictions = ensemble.predict(X_test)\n precision = sklearn.metrics.precision_score(y_test, predictions)\n recall = sklearn.metrics.recall_score(y_test, predictions)\n plot_values.append((precision, recall))\nfig = plt.figure()\nax = fig.add_subplot(111)\nfor precision, recall in plot_values:\n ax.scatter(precision, recall, c=\"blue\")\nax.set_xlabel(\"Precision\")\nax.set_ylabel(\"Recall\")\nax.set_title(\"Pareto set\")\nplt.show()"
]
}
],
"metadata": {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ def cli_start_worker(scheduler_file_name):
y_train,
task=MULTICLASS_CLASSIFICATION,
dataset_name="digits",
ensemble_size=20,
ensemble_kwargs={"ensemble_size": 20},
ensemble_nbest=50,
)

Expand Down
Binary file not shown.
Loading

0 comments on commit 159cf70

Please sign in to comment.