Skip to content

Commit

Permalink
Merge pull request #1192 from automl/development
Browse files Browse the repository at this point in the history
Development
  • Loading branch information
mfeurer authored Jul 28, 2021
2 parents 904a692 + 96b9ad0 commit 3d53cd9
Show file tree
Hide file tree
Showing 15 changed files with 513 additions and 35 deletions.
5 changes: 5 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,8 +1,13 @@
# Documentation
docs/build/*
docs/examples

*.py[cod]

# Exmaples
# examples 40_advanced generate a tmp_folder
examples/40_advanced/tmp_folder

# C extensions
*.c
*.so
Expand Down
2 changes: 1 addition & 1 deletion autosklearn/__version__.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""Version information."""

# The following line *must* be the last in the module, exactly as formatted:
__version__ = "0.12.8"
__version__ = "0.13.0"
1 change: 0 additions & 1 deletion autosklearn/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,6 @@ def __init__(self,
self.cv_models_ = None
self.ensemble_ = None
self._can_predict = False

self._debug_mode = debug_mode

self.InputValidator = None # type: Optional[InputValidator]
Expand Down
22 changes: 14 additions & 8 deletions autosklearn/ensembles/ensemble_selection.py
Original file line number Diff line number Diff line change
Expand Up @@ -278,14 +278,20 @@ def predict(self, predictions: Union[np.ndarray, List[np.ndarray]]) -> np.ndarra
return average

def __str__(self) -> str:
return 'Ensemble Selection:\n\tTrajectory: %s\n\tMembers: %s' \
'\n\tWeights: %s\n\tIdentifiers: %s' % \
(' '.join(['%d: %5f' % (idx, performance)
for idx, performance in enumerate(self.trajectory_)]),
self.indices_, self.weights_,
' '.join([str(identifier) for idx, identifier in
enumerate(self.identifiers_)
if self.weights_[idx] > 0]))
trajectory_str = ' '.join([
f'{id}: {perf:.5f}'
for id, perf in enumerate(self.trajectory_)
])
identifiers_str = ' '.join([
f'{identifier}'
for idx, identifier in enumerate(self.identifiers_)
if self.weights_[idx] > 0
])
return ("Ensemble Selection:\n"
f"\tTrajectory: {trajectory_str}\n"
f"\tMembers: {self.indices_}\n"
f"\tWeights: {self.weights_}\n"
f"\tIdentifiers: {identifiers_str}\n")

def get_models_with_weights(
self,
Expand Down
279 changes: 277 additions & 2 deletions autosklearn/estimators.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
# -*- encoding: utf-8 -*-

from typing import Optional, Dict, List, Tuple, Union
from typing import Optional, Dict, List, Tuple, Union, Iterable
from typing_extensions import Literal

from ConfigSpace.configuration_space import Configuration
import dask.distributed
import joblib
import numpy as np
import pandas as pd
from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
from sklearn.utils.multiclass import type_of_target
from smac.runhistory.runhistory import RunInfo, RunValue
Expand Down Expand Up @@ -550,6 +551,280 @@ def sprint_statistics(self):
"""
return self.automl_.sprint_statistics()

def leaderboard(
self,
detailed: bool = False,
ensemble_only: bool = True,
top_k: Union[int, Literal['all']] = 'all',
sort_by: str = 'cost',
sort_order: Literal['auto', 'ascending', 'descending'] = 'auto',
include: Optional[Union[str, Iterable[str]]] = None
) -> pd.DataFrame:
""" Returns a pandas table of results for all evaluated models.
Gives an overview of all models trained during the search process along
with various statistics about their training.
The availble statistics are:
**Simple**:
* ``"model_id"`` - The id given to a model by ``autosklearn``.
* ``"rank"`` - The rank of the model based on it's ``"cost"``.
* ``"ensemble_weight"`` - The weight given to the model in the ensemble.
* ``"type"`` - The type of classifier/regressor used.
* ``"cost"`` - The loss of the model on the validation set.
* ``"duration"`` - Length of time the model was optimized for.
**Detailed**:
The detailed view includes all of the simple statistics along with the
following.
* ``"config_id"`` - The id used by SMAC for optimization.
* ``"budget"`` - How much budget was allocated to this model.
* ``"status"`` - The return status of training the model with SMAC.
* ``"train_loss"`` - The loss of the model on the training set.
* ``"balancing_strategy"`` - The balancing strategy used for data preprocessing.
* ``"start_time"`` - Time the model began being optimized
* ``"end_time"`` - Time the model ended being optimized
* ``"data_preprocessors"`` - The preprocessors used on the data
* ``"feature_preprocessors"`` - The preprocessors for features types
Parameters
----------
detailed: bool = False
Whether to give detailed information or just a simple overview.
ensemble_only: bool = True
Whether to view only models included in the ensemble or all models
trained.
top_k: int or "all" = "all"
How many models to display.
sort_by: str = 'cost'
What column to sort by. If that column is not present, the
sorting defaults to the ``"model_id"`` index column.
sort_order: "auto" or "ascending" or "descending" = "auto"
Which sort order to apply to the ``sort_by`` column. If left
as ``"auto"``, it will sort by a sensible default where "better" is
on top, otherwise defaulting to the pandas default for
`DataFrame.sort_values`_ if there is no obvious "better".
.. _DataFrame.sort_values: https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.sort_values.html
include: Optional[str or Iterable[str]]
Items to include, other items not specified will be excluded.
The exception is the ``"model_id"`` index column which is always included.
If left as ``None``, it will resort back to using the ``detailed``
param to decide the columns to include.
Returns
-------
pd.DataFrame
A dataframe of statistics for the models, ordered by ``sort_by``.
""" # noqa (links are too long)
# TODO validate that `self` is fitted. This is required for
# self.ensemble_ to get the identifiers of models it will generate
# weights for.
column_types = AutoSklearnEstimator._leaderboard_columns()

# Validation of top_k
if (
not (isinstance(top_k, str) or isinstance(top_k, int))
or (isinstance(top_k, str) and top_k != 'all')
or (isinstance(top_k, int) and top_k <= 0)
):
raise ValueError(f"top_k={top_k} must be a positive integer or pass"
" `top_k`='all' to view results for all models")

# Validate columns to include
if isinstance(include, str):
include = [include]

if include == ['model_id']:
raise ValueError('Must provide more than just `model_id`')

if include is not None:
columns = [*include]

# 'model_id' should always be present as it is the unique index
# used for pandas
if 'model_id' not in columns:
columns.append('model_id')

invalid_include_items = set(columns) - set(column_types['all'])
if len(invalid_include_items) != 0:
raise ValueError(f"Values {invalid_include_items} are not known"
f" columns to include, must be contained in "
f"{column_types['all']}")
elif detailed:
columns = column_types['all']
else:
columns = column_types['simple']

# Validation of sorting
if sort_by not in column_types['all']:
raise ValueError(f"sort_by='{sort_by}' must be one of included "
f"columns {set(column_types['all'])}")

valid_sort_orders = ['auto', 'ascending', 'descending']
if not (isinstance(sort_order, str) and sort_order in valid_sort_orders):
raise ValueError(f"`sort_order` = {sort_order} must be a str in "
f"{valid_sort_orders}")

# To get all the models that were optmized, we collect what we can from
# runhistory first.
def has_key(rv, key):
return rv.additional_info and key in rv.additional_info

model_runs = {
rval.additional_info['num_run']: {
'model_id': rval.additional_info['num_run'],
'seed': rkey.seed,
'budget': rkey.budget,
'duration': rval.time,
'config_id': rkey.config_id,
'start_time': rval.starttime,
'end_time': rval.endtime,
'status': str(rval.status),
'cost': rval.cost,
'train_loss': rval.additional_info['train_loss']
if has_key(rval, 'train_loss') else None,
'config_origin': rval.additional_info['configuration_origin']
if has_key(rval, 'configuration_origin') else None
}
for rkey, rval in self.automl_.runhistory_.data.items()
if has_key(rval, 'num_run')
}

# Next we get some info about the model itself
model_class_strings = {
AutoMLClassifier: 'classifier',
AutoMLRegressor: 'regressor'
}
model_type = model_class_strings.get(self._get_automl_class(), None)
if model_type is None:
raise RuntimeError(f"Unknown `automl_class` {self._get_automl_class()}")

# A dict mapping model ids to their configurations
configurations = self.automl_.runhistory_.ids_config

for model_id, run_info in model_runs.items():
config_id = run_info['config_id']
run_config = configurations[config_id]._values

run_info.update({
'balancing_strategy': run_config.get('balancing:strategy', None),
'type': run_config[f'{model_type}:__choice__'],
'data_preprocessors': [
value for key, value in run_config.items()
if 'data_preprocessing' in key and '__choice__' in key
],
'feature_preprocessors': [
value for key, value in run_config.items()
if 'feature_preprocessor' in key and '__choice__' in key
]
})

# Get the models ensemble weight if it has one
# TODO both implementing classes of AbstractEnsemble have a property
# `identifiers_` and `weights_`, might be good to put it as an
# abstract property
# TODO `ensemble_.identifiers_` and `ensemble_.weights_` are loosely
# tied together by ordering, might be better to store as tuple
for i, weight in enumerate(self.automl_.ensemble_.weights_):
(_, model_id, _) = self.automl_.ensemble_.identifiers_[i]
model_runs[model_id]['ensemble_weight'] = weight

# Filter out non-ensemble members if needed, else fill in a default
# value of 0 if it's missing
if ensemble_only:
model_runs = {
model_id: info
for model_id, info in model_runs.items()
if ('ensemble_weight' in info and info['ensemble_weight'] > 0)
}
else:
for model_id, info in model_runs.items():
if 'ensemble_weight' not in info:
info['ensemble_weight'] = 0

# `rank` relies on `cost` so we include `cost`
# We drop it later if it's not requested
if 'rank' in columns and 'cost' not in columns:
columns = [*columns, 'cost']

# Finally, convert into a tabular format by converting the dict into
# column wise orientation.
dataframe = pd.DataFrame({
col: [run_info[col] for run_info in model_runs.values()]
for col in columns if col != 'rank'
})

# Give it an index, even if not in the `include`
dataframe.set_index('model_id', inplace=True)

# Add the `rank` column if needed, dropping `cost` if it's not
# requested by the user
if 'rank' in columns:
dataframe.sort_values(by='cost', ascending=True, inplace=True)
dataframe.insert(column='rank',
value=range(1, len(dataframe) + 1),
loc=list(columns).index('rank') - 1) # account for `model_id`

if 'cost' not in columns:
dataframe.drop('cost', inplace=True)

# Decide on the sort order depending on what it gets sorted by
descending_columns = ['ensemble_weight', 'duration']
if sort_order == 'auto':
ascending_param = False if sort_by in descending_columns else True
else:
ascending_param = False if sort_order == 'descending' else True

# Sort by the given column name, defaulting to 'model_id' if not present
if sort_by not in dataframe.columns:
self.automl_._logger.warning(f"sort_by = '{sort_by}' was not present"
", defaulting to sort on the index "
"'model_id'")
sort_by = 'model_id'

# Cost can be the same but leave rank all over the place
if 'rank' in columns and sort_by == 'cost':
dataframe.sort_values(by=[sort_by, 'rank'],
ascending=[ascending_param, True],
inplace=True)
else:
dataframe.sort_values(by=sort_by,
ascending=ascending_param,
inplace=True)

# Lastly, just grab the top_k
if top_k == 'all' or top_k >= len(dataframe):
top_k = len(dataframe)

dataframe = dataframe.head(top_k)

return dataframe

@staticmethod
def _leaderboard_columns() -> Dict[Literal['all', 'simple', 'detailed'], List[str]]:
all = [
"model_id", "rank", "ensemble_weight", "type", "cost", "duration",
"config_id", "train_loss", "seed", "start_time", "end_time",
"budget", "status", "data_preprocessors", "feature_preprocessors",
"balancing_strategy", "config_origin"
]
simple = [
"model_id", "rank", "ensemble_weight", "type", "cost", "duration"
]
detailed = all
return {'all': all, 'detailed': detailed, 'simple': simple}

def _get_automl_class(self):
raise NotImplementedError()

Expand Down
8 changes: 7 additions & 1 deletion doc/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@ ALLSPHINXOPTS = -d $(BUILDDIR)/doctrees $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .
# the i18n builder cannot share the environment and doctrees with the others
I18NSPHINXOPTS = $(PAPEROPT_$(PAPER)) $(SPHINXOPTS) .

.PHONY: help clean html dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext
.PHONY: help clean html html-noexamples dirhtml singlehtml pickle json htmlhelp qthelp devhelp epub latex latexpdf text man changes linkcheck doctest gettext

all: html

Expand Down Expand Up @@ -59,6 +59,12 @@ html:
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."

html-noexamples:
$(SPHINXBUILD) -D plot_gallery=0 -b html $(ALLSPHINXOPTS) $(SOURCEDIR) $(BUILDDIR)/html
@echo
@echo "Build finished. The HTML pages are in $(BUILDDIR)/html."


dirhtml:
$(SPHINXBUILD) -b dirhtml $(ALLSPHINXOPTS) $(BUILDDIR)/dirhtml
@echo
Expand Down
Loading

0 comments on commit 3d53cd9

Please sign in to comment.