Skip to content

Commit

Permalink
Add option to AutoMLSearch to exclude featurizers from pipelines (#3631)
Browse files Browse the repository at this point in the history
* exclude featurizers for default algorithm

* fix init

* remove tmp KeyError

* remove tmp Error

* docstring cleanup

* fix iterative algo

* update release notes

* update test

* clean up docstring wording

* add default and iterative tests

* update automlsearch test

* fix defaul algo

* lint fix

* fix for None iteration

* update test assertions

* more assertion updates

* add check for invalid input

* lint fix
  • Loading branch information
thehomebrewnerd authored Aug 2, 2022
1 parent 05a38e0 commit 31d4708
Show file tree
Hide file tree
Showing 8 changed files with 390 additions and 56 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631``
* Fixes
* Changes
* Documentation Changes
Expand Down
20 changes: 18 additions & 2 deletions evalml/automl/automl_algorithm/default_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class DefaultAlgorithm(AutoMLAlgorithm):
AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature has not been computed yet.
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by DefaultAlgorithm.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
"""

def __init__(
Expand All @@ -86,6 +88,7 @@ def __init__(
allow_long_running_models=False,
features=None,
verbose=False,
exclude_featurizers=None,
):
super().__init__(
allowed_pipelines=[],
Expand Down Expand Up @@ -114,6 +117,7 @@ def __init__(
self._X_without_cat_cols = None
self.features = features
self.ensembling = ensembling
self.exclude_featurizers = exclude_featurizers or []

# TODO remove on resolution of 3186
if is_time_series(self.problem_type) and self.ensembling:
Expand Down Expand Up @@ -197,6 +201,7 @@ def _create_naive_pipelines(self, use_features=False):
None,
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
)
for estimator in estimators
]
Expand Down Expand Up @@ -339,6 +344,7 @@ def _make_pipelines_helper(self, estimators):
None,
),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
)
for estimator in estimators
]
Expand Down Expand Up @@ -421,14 +427,20 @@ def _parse_selected_categorical_features(self, pipeline):
self._selected_cols,
self._selected_cat_cols,
)
if list(self.X.ww.select("URL", return_schema=True).columns):
if (
list(self.X.ww.select("URL", return_schema=True).columns)
and "URLFeaturizer" not in self.exclude_featurizers
):
self._get_feature_provenance_and_remove_engineered_features(
pipeline,
URLFeaturizer.name,
self._selected_cat_cols,
self._selected_cat_cols,
)
if list(self.X.ww.select("EmailAddress", return_schema=True).columns):
if (
list(self.X.ww.select("EmailAddress", return_schema=True).columns)
and "EmailFeaturizer" not in self.exclude_featurizers
):
self._get_feature_provenance_and_remove_engineered_features(
pipeline,
EmailFeaturizer.name,
Expand Down Expand Up @@ -527,6 +539,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
parameters=categorical_pipeline_parameters,
extra_components_before=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
)

numeric_pipeline = make_pipeline(
Expand All @@ -539,6 +552,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
extra_components_before=[SelectByType],
extra_components_after=[SelectColumns],
use_estimator=False,
exclude_featurizers=self.exclude_featurizers,
)
prior_components = (
{"DFS Transformer": ["DFS Transformer", "X", "y"]}
Expand Down Expand Up @@ -572,6 +586,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
parameters=categorical_pipeline_parameters,
extra_components_before=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
)
return categorical_pipeline
else:
Expand All @@ -587,5 +602,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
parameters=numeric_pipeline_parameters,
extra_components_after=[SelectColumns],
features=self.features,
exclude_featurizers=self.exclude_featurizers,
)
return numeric_pipeline
5 changes: 5 additions & 0 deletions evalml/automl/automl_algorithm/iterative_algorithm.py
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,8 @@ class IterativeAlgorithm(AutoMLAlgorithm):
AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input.
verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
"""

def __init__(
Expand All @@ -90,6 +92,7 @@ def __init__(
allow_long_running_models=False,
features=None,
verbose=False,
exclude_featurizers=None,
):
self.X = infer_feature_types(X)
self.y = infer_feature_types(y)
Expand Down Expand Up @@ -125,6 +128,7 @@ def __init__(
self.features = features
self.allowed_component_graphs = allowed_component_graphs
self._set_additional_pipeline_params()
self.exclude_featurizers = exclude_featurizers

super().__init__(
allowed_pipelines=self.allowed_pipelines,
Expand Down Expand Up @@ -175,6 +179,7 @@ def _create_pipelines(self):
{},
).get("known_in_advance", None),
features=self.features,
exclude_featurizers=self.exclude_featurizers,
)
for estimator in allowed_estimators
]
Expand Down
22 changes: 22 additions & 0 deletions evalml/automl/automl_search.py
Original file line number Diff line number Diff line change
Expand Up @@ -424,6 +424,8 @@ class AutoMLSearch:
verbose (boolean): Whether or not to display semi-real-time updates to stdout while search is running. Defaults to False.
timing (boolean): Whether or not to write pipeline search times to the logger. Defaults to False.
exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by search.
Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
holdout_set_size (float): The size of the holdout set that AutoML search will take for datasets larger than 500 rows. If set to 0, holdout set will not be taken regardless of number of rows. Must be between 0 and 1, exclusive. Defaults to 0.1.
"""
Expand Down Expand Up @@ -471,6 +473,7 @@ def __init__(
engine="sequential",
verbose=False,
timing=False,
exclude_featurizers=None,
holdout_set_size=0.1,
):
self.verbose = verbose
Expand Down Expand Up @@ -737,6 +740,19 @@ def __init__(
self.sampler_balanced_ratio = sampler_balanced_ratio
self._sampler_name = None

featurizer_names = [
"DatetimeFeaturizer",
"EmailFeaturizer",
"URLFeaturizer",
"NaturalLanguageFeaturizer",
"TimeSeriesFeaturizer",
]
if exclude_featurizers and (set(exclude_featurizers) - set(featurizer_names)):
raise ValueError(
f"Invalid value provided for exclude_featurizers. Must be one of: {', '.join(featurizer_names)}"
)
self.exclude_featurizers = exclude_featurizers or []

if is_classification(self.problem_type):
self._sampler_name = self.sampler_method
if self.sampler_method == "auto":
Expand Down Expand Up @@ -806,6 +822,7 @@ def __init__(
allow_long_running_models=allow_long_running_models,
features=features,
verbose=self.verbose,
exclude_featurizers=self.exclude_featurizers,
)
elif automl_algorithm == "default":
self.automl_algorithm = DefaultAlgorithm(
Expand All @@ -822,6 +839,7 @@ def __init__(
ensembling=self.ensembling,
verbose=self.verbose,
n_jobs=self.n_jobs,
exclude_featurizers=self.exclude_featurizers,
)
else:
raise ValueError("Please specify a valid automl algorithm.")
Expand Down Expand Up @@ -1248,11 +1266,15 @@ def _get_baseline_pipeline(self):
gap = self.problem_configuration["gap"]
forecast_horizon = self.problem_configuration["forecast_horizon"]
time_index = self.problem_configuration["time_index"]
exclude_timeseries_featurizer = (
"TimeSeriesFeaturizer" in self.exclude_featurizers
)
baseline = make_timeseries_baseline_pipeline(
self.problem_type,
gap,
forecast_horizon,
time_index,
exclude_timeseries_featurizer,
)
return baseline

Expand Down
Loading

0 comments on commit 31d4708

Please sign in to comment.