Add option to AutoMLSearch to exclude featurizers from pipelines (#3631)

* exclude featurizers for default algorithm * fix init * remove tmp KeyError * remove tmp Error * docstring cleanup * fix iterative algo * update release notes * update test * clean up docstring wording * add default and iterative tests * update automlsearch test * fix defaul algo * lint fix * fix for None iteration * update test assertions * more assertion updates * add check for invalid input * lint fix
alteryx · Aug 2, 2022 · 31d4708 · 31d4708
1 parent 05a38e0
commit 31d4708
Show file tree

Hide file tree

Showing 8 changed files with 390 additions and 56 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631``
     * Fixes
     * Changes
     * Documentation Changes

diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py
@@ -66,6 +66,8 @@ class DefaultAlgorithm(AutoMLAlgorithm):
             AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
         features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature has not been computed yet.
         verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
+        exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by DefaultAlgorithm.
+            Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
     """
 
     def __init__(
@@ -86,6 +88,7 @@ def __init__(
         allow_long_running_models=False,
         features=None,
         verbose=False,
+        exclude_featurizers=None,
     ):
         super().__init__(
             allowed_pipelines=[],
@@ -114,6 +117,7 @@ def __init__(
         self._X_without_cat_cols = None
         self.features = features
         self.ensembling = ensembling
+        self.exclude_featurizers = exclude_featurizers or []
 
         # TODO remove on resolution of 3186
         if is_time_series(self.problem_type) and self.ensembling:
@@ -197,6 +201,7 @@ def _create_naive_pipelines(self, use_features=False):
                     None,
                 ),
                 features=self.features,
+                exclude_featurizers=self.exclude_featurizers,
             )
             for estimator in estimators
         ]
@@ -339,6 +344,7 @@ def _make_pipelines_helper(self, estimators):
                         None,
                     ),
                     features=self.features,
+                    exclude_featurizers=self.exclude_featurizers,
                 )
                 for estimator in estimators
             ]
@@ -421,14 +427,20 @@ def _parse_selected_categorical_features(self, pipeline):
                 self._selected_cols,
                 self._selected_cat_cols,
             )
-        if list(self.X.ww.select("URL", return_schema=True).columns):
+        if (
+            list(self.X.ww.select("URL", return_schema=True).columns)
+            and "URLFeaturizer" not in self.exclude_featurizers
+        ):
             self._get_feature_provenance_and_remove_engineered_features(
                 pipeline,
                 URLFeaturizer.name,
                 self._selected_cat_cols,
                 self._selected_cat_cols,
             )
-        if list(self.X.ww.select("EmailAddress", return_schema=True).columns):
+        if (
+            list(self.X.ww.select("EmailAddress", return_schema=True).columns)
+            and "EmailFeaturizer" not in self.exclude_featurizers
+        ):
             self._get_feature_provenance_and_remove_engineered_features(
                 pipeline,
                 EmailFeaturizer.name,
@@ -527,6 +539,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
                 parameters=categorical_pipeline_parameters,
                 extra_components_before=[SelectColumns],
                 use_estimator=False,
+                exclude_featurizers=self.exclude_featurizers,
             )
 
             numeric_pipeline = make_pipeline(
@@ -539,6 +552,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
                 extra_components_before=[SelectByType],
                 extra_components_after=[SelectColumns],
                 use_estimator=False,
+                exclude_featurizers=self.exclude_featurizers,
             )
             prior_components = (
                 {"DFS Transformer": ["DFS Transformer", "X", "y"]}
@@ -572,6 +586,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
                 parameters=categorical_pipeline_parameters,
                 extra_components_before=[SelectColumns],
                 features=self.features,
+                exclude_featurizers=self.exclude_featurizers,
             )
             return categorical_pipeline
         else:
@@ -587,5 +602,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None):
                 parameters=numeric_pipeline_parameters,
                 extra_components_after=[SelectColumns],
                 features=self.features,
+                exclude_featurizers=self.exclude_featurizers,
             )
             return numeric_pipeline
diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py
@@ -66,6 +66,8 @@ class IterativeAlgorithm(AutoMLAlgorithm):
             AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False.
         features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input.
         verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False.
+        exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm.
+            Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
     """
 
     def __init__(
@@ -90,6 +92,7 @@ def __init__(
         allow_long_running_models=False,
         features=None,
         verbose=False,
+        exclude_featurizers=None,
     ):
         self.X = infer_feature_types(X)
         self.y = infer_feature_types(y)
@@ -125,6 +128,7 @@ def __init__(
         self.features = features
         self.allowed_component_graphs = allowed_component_graphs
         self._set_additional_pipeline_params()
+        self.exclude_featurizers = exclude_featurizers
 
         super().__init__(
             allowed_pipelines=self.allowed_pipelines,
@@ -175,6 +179,7 @@ def _create_pipelines(self):
                             {},
                         ).get("known_in_advance", None),
                         features=self.features,
+                        exclude_featurizers=self.exclude_featurizers,
                     )
                     for estimator in allowed_estimators
                 ]

diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py
@@ -424,6 +424,8 @@ class AutoMLSearch:
         verbose (boolean): Whether or not to display semi-real-time updates to stdout while search is running. Defaults to False.
 
         timing (boolean): Whether or not to write pipeline search times to the logger. Defaults to False.
+        exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by search.
+            Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer"
 
         holdout_set_size (float): The size of the holdout set that AutoML search will take for datasets larger than 500 rows. If set to 0, holdout set will not be taken regardless of number of rows. Must be between 0 and 1, exclusive. Defaults to 0.1.
     """
@@ -471,6 +473,7 @@ def __init__(
         engine="sequential",
         verbose=False,
         timing=False,
+        exclude_featurizers=None,
         holdout_set_size=0.1,
     ):
         self.verbose = verbose
@@ -737,6 +740,19 @@ def __init__(
         self.sampler_balanced_ratio = sampler_balanced_ratio
         self._sampler_name = None
 
+        featurizer_names = [
+            "DatetimeFeaturizer",
+            "EmailFeaturizer",
+            "URLFeaturizer",
+            "NaturalLanguageFeaturizer",
+            "TimeSeriesFeaturizer",
+        ]
+        if exclude_featurizers and (set(exclude_featurizers) - set(featurizer_names)):
+            raise ValueError(
+                f"Invalid value provided for exclude_featurizers. Must be one of: {', '.join(featurizer_names)}"
+            )
+        self.exclude_featurizers = exclude_featurizers or []
+
         if is_classification(self.problem_type):
             self._sampler_name = self.sampler_method
             if self.sampler_method == "auto":
@@ -806,6 +822,7 @@ def __init__(
                 allow_long_running_models=allow_long_running_models,
                 features=features,
                 verbose=self.verbose,
+                exclude_featurizers=self.exclude_featurizers,
             )
         elif automl_algorithm == "default":
             self.automl_algorithm = DefaultAlgorithm(
@@ -822,6 +839,7 @@ def __init__(
                 ensembling=self.ensembling,
                 verbose=self.verbose,
                 n_jobs=self.n_jobs,
+                exclude_featurizers=self.exclude_featurizers,
             )
         else:
             raise ValueError("Please specify a valid automl algorithm.")
@@ -1248,11 +1266,15 @@ def _get_baseline_pipeline(self):
             gap = self.problem_configuration["gap"]
             forecast_horizon = self.problem_configuration["forecast_horizon"]
             time_index = self.problem_configuration["time_index"]
+            exclude_timeseries_featurizer = (
+                "TimeSeriesFeaturizer" in self.exclude_featurizers
+            )
             baseline = make_timeseries_baseline_pipeline(
                 self.problem_type,
                 gap,
                 forecast_horizon,
                 time_index,
+                exclude_timeseries_featurizer,
             )
         return baseline