diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index b318a994af..01951e7405 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,6 +2,7 @@ Release Notes ------------- **Future Releases** * Enhancements + * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`` * Fixes * Changes * Documentation Changes diff --git a/evalml/automl/automl_algorithm/default_algorithm.py b/evalml/automl/automl_algorithm/default_algorithm.py index 5590fd8ac9..1d82499536 100644 --- a/evalml/automl/automl_algorithm/default_algorithm.py +++ b/evalml/automl/automl_algorithm/default_algorithm.py @@ -66,6 +66,8 @@ class DefaultAlgorithm(AutoMLAlgorithm): AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False. features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature has not been computed yet. verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by DefaultAlgorithm. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" """ def __init__( @@ -86,6 +88,7 @@ def __init__( allow_long_running_models=False, features=None, verbose=False, + exclude_featurizers=None, ): super().__init__( allowed_pipelines=[], @@ -114,6 +117,7 @@ def __init__( self._X_without_cat_cols = None self.features = features self.ensembling = ensembling + self.exclude_featurizers = exclude_featurizers or [] # TODO remove on resolution of 3186 if is_time_series(self.problem_type) and self.ensembling: @@ -197,6 +201,7 @@ def _create_naive_pipelines(self, use_features=False): None, ), features=self.features, + exclude_featurizers=self.exclude_featurizers, ) for estimator in estimators ] @@ -339,6 +344,7 @@ def _make_pipelines_helper(self, estimators): None, ), features=self.features, + exclude_featurizers=self.exclude_featurizers, ) for estimator in estimators ] @@ -421,14 +427,20 @@ def _parse_selected_categorical_features(self, pipeline): self._selected_cols, self._selected_cat_cols, ) - if list(self.X.ww.select("URL", return_schema=True).columns): + if ( + list(self.X.ww.select("URL", return_schema=True).columns) + and "URLFeaturizer" not in self.exclude_featurizers + ): self._get_feature_provenance_and_remove_engineered_features( pipeline, URLFeaturizer.name, self._selected_cat_cols, self._selected_cat_cols, ) - if list(self.X.ww.select("EmailAddress", return_schema=True).columns): + if ( + list(self.X.ww.select("EmailAddress", return_schema=True).columns) + and "EmailFeaturizer" not in self.exclude_featurizers + ): self._get_feature_provenance_and_remove_engineered_features( pipeline, EmailFeaturizer.name, @@ -527,6 +539,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): parameters=categorical_pipeline_parameters, extra_components_before=[SelectColumns], use_estimator=False, + exclude_featurizers=self.exclude_featurizers, ) numeric_pipeline = make_pipeline( @@ -539,6 +552,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): extra_components_before=[SelectByType], extra_components_after=[SelectColumns], use_estimator=False, + exclude_featurizers=self.exclude_featurizers, ) prior_components = ( {"DFS Transformer": ["DFS Transformer", "X", "y"]} @@ -572,6 +586,7 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): parameters=categorical_pipeline_parameters, extra_components_before=[SelectColumns], features=self.features, + exclude_featurizers=self.exclude_featurizers, ) return categorical_pipeline else: @@ -587,5 +602,6 @@ def _make_split_pipeline(self, estimator, pipeline_name=None): parameters=numeric_pipeline_parameters, extra_components_after=[SelectColumns], features=self.features, + exclude_featurizers=self.exclude_featurizers, ) return numeric_pipeline diff --git a/evalml/automl/automl_algorithm/iterative_algorithm.py b/evalml/automl/automl_algorithm/iterative_algorithm.py index 1493027d95..745d5a9690 100644 --- a/evalml/automl/automl_algorithm/iterative_algorithm.py +++ b/evalml/automl/automl_algorithm/iterative_algorithm.py @@ -66,6 +66,8 @@ class IterativeAlgorithm(AutoMLAlgorithm): AutoMLSearch will not use Elastic Net or XGBoost when there are more than 75 multiclass targets and will not use CatBoost when there are more than 150 multiclass targets. Defaults to False. features (list)[FeatureBase]: List of features to run DFS on in AutoML pipelines. Defaults to None. Features will only be computed if the columns used by the feature exist in the input and if the feature itself is not in input. verbose (boolean): Whether or not to display logging information regarding pipeline building. Defaults to False. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by IterativeAlgorithm. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" """ def __init__( @@ -90,6 +92,7 @@ def __init__( allow_long_running_models=False, features=None, verbose=False, + exclude_featurizers=None, ): self.X = infer_feature_types(X) self.y = infer_feature_types(y) @@ -125,6 +128,7 @@ def __init__( self.features = features self.allowed_component_graphs = allowed_component_graphs self._set_additional_pipeline_params() + self.exclude_featurizers = exclude_featurizers super().__init__( allowed_pipelines=self.allowed_pipelines, @@ -175,6 +179,7 @@ def _create_pipelines(self): {}, ).get("known_in_advance", None), features=self.features, + exclude_featurizers=self.exclude_featurizers, ) for estimator in allowed_estimators ] diff --git a/evalml/automl/automl_search.py b/evalml/automl/automl_search.py index 9d812ab99a..b4ebecb9c0 100644 --- a/evalml/automl/automl_search.py +++ b/evalml/automl/automl_search.py @@ -424,6 +424,8 @@ class AutoMLSearch: verbose (boolean): Whether or not to display semi-real-time updates to stdout while search is running. Defaults to False. timing (boolean): Whether or not to write pipeline search times to the logger. Defaults to False. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipelines built by search. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" holdout_set_size (float): The size of the holdout set that AutoML search will take for datasets larger than 500 rows. If set to 0, holdout set will not be taken regardless of number of rows. Must be between 0 and 1, exclusive. Defaults to 0.1. """ @@ -471,6 +473,7 @@ def __init__( engine="sequential", verbose=False, timing=False, + exclude_featurizers=None, holdout_set_size=0.1, ): self.verbose = verbose @@ -737,6 +740,19 @@ def __init__( self.sampler_balanced_ratio = sampler_balanced_ratio self._sampler_name = None + featurizer_names = [ + "DatetimeFeaturizer", + "EmailFeaturizer", + "URLFeaturizer", + "NaturalLanguageFeaturizer", + "TimeSeriesFeaturizer", + ] + if exclude_featurizers and (set(exclude_featurizers) - set(featurizer_names)): + raise ValueError( + f"Invalid value provided for exclude_featurizers. Must be one of: {', '.join(featurizer_names)}" + ) + self.exclude_featurizers = exclude_featurizers or [] + if is_classification(self.problem_type): self._sampler_name = self.sampler_method if self.sampler_method == "auto": @@ -806,6 +822,7 @@ def __init__( allow_long_running_models=allow_long_running_models, features=features, verbose=self.verbose, + exclude_featurizers=self.exclude_featurizers, ) elif automl_algorithm == "default": self.automl_algorithm = DefaultAlgorithm( @@ -822,6 +839,7 @@ def __init__( ensembling=self.ensembling, verbose=self.verbose, n_jobs=self.n_jobs, + exclude_featurizers=self.exclude_featurizers, ) else: raise ValueError("Please specify a valid automl algorithm.") @@ -1248,11 +1266,15 @@ def _get_baseline_pipeline(self): gap = self.problem_configuration["gap"] forecast_horizon = self.problem_configuration["forecast_horizon"] time_index = self.problem_configuration["time_index"] + exclude_timeseries_featurizer = ( + "TimeSeriesFeaturizer" in self.exclude_featurizers + ) baseline = make_timeseries_baseline_pipeline( self.problem_type, gap, forecast_horizon, time_index, + exclude_timeseries_featurizer, ) return baseline diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py index 8a5fabddaa..4259128727 100644 --- a/evalml/pipelines/utils.py +++ b/evalml/pipelines/utils.py @@ -109,12 +109,18 @@ def _get_drop_index_unknown(X, y, problem_type, estimator_class, sampler_name=No return component -def _get_url_email(X, y, problem_type, estimator_class, sampler_name=None): +def _get_email(X, y, problem_type, estimator_class, sampler_name=None): components = [] email_columns = list(X.ww.select("EmailAddress", return_schema=True).columns) if len(email_columns) > 0: components.append(EmailFeaturizer) + return components + + +def _get_url(X, y, problem_type, estimator_class, sampler_name=None): + components = [] + url_columns = list(X.ww.select("URL", return_schema=True).columns) if len(url_columns) > 0: components.append(URLFeaturizer) @@ -235,6 +241,7 @@ def _get_preprocessing_components( problem_type, estimator_class, sampler_name=None, + exclude_featurizers=None, ): """Given input data, target data and an estimator class, construct a recommended preprocessing chain to be combined with the estimator and trained on the provided data. @@ -244,6 +251,8 @@ def _get_preprocessing_components( problem_type (ProblemTypes or str): Problem type. estimator_class (class): A class which subclasses Estimator estimator for pipeline. sampler_name (str): The name of the sampler component to add to the pipeline. Defaults to None. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the returned components. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" Returns: list[Transformer]: A list of applicable preprocessing components to use with the estimator. @@ -254,7 +263,8 @@ def _get_preprocessing_components( _get_drop_all_null, _get_replace_null, _get_drop_index_unknown, - _get_url_email, + _get_url, + _get_email, _get_natural_language, _get_imputer, _get_time_series_featurizer, @@ -270,7 +280,8 @@ def _get_preprocessing_components( _get_drop_all_null, _get_replace_null, _get_drop_index_unknown, - _get_url_email, + _get_url, + _get_email, _get_datetime, _get_natural_language, _get_imputer, @@ -278,9 +289,25 @@ def _get_preprocessing_components( _get_sampler, _get_standard_scaler, ] + + functions_to_exclude = [] + if exclude_featurizers and "DatetimeFeaturizer" in exclude_featurizers: + functions_to_exclude.append(_get_datetime) + if exclude_featurizers and "EmailFeaturizer" in exclude_featurizers: + functions_to_exclude.append(_get_email) + if exclude_featurizers and "URLFeaturizer" in exclude_featurizers: + functions_to_exclude.append(_get_url) + if exclude_featurizers and "NaturalLanguageFeaturizer" in exclude_featurizers: + functions_to_exclude.append(_get_natural_language) + if exclude_featurizers and "TimeSeriesFeaturizer" in exclude_featurizers: + functions_to_exclude.append(_get_time_series_featurizer) + components = [] for function in components_functions: - components.extend(function(X, y, problem_type, estimator_class, sampler_name)) + if function not in functions_to_exclude: + components.extend( + function(X, y, problem_type, estimator_class, sampler_name) + ) return components @@ -310,6 +337,7 @@ def _make_pipeline_time_series( parameters=None, sampler_name=None, known_in_advance=None, + exclude_featurizers=None, ): """Make a pipeline for time series problems. @@ -318,15 +346,17 @@ def _make_pipeline_time_series( The known_in_advance features are treated like a non-time-series features since they don't change with time. Args: - X (pd.DataFrame): The input data of shape [n_samples, n_features]. - y (pd.Series): The target data of length [n_samples]. - estimator (Estimator): Estimator for pipeline. - problem_type (ProblemTypes or str): Problem type for pipeline to generate. - parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. - An empty dictionary or None implies using all default values for component parameters. - sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. - Defaults to None - known_in_advance (list[str], None): List of features that are known in advance. + X (pd.DataFrame): The input data of shape [n_samples, n_features]. + y (pd.Series): The target data of length [n_samples]. + estimator (Estimator): Estimator for pipeline. + problem_type (ProblemTypes or str): Problem type for pipeline to generate. + parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. + An empty dictionary or None implies using all default values for component parameters. + sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. + Defaults to None + known_in_advance (list[str], None): List of features that are known in advance. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipeline. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" Returns: PipelineBase: TimeSeriesPipeline'' @@ -345,6 +375,7 @@ def _make_pipeline_time_series( problem_type, estimator, sampler_name, + exclude_featurizers, ) if known_in_advance: @@ -376,6 +407,7 @@ def _make_pipeline_time_series( ProblemTypes.REGRESSION, estimator, sampler_name, + exclude_featurizers, ) kina_component_graph = PipelineBase._make_component_dict_from_component_list( kina_preprocessing, @@ -422,23 +454,26 @@ def make_pipeline( use_estimator=True, known_in_advance=None, features=False, + exclude_featurizers=None, ): """Given input data, target data, an estimator class and the problem type, generates a pipeline class with a preprocessing chain which was recommended based on the inputs. The pipeline will be a subclass of the appropriate pipeline base class for the specified problem_type. Args: - X (pd.DataFrame): The input data of shape [n_samples, n_features]. - y (pd.Series): The target data of length [n_samples]. - estimator (Estimator): Estimator for pipeline. - problem_type (ProblemTypes or str): Problem type for pipeline to generate. - parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. - An empty dictionary or None implies using all default values for component parameters. - sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. - Defaults to None - extra_components_before (list[ComponentBase]): List of extra components to be added before preprocessing components. Defaults to None. - extra_components_after (list[ComponentBase]): List of extra components to be added after preprocessing components. Defaults to None. - use_estimator (bool): Whether to add the provided estimator to the pipeline or not. Defaults to True. - known_in_advance (list[str], None): List of features that are known in advance. - features (bool): Whether to add a DFSTransformer component to this pipeline. + X (pd.DataFrame): The input data of shape [n_samples, n_features]. + y (pd.Series): The target data of length [n_samples]. + estimator (Estimator): Estimator for pipeline. + problem_type (ProblemTypes or str): Problem type for pipeline to generate. + parameters (dict): Dictionary with component names as keys and dictionary of that component's parameters as values. + An empty dictionary or None implies using all default values for component parameters. + sampler_name (str): The name of the sampler component to add to the pipeline. Only used in classification problems. + Defaults to None + extra_components_before (list[ComponentBase]): List of extra components to be added before preprocessing components. Defaults to None. + extra_components_after (list[ComponentBase]): List of extra components to be added after preprocessing components. Defaults to None. + use_estimator (bool): Whether to add the provided estimator to the pipeline or not. Defaults to True. + known_in_advance (list[str], None): List of features that are known in advance. + features (bool): Whether to add a DFSTransformer component to this pipeline. + exclude_featurizers (list[str]): A list of featurizer components to exclude from the pipeline. + Valid options are "DatetimeFeaturizer", "EmailFeaturizer", "URLFeaturizer", "NaturalLanguageFeaturizer", "TimeSeriesFeaturizer" Returns: @@ -470,6 +505,7 @@ def make_pipeline( parameters, sampler_name, known_in_advance, + exclude_featurizers, ) else: preprocessing_components = _get_preprocessing_components( @@ -478,6 +514,7 @@ def make_pipeline( problem_type, estimator, sampler_name, + exclude_featurizers, ) extra_components_before = extra_components_before or [] extra_components_after = extra_components_after or [] @@ -947,7 +984,9 @@ def get_actions_from_option_defaults(action_options): return actions -def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, time_index): +def make_timeseries_baseline_pipeline( + problem_type, gap, forecast_horizon, time_index, exclude_featurizer=False +): """Make a baseline pipeline for time series regression problems. Args: @@ -955,6 +994,8 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, time_ gap (int): Non-negative gap parameter. forecast_horizon (int): Positive forecast_horizon parameter. time_index (str): Column name of time_index parameter. + exclude_featurizer (bool): Whether or not to exclude the TimeSeriesFeaturizer from + the baseline graph. Defaults to False. Returns: TimeSeriesPipelineBase, a time series pipeline corresponding to the problem type. @@ -974,32 +1015,33 @@ def make_timeseries_baseline_pipeline(problem_type, gap, forecast_horizon, time_ "Time Series Baseline Binary Pipeline", ), }[problem_type] + component_graph = ["Time Series Baseline Estimator"] + parameters = { + "pipeline": { + "time_index": time_index, + "gap": gap, + "max_delay": 0, + "forecast_horizon": forecast_horizon, + }, + "Time Series Baseline Estimator": { + "gap": gap, + "forecast_horizon": forecast_horizon, + }, + } + if not exclude_featurizer: + component_graph = ["Time Series Featurizer"] + component_graph + parameters["Time Series Featurizer"] = { + "max_delay": 0, + "gap": gap, + "forecast_horizon": forecast_horizon, + "delay_target": True, + "delay_features": False, + "time_index": time_index, + } baseline = pipeline_class( - component_graph=[ - "Time Series Featurizer", - "Time Series Baseline Estimator", - ], + component_graph=component_graph, custom_name=pipeline_name, - parameters={ - "pipeline": { - "time_index": time_index, - "gap": gap, - "max_delay": 0, - "forecast_horizon": forecast_horizon, - }, - "Time Series Featurizer": { - "max_delay": 0, - "gap": gap, - "forecast_horizon": forecast_horizon, - "delay_target": True, - "delay_features": False, - "time_index": time_index, - }, - "Time Series Baseline Estimator": { - "gap": gap, - "forecast_horizon": forecast_horizon, - }, - }, + parameters=parameters, ) return baseline diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py index 9d1997c5f6..6c7a1fb5db 100644 --- a/evalml/tests/automl_tests/test_automl.py +++ b/evalml/tests/automl_tests/test_automl.py @@ -61,7 +61,14 @@ RegressionPipeline, StackedEnsembleClassifier, ) -from evalml.pipelines.components import DecisionTreeClassifier +from evalml.pipelines.components import ( + DateTimeFeaturizer, + DecisionTreeClassifier, + EmailFeaturizer, + NaturalLanguageFeaturizer, + TimeSeriesFeaturizer, + URLFeaturizer, +) from evalml.pipelines.utils import ( _get_pipeline_base_class, _make_stacked_ensemble_pipeline, @@ -5047,6 +5054,101 @@ def test_default_algorithm_uses_n_jobs(X_y_binary, AutoMLTestEnv): assert n_checked and n_feature_selector_checked +@pytest.mark.parametrize("input_type", ["pd", "ww"]) +@pytest.mark.parametrize("automl_algorithm", ["default", "iterative"]) +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) +def test_exclude_featurizers( + automl_algorithm, + problem_type, + input_type, + get_test_data_from_configuration, + AutoMLTestEnv, +): + parameters = {} + if is_time_series(problem_type): + parameters = { + "time_index": "dates", + "gap": 1, + "max_delay": 1, + "forecast_horizon": 1, + } + + X, y = get_test_data_from_configuration( + input_type, problem_type, column_names=["dates", "text", "email", "url"] + ) + + automl = AutoMLSearch( + X_train=X, + y_train=y, + problem_type=problem_type, + problem_configuration=parameters, + automl_algorithm=automl_algorithm, + exclude_featurizers=[ + "DatetimeFeaturizer", + "EmailFeaturizer", + "URLFeaturizer", + "NaturalLanguageFeaturizer", + "TimeSeriesFeaturizer", + ], + ) + + env = AutoMLTestEnv(problem_type) + with env.test_context(score_return_value={automl.objective.name: 1.0}): + automl.search() + + pipelines = [ + automl.get_pipeline(i) for i in range(len(automl.results["pipeline_results"])) + ] + + # A check to make sure we actually retrieve constructed pipelines from the algo. + assert len(pipelines) > 0 + + assert not any( + [ + DateTimeFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [EmailFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [URLFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [ + NaturalLanguageFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [ + TimeSeriesFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + + +def test_exclude_featurizers_error(X_y_binary): + X, y = X_y_binary + match_text = ( + "Invalid value provided for exclude_featurizers. Must be one of: " + "DatetimeFeaturizer, EmailFeaturizer, URLFeaturizer, NaturalLanguageFeaturizer, TimeSeriesFeaturizer" + ) + with pytest.raises( + ValueError, + match=match_text, + ): + AutoMLSearch( + X_train=X, + y_train=y, + problem_type="binary", + exclude_featurizers=[ + "InvalidNameFeaturizer", + ], + ) + + def test_init_holdout_set(X_y_binary, caplog): X, y = X_y_binary X_train, X_holdout, y_train, y_holdout = split_data(X, y, "binary") diff --git a/evalml/tests/automl_tests/test_default_algorithm.py b/evalml/tests/automl_tests/test_default_algorithm.py index 407b2ffa90..e8b7d71b1e 100644 --- a/evalml/tests/automl_tests/test_default_algorithm.py +++ b/evalml/tests/automl_tests/test_default_algorithm.py @@ -10,15 +10,20 @@ from evalml.model_family import ModelFamily from evalml.pipelines.components import ( ARIMARegressor, + DateTimeFeaturizer, ElasticNetClassifier, ElasticNetRegressor, + EmailFeaturizer, LogisticRegressionClassifier, + NaturalLanguageFeaturizer, ProphetRegressor, RandomForestClassifier, StackedEnsembleClassifier, StackedEnsembleRegressor, + TimeSeriesFeaturizer, + URLFeaturizer, ) -from evalml.problem_types import ProblemTypes +from evalml.problem_types import ProblemTypes, is_time_series def test_default_algorithm_init(X_y_binary): @@ -922,3 +927,71 @@ def test_default_algorithm_accepts_URL_email_features( assert pipeline.parameters["Categorical Pipeline - Select Columns Transformer"][ "columns" ] == ["url", "email"] + + +@pytest.mark.parametrize("input_type", ["pd", "ww"]) +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) +def test_exclude_featurizers_default_algorithm( + problem_type, + input_type, + get_test_data_from_configuration, +): + parameters = {} + if is_time_series(problem_type): + parameters = { + "time_index": "dates", + "gap": 1, + "max_delay": 1, + "forecast_horizon": 3, + } + + X, y = get_test_data_from_configuration( + input_type, problem_type, column_names=["dates", "text", "email", "url"] + ) + + algo = DefaultAlgorithm( + X, + y, + problem_type, + sampler_name=None, + search_parameters={"pipeline": parameters}, + exclude_featurizers=[ + "DatetimeFeaturizer", + "EmailFeaturizer", + "URLFeaturizer", + "NaturalLanguageFeaturizer", + "TimeSeriesFeaturizer", + ], + ) + + pipelines = [] + for _ in range(4): + pipelines.extend([pl for pl in algo.next_batch()]) + + # A check to make sure we actually retrieve constructed pipelines from the algo. + assert len(pipelines) > 0 + + assert not any( + [ + DateTimeFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [EmailFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [URLFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [ + NaturalLanguageFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [ + TimeSeriesFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) diff --git a/evalml/tests/automl_tests/test_iterative_algorithm.py b/evalml/tests/automl_tests/test_iterative_algorithm.py index bd386b362f..4bdf61b4e9 100644 --- a/evalml/tests/automl_tests/test_iterative_algorithm.py +++ b/evalml/tests/automl_tests/test_iterative_algorithm.py @@ -14,9 +14,16 @@ StackedEnsembleClassifier, StackedEnsembleRegressor, ) +from evalml.pipelines.components import ( + DateTimeFeaturizer, + EmailFeaturizer, + NaturalLanguageFeaturizer, + TimeSeriesFeaturizer, + URLFeaturizer, +) from evalml.pipelines.components.utils import get_estimators from evalml.pipelines.utils import make_pipeline -from evalml.problem_types import ProblemTypes +from evalml.problem_types import ProblemTypes, is_time_series @pytest.fixture @@ -1048,3 +1055,69 @@ def test_iterative_algorithm_add_result_cache( for values in algo._best_pipeline_info.values(): assert values["cached_data"] == cache + + +@pytest.mark.parametrize("input_type", ["pd", "ww"]) +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) +def test_exclude_featurizers_iterative_algorithm( + problem_type, + input_type, + get_test_data_from_configuration, +): + parameters = {} + if is_time_series(problem_type): + parameters = { + "time_index": "dates", + "gap": 1, + "max_delay": 1, + "forecast_horizon": 3, + } + + X, y = get_test_data_from_configuration( + input_type, problem_type, column_names=["dates", "text", "email", "url"] + ) + + algo = IterativeAlgorithm( + X, + y, + problem_type, + sampler_name=None, + search_parameters={"pipeline": parameters}, + exclude_featurizers=[ + "DatetimeFeaturizer", + "EmailFeaturizer", + "URLFeaturizer", + "NaturalLanguageFeaturizer", + "TimeSeriesFeaturizer", + ], + ) + + pipelines = [pl for pl in algo.allowed_pipelines] + + # A check to make sure we actually retrieve constructed pipelines from the algo. + assert len(pipelines) > 0 + + assert not any( + [ + DateTimeFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [EmailFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [URLFeaturizer.name in pl.component_graph.compute_order for pl in pipelines] + ) + assert not any( + [ + NaturalLanguageFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + ) + assert not any( + [ + TimeSeriesFeaturizer.name in pl.component_graph.compute_order + for pl in pipelines + ] + )