diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 57c445b2da..d76fdaca07 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -5,6 +5,7 @@ Release Notes * Fixes * Changes * Remove unnecessary logic from imputer components prior to nullable type handling :pr:`4038` + * Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046` * Documentation Changes * Testing Changes diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index b8e563594c..b508fe6c4d 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -189,8 +189,11 @@ def fit(self, X, y=None): self """ X = infer_feature_types(X) - X_encoded = self._encode_categories(X, fit=True) - y_encoded = self._encode_labels(y) + if y is not None: + y = infer_feature_types(y) + X_d, y_d = self._handle_nullable_types(X, y) + X_encoded = self._encode_categories(X_d, fit=True) + y_encoded = self._encode_labels(y_d) self._component_obj.fit(X_encoded, y_encoded) return self @@ -204,7 +207,8 @@ def predict(self, X): pd.DataFrame: Predicted values. """ X_encoded = self._encode_categories(X) - predictions = super().predict(X_encoded) + X_d, _ = self._handle_nullable_types(X_encoded) + predictions = super().predict(X_d) if not self._label_encoder: return predictions predictions = self._label_encoder.inverse_transform( @@ -222,4 +226,5 @@ def predict_proba(self, X): pd.DataFrame: Predicted probability values. """ X_encoded = self._encode_categories(X) - return super().predict_proba(X_encoded) + X_d, _ = self._handle_nullable_types(X_encoded) + return super().predict_proba(X_d) diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py index 8a68a794ec..ea8be6bb77 100644 --- a/evalml/pipelines/components/estimators/regressors/arima_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/arima_regressor.py @@ -9,7 +9,6 @@ from evalml.pipelines.components.estimators import Estimator from evalml.problem_types import ProblemTypes from evalml.utils import ( - downcast_int_nullable_to_double, import_or_raise, infer_feature_types, ) @@ -213,10 +212,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): Raises: ValueError: If y was not passed in. """ - if X is not None: - X = downcast_int_nullable_to_double(X) - X = X.fillna(X.mean()) X, y = self._manage_woodwork(X, y) + X, y = self._handle_nullable_types(X, y) + if X is not None: + X = X.ww.fillna(X.mean()) if y is None: raise ValueError("ARIMA Regressor requires y as input.") diff --git a/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py b/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py index a0a7220452..733023d3b3 100644 --- a/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py @@ -119,6 +119,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None): if y is None: raise ValueError("Exponential Smoothing Regressor requires y as input.") + X, y = self._handle_nullable_types(X, y) + y = self._remove_datetime(y) self._component_obj.fit(y=y) diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py index e3f05ccb8f..c968e4462b 100644 --- a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py +++ b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py @@ -11,7 +11,6 @@ from evalml.utils import ( SEED_BOUNDS, _rename_column_names_to_numeric, - downcast_int_nullable_to_double, import_or_raise, infer_feature_types, ) @@ -170,8 +169,8 @@ def fit(self, X, y=None): X_encoded = self._encode_categories(X, fit=True) if y is not None: y = infer_feature_types(y) - X_encoded = downcast_int_nullable_to_double(X_encoded) - self._component_obj.fit(X_encoded, y) + X_d, y_d = self._handle_nullable_types(X_encoded, y) + self._component_obj.fit(X_d, y_d) return self def predict(self, X): @@ -184,4 +183,5 @@ def predict(self, X): pd.Series: Predicted values. """ X_encoded = self._encode_categories(X) - return super().predict(X_encoded) + X_d, _ = self._handle_nullable_types(X_encoded) + return super().predict(X_d) diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py index 0105f16782..5f27e82a04 100644 --- a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py +++ b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py @@ -1,10 +1,19 @@ """Component that imputes missing data according to a specified timeseries-specific imputation strategy.""" + + import pandas as pd import woodwork as ww -from woodwork.logical_types import BooleanNullable, Double +from woodwork.logical_types import ( + BooleanNullable, + Double, +) from evalml.pipelines.components.transformers import Transformer from evalml.utils import infer_feature_types +from evalml.utils.nullable_type_utils import ( + _determine_fractional_type, + _determine_non_nullable_equivalent, +) class TimeSeriesImputer(Transformer): @@ -52,7 +61,7 @@ class TimeSeriesImputer(Transformer): # Incompatibility: https://github.com/alteryx/evalml/issues/4001 # TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014 _integer_nullable_incompatibilities = ["X", "y"] - _boolean_nullable_incompatibilities = ["X", "y"] + _boolean_nullable_incompatibilities = ["y"] def __init__( self, @@ -155,35 +164,54 @@ def transform(self, X, y=None): if y is not None: y = infer_feature_types(y) + # This will change the logical type of BooleanNullable/IntegerNullable/AgeNullable columns with nans + # so we save the original schema to recreate it where possible after imputation + original_schema = X.ww.schema + X, y = self._handle_nullable_types(X, y) + X_not_all_null = X.ww.drop(self._all_null_cols) - X_schema = X_not_all_null.ww.schema - X_schema = X_schema.get_subset_schema( - subset_cols=X_schema._filter_cols( - exclude=["IntegerNullable", "BooleanNullable", "AgeNullable"], - ), + + # Because the TimeSeriesImputer is always used with the TimeSeriesRegularizer, + # many of the columns containing nans may have originally been non nullable logical types. + # We will use the non nullable equivalents where possible + original_schema = original_schema.get_subset_schema( + list(X_not_all_null.columns), ) + new_ltypes = { + col: _determine_non_nullable_equivalent(ltype) + for col, ltype in original_schema.logical_types.items() + } if self._forwards_cols is not None: - X_forward = X.ww[self._forwards_cols] + X_forward = X[self._forwards_cols] imputed = X_forward.pad() imputed.bfill(inplace=True) # Fill in the first value, if missing X_not_all_null[X_forward.columns] = imputed if self._backwards_cols is not None: - X_backward = X.ww[self._backwards_cols] + X_backward = X[self._backwards_cols] imputed = X_backward.bfill() imputed.pad(inplace=True) # Fill in the last value, if missing X_not_all_null[X_backward.columns] = imputed if self._interpolate_cols is not None: - X_interpolate = X.ww[self._interpolate_cols] - # TODO: Revert when pandas introduces Float64 dtype - imputed = X_interpolate.astype( - float, - ).interpolate() # Cast to float because Int64 not handled + X_interpolate = X_not_all_null[self._interpolate_cols] + imputed = X_interpolate.interpolate() imputed.bfill(inplace=True) # Fill in the first value, if missing X_not_all_null[X_interpolate.columns] = imputed - X_not_all_null.ww.init(schema=X_schema) + + # Interpolate may add floating point values to integer data, so we + # have to update those logical types to a fractional type + int_cols_to_update = original_schema._filter_cols( + include=["IntegerNullable", "AgeNullable"], + ) + new_int_ltypes = { + col: _determine_fractional_type(ltype) + for col, ltype in original_schema.logical_types.items() + if col in int_cols_to_update + } + new_ltypes.update(new_int_ltypes) + X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes) y_imputed = pd.Series(y) if y is not None and len(y) > 0: @@ -194,10 +222,10 @@ def transform(self, X, y=None): y_imputed = y.bfill() y_imputed.pad(inplace=True) elif self._impute_target == "interpolate": - # TODO: Revert when pandas introduces Float64 dtype - y_imputed = y.astype(float).interpolate() + y_imputed = y.interpolate() y_imputed.bfill(inplace=True) - y_imputed = ww.init_series(y_imputed) + # Re-initialize woodwork with the downcast logical type + y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type) return X_not_all_null, y_imputed diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py index a569479f66..1b54a28423 100644 --- a/evalml/pipelines/components/transformers/samplers/base_sampler.py +++ b/evalml/pipelines/components/transformers/samplers/base_sampler.py @@ -2,8 +2,6 @@ import copy from abc import abstractmethod -from woodwork.logical_types import IntegerNullable - from evalml.pipelines.components.transformers import Transformer from evalml.utils.woodwork_utils import infer_feature_types @@ -36,7 +34,8 @@ def fit(self, X, y): """ if y is None: raise ValueError("y cannot be None") - X_ww, y_ww = self._prepare_data(X, y) + X_ww = infer_feature_types(X) + y_ww = infer_feature_types(y) self._initialize_sampler(X_ww, y_ww) return self @@ -49,41 +48,7 @@ def _initialize_sampler(self, X, y): y (pd.Series): The target data. """ - def _prepare_data(self, X, y): - """Transforms the input data to pandas data structure that our sampler can ingest. - - Args: - X (pd.DataFrame): Training features. - y (pd.Series): Target. - - Returns: - pd.DataFrame, pd.Series: Prepared X and y data as pandas types - """ - X = infer_feature_types(X) - int_nullable_cols = X.ww.select(IntegerNullable).columns - if len(int_nullable_cols) > 0: - try: - X = X.astype( - { - null_col: int - for null_col in X.ww.select(IntegerNullable).columns - }, - ) - except ValueError: - X = X.astype( - { - null_col: float - for null_col in X.ww.select(IntegerNullable).columns - }, - ) - X.ww.init(schema=X.ww.schema) - - if y is None: - raise ValueError("y cannot be None") - y = infer_feature_types(y) - return X, y - - def transform(self, X, y=None): + def transform(self, X, y): """Transforms the input data by sampling the data. Args: @@ -93,7 +58,8 @@ def transform(self, X, y=None): Returns: pd.DataFrame, pd.Series: Transformed features and target. """ - X, y = self._prepare_data(X, y) + X = infer_feature_types(X) + y = infer_feature_types(y) categorical_columns = X.ww.select("Categorical", return_schema=True).columns for col in categorical_columns: diff --git a/evalml/pipelines/components/transformers/samplers/oversampler.py b/evalml/pipelines/components/transformers/samplers/oversampler.py index f65ab0eeab..43df55a842 100644 --- a/evalml/pipelines/components/transformers/samplers/oversampler.py +++ b/evalml/pipelines/components/transformers/samplers/oversampler.py @@ -72,18 +72,46 @@ def fit(self, X, y): Returns: self """ - X_ww, y_ww = self._prepare_data(X, y) + X_ww = infer_feature_types(X) + if y is None: + raise ValueError("y cannot be None") + y_ww = infer_feature_types(y) + sampler_name = self._get_best_oversampler(X_ww) self.sampler = self.sampler_options[sampler_name] # get categorical features first, if necessary if sampler_name == "SMOTENC": - self._get_categorical(X) - super().fit(X, y) + self._get_categorical(X_ww) + super().fit(X_ww, y_ww) return self + def transform(self, X, y=None): + """Transforms the input data by Oversampling the data. + + Args: + X (pd.DataFrame): Training features. + y (pd.Series): Target. + + Returns: + pd.DataFrame, pd.Series: Transformed features and target. + """ + X_ww = infer_feature_types(X) + original_schema = X_ww.ww.schema + if y is None: + raise ValueError("y cannot be None") + y_ww = infer_feature_types(y) + X_d, y_d = self._handle_nullable_types(X_ww, y_ww) + X_t, y_t = super().transform(X_d, y_d) + X_t.ww.init(schema=original_schema) + + return X_t, y_t + def _get_best_oversampler(self, X): - cat_cols = X.ww.select(["category", "boolean"]).columns + cat_cols = X.ww.select( + ["category", "boolean", "BooleanNullable"], + return_schema=True, + ).columns if len(cat_cols) == X.shape[1]: return "SMOTEN" elif not len(cat_cols): @@ -101,7 +129,7 @@ def _get_categorical(self, X): ] # Grab boolean columns, since SMOTE considers these categorical as well for i, val in enumerate(X.ww.types["Logical Type"].items()): - if str(val[1]) == "Boolean": + if str(val[1]) in {"Boolean", "BooleanNullable"}: self.categorical_features.append(i) self._parameters["categorical_features"] = self.categorical_features @@ -115,7 +143,7 @@ def _initialize_sampler(self, X, y): y (pd.Series): Target. """ sampler_class = self.sampler - _, y_pd = self._prepare_data(X, y) + y_pd = infer_feature_types(y) sampler_params = { k: v for k, v in self.parameters.items() diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py index fbb8ab5291..1b01410b9e 100644 --- a/evalml/pipelines/components/transformers/samplers/undersampler.py +++ b/evalml/pipelines/components/transformers/samplers/undersampler.py @@ -95,7 +95,11 @@ def transform(self, X, y=None): Returns: pd.DataFrame, pd.Series: Transformed features and target. """ - X_ww, y_ww = self._prepare_data(X, y) + X_ww = infer_feature_types(X) + if y is None: + raise ValueError("y cannot be None") + y_ww = infer_feature_types(y) + index_df = pd.Series(y_ww.index) indices = self.fit_resample(X_ww, y_ww) diff --git a/evalml/tests/component_tests/test_arima_regressor.py b/evalml/tests/component_tests/test_arima_regressor.py index 1dd2017f1e..dc9f9022d1 100644 --- a/evalml/tests/component_tests/test_arima_regressor.py +++ b/evalml/tests/component_tests/test_arima_regressor.py @@ -4,6 +4,7 @@ import numpy as np import pandas as pd import pytest +import woodwork as ww from sktime.forecasting.arima import AutoARIMA as SKArima from sktime.forecasting.base import ForecastingHorizon @@ -476,11 +477,15 @@ def test_arima_regressor_can_forecast_arbitrary_dates(use_covariates, ts_data): ) -def test_arima_regressor_nullable_handling(): +@pytest.mark.parametrize( + "nullable_ltype", + ["IntegerNullable", "AgeNullable"], +) +def test_arima_regressor_with_nullable_types(nullable_ltype): X = pd.DataFrame() X["nums"] = pd.Series([i for i in range(100)], dtype="Int64") X.index = pd.date_range("1/1/21", periods=100) - X.ww.init(logical_types={"nums": "IntegerNullable"}) + X.ww.init(logical_types={"nums": nullable_ltype}) y = pd.Series([i for i in range(100)], dtype="Int64") y.index = pd.date_range("1/1/21", periods=100) @@ -489,6 +494,7 @@ def test_arima_regressor_nullable_handling(): X_test = X.ww.iloc[80:, :] y_train = y[:80] + y_train = ww.init_series(y_train, logical_type=nullable_ltype) arima_params = { "trend": None, diff --git a/evalml/tests/component_tests/test_exponential_smoothing_regressor.py b/evalml/tests/component_tests/test_exponential_smoothing_regressor.py index 5f0394e004..16b55cd460 100644 --- a/evalml/tests/component_tests/test_exponential_smoothing_regressor.py +++ b/evalml/tests/component_tests/test_exponential_smoothing_regressor.py @@ -202,7 +202,7 @@ def test_predict_no_X_in_fit( "nullable_y_ltype", ["IntegerNullable", "AgeNullable", "BooleanNullable"], ) -def test_handle_nullable_types( +def test_estimator_with_nullable_types( nullable_type_test_data, nullable_type_target, nullable_y_ltype, @@ -213,9 +213,9 @@ def test_handle_nullable_types( comp = ExponentialSmoothingRegressor() - X_d, y_d = comp._handle_nullable_types(X, y) - comp.fit(X_d, y_d) - comp.predict(X_d) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + comp.fit(X.ww.copy(), y) + comp.predict(X.ww.copy()) @pytest.mark.parametrize( diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py index eefdd4c84d..009ef427f4 100644 --- a/evalml/tests/component_tests/test_lgbm_classifier.py +++ b/evalml/tests/component_tests/test_lgbm_classifier.py @@ -332,9 +332,9 @@ def test_lgbm_preserves_schema_in_rename(mock_predict_proba, mock_predict, mock_ @pytest.mark.parametrize( "nullable_y_ltype", - ["IntegerNullable", "AgeNullable"], + ["IntegerNullable", "AgeNullable", "BooleanNullable"], ) -def test_lgbm_handle_nullable_types( +def test_lgbm_with_nullable_types( nullable_type_test_data, nullable_type_target, nullable_y_ltype, @@ -345,10 +345,10 @@ def test_lgbm_handle_nullable_types( lgb = LightGBMClassifier() - X, y = lgb._handle_nullable_types(X, y) - lgb.fit(X, y) - preds = lgb.predict(X) - pred_probs = lgb.predict_proba(X) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + lgb.fit(X.ww.copy(), y) + preds = lgb.predict(X.ww.copy()) + pred_probs = lgb.predict_proba(X.ww.copy()) assert not preds.isnull().any().any() assert not pred_probs.isnull().any().any() diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py index fb4c00cd84..f178eef9a7 100644 --- a/evalml/tests/component_tests/test_lgbm_regressor.py +++ b/evalml/tests/component_tests/test_lgbm_regressor.py @@ -234,7 +234,7 @@ def test_lgbm_preserves_schema_in_rename(mock_predict, mock_fit): "nullable_y_ltype", ["IntegerNullable", "AgeNullable", "BooleanNullable"], ) -def test_lgbm_handle_nullable_types( +def test_lgbm_with_nullable_types( nullable_type_test_data, nullable_type_target, nullable_y_ltype, @@ -245,9 +245,9 @@ def test_lgbm_handle_nullable_types( lgb = LightGBMRegressor() - X, y = lgb._handle_nullable_types(X, y) - lgb.fit(X, y) - preds = lgb.predict(X) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + lgb.fit(X.ww.copy(), y) + preds = lgb.predict(X.ww.copy()) assert not preds.isnull().any().any() diff --git a/evalml/tests/component_tests/test_oversampler.py b/evalml/tests/component_tests/test_oversampler.py index 7d810af436..c4a011a004 100644 --- a/evalml/tests/component_tests/test_oversampler.py +++ b/evalml/tests/component_tests/test_oversampler.py @@ -450,6 +450,7 @@ def test_oversampler_handle_nullable_types( X = nullable_type_test_data(has_nans=False) # Oversampler can only handle numeric and boolean columns X = X.ww.select(include=["numeric", "Boolean", "BooleanNullable", "category"]) + original_schema = X.ww.schema y = nullable_type_target(ltype=nullable_y_ltype, has_nans=False) oversampler = Oversampler(sampling_ratio=0.5) @@ -460,6 +461,9 @@ def test_oversampler_handle_nullable_types( assert len(X_t) > len(X) assert len(y_t) > len(y) + # Confirm the original types are maintained + assert original_schema == X_t.ww.schema + @pytest.mark.parametrize( "nullable_y_ltype", diff --git a/evalml/tests/component_tests/test_time_series_imputer.py b/evalml/tests/component_tests/test_time_series_imputer.py index 6a582c6025..c27adffdc3 100644 --- a/evalml/tests/component_tests/test_time_series_imputer.py +++ b/evalml/tests/component_tests/test_time_series_imputer.py @@ -2,7 +2,13 @@ import pandas as pd import pytest from pandas.testing import assert_frame_equal, assert_series_equal -from woodwork.logical_types import Boolean, BooleanNullable, IntegerNullable +from woodwork.logical_types import ( + AgeFractional, + Boolean, + BooleanNullable, + Double, + IntegerNullable, +) from evalml.pipelines.components import TimeSeriesImputer @@ -129,6 +135,7 @@ def test_categorical_only_input(imputer_test_data): ), "bool col with nan": pd.Series( [True, True, False, False, True] * 4, + dtype="bool", ), }, ) @@ -147,6 +154,7 @@ def test_categorical_only_input(imputer_test_data): ) expected["bool col with nan"] = pd.Series( [True, False, False, True, True] * 4, + dtype="bool", ) imputer = TimeSeriesImputer(categorical_impute_strategy="backwards_fill") @@ -159,7 +167,8 @@ def test_categorical_and_numeric_input(imputer_test_data): y = pd.Series([0, 0, 1, 0, 1]) imputer = TimeSeriesImputer() imputer.fit(X, y) - transformed, _ = imputer.transform(X, y) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + transformed, _ = imputer.transform(X.ww.copy(), y) expected = pd.DataFrame( { "dates": pd.date_range("01-01-2022", periods=20), @@ -197,7 +206,8 @@ def test_categorical_and_numeric_input(imputer_test_data): numeric_impute_strategy="forwards_fill", categorical_impute_strategy="forwards_fill", ) - transformed, _ = imputer.fit_transform(X, y) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + transformed, _ = imputer.fit_transform(X.ww.copy(), y) expected["float with nan"] = [0.3, 1.0, 1.0, -1.0, 0.0] * 4 assert_frame_equal(transformed, expected, check_dtype=False) @@ -562,26 +572,64 @@ def test_imputer_woodwork_custom_overrides_returned_by_components( @pytest.mark.parametrize( - "nullable_ltype", - ["BooleanNullable", "IntegerNullable", "AgeNullable"], + "nullable_y_ltype, expected_imputed_y_ltype", + [ + ("BooleanNullable", Double), + ("IntegerNullable", Double), + ("AgeNullable", AgeFractional), + ], +) +@pytest.mark.parametrize( + "numeric_impute_strategy", + ["forwards_fill", "backwards_fill", "interpolate"], ) def test_imputer_can_take_in_nullable_types( nullable_type_test_data, nullable_type_target, - nullable_ltype, + numeric_impute_strategy, + nullable_y_ltype, + expected_imputed_y_ltype, ): - y = nullable_type_target(ltype=nullable_ltype, has_nans=True) + y = nullable_type_target(ltype=nullable_y_ltype, has_nans=True) X = nullable_type_test_data(has_nans=True) - # Only numeric imputing has interpolate as an option - X = X.ww.select("numeric") + # Drop the fully null columns since aren't relevant to the handle nullable types checks + X = X.ww.drop(["all nan", "all nan cat"]) - imputer = TimeSeriesImputer(numeric_impute_strategy="interpolate") - imputer.fit(X, y) - X_imputed, y_imputed = imputer.transform(X, y) + cols_expected_to_change = X.ww.schema._filter_cols( + include=["IntegerNullable", "AgeNullable", "BooleanNullable"], + ) + cols_expected_to_stay_the_same = X.ww.schema._filter_cols( + exclude=["IntegerNullable", "AgeNullable", "BooleanNullable"], + ) + + imputer = TimeSeriesImputer( + numeric_impute_strategy=numeric_impute_strategy, + target_impute_strategy="interpolate", + ) + # Copy X to avoid X taking on any mutations from the internal _handle_nullable_types call + imputer.fit(X.ww.copy(), y) + X_imputed, y_imputed = imputer.transform(X.ww.copy(), y) assert not X_imputed.isnull().any().any() assert not y_imputed.isnull().any() + # Check that the types are as expected - when interpolate is used, we need fractional numeric ltypes + if numeric_impute_strategy == "interpolate": + expected_X_ltypes = {"AgeFractional", "Double", "Boolean"} + else: + expected_X_ltypes = {"Age", "Integer", "Boolean"} + + assert X.ww.get_subset_schema( + cols_expected_to_stay_the_same, + ) == X_imputed.ww.get_subset_schema(cols_expected_to_stay_the_same) + assert { + str(ltype) + for col, ltype in X_imputed.ww.logical_types.items() + if col in cols_expected_to_change + } == expected_X_ltypes + + assert isinstance(y_imputed.ww.logical_type, expected_imputed_y_ltype) + @pytest.mark.parametrize( "categorical_impute_strategy", diff --git a/evalml/tests/component_tests/test_undersampler.py b/evalml/tests/component_tests/test_undersampler.py index af24597ef7..532b600ca9 100644 --- a/evalml/tests/component_tests/test_undersampler.py +++ b/evalml/tests/component_tests/test_undersampler.py @@ -151,3 +151,26 @@ def test_undersampler_sampling_dict_strings(): assert len(new_X) == sum(expected_result.values()) assert new_y.value_counts().to_dict() == expected_result + + +@pytest.mark.parametrize( + "nullable_y_ltype", + ["IntegerNullable", "AgeNullable", "BooleanNullable"], +) +def test_undersampler_with_nullable_types( + nullable_type_test_data, + nullable_type_target, + nullable_y_ltype, +): + X = nullable_type_test_data(has_nans=False) + # Undersampler can only handle numeric and boolean columns + X = X.ww.select(include=["numeric", "Boolean", "BooleanNullable"]) + y = nullable_type_target(ltype=nullable_y_ltype, has_nans=False) + + oversampler = Undersampler(sampling_ratio_dict={0: 1, 1: 0.5}) + oversampler.fit(X, y) + X_t, y_t = oversampler.transform(X, y) + + # Confirm oversampling happened by checking the length increased + assert len(X_t) < len(X) + assert len(y_t) < len(y) diff --git a/evalml/utils/nullable_type_utils.py b/evalml/utils/nullable_type_utils.py index cd185a5f11..ada248b722 100644 --- a/evalml/utils/nullable_type_utils.py +++ b/evalml/utils/nullable_type_utils.py @@ -1,4 +1,11 @@ import woodwork as ww +from woodwork.logical_types import AgeNullable, BooleanNullable, IntegerNullable + +DOWNCAST_TYPE_DICT = { + "BooleanNullable": ("Boolean", "Categorical"), + "IntegerNullable": ("Integer", "Double"), + "AgeNullable": ("Age", "AgeFractional"), +} def _downcast_nullable_X(X, handle_boolean_nullable=True, handle_integer_nullable=True): @@ -76,10 +83,10 @@ def _get_incompatible_nullable_types(handle_boolean_nullable, handle_integer_nul """ nullable_types_to_handle = [] if handle_boolean_nullable: - nullable_types_to_handle.append(ww.logical_types.BooleanNullable) + nullable_types_to_handle.append(BooleanNullable) if handle_integer_nullable: - nullable_types_to_handle.append(ww.logical_types.IntegerNullable) - nullable_types_to_handle.append(ww.logical_types.AgeNullable) + nullable_types_to_handle.append(IntegerNullable) + nullable_types_to_handle.append(AgeNullable) return nullable_types_to_handle @@ -97,14 +104,46 @@ def _determine_downcast_type(col): Returns: LogicalType string to be used to downcast incompatible nullable logical types. """ - downcast_matches = { - "BooleanNullable": ("Boolean", "Categorical"), - "IntegerNullable": ("Integer", "Double"), - "AgeNullable": ("Age", "AgeFractional"), - } - - no_nans_ltype, has_nans_ltype = downcast_matches[str(col.ww.logical_type)] + no_nans_ltype, has_nans_ltype = DOWNCAST_TYPE_DICT[str(col.ww.logical_type)] if col.isnull().any(): return has_nans_ltype return no_nans_ltype + + +def _determine_fractional_type(logical_type): + """Determines what logical type to use for integer data that has fractional values imputed. + - IntegerNullable becomes Double. + - AgeNullable becomes AgeFractional. + - All other logical types are returned unchanged. + + Args: + logical_type (ww.LogicalType): The logical type whose fractional equivalent we are determining. + Should be either IntegerNullable or AgeNullable. + + Returns: + LogicalType to be used after fractional values have been added to a previously integer column. + """ + fractional_ltype = None + if isinstance(logical_type, (IntegerNullable, AgeNullable)): + _, fractional_ltype = DOWNCAST_TYPE_DICT[str(logical_type)] + + return fractional_ltype or logical_type + + +def _determine_non_nullable_equivalent(logical_type): + """Determines the non nullable equivalent logical type to use for nullable types. These types cannot support null values. + - IntegerNullable becomes Integer. + - AgeNullable becomes Age. + - BooleanNullable becomes Boolean. + - All other logical types are returned unchanged. + + Args: + logical_type (ww.LogicalType): The logical type whose non nullable equivalent we are determining. + + Returns: + LogicalType to be used instead of nullable type when nans aren't present. + """ + non_nullable_ltype, _ = DOWNCAST_TYPE_DICT.get(str(logical_type), (None, None)) + + return non_nullable_ltype or logical_type