Use nullable type handling in components' fit, transform, and predict…

… methods (#4046) * Remove existing nullable type handling from oversampler and use _handle_nullable_types instead * Add handle call for lgbm regressor and remove existing handling * Add handle call for lgbm classifier * temp broken exp smoothing tests * lint fix * add release note * Fix broken tests by initting woodwork on y in lgbm classifier * Update tests * Call handle in arima * call handle from ts imputer y ltype is downcasted value * remove unnecessary comments * Fix time series guide * lint fix * Only call handle_nullable_types when necessary in methods * Remove remaining unnecessary handle calls * resolve remaining comments * Add y ww init to ts imputer back in to fix tests * Copy X in testing nullable types to stop hiding potential incompatibilities in methods * use X_d in lgbm predict proba * remove nullable type handling after sklearn upgrade fixed incompatibilities * use common util to determine type for time series imputed integers * Add comments around why we copy X * remove _prepare_data from samplers * PR comments * remove tests to check if handle method is called * remove nullable types from imputed data because of regularizer * fix typo * fix docstrings * fix codecov issues * PR comments * Revert "Fix time series guide" This reverts commit 964622a. * return unchanged ltype in nullabl;e type utils * add back ts imputer incompatibility test * use dict get return value * call handle nullable types in oversampler and check schema equality
alteryx · Mar 27, 2023 · abd16f7 · abd16f7
1 parent 0c83183
commit abd16f7
Show file tree

Hide file tree

Showing 17 changed files with 267 additions and 114 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,6 +6,7 @@ Release Notes
     * Changes
         * Calculated partial dependence grid values for integer data by rounding instead of truncating fractional values :pr:`4096`
         * Remove unnecessary logic from imputer components prior to nullable type handling :pr:`4038`
+        * Added calls to ``_handle_nullable_types`` in component fit, transform, and predict methods when needed :pr:`4046`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py
@@ -189,8 +189,11 @@ def fit(self, X, y=None):
             self
         """
         X = infer_feature_types(X)
-        X_encoded = self._encode_categories(X, fit=True)
-        y_encoded = self._encode_labels(y)
+        if y is not None:
+            y = infer_feature_types(y)
+        X_d, y_d = self._handle_nullable_types(X, y)
+        X_encoded = self._encode_categories(X_d, fit=True)
+        y_encoded = self._encode_labels(y_d)
         self._component_obj.fit(X_encoded, y_encoded)
         return self
 
@@ -204,7 +207,8 @@ def predict(self, X):
             pd.DataFrame: Predicted values.
         """
         X_encoded = self._encode_categories(X)
-        predictions = super().predict(X_encoded)
+        X_d, _ = self._handle_nullable_types(X_encoded)
+        predictions = super().predict(X_d)
         if not self._label_encoder:
             return predictions
         predictions = self._label_encoder.inverse_transform(
@@ -222,4 +226,5 @@ def predict_proba(self, X):
             pd.DataFrame: Predicted probability values.
         """
         X_encoded = self._encode_categories(X)
-        return super().predict_proba(X_encoded)
+        X_d, _ = self._handle_nullable_types(X_encoded)
+        return super().predict_proba(X_d)
diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py
@@ -9,7 +9,6 @@
 from evalml.pipelines.components.estimators import Estimator
 from evalml.problem_types import ProblemTypes
 from evalml.utils import (
-    downcast_int_nullable_to_double,
     import_or_raise,
     infer_feature_types,
 )
@@ -213,10 +212,10 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         Raises:
             ValueError: If y was not passed in.
         """
-        if X is not None:
-            X = downcast_int_nullable_to_double(X)
-            X = X.fillna(X.mean())
         X, y = self._manage_woodwork(X, y)
+        X, y = self._handle_nullable_types(X, y)
+        if X is not None:
+            X = X.ww.fillna(X.mean())
         if y is None:
             raise ValueError("ARIMA Regressor requires y as input.")
 

diff --git a/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py b/evalml/pipelines/components/estimators/regressors/exponential_smoothing_regressor.py
@@ -119,6 +119,8 @@ def fit(self, X: pd.DataFrame, y: Optional[pd.Series] = None):
         if y is None:
             raise ValueError("Exponential Smoothing Regressor requires y as input.")
 
+        X, y = self._handle_nullable_types(X, y)
+
         y = self._remove_datetime(y)
 
         self._component_obj.fit(y=y)

diff --git a/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py b/evalml/pipelines/components/estimators/regressors/lightgbm_regressor.py
@@ -11,7 +11,6 @@
 from evalml.utils import (
     SEED_BOUNDS,
     _rename_column_names_to_numeric,
-    downcast_int_nullable_to_double,
     import_or_raise,
     infer_feature_types,
 )
@@ -170,8 +169,8 @@ def fit(self, X, y=None):
         X_encoded = self._encode_categories(X, fit=True)
         if y is not None:
             y = infer_feature_types(y)
-        X_encoded = downcast_int_nullable_to_double(X_encoded)
-        self._component_obj.fit(X_encoded, y)
+        X_d, y_d = self._handle_nullable_types(X_encoded, y)
+        self._component_obj.fit(X_d, y_d)
         return self
 
     def predict(self, X):
@@ -184,4 +183,5 @@ def predict(self, X):
             pd.Series: Predicted values.
         """
         X_encoded = self._encode_categories(X)
-        return super().predict(X_encoded)
+        X_d, _ = self._handle_nullable_types(X_encoded)
+        return super().predict(X_d)
diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py
@@ -1,10 +1,19 @@
 """Component that imputes missing data according to a specified timeseries-specific imputation strategy."""
+
+
 import pandas as pd
 import woodwork as ww
-from woodwork.logical_types import BooleanNullable, Double
+from woodwork.logical_types import (
+    BooleanNullable,
+    Double,
+)
 
 from evalml.pipelines.components.transformers import Transformer
 from evalml.utils import infer_feature_types
+from evalml.utils.nullable_type_utils import (
+    _determine_fractional_type,
+    _determine_non_nullable_equivalent,
+)
 
 
 class TimeSeriesImputer(Transformer):
@@ -52,7 +61,7 @@ class TimeSeriesImputer(Transformer):
     # Incompatibility: https://github.com/alteryx/evalml/issues/4001
     # TODO: Remove when support is added https://github.com/alteryx/evalml/issues/4014
     _integer_nullable_incompatibilities = ["X", "y"]
-    _boolean_nullable_incompatibilities = ["X", "y"]
+    _boolean_nullable_incompatibilities = ["y"]
 
     def __init__(
         self,
@@ -155,35 +164,54 @@ def transform(self, X, y=None):
         if y is not None:
             y = infer_feature_types(y)
 
+        # This will change the logical type of BooleanNullable/IntegerNullable/AgeNullable columns with nans
+        # so we save the original schema to recreate it where possible after imputation
+        original_schema = X.ww.schema
+        X, y = self._handle_nullable_types(X, y)
+
         X_not_all_null = X.ww.drop(self._all_null_cols)
-        X_schema = X_not_all_null.ww.schema
-        X_schema = X_schema.get_subset_schema(
-            subset_cols=X_schema._filter_cols(
-                exclude=["IntegerNullable", "BooleanNullable", "AgeNullable"],
-            ),
+
+        # Because the TimeSeriesImputer is always used with the TimeSeriesRegularizer,
+        # many of the columns containing nans may have originally been non nullable logical types.
+        # We will use the non nullable equivalents where possible
+        original_schema = original_schema.get_subset_schema(
+            list(X_not_all_null.columns),
         )
+        new_ltypes = {
+            col: _determine_non_nullable_equivalent(ltype)
+            for col, ltype in original_schema.logical_types.items()
+        }
 
         if self._forwards_cols is not None:
-            X_forward = X.ww[self._forwards_cols]
+            X_forward = X[self._forwards_cols]
             imputed = X_forward.pad()
             imputed.bfill(inplace=True)  # Fill in the first value, if missing
             X_not_all_null[X_forward.columns] = imputed
 
         if self._backwards_cols is not None:
-            X_backward = X.ww[self._backwards_cols]
+            X_backward = X[self._backwards_cols]
             imputed = X_backward.bfill()
             imputed.pad(inplace=True)  # Fill in the last value, if missing
             X_not_all_null[X_backward.columns] = imputed
 
         if self._interpolate_cols is not None:
-            X_interpolate = X.ww[self._interpolate_cols]
-            # TODO: Revert when pandas introduces Float64 dtype
-            imputed = X_interpolate.astype(
-                float,
-            ).interpolate()  # Cast to float because Int64 not handled
+            X_interpolate = X_not_all_null[self._interpolate_cols]
+            imputed = X_interpolate.interpolate()
             imputed.bfill(inplace=True)  # Fill in the first value, if missing
             X_not_all_null[X_interpolate.columns] = imputed
-        X_not_all_null.ww.init(schema=X_schema)
+
+            # Interpolate may add floating point values to integer data, so we
+            # have to update those logical types to a fractional type
+            int_cols_to_update = original_schema._filter_cols(
+                include=["IntegerNullable", "AgeNullable"],
+            )
+            new_int_ltypes = {
+                col: _determine_fractional_type(ltype)
+                for col, ltype in original_schema.logical_types.items()
+                if col in int_cols_to_update
+            }
+            new_ltypes.update(new_int_ltypes)
+        X_not_all_null.ww.init(schema=original_schema, logical_types=new_ltypes)
 
         y_imputed = pd.Series(y)
         if y is not None and len(y) > 0:
@@ -194,10 +222,10 @@ def transform(self, X, y=None):
                 y_imputed = y.bfill()
                 y_imputed.pad(inplace=True)
             elif self._impute_target == "interpolate":
-                # TODO: Revert when pandas introduces Float64 dtype
-                y_imputed = y.astype(float).interpolate()
+                y_imputed = y.interpolate()
                 y_imputed.bfill(inplace=True)
-            y_imputed = ww.init_series(y_imputed)
+            # Re-initialize woodwork with the downcast logical type
+            y_imputed = ww.init_series(y_imputed, logical_type=y.ww.logical_type)
 
         return X_not_all_null, y_imputed
 

diff --git a/evalml/pipelines/components/transformers/samplers/base_sampler.py b/evalml/pipelines/components/transformers/samplers/base_sampler.py
@@ -2,8 +2,6 @@
 import copy
 from abc import abstractmethod
 
-from woodwork.logical_types import IntegerNullable
-
 from evalml.pipelines.components.transformers import Transformer
 from evalml.utils.woodwork_utils import infer_feature_types
 
@@ -36,7 +34,8 @@ def fit(self, X, y):
         """
         if y is None:
             raise ValueError("y cannot be None")
-        X_ww, y_ww = self._prepare_data(X, y)
+        X_ww = infer_feature_types(X)
+        y_ww = infer_feature_types(y)
         self._initialize_sampler(X_ww, y_ww)
         return self
 
@@ -49,41 +48,7 @@ def _initialize_sampler(self, X, y):
             y (pd.Series): The target data.
         """
 
-    def _prepare_data(self, X, y):
-        """Transforms the input data to pandas data structure that our sampler can ingest.
-
-        Args:
-            X (pd.DataFrame): Training features.
-            y (pd.Series): Target.
-
-        Returns:
-            pd.DataFrame, pd.Series: Prepared X and y data as pandas types
-        """
-        X = infer_feature_types(X)
-        int_nullable_cols = X.ww.select(IntegerNullable).columns
-        if len(int_nullable_cols) > 0:
-            try:
-                X = X.astype(
-                    {
-                        null_col: int
-                        for null_col in X.ww.select(IntegerNullable).columns
-                    },
-                )
-            except ValueError:
-                X = X.astype(
-                    {
-                        null_col: float
-                        for null_col in X.ww.select(IntegerNullable).columns
-                    },
-                )
-            X.ww.init(schema=X.ww.schema)
-
-        if y is None:
-            raise ValueError("y cannot be None")
-        y = infer_feature_types(y)
-        return X, y
-
-    def transform(self, X, y=None):
+    def transform(self, X, y):
         """Transforms the input data by sampling the data.
 
         Args:
@@ -93,7 +58,8 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame, pd.Series: Transformed features and target.
         """
-        X, y = self._prepare_data(X, y)
+        X = infer_feature_types(X)
+        y = infer_feature_types(y)
 
         categorical_columns = X.ww.select("Categorical", return_schema=True).columns
         for col in categorical_columns:

diff --git a/evalml/pipelines/components/transformers/samplers/oversampler.py b/evalml/pipelines/components/transformers/samplers/oversampler.py
@@ -72,18 +72,46 @@ def fit(self, X, y):
         Returns:
             self
         """
-        X_ww, y_ww = self._prepare_data(X, y)
+        X_ww = infer_feature_types(X)
+        if y is None:
+            raise ValueError("y cannot be None")
+        y_ww = infer_feature_types(y)
+
         sampler_name = self._get_best_oversampler(X_ww)
         self.sampler = self.sampler_options[sampler_name]
 
         # get categorical features first, if necessary
         if sampler_name == "SMOTENC":
-            self._get_categorical(X)
-        super().fit(X, y)
+            self._get_categorical(X_ww)
+        super().fit(X_ww, y_ww)
         return self
 
+    def transform(self, X, y=None):
+        """Transforms the input data by Oversampling the data.
+
+        Args:
+            X (pd.DataFrame): Training features.
+            y (pd.Series): Target.
+
+        Returns:
+            pd.DataFrame, pd.Series: Transformed features and target.
+        """
+        X_ww = infer_feature_types(X)
+        original_schema = X_ww.ww.schema
+        if y is None:
+            raise ValueError("y cannot be None")
+        y_ww = infer_feature_types(y)
+        X_d, y_d = self._handle_nullable_types(X_ww, y_ww)
+        X_t, y_t = super().transform(X_d, y_d)
+        X_t.ww.init(schema=original_schema)
+
+        return X_t, y_t
+
     def _get_best_oversampler(self, X):
-        cat_cols = X.ww.select(["category", "boolean"]).columns
+        cat_cols = X.ww.select(
+            ["category", "boolean", "BooleanNullable"],
+            return_schema=True,
+        ).columns
         if len(cat_cols) == X.shape[1]:
             return "SMOTEN"
         elif not len(cat_cols):
@@ -101,7 +129,7 @@ def _get_categorical(self, X):
         ]
         # Grab boolean columns, since SMOTE considers these categorical as well
         for i, val in enumerate(X.ww.types["Logical Type"].items()):
-            if str(val[1]) == "Boolean":
+            if str(val[1]) in {"Boolean", "BooleanNullable"}:
                 self.categorical_features.append(i)
         self._parameters["categorical_features"] = self.categorical_features
 
@@ -115,7 +143,7 @@ def _initialize_sampler(self, X, y):
             y (pd.Series): Target.
         """
         sampler_class = self.sampler
-        _, y_pd = self._prepare_data(X, y)
+        y_pd = infer_feature_types(y)
         sampler_params = {
             k: v
             for k, v in self.parameters.items()

diff --git a/evalml/pipelines/components/transformers/samplers/undersampler.py b/evalml/pipelines/components/transformers/samplers/undersampler.py
@@ -95,7 +95,11 @@ def transform(self, X, y=None):
         Returns:
             pd.DataFrame, pd.Series: Transformed features and target.
         """
-        X_ww, y_ww = self._prepare_data(X, y)
+        X_ww = infer_feature_types(X)
+        if y is None:
+            raise ValueError("y cannot be None")
+        y_ww = infer_feature_types(y)
+
         index_df = pd.Series(y_ww.index)
         indices = self.fit_resample(X_ww, y_ww)
 

diff --git a/evalml/tests/component_tests/test_arima_regressor.py b/evalml/tests/component_tests/test_arima_regressor.py
@@ -4,6 +4,7 @@
 import numpy as np
 import pandas as pd
 import pytest
+import woodwork as ww
 from sktime.forecasting.arima import AutoARIMA as SKArima
 from sktime.forecasting.base import ForecastingHorizon
 
@@ -476,11 +477,15 @@ def test_arima_regressor_can_forecast_arbitrary_dates(use_covariates, ts_data):
     )
 
 
-def test_arima_regressor_nullable_handling():
+@pytest.mark.parametrize(
+    "nullable_ltype",
+    ["IntegerNullable", "AgeNullable"],
+)
+def test_arima_regressor_with_nullable_types(nullable_ltype):
     X = pd.DataFrame()
     X["nums"] = pd.Series([i for i in range(100)], dtype="Int64")
     X.index = pd.date_range("1/1/21", periods=100)
-    X.ww.init(logical_types={"nums": "IntegerNullable"})
+    X.ww.init(logical_types={"nums": nullable_ltype})
 
     y = pd.Series([i for i in range(100)], dtype="Int64")
     y.index = pd.date_range("1/1/21", periods=100)
@@ -489,6 +494,7 @@ def test_arima_regressor_nullable_handling():
     X_test = X.ww.iloc[80:, :]
 
     y_train = y[:80]
+    y_train = ww.init_series(y_train, logical_type=nullable_ltype)
 
     arima_params = {
         "trend": None,