Refactor imputer components to remove unnecessary logic (#4038)

* Stop using woodwork describe to get nan info in time series imputer * remove logic that's no longer needed and return if all bool dtype * remove unnecessary logic from target imputer * remove unused utils * remove logic to convert dfs features to categorical logical type * fix email featureizer test * Revert changes to transfomr prim components for testing * Revert "Revert changes to transfomr prim components for testing" This reverts commit 57dda43. * Fix bugs with ww not imputed and string categories * Add release note * Handle case where nans are present at transform in previously all bool dtype data * Stop truncating ints in target imputer * clean up * Fix tests * Keep ltype integer for most frequent impute tpye in target imputer * refactor knn imputer to use new logic * Fix list bug * remove comment * Update release note to mention nullable types * remove outdated comment * Conver all bool dfs to boolean nullable instead of refitting * lint fix * PR comments * add second bool col to imputer fixture
alteryx · Mar 17, 2023 · da6e3da · da6e3da
1 parent 4e750b4
commit da6e3da
Show file tree

Hide file tree

Showing 21 changed files with 222 additions and 293 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -4,6 +4,7 @@ Release Notes
     * Enhancements
     * Fixes
     * Changes
+        * Remove unnecessary logic from imputer components prior to nullable type handling :pr:`4038`
     * Documentation Changes
     * Testing Changes
 

diff --git a/evalml/pipelines/components/transformers/imputers/imputer.py b/evalml/pipelines/components/transformers/imputers/imputer.py
@@ -5,7 +5,6 @@
 from evalml.pipelines.components.transformers import Transformer
 from evalml.pipelines.components.transformers.imputers import KNNImputer, SimpleImputer
 from evalml.utils import downcast_nullable_types, infer_feature_types
-from evalml.utils.gen_utils import is_categorical_actually_boolean
 
 
 class Imputer(Transformer):
@@ -119,29 +118,24 @@ def fit(self, X, y=None):
         )
         numeric_cols = list(X.ww.select(["numeric"], return_schema=True).columns)
 
-        # TODO: Remove this when columns with True/False/NaN are inferred properly as BooleanNullable.
-        # If columns with boolean values and NaN are included with normal categorical columns, columns
-        # with object dtypes are attempted to be cast to float64 with scikit-learn 1.1.  So we separate
-        # boolean and categorical into separate imputers.
-        for col in cat_cols:
-            if is_categorical_actually_boolean(X, col):
-                cat_cols.remove(col)
-                bool_cols.append(col)
-
         nan_ratio = X.isna().sum() / X.shape[0]
         self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
 
-        X_numerics = X[[col for col in numeric_cols if col not in self._all_null_cols]]
+        X_numerics = X.ww[
+            [col for col in numeric_cols if col not in self._all_null_cols]
+        ]
         if len(X_numerics.columns) > 0:
             self._numeric_imputer.fit(X_numerics, y)
             self._numeric_cols = X_numerics.columns
 
-        X_categorical = X[[col for col in cat_cols if col not in self._all_null_cols]]
+        X_categorical = X.ww[
+            [col for col in cat_cols if col not in self._all_null_cols]
+        ]
         if len(X_categorical.columns) > 0:
             self._categorical_imputer.fit(X_categorical, y)
             self._categorical_cols = X_categorical.columns
 
-        X_boolean = X[[col for col in bool_cols if col not in self._all_null_cols]]
+        X_boolean = X.ww[[col for col in bool_cols if col not in self._all_null_cols]]
         if len(X_boolean.columns) > 0:
             self._boolean_imputer.fit(X_boolean, y)
             self._boolean_cols = X_boolean.columns

diff --git a/evalml/pipelines/components/transformers/imputers/knn_imputer.py b/evalml/pipelines/components/transformers/imputers/knn_imputer.py
@@ -5,7 +5,6 @@
 from sklearn.impute import KNNImputer as Sk_KNNImputer
 
 from evalml.pipelines.components.transformers import Transformer
-from evalml.pipelines.components.utils import drop_natural_language_columns
 from evalml.utils import infer_feature_types
 
 
@@ -51,16 +50,27 @@ def fit(self, X, y=None):
 
         """
         X = infer_feature_types(X)
-        X, _ = drop_natural_language_columns(X)
 
         nan_ratio = X.isna().sum() / X.shape[0]
+
+        # Keep track of the different types of data in X
         self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
+        self._natural_language_cols = list(
+            X.ww.select("NaturalLanguage", return_schema=True).columns.keys(),
+        )
+
+        # Only impute data that is not natural language columns or fully null
+        self._cols_to_impute = [
+            col
+            for col in X.columns
+            if col not in self._natural_language_cols and col not in self._all_null_cols
+        ]
 
         # If the Dataframe only had natural language columns, do nothing.
-        if X.shape[1] == 0:
+        if not self._cols_to_impute:
             return self
 
-        self._component_obj.fit(X, y)
+        self._component_obj.fit(X[self._cols_to_impute], y)
         return self
 
     def transform(self, X, y=None):
@@ -77,33 +87,34 @@ def transform(self, X, y=None):
 
         not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
         original_index = X.index
+        original_schema = X.ww.schema
 
-        # Drop natural language columns and transform the other columns
-        X_t, natural_language_cols = drop_natural_language_columns(X)
-        if X_t.shape[1] == 0:
-            return X
-        not_all_null_or_natural_language_cols = [
-            col for col in not_all_null_cols if col not in natural_language_cols
-        ]
+        if not self._cols_to_impute:
+            return X.ww[not_all_null_cols]
 
-        X_t = self._component_obj.transform(X_t)
-        X_t = pd.DataFrame(X_t, columns=not_all_null_or_natural_language_cols)
+        X_t = self._component_obj.transform(X[self._cols_to_impute])
+        X_t = pd.DataFrame(X_t, columns=self._cols_to_impute)
 
-        X_schema = X.ww.schema
+        # Get Woodwork types for the imputed data
+        new_schema = original_schema.get_subset_schema(self._cols_to_impute)
 
-        X_int_nullable_cols = X_schema._filter_cols(include=["IntegerNullable"])
-        new_ltypes_for_nullable_cols = {col: "Double" for col in X_int_nullable_cols}
+        X_bool_nullable_cols = new_schema._filter_cols(include=["BooleanNullable"])
+        X_int_nullable_cols = new_schema._filter_cols(include=["IntegerNullable"])
+        new_ltypes_for_nullable_cols = {col: "Boolean" for col in X_bool_nullable_cols}
+        new_ltypes_for_nullable_cols.update(
+            {col: "Double" for col in X_int_nullable_cols},
+        )
 
         # Add back in natural language columns, unchanged
-        if len(natural_language_cols) > 0:
-            X_t = woodwork.concat_columns([X_t, X[natural_language_cols]])
+        if len(self._natural_language_cols) > 0:
+            X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]])
 
         X_t.ww.init(
-            schema=X_schema,
+            schema=new_schema,
             logical_types=new_ltypes_for_nullable_cols,
         )
 
-        if not_all_null_or_natural_language_cols:
+        if self._cols_to_impute:
             X_t.index = original_index
 
         return X_t

diff --git a/evalml/pipelines/components/transformers/imputers/simple_imputer.py b/evalml/pipelines/components/transformers/imputers/simple_imputer.py
@@ -5,12 +5,7 @@
 from woodwork.logical_types import Double
 
 from evalml.pipelines.components.transformers import Transformer
-from evalml.pipelines.components.utils import (
-    drop_natural_language_columns,
-    set_boolean_columns_to_integer,
-)
 from evalml.utils import infer_feature_types
-from evalml.utils.gen_utils import is_categorical_actually_boolean
 
 
 class SimpleImputer(Transformer):
@@ -69,41 +64,40 @@ def fit(self, X, y=None):
         if set([lt.type_string for lt in X.ww.logical_types.values()]) == {
             "boolean",
             "categorical",
-        } and not all(
-            [
-                is_categorical_actually_boolean(X, col)
-                for col in X.ww.select("Categorical")
-            ],
-        ):
+        }:
             raise ValueError(
                 "SimpleImputer cannot handle dataframes with both boolean and categorical features.  Use Imputer instead.",
             )
 
         nan_ratio = X.isna().sum() / X.shape[0]
-        self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
-
-        X, _ = drop_natural_language_columns(X)
 
-        # Convert any boolean columns to IntegerNullable, but keep track of the columns so they can be converted back
-        self._boolean_cols = list(
+        # Keep track of the different types of data in X
+        self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
+        self._natural_language_cols = list(
             X.ww.select(
-                include=["Boolean", "BooleanNullable"],
+                "NaturalLanguage",
                 return_schema=True,
-            ).columns,
-        )
-        # Make sure we're tracking Categorical columns that should be boolean as well
-        self._boolean_cols.extend(
-            [
-                col
-                for col in X.ww.select("Categorical")
-                if is_categorical_actually_boolean(X, col)
-            ],
+            ).columns.keys(),
         )
-        X = set_boolean_columns_to_integer(X)
 
-        # If the Dataframe only had natural language columns, do nothing.
-        if X.shape[1] == 0:
+        # Only impute data that is not natural language columns or fully null
+        self._cols_to_impute = [
+            col
+            for col in X.columns
+            if col not in self._natural_language_cols and col not in self._all_null_cols
+        ]
+
+        # If there are no columns to impute, return early
+        if not self._cols_to_impute:
             return self
+
+        X = X[self._cols_to_impute]
+        if (X.dtypes == bool).all():
+            # Ensure that _component_obj still gets fit so that if any of the dtypes are different
+            # at transform, we've fit the component. This is needed because sklearn doesn't allow
+            # data with only bool dtype to be passed in.
+            X = X.astype("boolean")
+
         self._component_obj.fit(X, y)
         return self
 
@@ -119,27 +113,20 @@ def transform(self, X, y=None):
         """
         X = infer_feature_types(X)
         original_schema = X.ww.schema
-        X = set_boolean_columns_to_integer(X)
-
-        not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
         original_index = X.index
 
-        # Drop natural language columns and transform the other columns
-        X_t, natural_language_cols = drop_natural_language_columns(X)
-        if X_t.shape[1] == 0:
-            return X
-        not_all_null_or_natural_language_cols = [
-            col for col in not_all_null_cols if col not in natural_language_cols
-        ]
+        X_t = X[self._cols_to_impute]
+        not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
+        if not self._cols_to_impute or (X_t.dtypes == bool).all():
+            # If there are no columns to impute or all columns to impute are bool dtype,
+            # which will never have null values, return the original data without any fully null columns
+            return X.ww[not_all_null_cols]
 
         X_t = self._component_obj.transform(X_t)
-        X_t = pd.DataFrame(X_t, columns=not_all_null_or_natural_language_cols)
-
-        new_schema = original_schema.get_subset_schema(X_t.columns)
+        X_t = pd.DataFrame(X_t, columns=self._cols_to_impute)
 
-        # Iterate through previously saved boolean columns and convert them back to boolean
-        for col in self._boolean_cols:
-            X_t[col] = X_t[col].astype(bool)
+        # Get Woodwork types for the imputed data
+        new_schema = original_schema.get_subset_schema(self._cols_to_impute)
 
         # Convert Nullable Integers to Doubles for the "mean" and "median" strategies
         if self.impute_strategy in ["mean", "median"]:
@@ -149,11 +136,13 @@ def transform(self, X, y=None):
                 new_schema.set_types({col: Double})
         X_t.ww.init(schema=new_schema)
 
-        # Add back in natural language columns, unchanged
-        if len(natural_language_cols) > 0:
-            X_t = woodwork.concat_columns([X_t, X[natural_language_cols]])
+        # Add back in the unchanged original natural language columns that we want to keep
+        if len(self._natural_language_cols) > 0:
+            X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]])
+            # reorder columns to match original
+            X_t = X_t.ww[[col for col in original_schema.columns if col in X_t.columns]]
 
-        if not_all_null_or_natural_language_cols:
+        if self._cols_to_impute:
             X_t.index = original_index
         return X_t
 

diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py
@@ -4,7 +4,13 @@
 import pandas as pd
 import woodwork as ww
 from sklearn.impute import SimpleImputer as SkImputer
-from woodwork.logical_types import Categorical, Integer, IntegerNullable
+from woodwork.logical_types import (
+    Boolean,
+    BooleanNullable,
+    Double,
+    Integer,
+    IntegerNullable,
+)
 
 from evalml.exceptions import ComponentNotYetFittedError
 from evalml.pipelines.components import ComponentBaseMeta
@@ -93,9 +99,9 @@ def fit(self, X, y):
             raise TypeError("Provided target full of nulls.")
         y = y.to_frame()
 
-        # Convert all bool dtypes to category for fitting
+        # Return early if all the columns are bool dtype, which will never have null values
         if (y.dtypes == bool).all():
-            y = y.astype("category")
+            return y
 
         self._component_obj.fit(y)
         return self
@@ -117,26 +123,21 @@ def transform(self, X, y):
         y_ww = infer_feature_types(y)
         y_df = y_ww.ww.to_frame()
 
-        # Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
+        # Return early if all the columns are bool dtype, which will never have null values
         if (y_df.dtypes == bool).all():
             return X, y_ww
 
         transformed = self._component_obj.transform(y_df)
         y_t = pd.Series(transformed[:, 0], index=y_ww.index)
 
-        # TODO: Fix this after WW adds inference of object type booleans to BooleanNullable
-        # Iterate through categorical columns that might have been boolean and convert them back to boolean
-        if {True, False}.issubset(set(y_t.unique())) and isinstance(
-            y_ww.ww.logical_type,
-            Categorical,
-        ):
-            y_t = y_t.astype(bool)
-
-        new_logical_type = (
-            Integer
-            if isinstance(y_ww.ww.logical_type, IntegerNullable)
-            else y_ww.ww.logical_type
-        )
+        new_logical_type = y_ww.ww.logical_type
+        if isinstance(y_ww.ww.logical_type, IntegerNullable):
+            if self.parameters["impute_strategy"] in ["mean", "median"]:
+                new_logical_type = Double
+            else:
+                new_logical_type = Integer
+        elif isinstance(y_ww.ww.logical_type, BooleanNullable):
+            new_logical_type = Boolean
 
         y_t = ww.init_series(
             y_t,

diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py
@@ -108,7 +108,7 @@ def fit(self, X, y=None):
         """
         X = infer_feature_types(X)
 
-        nan_ratio = X.ww.describe().loc["nan_count"] / X.shape[0]
+        nan_ratio = X.isna().sum() / X.shape[0]
         self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
 
         def _filter_cols(impute_strat, X):

diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py
@@ -73,16 +73,13 @@ def transform(self, X, y=None):
         es = self._make_entity_set(X_ww)
         features = ft.calculate_feature_matrix(features=self._features, entityset=es)
 
-        # Convert to object dtype so that pd.NA is converted to np.nan
-        # until sklearn imputer can handle pd.NA in release 1.1
-        # FT returns these as string types, currently there isn't much difference
-        # in terms of performance between object and string
-        # see https://pandas.pydata.org/docs/user_guide/text.html#text-data-types
-        # "Currently, the performance of object dtype arrays of strings
-        # "and arrays.StringArray are about the same."
+        ltypes = features.ww.logical_types
+        # CatBoost has an issue with categoricals with string categories:
+        # https://github.com/catboost/catboost/issues/1965
+        # Which will pop up if these categorical features are left with string categories,
+        # so convert them to object until the bug is fixed.
         features = features.astype(object, copy=False)
-        features.index = X_ww.index
-        features.ww.init(logical_types={col_: "categorical" for col_ in features})
+        features.ww.init(logical_types=ltypes)
 
         X_ww = X_ww.ww.drop(self._columns)
         X_ww = ww.concat_columns([X_ww, features])