Skip to content

Commit

Permalink
Refactor imputer components to remove unnecessary logic (#4038)
Browse files Browse the repository at this point in the history
* Stop using woodwork describe to get nan info in time series imputer

* remove logic that's no longer needed and return if all bool dtype

* remove unnecessary logic from target imputer

* remove unused utils

* remove logic to convert dfs features to categorical logical type

* fix email featureizer test

* Revert changes to transfomr prim components for testing

* Revert "Revert changes to transfomr prim components for testing"

This reverts commit 57dda43.

* Fix bugs with ww not imputed and string categories

* Add release note

* Handle case where nans are present at transform in previously all bool dtype data

* Stop truncating ints in target imputer

* clean up

* Fix tests

* Keep ltype integer for most frequent impute tpye in target imputer

* refactor knn imputer to use new logic

* Fix list bug

* remove comment

* Update release note to mention nullable types

* remove outdated comment

* Conver all bool dfs to boolean nullable instead of refitting

* lint fix

* PR comments

* add second bool col to imputer fixture
  • Loading branch information
tamargrey authored and Tamar Grey committed Mar 17, 2023
1 parent 4e750b4 commit da6e3da
Show file tree
Hide file tree
Showing 21 changed files with 222 additions and 293 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ Release Notes
* Enhancements
* Fixes
* Changes
* Remove unnecessary logic from imputer components prior to nullable type handling :pr:`4038`
* Documentation Changes
* Testing Changes

Expand Down
20 changes: 7 additions & 13 deletions evalml/pipelines/components/transformers/imputers/imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.transformers.imputers import KNNImputer, SimpleImputer
from evalml.utils import downcast_nullable_types, infer_feature_types
from evalml.utils.gen_utils import is_categorical_actually_boolean


class Imputer(Transformer):
Expand Down Expand Up @@ -119,29 +118,24 @@ def fit(self, X, y=None):
)
numeric_cols = list(X.ww.select(["numeric"], return_schema=True).columns)

# TODO: Remove this when columns with True/False/NaN are inferred properly as BooleanNullable.
# If columns with boolean values and NaN are included with normal categorical columns, columns
# with object dtypes are attempted to be cast to float64 with scikit-learn 1.1. So we separate
# boolean and categorical into separate imputers.
for col in cat_cols:
if is_categorical_actually_boolean(X, col):
cat_cols.remove(col)
bool_cols.append(col)

nan_ratio = X.isna().sum() / X.shape[0]
self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()

X_numerics = X[[col for col in numeric_cols if col not in self._all_null_cols]]
X_numerics = X.ww[
[col for col in numeric_cols if col not in self._all_null_cols]
]
if len(X_numerics.columns) > 0:
self._numeric_imputer.fit(X_numerics, y)
self._numeric_cols = X_numerics.columns

X_categorical = X[[col for col in cat_cols if col not in self._all_null_cols]]
X_categorical = X.ww[
[col for col in cat_cols if col not in self._all_null_cols]
]
if len(X_categorical.columns) > 0:
self._categorical_imputer.fit(X_categorical, y)
self._categorical_cols = X_categorical.columns

X_boolean = X[[col for col in bool_cols if col not in self._all_null_cols]]
X_boolean = X.ww[[col for col in bool_cols if col not in self._all_null_cols]]
if len(X_boolean.columns) > 0:
self._boolean_imputer.fit(X_boolean, y)
self._boolean_cols = X_boolean.columns
Expand Down
51 changes: 31 additions & 20 deletions evalml/pipelines/components/transformers/imputers/knn_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from sklearn.impute import KNNImputer as Sk_KNNImputer

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.utils import drop_natural_language_columns
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -51,16 +50,27 @@ def fit(self, X, y=None):
"""
X = infer_feature_types(X)
X, _ = drop_natural_language_columns(X)

nan_ratio = X.isna().sum() / X.shape[0]

# Keep track of the different types of data in X
self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
self._natural_language_cols = list(
X.ww.select("NaturalLanguage", return_schema=True).columns.keys(),
)

# Only impute data that is not natural language columns or fully null
self._cols_to_impute = [
col
for col in X.columns
if col not in self._natural_language_cols and col not in self._all_null_cols
]

# If the Dataframe only had natural language columns, do nothing.
if X.shape[1] == 0:
if not self._cols_to_impute:
return self

self._component_obj.fit(X, y)
self._component_obj.fit(X[self._cols_to_impute], y)
return self

def transform(self, X, y=None):
Expand All @@ -77,33 +87,34 @@ def transform(self, X, y=None):

not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
original_index = X.index
original_schema = X.ww.schema

# Drop natural language columns and transform the other columns
X_t, natural_language_cols = drop_natural_language_columns(X)
if X_t.shape[1] == 0:
return X
not_all_null_or_natural_language_cols = [
col for col in not_all_null_cols if col not in natural_language_cols
]
if not self._cols_to_impute:
return X.ww[not_all_null_cols]

X_t = self._component_obj.transform(X_t)
X_t = pd.DataFrame(X_t, columns=not_all_null_or_natural_language_cols)
X_t = self._component_obj.transform(X[self._cols_to_impute])
X_t = pd.DataFrame(X_t, columns=self._cols_to_impute)

X_schema = X.ww.schema
# Get Woodwork types for the imputed data
new_schema = original_schema.get_subset_schema(self._cols_to_impute)

X_int_nullable_cols = X_schema._filter_cols(include=["IntegerNullable"])
new_ltypes_for_nullable_cols = {col: "Double" for col in X_int_nullable_cols}
X_bool_nullable_cols = new_schema._filter_cols(include=["BooleanNullable"])
X_int_nullable_cols = new_schema._filter_cols(include=["IntegerNullable"])
new_ltypes_for_nullable_cols = {col: "Boolean" for col in X_bool_nullable_cols}
new_ltypes_for_nullable_cols.update(
{col: "Double" for col in X_int_nullable_cols},
)

# Add back in natural language columns, unchanged
if len(natural_language_cols) > 0:
X_t = woodwork.concat_columns([X_t, X[natural_language_cols]])
if len(self._natural_language_cols) > 0:
X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]])

X_t.ww.init(
schema=X_schema,
schema=new_schema,
logical_types=new_ltypes_for_nullable_cols,
)

if not_all_null_or_natural_language_cols:
if self._cols_to_impute:
X_t.index = original_index

return X_t
Expand Down
87 changes: 38 additions & 49 deletions evalml/pipelines/components/transformers/imputers/simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,7 @@
from woodwork.logical_types import Double

from evalml.pipelines.components.transformers import Transformer
from evalml.pipelines.components.utils import (
drop_natural_language_columns,
set_boolean_columns_to_integer,
)
from evalml.utils import infer_feature_types
from evalml.utils.gen_utils import is_categorical_actually_boolean


class SimpleImputer(Transformer):
Expand Down Expand Up @@ -69,41 +64,40 @@ def fit(self, X, y=None):
if set([lt.type_string for lt in X.ww.logical_types.values()]) == {
"boolean",
"categorical",
} and not all(
[
is_categorical_actually_boolean(X, col)
for col in X.ww.select("Categorical")
],
):
}:
raise ValueError(
"SimpleImputer cannot handle dataframes with both boolean and categorical features. Use Imputer instead.",
)

nan_ratio = X.isna().sum() / X.shape[0]
self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()

X, _ = drop_natural_language_columns(X)

# Convert any boolean columns to IntegerNullable, but keep track of the columns so they can be converted back
self._boolean_cols = list(
# Keep track of the different types of data in X
self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()
self._natural_language_cols = list(
X.ww.select(
include=["Boolean", "BooleanNullable"],
"NaturalLanguage",
return_schema=True,
).columns,
)
# Make sure we're tracking Categorical columns that should be boolean as well
self._boolean_cols.extend(
[
col
for col in X.ww.select("Categorical")
if is_categorical_actually_boolean(X, col)
],
).columns.keys(),
)
X = set_boolean_columns_to_integer(X)

# If the Dataframe only had natural language columns, do nothing.
if X.shape[1] == 0:
# Only impute data that is not natural language columns or fully null
self._cols_to_impute = [
col
for col in X.columns
if col not in self._natural_language_cols and col not in self._all_null_cols
]

# If there are no columns to impute, return early
if not self._cols_to_impute:
return self

X = X[self._cols_to_impute]
if (X.dtypes == bool).all():
# Ensure that _component_obj still gets fit so that if any of the dtypes are different
# at transform, we've fit the component. This is needed because sklearn doesn't allow
# data with only bool dtype to be passed in.
X = X.astype("boolean")

self._component_obj.fit(X, y)
return self

Expand All @@ -119,27 +113,20 @@ def transform(self, X, y=None):
"""
X = infer_feature_types(X)
original_schema = X.ww.schema
X = set_boolean_columns_to_integer(X)

not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
original_index = X.index

# Drop natural language columns and transform the other columns
X_t, natural_language_cols = drop_natural_language_columns(X)
if X_t.shape[1] == 0:
return X
not_all_null_or_natural_language_cols = [
col for col in not_all_null_cols if col not in natural_language_cols
]
X_t = X[self._cols_to_impute]
not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
if not self._cols_to_impute or (X_t.dtypes == bool).all():
# If there are no columns to impute or all columns to impute are bool dtype,
# which will never have null values, return the original data without any fully null columns
return X.ww[not_all_null_cols]

X_t = self._component_obj.transform(X_t)
X_t = pd.DataFrame(X_t, columns=not_all_null_or_natural_language_cols)

new_schema = original_schema.get_subset_schema(X_t.columns)
X_t = pd.DataFrame(X_t, columns=self._cols_to_impute)

# Iterate through previously saved boolean columns and convert them back to boolean
for col in self._boolean_cols:
X_t[col] = X_t[col].astype(bool)
# Get Woodwork types for the imputed data
new_schema = original_schema.get_subset_schema(self._cols_to_impute)

# Convert Nullable Integers to Doubles for the "mean" and "median" strategies
if self.impute_strategy in ["mean", "median"]:
Expand All @@ -149,11 +136,13 @@ def transform(self, X, y=None):
new_schema.set_types({col: Double})
X_t.ww.init(schema=new_schema)

# Add back in natural language columns, unchanged
if len(natural_language_cols) > 0:
X_t = woodwork.concat_columns([X_t, X[natural_language_cols]])
# Add back in the unchanged original natural language columns that we want to keep
if len(self._natural_language_cols) > 0:
X_t = woodwork.concat_columns([X_t, X.ww[self._natural_language_cols]])
# reorder columns to match original
X_t = X_t.ww[[col for col in original_schema.columns if col in X_t.columns]]

if not_all_null_or_natural_language_cols:
if self._cols_to_impute:
X_t.index = original_index
return X_t

Expand Down
35 changes: 18 additions & 17 deletions evalml/pipelines/components/transformers/imputers/target_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,13 @@
import pandas as pd
import woodwork as ww
from sklearn.impute import SimpleImputer as SkImputer
from woodwork.logical_types import Categorical, Integer, IntegerNullable
from woodwork.logical_types import (
Boolean,
BooleanNullable,
Double,
Integer,
IntegerNullable,
)

from evalml.exceptions import ComponentNotYetFittedError
from evalml.pipelines.components import ComponentBaseMeta
Expand Down Expand Up @@ -93,9 +99,9 @@ def fit(self, X, y):
raise TypeError("Provided target full of nulls.")
y = y.to_frame()

# Convert all bool dtypes to category for fitting
# Return early if all the columns are bool dtype, which will never have null values
if (y.dtypes == bool).all():
y = y.astype("category")
return y

self._component_obj.fit(y)
return self
Expand All @@ -117,26 +123,21 @@ def transform(self, X, y):
y_ww = infer_feature_types(y)
y_df = y_ww.ww.to_frame()

# Return early since bool dtype doesn't support nans and sklearn errors if all cols are bool
# Return early if all the columns are bool dtype, which will never have null values
if (y_df.dtypes == bool).all():
return X, y_ww

transformed = self._component_obj.transform(y_df)
y_t = pd.Series(transformed[:, 0], index=y_ww.index)

# TODO: Fix this after WW adds inference of object type booleans to BooleanNullable
# Iterate through categorical columns that might have been boolean and convert them back to boolean
if {True, False}.issubset(set(y_t.unique())) and isinstance(
y_ww.ww.logical_type,
Categorical,
):
y_t = y_t.astype(bool)

new_logical_type = (
Integer
if isinstance(y_ww.ww.logical_type, IntegerNullable)
else y_ww.ww.logical_type
)
new_logical_type = y_ww.ww.logical_type
if isinstance(y_ww.ww.logical_type, IntegerNullable):
if self.parameters["impute_strategy"] in ["mean", "median"]:
new_logical_type = Double
else:
new_logical_type = Integer
elif isinstance(y_ww.ww.logical_type, BooleanNullable):
new_logical_type = Boolean

y_t = ww.init_series(
y_t,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ def fit(self, X, y=None):
"""
X = infer_feature_types(X)

nan_ratio = X.ww.describe().loc["nan_count"] / X.shape[0]
nan_ratio = X.isna().sum() / X.shape[0]
self._all_null_cols = nan_ratio[nan_ratio == 1].index.tolist()

def _filter_cols(impute_strat, X):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -73,16 +73,13 @@ def transform(self, X, y=None):
es = self._make_entity_set(X_ww)
features = ft.calculate_feature_matrix(features=self._features, entityset=es)

# Convert to object dtype so that pd.NA is converted to np.nan
# until sklearn imputer can handle pd.NA in release 1.1
# FT returns these as string types, currently there isn't much difference
# in terms of performance between object and string
# see https://pandas.pydata.org/docs/user_guide/text.html#text-data-types
# "Currently, the performance of object dtype arrays of strings
# "and arrays.StringArray are about the same."
ltypes = features.ww.logical_types
# CatBoost has an issue with categoricals with string categories:
# https://github.com/catboost/catboost/issues/1965
# Which will pop up if these categorical features are left with string categories,
# so convert them to object until the bug is fixed.
features = features.astype(object, copy=False)
features.index = X_ww.index
features.ww.init(logical_types={col_: "categorical" for col_ in features})
features.ww.init(logical_types=ltypes)

X_ww = X_ww.ww.drop(self._columns)
X_ww = ww.concat_columns([X_ww, features])
Expand Down
Loading

0 comments on commit da6e3da

Please sign in to comment.