Skip to content

Commit

Permalink
Conver all bool dfs to boolean nullable instead of refitting
Browse files Browse the repository at this point in the history
  • Loading branch information
Tamar Grey committed Mar 6, 2023
1 parent 600aea4 commit bcbe9e2
Show file tree
Hide file tree
Showing 2 changed files with 52 additions and 15 deletions.
26 changes: 12 additions & 14 deletions evalml/pipelines/components/transformers/imputers/simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -90,13 +90,14 @@ def fit(self, X, y=None):
# If there are no columns to impute, return early
if not self._cols_to_impute:
return self
elif (X.dtypes == bool).all():
self._fit_with_all_bool_dtypes = True
return self
else:
self._fit_with_all_bool_dtypes = False

self._component_obj.fit(X[self._cols_to_impute], y)
X = X[self._cols_to_impute]
if (X.dtypes == bool).all():
# Ensure that _component_obj still gets fit so that if any of the dtypes are different
# at transform, we've fit the component
X = X.astype("boolean")

self._component_obj.fit(X, y)
return self

def transform(self, X, y=None):
Expand All @@ -113,17 +114,14 @@ def transform(self, X, y=None):
original_schema = X.ww.schema
original_index = X.index

X_t = X[self._cols_to_impute]
not_all_null_cols = [col for col in X.columns if col not in self._all_null_cols]
if not self._cols_to_impute:
# If there are no columns to impute, return the original data without any fully null columns
if not self._cols_to_impute or (X_t.dtypes == bool).all():
# If there are no columns to impute or all columns to impute are bool dtype, which sklearn errors on,
# return the original data without any fully null columns
return X.ww[not_all_null_cols]
elif (X.dtypes == bool).all():
return X
# If the dtypes are not all bool but it was with with all bool dtypes we need to fit the _component_obj
elif self._fit_with_all_bool_dtypes:
self._component_obj.fit(X[self._cols_to_impute])

X_t = self._component_obj.transform(X[self._cols_to_impute])
X_t = self._component_obj.transform(X_t)
X_t = pd.DataFrame(X_t, columns=self._cols_to_impute)

# Get Woodwork types for the imputed data
Expand Down
41 changes: 40 additions & 1 deletion evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import pandas as pd
import pytest
import woodwork as ww
import woodwork.exceptions
from pandas.testing import assert_frame_equal
from woodwork.logical_types import (
Boolean,
Expand Down Expand Up @@ -628,7 +627,47 @@ def test_simple_imputer_boolean_nullable_valid_train_empty_test():
assert isinstance(X_t.ww.logical_types["a"], BooleanNullable)


def test_simple_imputer_all_bools_at_fit_and_transform():
"""Confirms that the simple imputer can handle data with only the bool dtype
which sklearn would error on."""
X = pd.DataFrame(
{
"bools1": pd.Series([True, False, True, True] * 20),
"bools2": pd.Series([True, False, True, False] * 20),
},
)
X.ww.init(
logical_types={
"bools1": "Boolean",
"bools2": "Boolean",
},
)

imp = SimpleImputer()
imp.fit(X)

X_imputed = imp.transform(X)
pd.testing.assert_frame_equal(X, X_imputed)


def test_simple_imputer_all_bools_at_fit_and_transform_with_all_null_and_nl_cols(
imputer_test_data,
):
"""Confirm that the simple imputer, which doesn't pass all null or natural language columns
to sklearn works when the remaining columns are all teh bool dtype, which sklearn would error on."""
X = imputer_test_data.ww[["all nan", "bool col", "natural language col"]]
X_copy = X.ww.copy()

imp = SimpleImputer()
imp.fit(X)

X_imputed = imp.transform(X)
pd.testing.assert_frame_equal(X_copy.ww.drop("all nan"), X_imputed)


def test_simple_imputer_all_bools_at_fit_with_nans_at_transform():
"""Confirm that the simple imputer can handle data whose dtype is different at transform
when originally the data only had bool dtype columns."""
# X_train will be only bool dtypes so the _component_obj won't be fit
X_train = pd.DataFrame(
{
Expand Down

0 comments on commit bcbe9e2

Please sign in to comment.