Skip to content

Commit

Permalink
add second bool col to imputer fixture
Browse files Browse the repository at this point in the history
  • Loading branch information
Tamar Grey committed Mar 8, 2023
1 parent 98c0561 commit 68ef36d
Show file tree
Hide file tree
Showing 4 changed files with 15 additions and 35 deletions.
1 change: 1 addition & 0 deletions evalml/tests/component_tests/test_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,7 @@ def test_categorical_and_numeric_input(imputer_test_data):
"object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"),
"float col": [0.1, 1.0, 0.0, -2.0, 5.0] * 4,
"bool col": [True, False, False, True, True] * 4,
"bool col 2": [True, False, False, True, True] * 4,
"natural language col": pd.Series(
["cats are really great", "don't", "believe", "me?", "well..."] * 4,
dtype="string",
Expand Down
46 changes: 11 additions & 35 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -627,21 +627,10 @@ def test_simple_imputer_boolean_nullable_valid_train_empty_test():
assert isinstance(X_t.ww.logical_types["a"], BooleanNullable)


def test_simple_imputer_all_bools_at_fit_and_transform():
def test_simple_imputer_all_bools_at_fit_and_transform(imputer_test_data):
"""Confirms that the simple imputer can handle data with only the bool dtype
which sklearn would error on."""
X = pd.DataFrame(
{
"bools1": pd.Series([True, False, True, True] * 20),
"bools2": pd.Series([True, False, True, False] * 20),
},
)
X.ww.init(
logical_types={
"bools1": "Boolean",
"bools2": "Boolean",
},
)
X = imputer_test_data.ww.select("boolean")

imp = SimpleImputer(impute_strategy="most_frequent")
imp.fit(X)
Expand All @@ -656,7 +645,9 @@ def test_simple_imputer_all_bools_at_fit_and_transform_with_all_null_and_nl_cols
"""Confirm that the simple imputer, which doesn't pass all null or natural language columns
to sklearn works when the remaining columns are all teh bool dtype, which sklearn would error on.
"""
X = imputer_test_data.ww[["all nan", "bool col", "natural language col"]]
X = imputer_test_data.ww[
["all nan", "bool col", "bool col 2", "natural language col"]
]
X_copy = X.ww.copy()

imp = SimpleImputer(impute_strategy="most_frequent")
Expand All @@ -666,37 +657,22 @@ def test_simple_imputer_all_bools_at_fit_and_transform_with_all_null_and_nl_cols
pd.testing.assert_frame_equal(X_copy.ww.drop("all nan"), X_imputed)


def test_simple_imputer_all_bools_at_fit_with_nans_at_transform():
def test_simple_imputer_all_bools_at_fit_with_nans_at_transform(imputer_test_data):
"""Confirm that the simple imputer can handle data whose dtype is different at transform
when originally the data only had bool dtype columns."""
# X_train will be only bool dtypes so the _component_obj won't be fit
X_train = pd.DataFrame(
{
"bools1": pd.Series([True, False, True, True] * 20),
"bools2": pd.Series([True, False, True, False] * 20),
},
)
X_train.ww.init(
logical_types={
"bools1": "Boolean",
"bools2": "Boolean",
},
)
X_train = imputer_test_data.ww.select("boolean")

imp = SimpleImputer(impute_strategy="most_frequent")
imp.fit(X_train)

# X_test will be BooleanNullable which will be a problem when _component_obj isn't fit
X_test = pd.DataFrame(
{
"bools1": pd.Series([True, False, pd.NA, True] * 20),
"bools2": pd.Series([True, pd.NA, True, False] * 20),
},
)
X_test = X_train.copy()
X_test.iloc[-1] = np.nan
X_test.ww.init(
logical_types={
"bools1": "BooleanNullable",
"bools2": "BooleanNullable",
"bool col": "BooleanNullable",
"bool col 2": "BooleanNullable",
},
)

Expand Down
1 change: 1 addition & 0 deletions evalml/tests/component_tests/test_time_series_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,6 +171,7 @@ def test_categorical_and_numeric_input(imputer_test_data):
"object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"),
"float col": [0.1, 1.0, 0.0, -2.0, 5.0] * 4,
"bool col": [True, False, False, True, True] * 4,
"bool col 2": [True, False, False, True, True] * 4,
"natural language col": pd.Series(
["cats are really great", "don't", "believe", "me?", "well..."] * 4,
dtype="string",
Expand Down
2 changes: 2 additions & 0 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -2227,6 +2227,7 @@ def X_no_nans():
"object col": ["b", "b", "a", "c", "d"] * 4,
"float col": [0.1, 1.0, 0.0, -2.0, 5.0] * 4,
"bool col": [True, False, False, True, True] * 4,
"bool col 2": [True, False, False, True, True] * 4,
"natural language col": pd.Series(
["cats are really great", "don't", "believe", "me?", "well..."] * 4,
dtype="string",
Expand All @@ -2241,6 +2242,7 @@ def X_no_nans():
"object col": "categorical",
"float col": "double",
"bool col": "boolean",
"bool col 2": "boolean",
"natural language col": "NaturalLanguage",
},
)
Expand Down

0 comments on commit 68ef36d

Please sign in to comment.