Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Updates to support Woodwork 0.5.1 #2610

Merged
merged 22 commits into from
Aug 12, 2021
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
7544dde
Fixed target imputer.
chukarsten Aug 5, 2021
84ab5dd
Fixed test_components.py
chukarsten Aug 5, 2021
9726e23
Fixed the invalidtarget datacheck to allow for the new Unknown type.
chukarsten Aug 6, 2021
23d0579
Updated the preprocessing components to not attempt to double drop Nu…
chukarsten Aug 6, 2021
1f2f699
Fill pd.NA with np.nan in reversion to numerical nans. Modified tests…
chukarsten Aug 9, 2021
371c58e
Lint.
chukarsten Aug 9, 2021
4f36996
Release.
chukarsten Aug 9, 2021
2cb0c12
Shuffled around the order of the DropColumns transformer in the tests…
chukarsten Aug 10, 2021
ad3270b
Changed the target imputer to just look for all nulls in the target a…
chukarsten Aug 10, 2021
d60b1b1
Set lower limit on WW to 0.5.1.
chukarsten Aug 10, 2021
5e0ced2
Pinned WW to 0.5.1
chukarsten Aug 10, 2021
b51a46c
Bumped the min core reqs to ww 0.5.1.
chukarsten Aug 10, 2021
17010a5
Addressed Nate's comments.
chukarsten Aug 10, 2021
aed9b6f
Added a test to address Freddy's concern and modified the reversion i…
chukarsten Aug 12, 2021
71d0fdc
Refactored the reversion of all-null Unknown columns into its own fun…
chukarsten Aug 12, 2021
2399fe5
Reverted the text featurizer and LSA tests.
chukarsten Aug 12, 2021
f757ba4
Reverted the additional DropColumn transformer change.
chukarsten Aug 12, 2021
13c589b
Fixed invalid target datacheck.
chukarsten Aug 12, 2021
2543a72
Update for explicit testing of infer_feature_types.
chukarsten Aug 12, 2021
dc0f82f
Addressed comments.
chukarsten Aug 12, 2021
115fb6b
Trigger build.
chukarsten Aug 12, 2021
4a4af75
Updated latest_dep_versions.txt
chukarsten Aug 12, 2021
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion core-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ psutil>=5.6.6
requirements-parser>=0.2.0
shap>=0.36.0
texttable>=1.6.2
woodwork==0.5.0
woodwork==0.5.1
dask>=2.12.0
featuretools>=0.26.1
nlp-primitives>=1.1.0
Expand Down
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ Release Notes
* Updated ``ComponentGraph`` ``_validate_component_dict`` logic to be stricter about input values :pr:`2599`
* Patched bug in ``xgboost`` estimators where predicting on a feature matrix of only booleans would throw an exception. :pr:`2602`
* Updated ``ARIMARegressor`` to use relative forecasting to predict values :pr:`2613`
* Updated to support Woodwork 0.5.1 :pr:`2610`
* Fixes
* Updated ``get_best_sampler_for_data`` to consider all non-numeric datatypes as categorical for SMOTE :pr:`2590`
* Fixed inconsistent test results from `TargetDistributionDataCheck` :pr:`2608`
Expand Down
2 changes: 1 addition & 1 deletion evalml/data_checks/invalid_targets_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,7 @@ def validate(self, X, y):

y = infer_feature_types(y)
is_supported_type = y.ww.logical_type.type_string in numeric_and_boolean_ww + [
ww.logical_types.Categorical.type_string
ww.logical_types.Categorical.type_string,
]
if not is_supported_type:
results["errors"].append(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -76,7 +76,10 @@ def fit(self, X, y):
"""
if y is None:
return self
y = infer_feature_types(y).to_frame()
y = infer_feature_types(y)
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Moved the exception for the target imputer to fit.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

nit, but couldn't you do

y = infer_feature_types(y).to_frame()
if all(y.isnull()):
        raise TypeError("Provided target full of nulls.")

just to shorten/simplify slightly?

if all(y.isnull()):
raise TypeError("Provided target full of nulls.")
y = y.to_frame()

# Convert all bool dtypes to category for fitting
if (y.dtypes == bool).all():
Expand Down Expand Up @@ -110,8 +113,6 @@ def transform(self, X, y):
)

transformed = self._component_obj.transform(y_df)
if transformed.shape[1] == 0:
raise RuntimeError("Transformed data is empty")
y_t = pd.Series(transformed[:, 0], index=y_ww.index)
return X, _retain_custom_types_and_initalize_woodwork(y_ww.ww.logical_type, y_t)

Expand Down
11 changes: 6 additions & 5 deletions evalml/pipelines/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,12 @@ def _get_preprocessing_components(
if len(all_null_cols) > 0:
pp_components.append(DropNullColumns)

index_and_unknown_columns = list(
X.ww.select(["index", "unknown"], return_schema=True).columns
)
if len(index_and_unknown_columns) > 0:
pp_components.append(DropColumns)

email_columns = list(X.ww.select("EmailAddress", return_schema=True).columns)
if len(email_columns) > 0:
pp_components.append(EmailFeaturizer)
Expand All @@ -105,11 +111,6 @@ def _get_preprocessing_components(
text_columns
):
pp_components.append(Imputer)
index_and_unknown_columns = list(
X.ww.select(["index", "unknown"], return_schema=True).columns
)
if len(index_and_unknown_columns) > 0:
pp_components.append(DropColumns)

datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)

Expand Down
2 changes: 1 addition & 1 deletion evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -973,7 +973,7 @@ def fit(self, X, y):
return self

def predict(self, X):
series = pd.Series()
series = pd.Series(dtype="string")
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this change was to accommodate the way empty series are now inferred. Woodwork complains if you don't do this.

series.ww.init()
return series

Expand Down
15 changes: 15 additions & 0 deletions evalml/tests/component_tests/test_simple_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -410,3 +410,18 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components(
assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {
0: Double
}


def test_component_handles_pre_init_ww():
"""Test to determine whether SimpleImputer can handle
a Woodwork-inited DataFrame with partially null and fully
null columns (post Woodwork 0.5.1) and still perform the
expected behavior."""
df = pd.DataFrame(
{"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]}
)
df.ww.init()
imputed = SimpleImputer().fit_transform(df)

assert "all_null" not in imputed.columns
assert [x for x in imputed["part_null"]] == [0, 1, 2, 0]
13 changes: 7 additions & 6 deletions evalml/tests/component_tests/test_target_imputer.py
Original file line number Diff line number Diff line change
Expand Up @@ -128,16 +128,17 @@ def test_target_imputer_boolean_dtype(data_type, make_data_type):
assert_series_equal(y_expected, y_t)


def test_target_imputer_fit_transform_all_nan_empty():
y = pd.Series([np.nan, np.nan])
@pytest.mark.parametrize("y", [[np.nan, np.nan], [pd.NA, pd.NA]])
def test_target_imputer_fit_transform_all_nan_empty(y):
y = pd.Series(y)

imputer = TargetImputer()
imputer.fit(None, y)
with pytest.raises(RuntimeError, match="Transformed data is empty"):
imputer.transform(None, y)

with pytest.raises(TypeError, match="Provided target full of nulls."):
imputer.fit(None, y)

imputer = TargetImputer()
with pytest.raises(RuntimeError, match="Transformed data is empty"):
with pytest.raises(TypeError, match="Provided target full of nulls."):
imputer.fit_transform(None, y)


Expand Down
18 changes: 16 additions & 2 deletions evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,12 +54,19 @@ def test_invalid_target_data_check_nan_error():
assert invalid_targets_check.validate(X, y=pd.Series([np.nan, np.nan, np.nan])) == {
"warnings": [],
"errors": [
DataCheckError(
chukarsten marked this conversation as resolved.
Show resolved Hide resolved
message="Target is unsupported Unknown type. Valid Woodwork "
"logical types include: integer, double, boolean",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
details={"unsupported_type": "unknown"},
).to_dict(),
DataCheckError(
message="Target is either empty or fully null.",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
details={},
).to_dict()
).to_dict(),
],
"actions": [],
}
Expand Down Expand Up @@ -787,12 +794,19 @@ def test_invalid_target_data_action_for_all_null(problem_type):
expected = {
"warnings": [],
"errors": [
DataCheckError(
message="Target is unsupported Unknown type. Valid Woodwork "
"logical types include: integer, double, boolean",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
details={"unsupported_type": "unknown"},
).to_dict(),
DataCheckError(
message="Target is either empty or fully null.",
data_check_name=invalid_targets_data_check_name,
message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
details={},
).to_dict()
).to_dict(),
],
"actions": [],
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ psutil==5.6.6
requirements-parser==0.2.0
shap==0.36.0
texttable==1.6.2
woodwork==0.5.0
woodwork==0.5.1
dask==2.12.0
featuretools==0.26.1
nlp-primitives==1.1.0
Expand Down
14 changes: 5 additions & 9 deletions evalml/tests/pipeline_tests/test_pipeline_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,8 +64,8 @@ def _get_test_data_from_configuration(
"[email protected]",
"[email protected]",
"[email protected]",
"$titanic_data%&@hotmail.com",
"foo*EMAIL@email.org",
"[email protected]",
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I submitted this issue to Woodwork to cover these email addresses which slipped through the WW EmailAddress inference. @davesque since I saw you did the Email inference.

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@chukarsten Yeah, never seen email addresses like that before :). I think it's safe to delete them from test data to accommodate the woodwork update.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Agreed. I think alteryx/woodwork#1080 will help cover against users passing impossible email values by manually specifying the email type.

"fooEMAIL@email.org",
"[email protected]",
"[email protected]",
],
Expand Down Expand Up @@ -197,11 +197,7 @@ def test_make_pipeline(
if "text" in column_names and input_type == "ww"
else []
)
email_featurizer = (
[EmailFeaturizer]
if "email" in column_names and input_type == "ww"
else []
)
email_featurizer = [EmailFeaturizer] if "email" in column_names else []
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is the change required for Email inference in WW.

url_featurizer = (
[URLFeaturizer] if "url" in column_names and input_type == "ww" else []
)
Expand All @@ -213,7 +209,7 @@ def test_make_pipeline(
)
drop_col = (
[DropColumns]
if any(ltype in column_names for ltype in ["url", "email", "text"])
if any(ltype in column_names for ltype in ["url", "text"])
and input_type == "pd"
else []
)
Expand All @@ -223,8 +219,8 @@ def test_make_pipeline(
+ url_featurizer
+ drop_null
+ text_featurizer
+ imputer
+ drop_col
+ imputer
+ datetime
+ delayed_features
+ ohe
Expand Down
56 changes: 55 additions & 1 deletion evalml/tests/utils_tests/test_woodwork_utils.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,16 @@
from itertools import product

import numpy as np
import pandas as pd
import pytest
import woodwork as ww
from woodwork.logical_types import Categorical, Datetime, Ordinal
from woodwork.logical_types import (
Categorical,
Datetime,
Double,
Ordinal,
Unknown,
)

from evalml.utils import (
_convert_numeric_dataset_pandas,
Expand Down Expand Up @@ -257,3 +265,49 @@ def test_ordinal_retains_order_min():
)
ltypes = cat_subset.ww.logical_types
assert not hasattr(ltypes["categorical"], "encoding")


@pytest.mark.parametrize(
"null_col,already_inited",
product(
[
[None, None, None],
[np.nan, np.nan, np.nan],
[pd.NA, pd.NA, pd.NA],
["ax23n9ck23l", "1,28&*_%*&&xejc", "xnmvz@@Dcmeods-0"],
],
[True, False],
),
)
def test_infer_feature_types_NA_to_nan(null_col, already_inited):
"""A short test to make sure that columnds with all null values
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

typo: columns

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

classic :)

get converted from woodwork Unknown logical type with string
physical type back to the original Double logical type with
float physical type. Other Unknown columns should remain unchanged."""

df = pd.DataFrame(
{
"unknown": null_col,
},
)

# Check that all null columns are inferred as Unknown type.
df.ww.init()
assert isinstance(df.ww.logical_types["unknown"], Unknown)
if all(df["unknown"].isnull()):
assert all([isinstance(x, type(pd.NA)) for x in df["unknown"]])
else:
assert all([isinstance(x, str) for x in df["unknown"]])

# Use infer_feature_types() to init the WW accessor and verify that all
# null columns are now Double logical types backed by np.nan and that other
# columns inferred as Unknown remain untouched.
del df.ww
if already_inited:
df.ww.init()
inferred_df = infer_feature_types(df)
if all(df["unknown"].isnull()):
assert isinstance(inferred_df.ww.logical_types["unknown"], Double)
assert all([isinstance(x, type(np.nan)) for x in inferred_df["unknown"]])
else:
assert all([isinstance(x, str) for x in df["unknown"]])
27 changes: 24 additions & 3 deletions evalml/utils/woodwork_utils.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
import numpy as np
import pandas as pd
import woodwork as ww
from woodwork.logical_types import Datetime, Ordinal
from woodwork.logical_types import Datetime, Ordinal, Unknown

from evalml.utils.gen_utils import is_all_numeric

Expand Down Expand Up @@ -64,6 +64,24 @@ def infer_feature_types(data, feature_types=None):

_raise_value_error_if_nullable_types_detected(data)

def convert_all_nan_unknown_to_double(data):
def is_column_pd_na(data, col):
return all([isinstance(x, type(pd.NA)) for x in data[col]])

def is_column_unknown(data, col):
return isinstance(data.ww.logical_types[col], Unknown)

if isinstance(data, pd.DataFrame):
all_null_unk_cols = [
col
for col in data.columns
if (is_column_pd_na(data, col) and is_column_unknown(data, col))
]
if len(all_null_unk_cols):
for col in all_null_unk_cols:
data.ww.set_types({col: "Double"})
return data

if data.ww.schema is not None:
if isinstance(data, pd.DataFrame) and not ww.is_schema_valid(
data, data.ww.schema
Expand All @@ -79,14 +97,17 @@ def infer_feature_types(data, feature_types=None):
ww_error = f"{ww_error}. Please initialize ww with df.ww.init() to get rid of this message."
raise ValueError(ww_error)
data.ww.init(schema=data.ww.schema)
return data
return convert_all_nan_unknown_to_double(data)

if isinstance(data, pd.Series):
if all([isinstance(x, type(pd.NA)) for x in data]):
chukarsten marked this conversation as resolved.
Show resolved Hide resolved
data = data.replace(pd.NA, np.nan)
feature_types = "Double"
return ww.init_series(data, logical_type=feature_types)
else:
ww_data = data.copy()
ww_data.ww.init(logical_types=feature_types)
return ww_data
return convert_all_nan_unknown_to_double(ww_data)


def _retain_custom_types_and_initalize_woodwork(
Expand Down