alteryx · chukarsten · Aug 12, 2021 · Aug 5, 2021 · Aug 5, 2021 · Aug 6, 2021
diff --git a/core-requirements.txt b/core-requirements.txt
@@ -11,7 +11,7 @@ psutil>=5.6.6
 requirements-parser>=0.2.0
 shap>=0.36.0
 texttable>=1.6.2
-woodwork==0.5.0
+woodwork==0.5.1
 dask>=2.12.0
 featuretools>=0.26.1
 nlp-primitives>=1.1.0

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,6 +8,7 @@ Release Notes
         * Updated ``ComponentGraph`` ``_validate_component_dict`` logic to be stricter about input values :pr:`2599`
         * Patched bug in ``xgboost`` estimators where predicting on a feature matrix of only booleans would throw an exception. :pr:`2602`
         * Updated ``ARIMARegressor`` to use relative forecasting to predict values :pr:`2613`
+        * Updated to support Woodwork 0.5.1 :pr:`2610`
     * Fixes
         * Updated ``get_best_sampler_for_data`` to consider all non-numeric datatypes as categorical for SMOTE :pr:`2590`
         * Fixed inconsistent test results from `TargetDistributionDataCheck` :pr:`2608`

diff --git a/evalml/data_checks/invalid_targets_data_check.py b/evalml/data_checks/invalid_targets_data_check.py
@@ -80,7 +80,7 @@ def validate(self, X, y):
 
         y = infer_feature_types(y)
         is_supported_type = y.ww.logical_type.type_string in numeric_and_boolean_ww + [
-            ww.logical_types.Categorical.type_string
+            ww.logical_types.Categorical.type_string,
         ]
         if not is_supported_type:
             results["errors"].append(

diff --git a/evalml/pipelines/components/transformers/imputers/target_imputer.py b/evalml/pipelines/components/transformers/imputers/target_imputer.py
@@ -76,7 +76,10 @@ def fit(self, X, y):
         """
         if y is None:
             return self
-        y = infer_feature_types(y).to_frame()
+        y = infer_feature_types(y)
+        if all(y.isnull()):
+            raise TypeError("Provided target full of nulls.")
+        y = y.to_frame()
 
         # Convert all bool dtypes to category for fitting
         if (y.dtypes == bool).all():
@@ -110,8 +113,6 @@ def transform(self, X, y):
             )
 
         transformed = self._component_obj.transform(y_df)
-        if transformed.shape[1] == 0:
-            raise RuntimeError("Transformed data is empty")
         y_t = pd.Series(transformed[:, 0], index=y_ww.index)
         return X, _retain_custom_types_and_initalize_woodwork(y_ww.ww.logical_type, y_t)
 

diff --git a/evalml/pipelines/utils.py b/evalml/pipelines/utils.py
@@ -79,6 +79,12 @@ def _get_preprocessing_components(
     if len(all_null_cols) > 0:
         pp_components.append(DropNullColumns)
 
+    index_and_unknown_columns = list(
+        X.ww.select(["index", "unknown"], return_schema=True).columns
+    )
+    if len(index_and_unknown_columns) > 0:
+        pp_components.append(DropColumns)
+
     email_columns = list(X.ww.select("EmailAddress", return_schema=True).columns)
     if len(email_columns) > 0:
         pp_components.append(EmailFeaturizer)
@@ -105,11 +111,6 @@ def _get_preprocessing_components(
         text_columns
     ):
         pp_components.append(Imputer)
-    index_and_unknown_columns = list(
-        X.ww.select(["index", "unknown"], return_schema=True).columns
-    )
-    if len(index_and_unknown_columns) > 0:
-        pp_components.append(DropColumns)
 
     datetime_cols = list(X.ww.select(["Datetime"], return_schema=True).columns)
 

diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py
@@ -973,7 +973,7 @@ def fit(self, X, y):
             return self
 
         def predict(self, X):
-            series = pd.Series()
+            series = pd.Series(dtype="string")
             series.ww.init()
             return series
 

diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
@@ -410,3 +410,18 @@ def test_simple_imputer_woodwork_custom_overrides_returned_by_components(
             assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {
                 0: Double
             }
+
+
+def test_component_handles_pre_init_ww():
+    """Test to determine whether SimpleImputer can handle
+    a Woodwork-inited DataFrame with partially null and fully
+    null columns (post Woodwork 0.5.1) and still perform the
+    expected behavior."""
+    df = pd.DataFrame(
+        {"part_null": [0, 1, 2, None], "all_null": [None, None, None, None]}
+    )
+    df.ww.init()
+    imputed = SimpleImputer().fit_transform(df)
+
+    assert "all_null" not in imputed.columns
+    assert [x for x in imputed["part_null"]] == [0, 1, 2, 0]
diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py
@@ -128,16 +128,17 @@ def test_target_imputer_boolean_dtype(data_type, make_data_type):
     assert_series_equal(y_expected, y_t)
 
 
-def test_target_imputer_fit_transform_all_nan_empty():
-    y = pd.Series([np.nan, np.nan])
+@pytest.mark.parametrize("y", [[np.nan, np.nan], [pd.NA, pd.NA]])
+def test_target_imputer_fit_transform_all_nan_empty(y):
+    y = pd.Series(y)
 
     imputer = TargetImputer()
-    imputer.fit(None, y)
-    with pytest.raises(RuntimeError, match="Transformed data is empty"):
-        imputer.transform(None, y)
+
+    with pytest.raises(TypeError, match="Provided target full of nulls."):
+        imputer.fit(None, y)
 
     imputer = TargetImputer()
-    with pytest.raises(RuntimeError, match="Transformed data is empty"):
+    with pytest.raises(TypeError, match="Provided target full of nulls."):
         imputer.fit_transform(None, y)
 
 

diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
@@ -54,12 +54,19 @@ def test_invalid_target_data_check_nan_error():
     assert invalid_targets_check.validate(X, y=pd.Series([np.nan, np.nan, np.nan])) == {
         "warnings": [],
         "errors": [
+            DataCheckError(
+                message="Target is unsupported Unknown type. Valid Woodwork "
+                "logical types include: integer, double, boolean",
+                data_check_name=invalid_targets_data_check_name,
+                message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
+                details={"unsupported_type": "unknown"},
+            ).to_dict(),
             DataCheckError(
                 message="Target is either empty or fully null.",
                 data_check_name=invalid_targets_data_check_name,
                 message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                 details={},
-            ).to_dict()
+            ).to_dict(),
         ],
         "actions": [],
     }
@@ -787,12 +794,19 @@ def test_invalid_target_data_action_for_all_null(problem_type):
     expected = {
         "warnings": [],
         "errors": [
+            DataCheckError(
+                message="Target is unsupported Unknown type. Valid Woodwork "
+                "logical types include: integer, double, boolean",
+                data_check_name=invalid_targets_data_check_name,
+                message_code=DataCheckMessageCode.TARGET_UNSUPPORTED_TYPE,
+                details={"unsupported_type": "unknown"},
+            ).to_dict(),
             DataCheckError(
                 message="Target is either empty or fully null.",
                 data_check_name=invalid_targets_data_check_name,
                 message_code=DataCheckMessageCode.TARGET_IS_EMPTY_OR_FULLY_NULL,
                 details={},
-            ).to_dict()
+            ).to_dict(),
         ],
         "actions": [],
     }

diff --git a/evalml/tests/dependency_update_check/minimum_core_requirements.txt b/evalml/tests/dependency_update_check/minimum_core_requirements.txt
@@ -11,7 +11,7 @@ psutil==5.6.6
 requirements-parser==0.2.0
 shap==0.36.0
 texttable==1.6.2
-woodwork==0.5.0
+woodwork==0.5.1
 dask==2.12.0
 featuretools==0.26.1
 nlp-primitives==1.1.0

diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -64,8 +64,8 @@ def _get_test_data_from_configuration(
                     "[email protected]",
                     "[email protected]",
                     "[email protected]",
-                    "$titanic_data%&@hotmail.com",
-                    "foo*EMAIL@email.org",
+                    "[email protected]",
+                    "fooEMAIL@email.org",
                     "[email protected]",
                     "[email protected]",
                 ],
@@ -197,11 +197,7 @@ def test_make_pipeline(
                 if "text" in column_names and input_type == "ww"
                 else []
             )
-            email_featurizer = (
-                [EmailFeaturizer]
-                if "email" in column_names and input_type == "ww"
-                else []
-            )
+            email_featurizer = [EmailFeaturizer] if "email" in column_names else []
             url_featurizer = (
                 [URLFeaturizer] if "url" in column_names and input_type == "ww" else []
             )
@@ -213,7 +209,7 @@ def test_make_pipeline(
             )
             drop_col = (
                 [DropColumns]
-                if any(ltype in column_names for ltype in ["url", "email", "text"])
+                if any(ltype in column_names for ltype in ["url", "text"])
                 and input_type == "pd"
                 else []
             )
@@ -223,8 +219,8 @@ def test_make_pipeline(
                 + url_featurizer
                 + drop_null
                 + text_featurizer
-                + imputer
                 + drop_col
+                + imputer
                 + datetime
                 + delayed_features
                 + ohe

diff --git a/evalml/tests/utils_tests/test_woodwork_utils.py b/evalml/tests/utils_tests/test_woodwork_utils.py
@@ -1,8 +1,16 @@
+from itertools import product
+
 import numpy as np
 import pandas as pd
 import pytest
 import woodwork as ww
-from woodwork.logical_types import Categorical, Datetime, Ordinal
+from woodwork.logical_types import (
+    Categorical,
+    Datetime,
+    Double,
+    Ordinal,
+    Unknown,
+)
 
 from evalml.utils import (
     _convert_numeric_dataset_pandas,
@@ -257,3 +265,49 @@ def test_ordinal_retains_order_min():
     )
     ltypes = cat_subset.ww.logical_types
     assert not hasattr(ltypes["categorical"], "encoding")
+
+
+@pytest.mark.parametrize(
+    "null_col,already_inited",
+    product(
+        [
+            [None, None, None],
+            [np.nan, np.nan, np.nan],
+            [pd.NA, pd.NA, pd.NA],
+            ["ax23n9ck23l", "1,28&*_%*&&xejc", "xnmvz@@Dcmeods-0"],
+        ],
+        [True, False],
+    ),
+)
+def test_infer_feature_types_NA_to_nan(null_col, already_inited):
+    """A short test to make sure that columnds with all null values
+    get converted from woodwork Unknown logical type with string
+    physical type back to the original Double logical type with
+    float physical type.  Other Unknown columns should remain unchanged."""
+
+    df = pd.DataFrame(
+        {
+            "unknown": null_col,
+        },
+    )
+
+    # Check that all null columns are inferred as Unknown type.
+    df.ww.init()
+    assert isinstance(df.ww.logical_types["unknown"], Unknown)
+    if all(df["unknown"].isnull()):
+        assert all([isinstance(x, type(pd.NA)) for x in df["unknown"]])
+    else:
+        assert all([isinstance(x, str) for x in df["unknown"]])
+
+    # Use infer_feature_types() to init the WW accessor and verify that all
+    # null columns are now Double logical types backed by np.nan and that other
+    # columns inferred as Unknown remain untouched.
+    del df.ww
+    if already_inited:
+        df.ww.init()
+    inferred_df = infer_feature_types(df)
+    if all(df["unknown"].isnull()):
+        assert isinstance(inferred_df.ww.logical_types["unknown"], Double)
+        assert all([isinstance(x, type(np.nan)) for x in inferred_df["unknown"]])
+    else:
+        assert all([isinstance(x, str) for x in df["unknown"]])
diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py
@@ -1,7 +1,7 @@
 import numpy as np
 import pandas as pd
 import woodwork as ww
-from woodwork.logical_types import Datetime, Ordinal
+from woodwork.logical_types import Datetime, Ordinal, Unknown
 
 from evalml.utils.gen_utils import is_all_numeric
 
@@ -64,6 +64,24 @@ def infer_feature_types(data, feature_types=None):
 
     _raise_value_error_if_nullable_types_detected(data)
 
+    def convert_all_nan_unknown_to_double(data):
+        def is_column_pd_na(data, col):
+            return all([isinstance(x, type(pd.NA)) for x in data[col]])
+
+        def is_column_unknown(data, col):
+            return isinstance(data.ww.logical_types[col], Unknown)
+
+        if isinstance(data, pd.DataFrame):
+            all_null_unk_cols = [
+                col
+                for col in data.columns
+                if (is_column_pd_na(data, col) and is_column_unknown(data, col))
+            ]
+            if len(all_null_unk_cols):
+                for col in all_null_unk_cols:
+                    data.ww.set_types({col: "Double"})
+        return data
+
     if data.ww.schema is not None:
         if isinstance(data, pd.DataFrame) and not ww.is_schema_valid(
             data, data.ww.schema
@@ -79,14 +97,17 @@ def infer_feature_types(data, feature_types=None):
                 ww_error = f"{ww_error}. Please initialize ww with df.ww.init() to get rid of this message."
             raise ValueError(ww_error)
         data.ww.init(schema=data.ww.schema)
-        return data
+        return convert_all_nan_unknown_to_double(data)
 
     if isinstance(data, pd.Series):
+        if all([isinstance(x, type(pd.NA)) for x in data]):
+            data = data.replace(pd.NA, np.nan)
+            feature_types = "Double"
         return ww.init_series(data, logical_type=feature_types)
     else:
         ww_data = data.copy()
         ww_data.ww.init(logical_types=feature_types)
-        return ww_data
+        return convert_all_nan_unknown_to_double(ww_data)
 
 
 def _retain_custom_types_and_initalize_woodwork(