alteryx · freddyaboulton · Mar 22, 2022 · Mar 18, 2022 · Mar 18, 2022 · Mar 21, 2022
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -6,6 +6,7 @@
         * Add support for oversampling in time series classification problems :pr:`3387`
     * Fixes
         * Fixed ``TimeSeriesFeaturizer`` to make it deterministic when creating and choosing columns :pr:`3384`
+        * Fixed bug where Email/URL features with missing values would cause the imputer to error out :pr:`3388`
     * Changes
         * Update maintainers to add Frank :pr:`3382`
         * Allow woodwork version 0.14.0 to be installed :pr:`3381`

diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py
@@ -73,12 +73,20 @@ def transform(self, X, y=None):
 
         es = self._make_entity_set(X_ww)
         features = ft.calculate_feature_matrix(features=self._features, entityset=es)
-        features.set_index(X_ww.index, inplace=True)
 
-        X_ww = X_ww.ww.drop(self._columns)
+        # Convert to object dtype so that pd.NA is converted to np.nan
+        # until sklearn imputer can handle pd.NA in release 1.1
+        # FT returns these as string types, currently there isn't much difference
+        # in terms of performance between object and string
+        # see https://pandas.pydata.org/docs/user_guide/text.html#text-data-types
+        # "Currently, the performance of object dtype arrays of strings
+        # "and arrays.StringArray are about the same."
+        features = features.astype(object, copy=False)
+        features.index = X_ww.index
         features.ww.init(logical_types={col_: "categorical" for col_ in features})
-        for col in features:
-            X_ww.ww[col] = features[col]
+
+        X_ww = X_ww.ww.drop(self._columns)
+        X_ww = ww.concat_columns([X_ww, features])
 
         return X_ww
 

diff --git a/evalml/tests/integration_tests/test_nullable_types.py b/evalml/tests/integration_tests/test_nullable_types.py
@@ -1,6 +1,9 @@
+import pandas as pd
 import pytest
 
 from evalml.automl import AutoMLSearch
+from evalml.pipelines import RegressionPipeline
+from evalml.pipelines.components import EmailFeaturizer, Imputer, URLFeaturizer
 from evalml.pipelines.components.transformers import ReplaceNullableTypes
 from evalml.problem_types import ProblemTypes, is_time_series
 
@@ -76,3 +79,29 @@ def test_nullable_types_builds_pipelines(
             assert all([ReplaceNullableTypes.name in pl for pl in pipelines])
     else:
         assert not any([ReplaceNullableTypes.name in pl for pl in pipelines])
+
+
+def test_imputer_can_impute_features_generated_from_null_email_url_features():
+    X = pd.DataFrame(
+        {
+            "email": ["[email protected]", "[email protected]", "[email protected]", None],
+            "url": ["evalml.org", "woodwork.gov", None, "compose.edu"],
+            "number": [1, None, 3, 4],
+            "another number": [7, 8, 9, 10],
+            "categorical": ["boo", "bar", "baz", "go"],
+        }
+    )
+    X.ww.init(
+        logical_types={
+            "email": "EmailAddress",
+            "url": "URL",
+            "categorical": "categorical",
+        }
+    )
+    y = pd.Series([1, 2, 1, 3])
+
+    pl = RegressionPipeline([EmailFeaturizer, URLFeaturizer, Imputer])
+
+    pl.fit(X, y)
+    X_t = pl.transform(X, y)
+    assert not X_t.isna().any(axis=None)