From 03e129d633415595caabed7b5b688fda67166756 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 24 Aug 2021 17:51:17 +0100
Subject: [PATCH 01/36] release notes

---
 docs/source/release_notes.rst | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index 26e820eb63..d56dede835 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -3,6 +3,7 @@ Release Notes
 **Future Releases**
     * Enhancements
         * Removed SVM "linear" and "precomputed" kernel hyperparameter options, and improved default parameters :pr:`2651`
+        * Updated to support Woodwork 0.6.0 :pr:``
     * Fixes
     * Changes
         * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660`

From 0dac5a043778135d0ba313c7269774742637801a Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 24 Aug 2021 17:57:33 +0100
Subject: [PATCH 02/36] Upgrade woodwork versions

---
 core-requirements.txt                                           | 2 +-
 docs/source/release_notes.rst                                   | 2 +-
 .../dependency_update_check/latest_dependency_versions.txt      | 2 +-
 .../tests/dependency_update_check/minimum_core_requirements.txt | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/core-requirements.txt b/core-requirements.txt
index e0942fe43d..7155fc8973 100644
--- a/core-requirements.txt
+++ b/core-requirements.txt
@@ -12,7 +12,7 @@ psutil>=5.6.6
 requirements-parser>=0.2.0
 shap>=0.36.0
 texttable>=1.6.2
-woodwork==0.5.1
+woodwork==0.6.0
 dask>=2.12.0
 featuretools>=0.26.1
 nlp-primitives>=1.1.0
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index d56dede835..58773bee92 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -3,7 +3,7 @@ Release Notes
 **Future Releases**
     * Enhancements
         * Removed SVM "linear" and "precomputed" kernel hyperparameter options, and improved default parameters :pr:`2651`
-        * Updated to support Woodwork 0.6.0 :pr:``
+        * Updated to support Woodwork 0.6.0 :pr:`2690`
     * Fixes
     * Changes
         * Removed ``LinearRegressor`` from the list of default ``AutoMLSearch`` estimators due to poor performance :pr:`2660`
diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
index ee85b1e26e..9fe6b759ae 100644
--- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt
+++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
@@ -29,5 +29,5 @@ shap==0.39.0
 sktime==0.7.0
 statsmodels==0.12.2
 texttable==1.6.4
-woodwork==0.5.1
+woodwork==0.6.0
 xgboost==1.4.2
diff --git a/evalml/tests/dependency_update_check/minimum_core_requirements.txt b/evalml/tests/dependency_update_check/minimum_core_requirements.txt
index 528d357668..3fecab4430 100644
--- a/evalml/tests/dependency_update_check/minimum_core_requirements.txt
+++ b/evalml/tests/dependency_update_check/minimum_core_requirements.txt
@@ -12,7 +12,7 @@ psutil==5.6.6
 requirements-parser==0.2.0
 shap==0.36.0
 texttable==1.6.2
-woodwork==0.5.1
+woodwork==0.6.0
 dask==2.12.0
 featuretools==0.26.1
 nlp-primitives==1.1.0

From cdcea2c2e30af024b9d46e95806460c6f2c95ec4 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Wed, 25 Aug 2021 13:17:04 +0100
Subject: [PATCH 03/36] extend length of columns to identify as categorical

---
 .../test_regression.py                        |  6 ++---
 .../pipeline_tests/test_pipeline_utils.py     | 22 +++++++++----------
 2 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
index d11f1ad595..cd83f08b7d 100644
--- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
+++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
@@ -1,3 +1,4 @@
+import numpy as np
 import pandas as pd
 import pytest
 
@@ -85,11 +86,10 @@ def test_woodwork_regression_pipeline(diabetes_local, linear_regression_pipeline
 
 def test_custom_indices():
     X = pd.DataFrame(
-        {"a": ["a", "b", "a", "a", "a", "c", "c", "c"], "b": [0, 1, 1, 1, 1, 1, 0, 1]}
+        {"a": ["a", "b", "a", "a", "a", "c", "c", "c"]*3, "b": [0, 1, 1, 1, 1, 1, 0, 1]*3}
     )
-    y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0], index=[7, 2, 1, 4, 5, 3, 6, 8])
+    y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0]*3, index=np.random.choice(24, 24, replace=False))
     x1, x2, y1, y2 = split_data(X, y, problem_type="regression")
-
     pipeline = RegressionPipeline(
         component_graph=["Imputer", "One Hot Encoder", "Linear Regressor"],
         parameters={},
diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
index 84149a115e..3198dc9c2e 100644
--- a/evalml/tests/pipeline_tests/test_pipeline_utils.py
+++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -47,10 +47,10 @@ def _get_test_data_from_configuration(
     ):
         X_all = pd.DataFrame(
             {
-                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan],
-                "numerical": range(7),
-                "categorical": ["a", "b", "a", "c", "c", "a", "b"],
-                "dates": pd.date_range("2000-02-03", periods=7, freq="W"),
+                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]*2,
+                "numerical": range(14),
+                "categorical": ["a", "b", "a", "b", "b", "a", "b"]*2,
+                "dates": pd.date_range("2000-02-03", periods=14, freq="W"),
                 "text": [
                     "this is a string",
                     "this is another string",
@@ -59,7 +59,7 @@ def _get_test_data_from_configuration(
                     "cats are gr8",
                     "hello world",
                     "evalml is gr8",
-                ],
+                ]*2,
                 "email": [
                     "abalone_0@gmail.com",
                     "AbaloneRings@yahoo.com",
@@ -68,7 +68,7 @@ def _get_test_data_from_configuration(
                     "fooEMAIL@email.org",
                     "evalml@evalml.org",
                     "evalml@alteryx.org",
-                ],
+                ]*2,
                 "url": [
                     "https://evalml.alteryx.com/en/stable/",
                     "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html",
@@ -77,17 +77,17 @@ def _get_test_data_from_configuration(
                     "https://www.evalml.alteryx.com/en/stable/demos/text_input.html",
                     "https://github.com/alteryx/evalml",
                     "https://github.com/alteryx/featuretools",
-                ],
+                ]*2,
             }
         )
-        y = pd.Series([0, 0, 1, 0, 0, 1, 1])
+        y = pd.Series([0, 0, 1, 0, 0, 1, 1]*2)
         if problem_type == ProblemTypes.MULTICLASS:
-            y = pd.Series([0, 2, 1, 2, 0, 2, 1])
+            y = pd.Series([0, 2, 1, 2, 0, 2, 1]*2)
         elif is_regression(problem_type):
             if lognormal_distribution:
-                y = pd.Series([1, 1, 1, 2, 3, 6, 9])
+                y = pd.Series([1, 1, 1, 2, 3, 6, 9]*2)
             else:
-                y = pd.Series([1, 2, 3, 3, 3, 4, 5])
+                y = pd.Series([1, 2, 3, 3, 3, 4, 5]*2)
         X = X_all[column_names]
 
         if input_type == "ww":

From ab4d80e24cb2253f4697e944b705f0cf7cf081e5 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Wed, 25 Aug 2021 16:41:03 +0100
Subject: [PATCH 04/36] data checks updated

---
 evalml/tests/data_checks_tests/test_data_checks.py     |  2 +-
 .../data_checks_tests/test_id_columns_data_check.py    |  1 +
 .../test_invalid_targets_data_check.py                 |  1 +
 .../test_multicollinearity_data_check.py               |  3 ++-
 .../test_target_leakage_data_check.py                  | 10 +---------
 evalml/utils/woodwork_utils.py                         |  2 +-
 6 files changed, 7 insertions(+), 12 deletions(-)

diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
index 1c495ee5cb..529fc66d68 100644
--- a/evalml/tests/data_checks_tests/test_data_checks.py
+++ b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -340,7 +340,7 @@ def test_default_data_checks_regression(input_type):
     X["nan_dt_col"][0] = None
     y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
     y_no_variance = pd.Series([5] * 5)
-    X.ww.init(logical_types={"natural_language_nan": "NaturalLanguage"})
+    X.ww.init(logical_types={"lots_of_null": "categorical", "natural_language_nan": "NaturalLanguage"})
     if input_type == "ww":
         y = ww.init_series(y)
         y_no_variance = ww.init_series(y_no_variance)
diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py
index eb550fb85f..f15a3c6f45 100644
--- a/evalml/tests/data_checks_tests/test_id_columns_data_check.py
+++ b/evalml/tests/data_checks_tests/test_id_columns_data_check.py
@@ -135,6 +135,7 @@ def test_id_columns_strings():
         "col_6": [0.1, 0.2, 0.3, 0.4],
     }
     X = pd.DataFrame.from_dict(X_dict)
+    X.ww.init(logical_types={"col_1_id": "categorical", "col_2": "categorical", "Id": "categorical", "col_5": "categorical"})
     id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
     assert id_cols_check.validate(X) == {
         "warnings": [
diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
index ce5ebfb20c..b5d8453952 100644
--- a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
+++ b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
@@ -231,6 +231,7 @@ def test_invalid_target_data_input_formats():
     #  test Woodwork
     y = pd.Series([None, None, None, 0])
     X = pd.DataFrame({"col": range(len(y))})
+
     messages = invalid_targets_check.validate(X, y)
     assert messages == expected
 
diff --git a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
index b086ae74a2..23961985e4 100644
--- a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
+++ b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
@@ -86,7 +86,8 @@ def test_multicollinearity_nonnumeric_cols(data_type, make_data_type):
             "col_6": [1, 1, 2, 3, 1],
         }
     )
-    X = make_data_type(data_type, X)
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical", "col_4": "categorical", "col_5": "categorical"})
+
     multi_check = MulticollinearityDataCheck(threshold=0.9)
     assert multi_check.validate(X) == {
         "warnings": [
diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
index 691e171dde..828a7e186d 100644
--- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
+++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
@@ -247,6 +247,7 @@ def test_target_leakage_types():
     X["d"] = ~y
     X["e"] = [0, 0, 0, 0]
     y = y.astype(bool)
+    X.ww.init(logical_types={"a": "categorical"})
 
     expected = {
         "warnings": [
@@ -398,12 +399,6 @@ def test_target_leakage_regression():
                 message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                 details={"column": "c"},
             ).to_dict(),
-            DataCheckWarning(
-                message="Column 'e' is 80.0% or more correlated with the target",
-                data_check_name=target_leakage_data_check_name,
-                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
-                details={"column": "e"},
-            ).to_dict(),
         ],
         "errors": [],
         "actions": [
@@ -416,9 +411,6 @@ def test_target_leakage_regression():
             DataCheckAction(
                 DataCheckActionCode.DROP_COL, metadata={"column": "c"}
             ).to_dict(),
-            DataCheckAction(
-                DataCheckActionCode.DROP_COL, metadata={"column": "e"}
-            ).to_dict(),
         ],
     }
 
diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py
index 1adb2269cc..2b93493487 100644
--- a/evalml/utils/woodwork_utils.py
+++ b/evalml/utils/woodwork_utils.py
@@ -100,8 +100,8 @@ def is_column_unknown(data, col):
         return convert_all_nan_unknown_to_double(data)
 
     if isinstance(data, pd.Series):
+        data = data.replace(pd.NA, np.nan)
         if all(data.isna()):
-            data = data.replace(pd.NA, np.nan)
             feature_types = "Double"
         return ww.init_series(data, logical_type=feature_types)
     else:

From d2d045e283b2bf0a82d584b936d9f1f6d01345c2 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Wed, 25 Aug 2021 18:55:10 +0100
Subject: [PATCH 05/36] more component tests

---
 .../test_column_selector_transformers.py      |   3 +
 evalml/tests/component_tests/test_imputer.py  | 144 +++++++++---------
 2 files changed, 75 insertions(+), 72 deletions(-)

diff --git a/evalml/tests/component_tests/test_column_selector_transformers.py b/evalml/tests/component_tests/test_column_selector_transformers.py
index f1eafae6f0..5543470399 100644
--- a/evalml/tests/component_tests/test_column_selector_transformers.py
+++ b/evalml/tests/component_tests/test_column_selector_transformers.py
@@ -118,6 +118,7 @@ def test_column_transformer_transform(class_to_test, checking_functions):
 
     if class_to_test is SelectByType:
         transformer = class_to_test(column_types=["categorical", "Boolean", "Integer"])
+        X.ww.init(logical_types={"one": "categorical"})
     else:
         transformer = class_to_test(columns=list(X.columns))
     assert check4(X, transformer.transform(X))
@@ -175,6 +176,7 @@ def test_column_transformer_fit_transform(class_to_test, checking_functions):
         assert check2(X, class_to_test(columns=["one"]).fit_transform(X))
 
     if class_to_test is SelectByType:
+        X.ww.init(logical_types={"one": "categorical"})
         assert check3(
             X,
             class_to_test(
@@ -254,6 +256,7 @@ def test_typeortag_column_transformer_ww_logical_and_semantic_types():
             "four": [4.0, 2.3, 6.5, 2.6],
         }
     )
+    X.ww.init(logical_types={"one": "categorical"})
 
     transformer = SelectByType(column_types=[ww.logical_types.Age])
     with pytest.raises(ValueError, match="not found in input data"):
diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
index e7113ec8be..cd75b4044a 100644
--- a/evalml/tests/component_tests/test_imputer.py
+++ b/evalml/tests/component_tests/test_imputer.py
@@ -19,24 +19,24 @@ def imputer_test_data():
     return pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "three"], dtype="category"
+                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
             ),
-            "int col": [0, 1, 2, 0, 3],
-            "object col": ["b", "b", "a", "c", "d"],
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0],
-            "bool col": [True, False, False, True, True],
+            "int col": [0, 1, 2, 0, 3]*4,
+            "object col": ["b", "b", "a", "c", "d"]*4,
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
+            "bool col": [True, False, False, True, True]*4,
             "categorical with nan": pd.Series(
-                [np.nan, "1", np.nan, "0", "3"], dtype="category"
+                [np.nan, "1", "0", "0", "3"]*4, dtype="category"
             ),
-            "int with nan": [np.nan, 1, 0, 0, 1],
-            "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0],
-            "object with nan": ["b", "b", np.nan, "c", np.nan],
+            "int with nan": [np.nan, 1, 0, 0, 1]*4,
+            "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0]*4,
+            "object with nan": ["b", "b", np.nan, "c", np.nan]*4,
             "bool col with nan": pd.Series(
-                [True, np.nan, False, np.nan, True], dtype="category"
+                [True, np.nan, False, np.nan, True]*4, dtype="category"
             ),
-            "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan],
+            "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]*4,
             "all nan cat": pd.Series(
-                [np.nan, np.nan, np.nan, np.nan, np.nan], dtype="category"
+                [np.nan, np.nan, np.nan, np.nan, np.nan]*4, dtype="category"
             ),
         }
     )
@@ -91,16 +91,16 @@ def test_numeric_only_input(imputer_test_data):
     X = imputer_test_data[
         ["int col", "float col", "int with nan", "float with nan", "all nan"]
     ]
-    y = pd.Series([0, 0, 1, 0, 1])
+    y = pd.Series([0, 0, 1, 0, 1]*4)
     imputer = Imputer(numeric_impute_strategy="median")
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int col": [0, 1, 2, 0, 3],
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0],
-            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
-            "float with nan": [0.0, 1.0, 0, -1.0, 0.0],
+            "int col": [0, 1, 2, 0, 3]*4,
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
+            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4,
+            "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4,
         }
     )
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -122,29 +122,29 @@ def test_categorical_only_input(imputer_test_data):
             "all nan cat",
         ]
     ]
-    y = pd.Series([0, 0, 1, 0, 1])
-    imputer = Imputer()
-    imputer.fit(X, y)
-    transformed = imputer.transform(X, y)
+    y = pd.Series([0, 0, 1, 0, 1]*4)
+
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "three"], dtype="category"
+                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
             ),
-            "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"),
-            "bool col": [True, False, False, True, True],
+            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
+            "bool col": [True, False, False, True, True]*4,
             "categorical with nan": pd.Series(
-                ["0", "1", "0", "0", "3"], dtype="category"
+                ["0", "1", "0", "0", "3"]*4, dtype="category"
             ),
-            "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"),
+            "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"),
             "bool col with nan": pd.Series(
-                [True, True, False, True, True], dtype="category"
+                [True, True, False, True, True]*4, dtype="category"
             ),
         }
     )
 
     imputer = Imputer()
+    imputer.fit(X, y)
     transformed = imputer.fit_transform(X, y)
+
     assert_frame_equal(transformed, expected, check_dtype=False)
 
 
@@ -157,20 +157,20 @@ def test_categorical_and_numeric_input(imputer_test_data):
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "three"], dtype="category"
+                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
             ),
-            "int col": [0, 1, 2, 0, 3],
-            "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"),
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0],
-            "bool col": [True, False, False, True, True],
+            "int col": [0, 1, 2, 0, 3]*4,
+            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
+            "bool col": [True, False, False, True, True]*4,
             "categorical with nan": pd.Series(
-                ["0", "1", "0", "0", "3"], dtype="category"
+                ["0", "1", "0", "0", "3"]*4, dtype="category"
             ),
-            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0],
-            "float with nan": [0.0, 1.0, 0, -1.0, 0.0],
-            "object with nan": pd.Series(["b", "b", "b", "c", "b"], dtype="category"),
+            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4,
+            "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4,
+            "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"),
             "bool col with nan": pd.Series(
-                [True, True, False, True, True], dtype="category"
+                [True, True, False, True, True]*4, dtype="category"
             ),
         }
     )
@@ -183,7 +183,7 @@ def test_categorical_and_numeric_input(imputer_test_data):
 
 def test_drop_all_columns(imputer_test_data):
     X = imputer_test_data[["all nan cat", "all nan"]]
-    y = pd.Series([0, 0, 1, 0, 1])
+    y = pd.Series([0, 0, 1, 0, 1]*4)
     X.ww.init()
     imputer = Imputer()
     imputer.fit(X, y)
@@ -288,7 +288,7 @@ def test_imputer_fill_value(imputer_test_data):
             "bool col with nan",
         ]
     ]
-    y = pd.Series([0, 0, 1, 0, 1])
+    y = pd.Series([0, 0, 1, 0, 1]*4)
     imputer = Imputer(
         categorical_impute_strategy="constant",
         numeric_impute_strategy="constant",
@@ -299,16 +299,16 @@ def test_imputer_fill_value(imputer_test_data):
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int with nan": [-1, 1, 0, 0, 1],
+            "int with nan": [-1, 1, 0, 0, 1]*4,
             "categorical with nan": pd.Series(
-                ["fill", "1", "fill", "0", "3"], dtype="category"
+                ["fill", "1", "0", "0", "3"]*4, dtype="category"
             ),
-            "float with nan": [0.0, 1.0, -1, -1.0, 0.0],
+            "float with nan": [0.0, 1.0, -1, -1.0, 0.0]*4,
             "object with nan": pd.Series(
-                ["b", "b", "fill", "c", "fill"], dtype="category"
+                ["b", "b", "fill", "c", "fill"]*4, dtype="category"
             ),
             "bool col with nan": pd.Series(
-                [True, "fill", False, "fill", True], dtype="category"
+                [True, "fill", False, "fill", True]*4, dtype="category"
             ),
         }
     )
@@ -326,7 +326,7 @@ def test_imputer_fill_value(imputer_test_data):
 
 def test_imputer_no_nans(imputer_test_data):
     X = imputer_test_data[["categorical col", "object col", "bool col"]]
-    y = pd.Series([0, 0, 1, 0, 1])
+    y = pd.Series([0, 0, 1, 0, 1]*4)
     imputer = Imputer(
         categorical_impute_strategy="constant",
         numeric_impute_strategy="constant",
@@ -338,10 +338,10 @@ def test_imputer_no_nans(imputer_test_data):
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "three"], dtype="category"
+                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
             ),
-            "object col": pd.Series(["b", "b", "a", "c", "d"], dtype="category"),
-            "bool col": [True, False, False, True, True],
+            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
+            "bool col": [True, False, False, True, True]*4,
         }
     )
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -359,25 +359,25 @@ def test_imputer_no_nans(imputer_test_data):
 def test_imputer_with_none():
     X = pd.DataFrame(
         {
-            "int with None": [1, 0, 5, None],
-            "float with None": [0.1, 0.0, 0.5, None],
-            "category with None": pd.Series(["b", "a", "a", None], dtype="category"),
-            "boolean with None": pd.Series([True, None, False, True]),
-            "object with None": ["b", "a", "a", None],
-            "all None": [None, None, None, None],
+            "int with None": [1, 0, 5, None]*4,
+            "float with None": [0.1, 0.0, 0.5, None]*4,
+            "category with None": pd.Series(["b", "a", "a", None]*4, dtype="category"),
+            "boolean with None": pd.Series([True, None, False, True]*4),
+            "object with None": ["b", "a", "a", None]*4,
+            "all None": [None, None, None, None]*4,
         }
     )
-    y = pd.Series([0, 0, 1, 0, 1])
+    y = pd.Series([0, 0, 1, 0, 1]*4)
     imputer = Imputer()
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int with None": [1, 0, 5, 2],
-            "float with None": [0.1, 0.0, 0.5, 0.2],
-            "category with None": pd.Series(["b", "a", "a", "a"], dtype="category"),
-            "boolean with None": pd.Series([True, True, False, True], dtype="category"),
-            "object with None": pd.Series(["b", "a", "a", "a"], dtype="category"),
+            "int with None": [1, 0, 5, 2]*4,
+            "float with None": [0.1, 0.0, 0.5, 0.2]*4,
+            "category with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
+            "boolean with None": pd.Series([True, True, False, True]*4, dtype="category"),
+            "object with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
         }
     )
     assert_frame_equal(expected, transformed, check_dtype=False)
@@ -403,9 +403,9 @@ def test_imputer_all_bool_return_original(data_type, make_data_type):
 
 @pytest.mark.parametrize("data_type", ["pd", "ww"])
 def test_imputer_bool_dtype_object(data_type, make_data_type):
-    X = pd.DataFrame([True, np.nan, False, np.nan, True])
-    y = pd.Series([1, 0, 0, 1, 0])
-    X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category")
+    X = pd.DataFrame([True, np.nan, False, np.nan, True]*4)
+    y = pd.Series([1, 0, 0, 1, 0]*4)
+    X_expected_arr = pd.DataFrame([True, True, False, True, True]*4, dtype="category")
     X = make_data_type(data_type, X)
     y = make_data_type(data_type, y)
     imputer = Imputer()
@@ -418,17 +418,17 @@ def test_imputer_bool_dtype_object(data_type, make_data_type):
 def test_imputer_multitype_with_one_bool(data_type, make_data_type):
     X_multi = pd.DataFrame(
         {
-            "bool with nan": pd.Series([True, np.nan, False, np.nan, False]),
-            "bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
+            "bool with nan": pd.Series([True, np.nan, False, np.nan, False]*4),
+            "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool),
         }
     )
-    y = pd.Series([1, 0, 0, 1, 0])
+    y = pd.Series([1, 0, 0, 1, 0]*4)
     X_multi_expected_arr = pd.DataFrame(
         {
             "bool with nan": pd.Series(
-                [True, False, False, False, False], dtype="category"
+                [True, False, False, False, False]*4, dtype="category"
             ),
-            "bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
+            "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool),
         }
     )
 
@@ -468,23 +468,23 @@ def test_imputer_int_preserved():
 
 
 def test_imputer_bool_preserved():
-    X = pd.DataFrame(pd.Series([True, False, True, np.nan]))
+    X = pd.DataFrame(pd.Series([True, False, True, np.nan]*4))
     imputer = Imputer(categorical_impute_strategy="most_frequent")
     transformed = imputer.fit_transform(X)
     pd.testing.assert_frame_equal(
         transformed,
-        pd.DataFrame(pd.Series([True, False, True, True], dtype="category")),
+        pd.DataFrame(pd.Series([True, False, True, True]*4, dtype="category")),
     )
     assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {
         0: Categorical
     }
 
-    X = pd.DataFrame(pd.Series([True, False, True, False]))
+    X = pd.DataFrame(pd.Series([True, False, True, False]*4))
     imputer = Imputer(categorical_impute_strategy="most_frequent")
     transformed = imputer.fit_transform(X)
     pd.testing.assert_frame_equal(
         transformed,
-        pd.DataFrame(pd.Series([True, False, True, False])),
+        pd.DataFrame(pd.Series([True, False, True, False]*4)),
         check_dtype=False,
     )
     assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Boolean}

From 7f1f92a80ce614a29f5b14f42a4a2985921e05e8 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Wed, 25 Aug 2021 19:14:31 +0100
Subject: [PATCH 06/36] lgbm updates

---
 evalml/tests/component_tests/test_lgbm_classifier.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
index 1766024f0a..bd6dd0ba41 100644
--- a/evalml/tests/component_tests/test_lgbm_classifier.py
+++ b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -173,11 +173,11 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):
 @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict")
 def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
     X = pd.DataFrame(
-        {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]}
+        {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2}
     )
-    y = pd.Series([1, 1, 0, 0, 0, 1])
+    y = pd.Series([1, 1, 0, 0, 0, 1]*2)
     X_expected = pd.DataFrame(
-        {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}
+        {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2}
     )
     X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category")
 

From eba1d4b9be7cbf65d171859c06cae1036eed891f Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Thu, 26 Aug 2021 11:30:57 +0100
Subject: [PATCH 07/36] no message

---
 evalml/utils/woodwork_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evalml/utils/woodwork_utils.py b/evalml/utils/woodwork_utils.py
index 2b93493487..1adb2269cc 100644
--- a/evalml/utils/woodwork_utils.py
+++ b/evalml/utils/woodwork_utils.py
@@ -100,8 +100,8 @@ def is_column_unknown(data, col):
         return convert_all_nan_unknown_to_double(data)
 
     if isinstance(data, pd.Series):
-        data = data.replace(pd.NA, np.nan)
         if all(data.isna()):
+            data = data.replace(pd.NA, np.nan)
             feature_types = "Double"
         return ww.init_series(data, logical_type=feature_types)
     else:

From 22c79c16627a5576ea353a9a6a809c7a98cf265d Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Thu, 26 Aug 2021 15:00:05 +0100
Subject: [PATCH 08/36] model understanding updates

---
 .../test_explainers.py                            |  7 ++++++-
 .../test_force_plots.py                           |  2 ++
 .../test_partial_dependence.py                    | 10 +++++-----
 .../test_permutation_importance.py                |  1 +
 .../tests/pipeline_tests/test_component_graph.py  | 15 ++++++++++-----
 5 files changed, 24 insertions(+), 11 deletions(-)

diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
index ccd7b3ab3e..2405e2cabd 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
@@ -1263,6 +1263,7 @@ def transform_y_for_problem_type(problem_type, y):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1304,7 +1305,7 @@ def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_
 def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
 
-    X.ww.set_types(logical_types={"provider": "NaturalLanguage"})
+    X.ww.set_types(logical_types={"provider": "NaturalLanguage", "currency": "categorical", "expiration_date": "categorical"})
     component_graph = [
         "Select Columns Transformer",
         "One Hot Encoder",
@@ -1361,6 +1362,7 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1415,6 +1417,7 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     component_graph = {
         "SelectNumeric": ["Select Columns Transformer", "X", "y"],
@@ -1474,6 +1477,7 @@ def test_categories_aggregated_but_not_those_that_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     component_graph = [
         "Select Columns Transformer",
@@ -1517,6 +1521,7 @@ def test_categories_aggregated_when_some_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     component_graph = [
         "Select Columns Transformer",
diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
index 343a1ce97a..b08f8ccc26 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
@@ -217,6 +217,7 @@ def test_force_plot_regression(
 def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
     columns_to_select = ["datetime", "amount", "provider", "currency"]
+    X.ww.init(logical_types={"currency": "categorical"})
 
     pipeline = pipeline_class(
         component_graph=[
@@ -250,6 +251,7 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
 def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
     columns_to_select = ["datetime", "amount", "provider", "currency"]
+    X.ww.init(logical_types={"currency": "categorical"})
 
     X.ww.set_types(logical_types={"provider": "NaturalLanguage"})
     component_graph = [
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index d057b496e1..6751b29a30 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -135,15 +135,15 @@ def test_partial_dependence_with_non_numeric_columns(
 ):
     X = pd.DataFrame(
         {
-            "numeric": [1, 2, 3, 0],
-            "also numeric": [2, 3, 4, 1],
-            "string": ["a", "b", "a", "c"],
-            "also string": ["c", "b", "a", "d"],
+            "numeric": [1, 2, 3, 0]*4,
+            "also numeric": [2, 3, 4, 1]*4,
+            "string": ["a", "b", "a", "c"]*4,
+            "also string": ["c", "b", "a", "c"]*4,
         }
     )
     if data_type == "ww":
         X.ww.init()
-    y = [0, 0.2, 1.4, 1]
+    y = [0, 0.2, 1.4, 1]*4
     pipeline = linear_regression_pipeline_class(
         parameters={"Linear Regressor": {"n_jobs": 1}}
     )
diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py
index 1833cf1b7a..dfbe0f1c35 100644
--- a/evalml/tests/model_understanding_tests/test_permutation_importance.py
+++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py
@@ -310,6 +310,7 @@ def test_fast_permutation_importance_matches_slow_output(
             "dependency not installed."
         )
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
 
     if pipeline_class == LinearPipelineWithTextFeatures:
         X.ww.set_types(logical_types={"provider": "NaturalLanguage"})
diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py
index 5f99549e2a..4246d0a028 100644
--- a/evalml/tests/pipeline_tests/test_component_graph.py
+++ b/evalml/tests/pipeline_tests/test_component_graph.py
@@ -753,6 +753,8 @@ def test_computation_input_custom_index(index, example_graph):
         index=index,
     )
     y = pd.Series([1, 2, 1, 2, 1])
+    X.ww.init(logical_types={"categories": "categorical"})
+
     component_graph = ComponentGraph(example_graph)
     component_graph.instantiate({})
     component_graph.fit(X, y)
@@ -881,6 +883,7 @@ def test_input_feature_names(example_graph):
         }
     )
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
+    X.ww.init(logical_types={"column_1": "categorical"})
 
     component_graph = ComponentGraph(example_graph)
     component_graph.instantiate(
@@ -945,7 +948,7 @@ def test_custom_input_feature_types(example_graph):
         }
     )
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
-    X = infer_feature_types(X, {"column_2": "categorical"})
+    X = infer_feature_types(X, {"column_1": "categorical", "column_2": "categorical"})
 
     component_graph = ComponentGraph(example_graph)
     component_graph.instantiate(
@@ -1015,7 +1018,7 @@ def test_component_graph_dataset_with_different_types():
 
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
     X = infer_feature_types(
-        X, {"column_2": "categorical", "column_5": "NaturalLanguage"}
+        X, {"column_1": "categorical", "column_2": "categorical", "column_5": "NaturalLanguage"}
     )
 
     component_graph = ComponentGraph(graph)
@@ -1181,7 +1184,7 @@ def test_component_graph_types_merge_mock(mock_rf_fit):
     )
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
     # woodwork would infer this as boolean by default -- convert to a numeric type
-    X = infer_feature_types(X, {"column_3": "integer"})
+    X = infer_feature_types(X, {"column_1": "categorical", "column_3": "integer"})
 
     component_graph = ComponentGraph(graph)
     # we don't have feature type selectors defined yet, so in order for the above graph to work we have to
@@ -1263,7 +1266,7 @@ def transform(self, X, y=None):
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
 
     # woodwork would infer this as boolean by default -- convert to a numeric type
-    X.ww.init(semantic_tags={"address": "address"})
+    X.ww.init(logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"})
 
     component_graph = ComponentGraph(graph)
     # we don't have feature type selectors defined yet, so in order for the above graph to work we have to
@@ -1333,7 +1336,7 @@ def test_component_graph_types_merge():
     X["column_5"] = X["column_4"]
     X["column_6"] = [42.0] * len(X)
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
-    X = infer_feature_types(X, {"column_5": "NaturalLanguage"})
+    X = infer_feature_types(X, {"column_1": "categorical", "column_5": "NaturalLanguage"})
 
     component_graph = ComponentGraph(graph)
     # we don't have feature type selectors defined yet, so in order for the above graph to work we have to
@@ -1416,6 +1419,7 @@ def test_component_graph_dataset_with_target_imputer():
         }
     )
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, np.nan])
+    X = infer_feature_types(X, {"column_1": "categorical"})
     graph = {
         "Target Imputer": [TargetImputer, "X", "y"],
         "OneHot": [OneHotEncoder, "Target Imputer.x", "Target Imputer.y"],
@@ -1919,6 +1923,7 @@ def test_final_component_features_does_not_have_target():
         }
     )
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
+    X.ww.init(logical_types={"column_1": "categorical"})
 
     cg = ComponentGraph(
         {

From 7db57872b1153f8e8d20cd59d098332d176820b9 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Thu, 26 Aug 2021 20:43:20 +0100
Subject: [PATCH 09/36] imputer fixes

---
 .../component_tests/test_target_imputer.py    | 20 +++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py
index 5a77c41db2..8a9ad5ce26 100644
--- a/evalml/tests/component_tests/test_target_imputer.py
+++ b/evalml/tests/component_tests/test_target_imputer.py
@@ -57,11 +57,11 @@ def test_target_imputer_mean():
         (None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])),
         (
             None,
-            pd.Series([np.nan, "a", "b"]),
-            pd.Series(["missing_value", "a", "b"]).astype("category"),
+            pd.Series([np.nan, "a", "b"]*5),
+            pd.Series(["missing_value", "a", "b"]*5).astype("category"),
         ),
         (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])),
-        (3, pd.Series([np.nan, "a", "b"]), pd.Series([3, "a", "b"]).astype("category")),
+        (3, pd.Series([np.nan, "a", "b"]*5), pd.Series([3, "a", "b"]*5).astype("category")),
     ],
 )
 def test_target_imputer_constant(fill_value, y, y_expected):
@@ -71,9 +71,9 @@ def test_target_imputer_constant(fill_value, y, y_expected):
 
 
 def test_target_imputer_most_frequent():
-    y = pd.Series([np.nan, "a", "b"])
+    y = pd.Series([np.nan, "a", "b"]*5)
     imputer = TargetImputer(impute_strategy="most_frequent")
-    y_expected = pd.Series(["a", "a", "b"]).astype("category")
+    y_expected = pd.Series(["a", "a", "b"]*5).astype("category")
     _, y_t = imputer.fit_transform(None, y)
     assert_series_equal(y_expected, y_t, check_dtype=False)
 
@@ -85,7 +85,7 @@ def test_target_imputer_most_frequent():
 
 
 def test_target_imputer_col_with_non_numeric_with_numeric_strategy():
-    y = pd.Series([np.nan, "a", "b"])
+    y = pd.Series([np.nan, "a", "b"]*5)
     imputer = TargetImputer(impute_strategy="mean")
     with pytest.raises(
         ValueError, match="Cannot use mean strategy with non-numeric data"
@@ -190,16 +190,16 @@ def test_target_imputer_with_none(y, y_expected):
     "y, y_expected",
     [
         (
-            pd.Series(["b", "a", "a", None], dtype="category"),
-            pd.Series(["b", "a", "a", "a"], dtype="category"),
+            pd.Series(["b", "a", "a", None]*4, dtype="category"),
+            pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
         ),
         (
             pd.Series([True, None, False, True], dtype="category"),
             pd.Series([True, True, False, True], dtype="category"),
         ),
         (
-            pd.Series(["b", "a", "a", None]),
-            pd.Series(["b", "a", "a", "a"], dtype="category"),
+            pd.Series(["b", "a", "a", None]*4),
+            pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
         ),
     ],
 )

From 240655d1b6107c7934a4e02b4cb5ee3ae9da8501 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Fri, 27 Aug 2021 11:22:14 +0100
Subject: [PATCH 10/36] test target encoder and invalid target data checks

---
 .../component_tests/test_target_encoder.py      | 17 ++++++++++-------
 .../test_invalid_targets_data_check.py          |  4 ++--
 2 files changed, 12 insertions(+), 9 deletions(-)

diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py
index a7c30a2ff2..fd7c5315e1 100644
--- a/evalml/tests/component_tests/test_target_encoder.py
+++ b/evalml/tests/component_tests/test_target_encoder.py
@@ -70,6 +70,7 @@ def test_null_values_in_dataframe():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     y = pd.Series([0, 1, 1, 1, 0])
     encoder = TargetEncoder(handle_missing="value")
     encoder.fit(X, y)
@@ -116,13 +117,13 @@ def test_null_values_in_dataframe():
 def test_cols():
     X = pd.DataFrame(
         {
-            "col_1": [1, 2, 1, 1, 2],
-            "col_2": ["2", "1", "1", "1", "1"],
-            "col_3": ["a", "a", "a", "a", "a"],
+            "col_1": [1, 2, 1, 1, 2]*2,
+            "col_2": ["2", "1", "1", "1", "1"]*2,
+            "col_3": ["a", "a", "a", "a", "a"]*2,
         }
     )
     X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"})
-    y = pd.Series([0, 1, 1, 1, 0])
+    y = pd.Series([0, 1, 1, 1, 0]*2)
     encoder = TargetEncoder(cols=[])
     encoder.fit(X, y)
     X_t = encoder.transform(X)
@@ -133,9 +134,9 @@ def test_cols():
     X_t = encoder.transform(X)
     X_expected = pd.DataFrame(
         {
-            "col_1": pd.Series([1, 2, 1, 1, 2], dtype="int64"),
-            "col_2": [0.60000, 0.742886, 0.742886, 0.742886, 0.742886],
-            "col_3": pd.Series(["a", "a", "a", "a", "a"], dtype="category"),
+            "col_1": pd.Series([1, 2, 1, 1, 2]*2, dtype="int64"),
+            "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863]*2,
+            "col_3": pd.Series(["a", "a", "a", "a", "a"]*2, dtype="category"),
         }
     )
     assert_frame_equal(X_expected, X_t, check_less_precise=True)
@@ -157,6 +158,7 @@ def test_transform():
             "col_3": ["a", "a", "a", "b", "a"],
         }
     )
+    X.ww.init(logical_types={"col_2": "categorical", "col_3": "categorical"})
     y = pd.Series([0, 1, 1, 1, 0])
     encoder = TargetEncoder()
     encoder.fit(X, y)
@@ -180,6 +182,7 @@ def test_smoothing():
             "col_3": ["a", "a", "a", "a", "b"],
         }
     )
+    X.ww.init(logical_types={"col_3": "categorical"})
     y = pd.Series([0, 1, 1, 1, 0])
     encoder = TargetEncoder(smoothing=1)
     encoder.fit(X, y)
diff --git a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
index b5d8453952..4a5d66929c 100644
--- a/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
+++ b/evalml/tests/data_checks_tests/test_invalid_targets_data_check.py
@@ -236,14 +236,14 @@ def test_invalid_target_data_input_formats():
     assert messages == expected
 
     #  test list
-    y = [None, None, None, 0]
+    y = [np.nan, np.nan, np.nan, 0]
     X = pd.DataFrame({"col": range(len(y))})
 
     messages = invalid_targets_check.validate(X, y)
     assert messages == expected
 
     # test np.array
-    y = np.array([None, None, None, 0])
+    y = np.array([np.nan, np.nan, np.nan, 0])
     X = pd.DataFrame({"col": range(len(y))})
 
     messages = invalid_targets_check.validate(X, y)

From 708ea7bf8194c5352c3434b84fe6c4169e4f77da Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Fri, 27 Aug 2021 12:22:16 +0100
Subject: [PATCH 11/36] one hot encoder updates

---
 .../component_tests/test_one_hot_encoder.py   | 26 ++++++++++++++-----
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index b5568980f1..24c0f0ba8e 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -166,6 +166,7 @@ def test_drop_first():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -216,6 +217,7 @@ def test_drop_binary_and_top_n_2():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     encoder = OneHotEncoder(top_n=2, drop="if_binary")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -233,7 +235,7 @@ def test_handle_unknown():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     encoder = OneHotEncoder(handle_unknown="error")
     encoder.fit(X)
     assert isinstance(encoder.transform(X), pd.DataFrame)
@@ -299,6 +301,7 @@ def test_categories():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
 
     categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]
 
@@ -373,6 +376,7 @@ def test_more_top_n_unique_values():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
 
     random_seed = 2
 
@@ -521,21 +525,24 @@ def test_large_number_of_categories():
 @pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"])
 def test_data_types(data_type):
     if data_type == "list":
-        X = [["a"], ["b"], ["c"]]
+        X = [["a"], ["b"], ["c"]]*5
     elif data_type == "np":
-        X = np.array([["a"], ["b"], ["c"]])
+        X = np.array([["a"], ["b"], ["c"]]*5)
     elif data_type == "pd_no_index":
-        X = pd.DataFrame(["a", "b", "c"])
+        X = pd.DataFrame(["a", "b", "c"]*5)
     elif data_type == "pd_index":
-        X = pd.DataFrame(["a", "b", "c"], columns=["0"])
+        X = pd.DataFrame(["a", "b", "c"]*5, columns=["0"])
     elif data_type == "ww":
-        X = pd.DataFrame(["a", "b", "c"])
+        X = pd.DataFrame(["a", "b", "c"]*5)
         X.ww.init()
     encoder = OneHotEncoder()
     encoder.fit(X)
     X_t = encoder.transform(X)
     assert list(X_t.columns) == ["0_a", "0_b", "0_c"]
-    np.testing.assert_array_equal(X_t.to_numpy(), np.identity(3))
+    mask = np.identity(3)
+    for _ in range(4):
+        mask = np.vstack((mask, np.identity(3)))
+    np.testing.assert_array_equal(X_t.to_numpy(), mask)
 
 
 @pytest.mark.parametrize(
@@ -563,6 +570,7 @@ def test_ohe_categories():
     X = pd.DataFrame(
         {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}
     )
+    X.ww.init(logical_types={"col_2": "categorical"})
     ohe = OneHotEncoder(top_n=2)
     with pytest.raises(
         ComponentNotYetFittedError,
@@ -584,6 +592,7 @@ def test_ohe_get_feature_names():
     X = pd.DataFrame(
         {"col_1": ["a"] * 10, "col_2": ["a"] * 3 + ["b"] * 3 + ["c"] * 2 + ["d"] * 2}
     )
+    X.ww.init(logical_types={"col_2": "categorical"})
     ohe = OneHotEncoder(top_n=2)
     with pytest.raises(
         ComponentNotYetFittedError,
@@ -671,6 +680,7 @@ def check_df_equality(random_seed):
 
 def test_ohe_column_names_unique():
     df = pd.DataFrame({"A": ["x_y"], "A_x": ["y"]})
+    df.ww.init(logical_types={"A": "categorical", "A_x": "categorical"})
     df_transformed = OneHotEncoder().fit_transform(df)
     assert set(df_transformed.columns) == {"A_x_y", "A_x_y_1"}
 
@@ -685,6 +695,7 @@ def test_ohe_column_names_unique():
             "A_x_y": ["1", "y", "y"],
         }
     )
+    df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"})
     df_transformed = OneHotEncoder().fit_transform(df)
     # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
     # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists
@@ -693,6 +704,7 @@ def test_ohe_column_names_unique():
     df = pd.DataFrame(
         {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]}
     )
+    df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"})
     df_transformed = OneHotEncoder().fit_transform(df)
     # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
     # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists

From 63eb23a44f5e0a19d5623c69b8057a6a70779d81 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Fri, 27 Aug 2021 12:32:11 +0100
Subject: [PATCH 12/36] more ohe

---
 evalml/tests/component_tests/test_one_hot_encoder.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index 24c0f0ba8e..076442342f 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -78,7 +78,7 @@ def test_null_values_in_dataframe():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"})
     # Test NaN will be counted as a category if within the top_n
     encoder = OneHotEncoder(handle_missing="as_category")
     encoder.fit(X)
@@ -110,7 +110,7 @@ def test_null_values_in_dataframe():
             "col_4": [2, 0, 1, np.nan, 0],
         }
     )
-
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"})
     encoder = OneHotEncoder(top_n=2, handle_missing="as_category")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -183,6 +183,7 @@ def test_drop_binary():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -199,6 +200,7 @@ def test_drop_parameter_is_array():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -263,6 +265,7 @@ def test_no_top_n():
             "col_4": [2, 0, 1, 3, 0, 1, 2, 0, 2, 1, 2],
         }
     )
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"})
     expected_col_names = set(["col_3_b", "col_4"])
     for val in X["col_1"]:
         expected_col_names.add("col_1_" + val)
@@ -345,7 +348,7 @@ def test_less_than_top_n_unique_values():
             "col_4": [2, 0, 1, 0, 0],
         }
     )
-
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"})
     encoder = OneHotEncoder(top_n=5)
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -419,7 +422,7 @@ def test_more_top_n_unique_values_large():
             "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1],
         }
     )
-
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
     random_seed = 2
 
     encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
@@ -455,6 +458,7 @@ def test_categorical_dtype():
         }
     )
     X["col_4"] = X["col_4"].astype("category")
+    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical"})
 
     encoder = OneHotEncoder(top_n=5)
     encoder.fit(X)

From 86ca4524f1e0a8dd8fbf8aecb6b36bc204e9c036 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Fri, 27 Aug 2021 12:54:28 +0100
Subject: [PATCH 13/36] segmentation fault

---
 evalml/tests/component_tests/test_lgbm_classifier.py | 2 ++
 evalml/tests/component_tests/test_lgbm_regressor.py  | 2 ++
 2 files changed, 4 insertions(+)

diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
index bd6dd0ba41..5db4d88d82 100644
--- a/evalml/tests/component_tests/test_lgbm_classifier.py
+++ b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -175,6 +175,7 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
     X = pd.DataFrame(
         {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2}
     )
+    X.ww.init(logical_types={"feature_2": "categorical"})
     y = pd.Series([1, 1, 0, 0, 0, 1]*2)
     X_expected = pd.DataFrame(
         {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2}
@@ -182,6 +183,7 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
     X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category")
 
     X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]})
+    X_subset.ww.init(logical_types={"feature_2": "categorical"})
     X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]})
     X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category")
 
diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py
index 1b20b23bd4..1390cd1e0d 100644
--- a/evalml/tests/component_tests/test_lgbm_regressor.py
+++ b/evalml/tests/component_tests/test_lgbm_regressor.py
@@ -123,6 +123,7 @@ def test_categorical_data_subset(mock_predict, X_y_regression):
     X = pd.DataFrame(
         {"feature_1": [0, 0, 1, 1, 0, 1], "feature_2": ["a", "a", "b", "b", "c", "c"]}
     )
+    X.ww.init(logical_types={"feature_2": "categorical"})
     y = pd.Series([1, 1, 0, 0, 0, 1])
     X_expected = pd.DataFrame(
         {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}
@@ -130,6 +131,7 @@ def test_categorical_data_subset(mock_predict, X_y_regression):
     X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category")
 
     X_subset = pd.DataFrame({"feature_1": [1, 0], "feature_2": ["c", "a"]})
+    X_subset.ww.init(logical_types={"feature_2": "categorical"})
     X_expected_subset = pd.DataFrame({0: [1, 0], 1: [2.0, 0.0]})
     X_expected_subset.iloc[:, 1] = X_expected_subset.iloc[:, 1].astype("category")
 

From 7871c74c811953e70e7bcd2b761e3f94dbb41df6 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Fri, 27 Aug 2021 15:27:17 +0100
Subject: [PATCH 14/36] lgbm, per column, simple imputer

---
 evalml/tests/component_tests/test_lgbm_classifier.py |  4 ++++
 .../tests/component_tests/test_per_column_imputer.py |  5 +++--
 evalml/tests/component_tests/test_simple_imputer.py  | 12 ++++++++----
 3 files changed, 15 insertions(+), 6 deletions(-)

diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
index 5db4d88d82..052211823c 100644
--- a/evalml/tests/component_tests/test_lgbm_classifier.py
+++ b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -205,7 +205,9 @@ def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
 def test_multiple_fit(mock_predict, mock_predict_proba):
     y = pd.Series([1] * 4)
     X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]})
+    X1_fit.ww.init(logical_types={"feature": "categorical"})
     X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]})
+    X1_predict.ww.init(logical_types={"feature": "categorical"})
     X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category")
 
     clf = LightGBMClassifier()
@@ -217,7 +219,9 @@ def test_multiple_fit(mock_predict, mock_predict_proba):
 
     # Check if it will fit a different dataset with new variable
     X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]})
+    X2_fit.ww.init(logical_types={"feature": "categorical"})
     X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]})
+    X2_predict.ww.init(logical_types={"feature": "categorical"})
     X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category")
 
     clf = LightGBMClassifier()
diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py
index 7dcdbdc690..88988b50e9 100644
--- a/evalml/tests/component_tests/test_per_column_imputer.py
+++ b/evalml/tests/component_tests/test_per_column_imputer.py
@@ -47,6 +47,7 @@ def test_all_strategies():
             "D": pd.Series(["a", "a", "b", np.nan]),
         }
     )
+    X.ww.init(logical_types={"D": "categorical"})
 
     X_expected = pd.DataFrame(
         {
@@ -91,7 +92,7 @@ def test_fit_transform():
 def test_non_numeric_errors(non_numeric_df):
     # test col with all strings
     X = non_numeric_df
-
+    X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"})
     # mean with all strings
     strategies = {"A": {"impute_strategy": "mean"}}
     with pytest.raises(
@@ -121,7 +122,7 @@ def test_non_numeric_errors(non_numeric_df):
 
 def test_non_numeric_valid(non_numeric_df):
     X = non_numeric_df
-
+    X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"})
     # most frequent with all strings
     strategies = {"C": {"impute_strategy": "most_frequent"}}
     transformer = PerColumnImputer(impute_strategies=strategies)
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
index 0298d97357..cdd83af90e 100644
--- a/evalml/tests/component_tests/test_simple_imputer.py
+++ b/evalml/tests/component_tests/test_simple_imputer.py
@@ -44,7 +44,7 @@ def test_simple_imputer_mean():
 def test_simple_imputer_constant():
     # test impute strategy is constant and fill value is not specified
     X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 3, 0]])
-
+    X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"})
     transformer = SimpleImputer(impute_strategy="constant", fill_value=3)
     X_expected_arr = pd.DataFrame([[3, 0, 1, 3], ["a", 2, 3, 3], ["b", 2, 3, 0]])
     X_expected_arr = X_expected_arr.astype({0: "category"})
@@ -54,7 +54,7 @@ def test_simple_imputer_constant():
 
 def test_simple_imputer_most_frequent():
     X = pd.DataFrame([[np.nan, 0, 1, np.nan], ["a", 2, np.nan, 3], ["b", 2, 1, 0]])
-
+    X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"})
     transformer = SimpleImputer(impute_strategy="most_frequent")
     X_expected_arr = pd.DataFrame([["a", 0, 1, 0], ["a", 2, 1, 3], ["b", 2, 1, 0]])
     X_expected_arr = X_expected_arr.astype({0: "category"})
@@ -67,7 +67,7 @@ def test_simple_imputer_col_with_non_numeric():
     X = pd.DataFrame(
         [["a", 0, 1, np.nan], ["b", 2, 3, 3], ["a", 2, 3, 1], [np.nan, 2, 3, 0]]
     )
-
+    X.ww.init(logical_types={0: "categorical", 1: "Double", 2: "Double", 3: "Double"})
     transformer = SimpleImputer(impute_strategy="mean")
     with pytest.raises(
         ValueError, match="Cannot use mean strategy with non-numeric data"
@@ -121,6 +121,7 @@ def test_simple_imputer_all_bool_return_original(data_type, make_data_type):
 @pytest.mark.parametrize("data_type", ["pd", "ww"])
 def test_simple_imputer_boolean_dtype(data_type, make_data_type):
     X = pd.DataFrame([True, np.nan, False, np.nan, True])
+    X.ww.init(logical_types={0: "categorical"})
     y = pd.Series([1, 0, 0, 1, 0])
     X_expected_arr = pd.DataFrame([True, True, False, True, True], dtype="category")
     X = make_data_type(data_type, X)
@@ -138,6 +139,7 @@ def test_simple_imputer_multitype_with_one_bool(data_type, make_data_type):
             "bool no nan": pd.Series([False, False, False, False, True], dtype=bool),
         }
     )
+    X_multi.ww.init(logical_types={"bool with nan": "categorical"})
     y = pd.Series([1, 0, 0, 1, 0])
     X_multi_expected_arr = pd.DataFrame(
         {
@@ -256,6 +258,7 @@ def test_simple_imputer_fill_value(data_type):
                 ),
             }
         )
+        X.ww.init(logical_types={"categorical with nan": "categorical", "object with nan": "categorical"})
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
     imputer.fit(X, y)
@@ -321,6 +324,7 @@ def test_simple_imputer_with_none():
             "all None": [None, None, None, None],
         }
     )
+    X.ww.init(logical_types={"boolean with None": "categorical", "object with None": "categorical", "all None": "categorical"})
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = SimpleImputer()
     imputer.fit(X, y)
@@ -343,7 +347,7 @@ def test_simple_imputer_supports_natural_language_constant():
         }
     )
     y = pd.Series([0, 0, 1, 0, 1])
-    X.ww.init(logical_types={"natural language col": "NaturalLanguage"})
+    X.ww.init(logical_types={"cat with None": "categorical", "natural language col": "NaturalLanguage"})
     imputer = SimpleImputer(impute_strategy="constant", fill_value="placeholder")
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)

From c336f9d1a065578ddb4b99f0516d93a5cd2b0a45 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 11:14:07 +0100
Subject: [PATCH 15/36] imputer and partial dependence

---
 evalml/tests/component_tests/test_imputer.py                   | 3 ++-
 evalml/tests/conftest.py                                       | 2 ++
 .../tests/model_understanding_tests/test_partial_dependence.py | 2 ++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
index cd75b4044a..28d4ebcd6e 100644
--- a/evalml/tests/component_tests/test_imputer.py
+++ b/evalml/tests/component_tests/test_imputer.py
@@ -259,9 +259,10 @@ def test_imputer_does_not_reset_index():
     X.loc[5, "input_val"] = np.nan
     X.loc[5, "input_cat"] = np.nan
     assert X.index.tolist() == list(range(10))
+    X.ww.init(logical_types={"input_cat": "categorical"})
 
     X.drop(0, inplace=True)
-    y = X.pop("target")
+    y = X.ww.pop("target")
 
     imputer = Imputer()
     imputer.fit(X, y=y)
diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
index b80661d0bf..2572dc97fb 100644
--- a/evalml/tests/conftest.py
+++ b/evalml/tests/conftest.py
@@ -43,6 +43,7 @@
     handle_problem_types,
     is_regression,
 )
+from evalml.utils import infer_feature_types
 
 
 def pytest_configure(config):
@@ -741,6 +742,7 @@ def decision_tree_classification_pipeline_class(X_y_categorical_classification):
         }
     )
     X, y = X_y_categorical_classification
+    X.ww.init(logical_types={"Ticket": "categorical", "Cabin": "categorical"})
     pipeline.fit(X, y)
     return pipeline
 
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index 6751b29a30..30b33783f9 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -21,6 +21,7 @@
     RegressionPipeline,
 )
 from evalml.problem_types import ProblemTypes
+from evalml.utils import infer_feature_types
 
 
 @pytest.fixture
@@ -209,6 +210,7 @@ def test_partial_dependence_catboost(
                 "also string": ["c", "b", "a"],
             }
         )
+        X.ww.init(logical_types={"string": "categorical", "also string": "categorical"})
         pipeline = pipeline_class(
             component_graph=["CatBoost Classifier"],
             parameters={"CatBoost Classifier": {"thread_count": 1}},

From 1fd623133dd9b9fd57922428cf5b9c00e45aca6e Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 11:27:11 +0100
Subject: [PATCH 16/36] pip install scikit-learn

---
 .github/workflows/windows_unit_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml
index b5c0eba6f1..041827d3d6 100644
--- a/.github/workflows/windows_unit_tests.yml
+++ b/.github/workflows/windows_unit_tests.yml
@@ -66,6 +66,7 @@ jobs:
           conda activate curr_py
           python -m pip install --upgrade pip
           python -m pip install .
+          python -m pip install scikit-learn
           python -m pip install -r test-requirements.txt
           pip freeze
       - name: Run unit tests

From b086a1628d35ab45d1c9a520a81a356fd6d3d9bf Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 11:34:54 +0100
Subject: [PATCH 17/36] install woodwork

---
 .github/workflows/windows_unit_tests.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml
index 041827d3d6..ccbe5231c6 100644
--- a/.github/workflows/windows_unit_tests.yml
+++ b/.github/workflows/windows_unit_tests.yml
@@ -67,6 +67,7 @@ jobs:
           python -m pip install --upgrade pip
           python -m pip install .
           python -m pip install scikit-learn
+          python -m pip install woodwork==0.5.1
           python -m pip install -r test-requirements.txt
           pip freeze
       - name: Run unit tests

From 370f3372dbbc152b1d6839059b6cabdf3c9dde26 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 11:41:47 +0100
Subject: [PATCH 18/36] no message

---
 .github/workflows/windows_unit_tests.yml | 2 --
 1 file changed, 2 deletions(-)

diff --git a/.github/workflows/windows_unit_tests.yml b/.github/workflows/windows_unit_tests.yml
index ccbe5231c6..b5c0eba6f1 100644
--- a/.github/workflows/windows_unit_tests.yml
+++ b/.github/workflows/windows_unit_tests.yml
@@ -66,8 +66,6 @@ jobs:
           conda activate curr_py
           python -m pip install --upgrade pip
           python -m pip install .
-          python -m pip install scikit-learn
-          python -m pip install woodwork==0.5.1
           python -m pip install -r test-requirements.txt
           pip freeze
       - name: Run unit tests

From 64868afa83c46dfce8847a21f310ba75bcb96089 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 12:14:29 +0100
Subject: [PATCH 19/36] test_explainers

---
 .../prediction_explanations_tests/test_explainers.py             | 1 +
 1 file changed, 1 insertion(+)

diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
index 2405e2cabd..9f847ca582 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
@@ -1627,6 +1627,7 @@ def test_explain_predictions_oversampler(estimator, fraud_100):
         reason="Skipping test because imbalanced-learn not installed",
     )
     X, y = fraud_100
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
     pipeline = BinaryClassificationPipeline(
         component_graph={
             "Imputer": ["Imputer", "X", "y"],

From 6b330649ec6be61ce1f0325c58514246b5c994be Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 12:34:56 +0100
Subject: [PATCH 20/36] plotly update

---
 .../dependency_update_check/latest_dependency_versions.txt      | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
index 9fe6b759ae..47683d73c1 100644
--- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt
+++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
@@ -16,7 +16,7 @@ nlp-primitives==1.1.0
 numba==0.53.0
 numpy==1.21.2
 pandas==1.3.2
-plotly==5.2.2
+plotly==5.3.0
 pmdarima==1.8.0
 psutil==5.8.0
 pyzmq==22.2.1

From d56a5caca34aecfb7a9daa292cdbc8223c4cdaa3 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 13:10:19 +0100
Subject: [PATCH 21/36] partial dependence

---
 .../test_partial_dependence.py                      | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index 30b33783f9..8af3f12b70 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -185,11 +185,11 @@ def test_partial_dependence_catboost(
 
         if problem_type == ProblemTypes.BINARY:
             X, y = X_y_binary
-            y_small = ["a", "b", "a"]
+            y_small = ["a", "b", "a"]*5
             pipeline_class = BinaryClassificationPipeline
         else:
             X, y = X_y_multi
-            y_small = ["a", "b", "c"]
+            y_small = ["a", "b", "c"]*5
             pipeline_class = MulticlassClassificationPipeline
 
         pipeline = pipeline_class(
@@ -204,13 +204,12 @@ def test_partial_dependence_catboost(
         # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence
         X = pd.DataFrame(
             {
-                "numeric": [1, 2, 3],
-                "also numeric": [2, 3, 4],
-                "string": ["a", "b", "c"],
-                "also string": ["c", "b", "a"],
+                "numeric": [1, 2, 3]*5,
+                "also numeric": [2, 3, 4]*5,
+                "string": ["a", "b", "c"]*5,
+                "also string": ["c", "b", "a"]*5,
             }
         )
-        X.ww.init(logical_types={"string": "categorical", "also string": "categorical"})
         pipeline = pipeline_class(
             component_graph=["CatBoost Classifier"],
             parameters={"CatBoost Classifier": {"thread_count": 1}},

From 77374a9795f1ffc0df8a26c648468a4770dc6367 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 13:14:53 +0100
Subject: [PATCH 22/36] lint fixes

---
 evalml/tests/component_tests/test_imputer.py  | 150 ++++++++++--------
 .../component_tests/test_lgbm_classifier.py   |   9 +-
 .../component_tests/test_one_hot_encoder.py   |  82 ++++++++--
 .../test_per_column_imputer.py                |  18 ++-
 .../component_tests/test_simple_imputer.py    |  22 ++-
 .../component_tests/test_target_encoder.py    |  22 ++-
 .../component_tests/test_target_imputer.py    |  24 +--
 .../data_checks_tests/test_data_checks.py     |   7 +-
 .../test_id_columns_data_check.py             |   9 +-
 .../test_multicollinearity_data_check.py      |  10 +-
 .../test_explainers.py                        |  32 +++-
 .../test_partial_dependence.py                |  22 +--
 .../test_permutation_importance.py            |   4 +-
 .../test_regression.py                        |   9 +-
 .../pipeline_tests/test_component_graph.py    |  15 +-
 .../pipeline_tests/test_pipeline_utils.py     |  22 +--
 16 files changed, 311 insertions(+), 146 deletions(-)

diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
index 28d4ebcd6e..608f62c02b 100644
--- a/evalml/tests/component_tests/test_imputer.py
+++ b/evalml/tests/component_tests/test_imputer.py
@@ -19,24 +19,24 @@ def imputer_test_data():
     return pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
+                ["zero", "one", "two", "zero", "two"] * 4, dtype="category"
             ),
-            "int col": [0, 1, 2, 0, 3]*4,
-            "object col": ["b", "b", "a", "c", "d"]*4,
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
-            "bool col": [True, False, False, True, True]*4,
+            "int col": [0, 1, 2, 0, 3] * 4,
+            "object col": ["b", "b", "a", "c", "d"] * 4,
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4,
+            "bool col": [True, False, False, True, True] * 4,
             "categorical with nan": pd.Series(
-                [np.nan, "1", "0", "0", "3"]*4, dtype="category"
+                [np.nan, "1", "0", "0", "3"] * 4, dtype="category"
             ),
-            "int with nan": [np.nan, 1, 0, 0, 1]*4,
-            "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0]*4,
-            "object with nan": ["b", "b", np.nan, "c", np.nan]*4,
+            "int with nan": [np.nan, 1, 0, 0, 1] * 4,
+            "float with nan": [0.0, 1.0, np.nan, -1.0, 0.0] * 4,
+            "object with nan": ["b", "b", np.nan, "c", np.nan] * 4,
             "bool col with nan": pd.Series(
-                [True, np.nan, False, np.nan, True]*4, dtype="category"
+                [True, np.nan, False, np.nan, True] * 4, dtype="category"
             ),
-            "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan]*4,
+            "all nan": [np.nan, np.nan, np.nan, np.nan, np.nan] * 4,
             "all nan cat": pd.Series(
-                [np.nan, np.nan, np.nan, np.nan, np.nan]*4, dtype="category"
+                [np.nan, np.nan, np.nan, np.nan, np.nan] * 4, dtype="category"
             ),
         }
     )
@@ -91,16 +91,16 @@ def test_numeric_only_input(imputer_test_data):
     X = imputer_test_data[
         ["int col", "float col", "int with nan", "float with nan", "all nan"]
     ]
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
     imputer = Imputer(numeric_impute_strategy="median")
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int col": [0, 1, 2, 0, 3]*4,
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
-            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4,
-            "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4,
+            "int col": [0, 1, 2, 0, 3] * 4,
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4,
+            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0] * 4,
+            "float with nan": [0.0, 1.0, 0, -1.0, 0.0] * 4,
         }
     )
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -122,21 +122,23 @@ def test_categorical_only_input(imputer_test_data):
             "all nan cat",
         ]
     ]
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
 
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
+                ["zero", "one", "two", "zero", "two"] * 4, dtype="category"
             ),
-            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
-            "bool col": [True, False, False, True, True]*4,
+            "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"),
+            "bool col": [True, False, False, True, True] * 4,
             "categorical with nan": pd.Series(
-                ["0", "1", "0", "0", "3"]*4, dtype="category"
+                ["0", "1", "0", "0", "3"] * 4, dtype="category"
+            ),
+            "object with nan": pd.Series(
+                ["b", "b", "b", "c", "b"] * 4, dtype="category"
             ),
-            "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"),
             "bool col with nan": pd.Series(
-                [True, True, False, True, True]*4, dtype="category"
+                [True, True, False, True, True] * 4, dtype="category"
             ),
         }
     )
@@ -157,20 +159,22 @@ def test_categorical_and_numeric_input(imputer_test_data):
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
+                ["zero", "one", "two", "zero", "two"] * 4, dtype="category"
             ),
-            "int col": [0, 1, 2, 0, 3]*4,
-            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
-            "float col": [0.0, 1.0, 0.0, -2.0, 5.0]*4,
-            "bool col": [True, False, False, True, True]*4,
+            "int col": [0, 1, 2, 0, 3] * 4,
+            "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"),
+            "float col": [0.0, 1.0, 0.0, -2.0, 5.0] * 4,
+            "bool col": [True, False, False, True, True] * 4,
             "categorical with nan": pd.Series(
-                ["0", "1", "0", "0", "3"]*4, dtype="category"
+                ["0", "1", "0", "0", "3"] * 4, dtype="category"
+            ),
+            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0] * 4,
+            "float with nan": [0.0, 1.0, 0, -1.0, 0.0] * 4,
+            "object with nan": pd.Series(
+                ["b", "b", "b", "c", "b"] * 4, dtype="category"
             ),
-            "int with nan": [0.5, 1.0, 0.0, 0.0, 1.0]*4,
-            "float with nan": [0.0, 1.0, 0, -1.0, 0.0]*4,
-            "object with nan": pd.Series(["b", "b", "b", "c", "b"]*4, dtype="category"),
             "bool col with nan": pd.Series(
-                [True, True, False, True, True]*4, dtype="category"
+                [True, True, False, True, True] * 4, dtype="category"
             ),
         }
     )
@@ -183,7 +187,7 @@ def test_categorical_and_numeric_input(imputer_test_data):
 
 def test_drop_all_columns(imputer_test_data):
     X = imputer_test_data[["all nan cat", "all nan"]]
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
     X.ww.init()
     imputer = Imputer()
     imputer.fit(X, y)
@@ -289,7 +293,7 @@ def test_imputer_fill_value(imputer_test_data):
             "bool col with nan",
         ]
     ]
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
     imputer = Imputer(
         categorical_impute_strategy="constant",
         numeric_impute_strategy="constant",
@@ -300,16 +304,16 @@ def test_imputer_fill_value(imputer_test_data):
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int with nan": [-1, 1, 0, 0, 1]*4,
+            "int with nan": [-1, 1, 0, 0, 1] * 4,
             "categorical with nan": pd.Series(
-                ["fill", "1", "0", "0", "3"]*4, dtype="category"
+                ["fill", "1", "0", "0", "3"] * 4, dtype="category"
             ),
-            "float with nan": [0.0, 1.0, -1, -1.0, 0.0]*4,
+            "float with nan": [0.0, 1.0, -1, -1.0, 0.0] * 4,
             "object with nan": pd.Series(
-                ["b", "b", "fill", "c", "fill"]*4, dtype="category"
+                ["b", "b", "fill", "c", "fill"] * 4, dtype="category"
             ),
             "bool col with nan": pd.Series(
-                [True, "fill", False, "fill", True]*4, dtype="category"
+                [True, "fill", False, "fill", True] * 4, dtype="category"
             ),
         }
     )
@@ -327,7 +331,7 @@ def test_imputer_fill_value(imputer_test_data):
 
 def test_imputer_no_nans(imputer_test_data):
     X = imputer_test_data[["categorical col", "object col", "bool col"]]
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
     imputer = Imputer(
         categorical_impute_strategy="constant",
         numeric_impute_strategy="constant",
@@ -339,10 +343,10 @@ def test_imputer_no_nans(imputer_test_data):
     expected = pd.DataFrame(
         {
             "categorical col": pd.Series(
-                ["zero", "one", "two", "zero", "two"]*4, dtype="category"
+                ["zero", "one", "two", "zero", "two"] * 4, dtype="category"
             ),
-            "object col": pd.Series(["b", "b", "a", "c", "d"]*4, dtype="category"),
-            "bool col": [True, False, False, True, True]*4,
+            "object col": pd.Series(["b", "b", "a", "c", "d"] * 4, dtype="category"),
+            "bool col": [True, False, False, True, True] * 4,
         }
     )
     assert_frame_equal(transformed, expected, check_dtype=False)
@@ -360,25 +364,29 @@ def test_imputer_no_nans(imputer_test_data):
 def test_imputer_with_none():
     X = pd.DataFrame(
         {
-            "int with None": [1, 0, 5, None]*4,
-            "float with None": [0.1, 0.0, 0.5, None]*4,
-            "category with None": pd.Series(["b", "a", "a", None]*4, dtype="category"),
-            "boolean with None": pd.Series([True, None, False, True]*4),
-            "object with None": ["b", "a", "a", None]*4,
-            "all None": [None, None, None, None]*4,
+            "int with None": [1, 0, 5, None] * 4,
+            "float with None": [0.1, 0.0, 0.5, None] * 4,
+            "category with None": pd.Series(
+                ["b", "a", "a", None] * 4, dtype="category"
+            ),
+            "boolean with None": pd.Series([True, None, False, True] * 4),
+            "object with None": ["b", "a", "a", None] * 4,
+            "all None": [None, None, None, None] * 4,
         }
     )
-    y = pd.Series([0, 0, 1, 0, 1]*4)
+    y = pd.Series([0, 0, 1, 0, 1] * 4)
     imputer = Imputer()
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
     expected = pd.DataFrame(
         {
-            "int with None": [1, 0, 5, 2]*4,
-            "float with None": [0.1, 0.0, 0.5, 0.2]*4,
-            "category with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
-            "boolean with None": pd.Series([True, True, False, True]*4, dtype="category"),
-            "object with None": pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
+            "int with None": [1, 0, 5, 2] * 4,
+            "float with None": [0.1, 0.0, 0.5, 0.2] * 4,
+            "category with None": pd.Series(["b", "a", "a", "a"] * 4, dtype="category"),
+            "boolean with None": pd.Series(
+                [True, True, False, True] * 4, dtype="category"
+            ),
+            "object with None": pd.Series(["b", "a", "a", "a"] * 4, dtype="category"),
         }
     )
     assert_frame_equal(expected, transformed, check_dtype=False)
@@ -404,9 +412,9 @@ def test_imputer_all_bool_return_original(data_type, make_data_type):
 
 @pytest.mark.parametrize("data_type", ["pd", "ww"])
 def test_imputer_bool_dtype_object(data_type, make_data_type):
-    X = pd.DataFrame([True, np.nan, False, np.nan, True]*4)
-    y = pd.Series([1, 0, 0, 1, 0]*4)
-    X_expected_arr = pd.DataFrame([True, True, False, True, True]*4, dtype="category")
+    X = pd.DataFrame([True, np.nan, False, np.nan, True] * 4)
+    y = pd.Series([1, 0, 0, 1, 0] * 4)
+    X_expected_arr = pd.DataFrame([True, True, False, True, True] * 4, dtype="category")
     X = make_data_type(data_type, X)
     y = make_data_type(data_type, y)
     imputer = Imputer()
@@ -419,17 +427,21 @@ def test_imputer_bool_dtype_object(data_type, make_data_type):
 def test_imputer_multitype_with_one_bool(data_type, make_data_type):
     X_multi = pd.DataFrame(
         {
-            "bool with nan": pd.Series([True, np.nan, False, np.nan, False]*4),
-            "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool),
+            "bool with nan": pd.Series([True, np.nan, False, np.nan, False] * 4),
+            "bool no nan": pd.Series(
+                [False, False, False, False, True] * 4, dtype=bool
+            ),
         }
     )
-    y = pd.Series([1, 0, 0, 1, 0]*4)
+    y = pd.Series([1, 0, 0, 1, 0] * 4)
     X_multi_expected_arr = pd.DataFrame(
         {
             "bool with nan": pd.Series(
-                [True, False, False, False, False]*4, dtype="category"
+                [True, False, False, False, False] * 4, dtype="category"
+            ),
+            "bool no nan": pd.Series(
+                [False, False, False, False, True] * 4, dtype=bool
             ),
-            "bool no nan": pd.Series([False, False, False, False, True]*4, dtype=bool),
         }
     )
 
@@ -469,23 +481,23 @@ def test_imputer_int_preserved():
 
 
 def test_imputer_bool_preserved():
-    X = pd.DataFrame(pd.Series([True, False, True, np.nan]*4))
+    X = pd.DataFrame(pd.Series([True, False, True, np.nan] * 4))
     imputer = Imputer(categorical_impute_strategy="most_frequent")
     transformed = imputer.fit_transform(X)
     pd.testing.assert_frame_equal(
         transformed,
-        pd.DataFrame(pd.Series([True, False, True, True]*4, dtype="category")),
+        pd.DataFrame(pd.Series([True, False, True, True] * 4, dtype="category")),
     )
     assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {
         0: Categorical
     }
 
-    X = pd.DataFrame(pd.Series([True, False, True, False]*4))
+    X = pd.DataFrame(pd.Series([True, False, True, False] * 4))
     imputer = Imputer(categorical_impute_strategy="most_frequent")
     transformed = imputer.fit_transform(X)
     pd.testing.assert_frame_equal(
         transformed,
-        pd.DataFrame(pd.Series([True, False, True, False]*4)),
+        pd.DataFrame(pd.Series([True, False, True, False] * 4)),
         check_dtype=False,
     )
     assert {k: type(v) for k, v in transformed.ww.logical_types.items()} == {0: Boolean}
diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
index 052211823c..2a35d0ba7c 100644
--- a/evalml/tests/component_tests/test_lgbm_classifier.py
+++ b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -173,12 +173,15 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):
 @patch("evalml.pipelines.components.estimators.estimator.Estimator.predict")
 def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
     X = pd.DataFrame(
-        {"feature_1": [0, 0, 1, 1, 0, 1]*2, "feature_2": ["a", "a", "b", "b", "c", "c"]*2}
+        {
+            "feature_1": [0, 0, 1, 1, 0, 1] * 2,
+            "feature_2": ["a", "a", "b", "b", "c", "c"] * 2,
+        }
     )
     X.ww.init(logical_types={"feature_2": "categorical"})
-    y = pd.Series([1, 1, 0, 0, 0, 1]*2)
+    y = pd.Series([1, 1, 0, 0, 0, 1] * 2)
     X_expected = pd.DataFrame(
-        {0: [0, 0, 1, 1, 0, 1]*2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]*2}
+        {0: [0, 0, 1, 1, 0, 1] * 2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0] * 2}
     )
     X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category")
 
diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index 076442342f..36a5cad415 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -166,7 +166,13 @@ def test_drop_first():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -183,7 +189,13 @@ def test_drop_binary():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -200,7 +212,13 @@ def test_drop_parameter_is_array():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -219,7 +237,13 @@ def test_drop_binary_and_top_n_2():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     encoder = OneHotEncoder(top_n=2, drop="if_binary")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -237,7 +261,13 @@ def test_handle_unknown():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     encoder = OneHotEncoder(handle_unknown="error")
     encoder.fit(X)
     assert isinstance(encoder.transform(X), pd.DataFrame)
@@ -304,7 +334,13 @@ def test_categories():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
 
     categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]
 
@@ -379,7 +415,13 @@ def test_more_top_n_unique_values():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
 
     random_seed = 2
 
@@ -422,7 +464,13 @@ def test_more_top_n_unique_values_large():
             "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     random_seed = 2
 
     encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
@@ -529,15 +577,15 @@ def test_large_number_of_categories():
 @pytest.mark.parametrize("data_type", ["list", "np", "pd_no_index", "pd_index", "ww"])
 def test_data_types(data_type):
     if data_type == "list":
-        X = [["a"], ["b"], ["c"]]*5
+        X = [["a"], ["b"], ["c"]] * 5
     elif data_type == "np":
-        X = np.array([["a"], ["b"], ["c"]]*5)
+        X = np.array([["a"], ["b"], ["c"]] * 5)
     elif data_type == "pd_no_index":
-        X = pd.DataFrame(["a", "b", "c"]*5)
+        X = pd.DataFrame(["a", "b", "c"] * 5)
     elif data_type == "pd_index":
-        X = pd.DataFrame(["a", "b", "c"]*5, columns=["0"])
+        X = pd.DataFrame(["a", "b", "c"] * 5, columns=["0"])
     elif data_type == "ww":
-        X = pd.DataFrame(["a", "b", "c"]*5)
+        X = pd.DataFrame(["a", "b", "c"] * 5)
         X.ww.init()
     encoder = OneHotEncoder()
     encoder.fit(X)
@@ -699,7 +747,9 @@ def test_ohe_column_names_unique():
             "A_x_y": ["1", "y", "y"],
         }
     )
-    df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"})
+    df.ww.init(
+        logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}
+    )
     df_transformed = OneHotEncoder().fit_transform(df)
     # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
     # category 1 in A_x_y gets mapped to A_x_y_1_1 because A_x_y_1 already exists
@@ -708,7 +758,9 @@ def test_ohe_column_names_unique():
     df = pd.DataFrame(
         {"A": ["x_y", "z", "a"], "A_x": ["y_1", "y", "b"], "A_x_y": ["1", "y", "c"]}
     )
-    df.ww.init(logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"})
+    df.ww.init(
+        logical_types={"A": "categorical", "A_x": "categorical", "A_x_y": "categorical"}
+    )
     df_transformed = OneHotEncoder().fit_transform(df)
     # category y in A_x gets mapped to A_x_y_1 because A_x_y already exists
     # category y_1 in A_x gets mapped to A_x_y_1_1 because A_x_y_1 already exists
diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py
index 88988b50e9..f97390f5d3 100644
--- a/evalml/tests/component_tests/test_per_column_imputer.py
+++ b/evalml/tests/component_tests/test_per_column_imputer.py
@@ -92,7 +92,14 @@ def test_fit_transform():
 def test_non_numeric_errors(non_numeric_df):
     # test col with all strings
     X = non_numeric_df
-    X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"})
+    X.ww.init(
+        logical_types={
+            "A": "categorical",
+            "B": "categorical",
+            "C": "categorical",
+            "D": "categorical",
+        }
+    )
     # mean with all strings
     strategies = {"A": {"impute_strategy": "mean"}}
     with pytest.raises(
@@ -122,7 +129,14 @@ def test_non_numeric_errors(non_numeric_df):
 
 def test_non_numeric_valid(non_numeric_df):
     X = non_numeric_df
-    X.ww.init(logical_types={"A": "categorical", "B": "categorical", "C": "categorical", "D": "categorical"})
+    X.ww.init(
+        logical_types={
+            "A": "categorical",
+            "B": "categorical",
+            "C": "categorical",
+            "D": "categorical",
+        }
+    )
     # most frequent with all strings
     strategies = {"C": {"impute_strategy": "most_frequent"}}
     transformer = PerColumnImputer(impute_strategies=strategies)
diff --git a/evalml/tests/component_tests/test_simple_imputer.py b/evalml/tests/component_tests/test_simple_imputer.py
index cdd83af90e..7d0de8c978 100644
--- a/evalml/tests/component_tests/test_simple_imputer.py
+++ b/evalml/tests/component_tests/test_simple_imputer.py
@@ -258,7 +258,12 @@ def test_simple_imputer_fill_value(data_type):
                 ),
             }
         )
-        X.ww.init(logical_types={"categorical with nan": "categorical", "object with nan": "categorical"})
+        X.ww.init(
+            logical_types={
+                "categorical with nan": "categorical",
+                "object with nan": "categorical",
+            }
+        )
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = SimpleImputer(impute_strategy="constant", fill_value=fill_value)
     imputer.fit(X, y)
@@ -324,7 +329,13 @@ def test_simple_imputer_with_none():
             "all None": [None, None, None, None],
         }
     )
-    X.ww.init(logical_types={"boolean with None": "categorical", "object with None": "categorical", "all None": "categorical"})
+    X.ww.init(
+        logical_types={
+            "boolean with None": "categorical",
+            "object with None": "categorical",
+            "all None": "categorical",
+        }
+    )
     y = pd.Series([0, 0, 1, 0, 1])
     imputer = SimpleImputer()
     imputer.fit(X, y)
@@ -347,7 +358,12 @@ def test_simple_imputer_supports_natural_language_constant():
         }
     )
     y = pd.Series([0, 0, 1, 0, 1])
-    X.ww.init(logical_types={"cat with None": "categorical", "natural language col": "NaturalLanguage"})
+    X.ww.init(
+        logical_types={
+            "cat with None": "categorical",
+            "natural language col": "NaturalLanguage",
+        }
+    )
     imputer = SimpleImputer(impute_strategy="constant", fill_value="placeholder")
     imputer.fit(X, y)
     transformed = imputer.transform(X, y)
diff --git a/evalml/tests/component_tests/test_target_encoder.py b/evalml/tests/component_tests/test_target_encoder.py
index fd7c5315e1..9d497a8c61 100644
--- a/evalml/tests/component_tests/test_target_encoder.py
+++ b/evalml/tests/component_tests/test_target_encoder.py
@@ -70,7 +70,13 @@ def test_null_values_in_dataframe():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
     y = pd.Series([0, 1, 1, 1, 0])
     encoder = TargetEncoder(handle_missing="value")
     encoder.fit(X, y)
@@ -117,13 +123,13 @@ def test_null_values_in_dataframe():
 def test_cols():
     X = pd.DataFrame(
         {
-            "col_1": [1, 2, 1, 1, 2]*2,
-            "col_2": ["2", "1", "1", "1", "1"]*2,
-            "col_3": ["a", "a", "a", "a", "a"]*2,
+            "col_1": [1, 2, 1, 1, 2] * 2,
+            "col_2": ["2", "1", "1", "1", "1"] * 2,
+            "col_3": ["a", "a", "a", "a", "a"] * 2,
         }
     )
     X_expected = X.astype({"col_1": "int64", "col_2": "category", "col_3": "category"})
-    y = pd.Series([0, 1, 1, 1, 0]*2)
+    y = pd.Series([0, 1, 1, 1, 0] * 2)
     encoder = TargetEncoder(cols=[])
     encoder.fit(X, y)
     X_t = encoder.transform(X)
@@ -134,9 +140,9 @@ def test_cols():
     X_t = encoder.transform(X)
     X_expected = pd.DataFrame(
         {
-            "col_1": pd.Series([1, 2, 1, 1, 2]*2, dtype="int64"),
-            "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863]*2,
-            "col_3": pd.Series(["a", "a", "a", "a", "a"]*2, dtype="category"),
+            "col_1": pd.Series([1, 2, 1, 1, 2] * 2, dtype="int64"),
+            "col_2": [0.161365, 0.749863, 0.749863, 0.749863, 0.749863] * 2,
+            "col_3": pd.Series(["a", "a", "a", "a", "a"] * 2, dtype="category"),
         }
     )
     assert_frame_equal(X_expected, X_t, check_less_precise=True)
diff --git a/evalml/tests/component_tests/test_target_imputer.py b/evalml/tests/component_tests/test_target_imputer.py
index 8a9ad5ce26..9bdf615b66 100644
--- a/evalml/tests/component_tests/test_target_imputer.py
+++ b/evalml/tests/component_tests/test_target_imputer.py
@@ -57,11 +57,15 @@ def test_target_imputer_mean():
         (None, pd.Series([np.nan, 0, 5]), pd.Series([0, 0, 5])),
         (
             None,
-            pd.Series([np.nan, "a", "b"]*5),
-            pd.Series(["missing_value", "a", "b"]*5).astype("category"),
+            pd.Series([np.nan, "a", "b"] * 5),
+            pd.Series(["missing_value", "a", "b"] * 5).astype("category"),
         ),
         (3, pd.Series([np.nan, 0, 5]), pd.Series([3, 0, 5])),
-        (3, pd.Series([np.nan, "a", "b"]*5), pd.Series([3, "a", "b"]*5).astype("category")),
+        (
+            3,
+            pd.Series([np.nan, "a", "b"] * 5),
+            pd.Series([3, "a", "b"] * 5).astype("category"),
+        ),
     ],
 )
 def test_target_imputer_constant(fill_value, y, y_expected):
@@ -71,9 +75,9 @@ def test_target_imputer_constant(fill_value, y, y_expected):
 
 
 def test_target_imputer_most_frequent():
-    y = pd.Series([np.nan, "a", "b"]*5)
+    y = pd.Series([np.nan, "a", "b"] * 5)
     imputer = TargetImputer(impute_strategy="most_frequent")
-    y_expected = pd.Series(["a", "a", "b"]*5).astype("category")
+    y_expected = pd.Series(["a", "a", "b"] * 5).astype("category")
     _, y_t = imputer.fit_transform(None, y)
     assert_series_equal(y_expected, y_t, check_dtype=False)
 
@@ -85,7 +89,7 @@ def test_target_imputer_most_frequent():
 
 
 def test_target_imputer_col_with_non_numeric_with_numeric_strategy():
-    y = pd.Series([np.nan, "a", "b"]*5)
+    y = pd.Series([np.nan, "a", "b"] * 5)
     imputer = TargetImputer(impute_strategy="mean")
     with pytest.raises(
         ValueError, match="Cannot use mean strategy with non-numeric data"
@@ -190,16 +194,16 @@ def test_target_imputer_with_none(y, y_expected):
     "y, y_expected",
     [
         (
-            pd.Series(["b", "a", "a", None]*4, dtype="category"),
-            pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
+            pd.Series(["b", "a", "a", None] * 4, dtype="category"),
+            pd.Series(["b", "a", "a", "a"] * 4, dtype="category"),
         ),
         (
             pd.Series([True, None, False, True], dtype="category"),
             pd.Series([True, True, False, True], dtype="category"),
         ),
         (
-            pd.Series(["b", "a", "a", None]*4),
-            pd.Series(["b", "a", "a", "a"]*4, dtype="category"),
+            pd.Series(["b", "a", "a", None] * 4),
+            pd.Series(["b", "a", "a", "a"] * 4, dtype="category"),
         ),
     ],
 )
diff --git a/evalml/tests/data_checks_tests/test_data_checks.py b/evalml/tests/data_checks_tests/test_data_checks.py
index 529fc66d68..fc1179cc81 100644
--- a/evalml/tests/data_checks_tests/test_data_checks.py
+++ b/evalml/tests/data_checks_tests/test_data_checks.py
@@ -340,7 +340,12 @@ def test_default_data_checks_regression(input_type):
     X["nan_dt_col"][0] = None
     y = pd.Series([0.3, 100.0, np.nan, 1.0, 0.2])
     y_no_variance = pd.Series([5] * 5)
-    X.ww.init(logical_types={"lots_of_null": "categorical", "natural_language_nan": "NaturalLanguage"})
+    X.ww.init(
+        logical_types={
+            "lots_of_null": "categorical",
+            "natural_language_nan": "NaturalLanguage",
+        }
+    )
     if input_type == "ww":
         y = ww.init_series(y)
         y_no_variance = ww.init_series(y_no_variance)
diff --git a/evalml/tests/data_checks_tests/test_id_columns_data_check.py b/evalml/tests/data_checks_tests/test_id_columns_data_check.py
index f15a3c6f45..af5195e335 100644
--- a/evalml/tests/data_checks_tests/test_id_columns_data_check.py
+++ b/evalml/tests/data_checks_tests/test_id_columns_data_check.py
@@ -135,7 +135,14 @@ def test_id_columns_strings():
         "col_6": [0.1, 0.2, 0.3, 0.4],
     }
     X = pd.DataFrame.from_dict(X_dict)
-    X.ww.init(logical_types={"col_1_id": "categorical", "col_2": "categorical", "Id": "categorical", "col_5": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1_id": "categorical",
+            "col_2": "categorical",
+            "Id": "categorical",
+            "col_5": "categorical",
+        }
+    )
     id_cols_check = IDColumnsDataCheck(id_threshold=0.95)
     assert id_cols_check.validate(X) == {
         "warnings": [
diff --git a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
index 23961985e4..208ffe9be3 100644
--- a/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
+++ b/evalml/tests/data_checks_tests/test_multicollinearity_data_check.py
@@ -86,7 +86,15 @@ def test_multicollinearity_nonnumeric_cols(data_type, make_data_type):
             "col_6": [1, 1, 2, 3, 1],
         }
     )
-    X.ww.init(logical_types={"col_1": "categorical", "col_2": "categorical", "col_3": "categorical", "col_4": "categorical", "col_5": "categorical"})
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+            "col_4": "categorical",
+            "col_5": "categorical",
+        }
+    )
 
     multi_check = MulticollinearityDataCheck(threshold=0.9)
     assert multi_check.validate(X) == {
diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
index 9f847ca582..6664e630d5 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
@@ -1263,7 +1263,9 @@ def transform_y_for_problem_type(problem_type, y):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1305,7 +1307,13 @@ def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_
 def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
 
-    X.ww.set_types(logical_types={"provider": "NaturalLanguage", "currency": "categorical", "expiration_date": "categorical"})
+    X.ww.set_types(
+        logical_types={
+            "provider": "NaturalLanguage",
+            "currency": "categorical",
+            "expiration_date": "categorical",
+        }
+    )
     component_graph = [
         "Select Columns Transformer",
         "One Hot Encoder",
@@ -1362,7 +1370,9 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1417,7 +1427,9 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     component_graph = {
         "SelectNumeric": ["Select Columns Transformer", "X", "y"],
@@ -1477,7 +1489,9 @@ def test_categories_aggregated_but_not_those_that_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     component_graph = [
         "Select Columns Transformer",
@@ -1521,7 +1535,9 @@ def test_categories_aggregated_when_some_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     component_graph = [
         "Select Columns Transformer",
@@ -1627,7 +1643,9 @@ def test_explain_predictions_oversampler(estimator, fraud_100):
         reason="Skipping test because imbalanced-learn not installed",
     )
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
     pipeline = BinaryClassificationPipeline(
         component_graph={
             "Imputer": ["Imputer", "X", "y"],
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index 8af3f12b70..f045dbc776 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -136,15 +136,15 @@ def test_partial_dependence_with_non_numeric_columns(
 ):
     X = pd.DataFrame(
         {
-            "numeric": [1, 2, 3, 0]*4,
-            "also numeric": [2, 3, 4, 1]*4,
-            "string": ["a", "b", "a", "c"]*4,
-            "also string": ["c", "b", "a", "c"]*4,
+            "numeric": [1, 2, 3, 0] * 4,
+            "also numeric": [2, 3, 4, 1] * 4,
+            "string": ["a", "b", "a", "c"] * 4,
+            "also string": ["c", "b", "a", "c"] * 4,
         }
     )
     if data_type == "ww":
         X.ww.init()
-    y = [0, 0.2, 1.4, 1]*4
+    y = [0, 0.2, 1.4, 1] * 4
     pipeline = linear_regression_pipeline_class(
         parameters={"Linear Regressor": {"n_jobs": 1}}
     )
@@ -185,11 +185,11 @@ def test_partial_dependence_catboost(
 
         if problem_type == ProblemTypes.BINARY:
             X, y = X_y_binary
-            y_small = ["a", "b", "a"]*5
+            y_small = ["a", "b", "a"] * 5
             pipeline_class = BinaryClassificationPipeline
         else:
             X, y = X_y_multi
-            y_small = ["a", "b", "c"]*5
+            y_small = ["a", "b", "c"] * 5
             pipeline_class = MulticlassClassificationPipeline
 
         pipeline = pipeline_class(
@@ -204,10 +204,10 @@ def test_partial_dependence_catboost(
         # test that CatBoost can natively handle non-numerical columns as feature passed to partial_dependence
         X = pd.DataFrame(
             {
-                "numeric": [1, 2, 3]*5,
-                "also numeric": [2, 3, 4]*5,
-                "string": ["a", "b", "c"]*5,
-                "also string": ["c", "b", "a"]*5,
+                "numeric": [1, 2, 3] * 5,
+                "also numeric": [2, 3, 4] * 5,
+                "string": ["a", "b", "c"] * 5,
+                "also string": ["c", "b", "a"] * 5,
             }
         )
         pipeline = pipeline_class(
diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py
index dfbe0f1c35..35ebcd9266 100644
--- a/evalml/tests/model_understanding_tests/test_permutation_importance.py
+++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py
@@ -310,7 +310,9 @@ def test_fast_permutation_importance_matches_slow_output(
             "dependency not installed."
         )
     X, y = fraud_100
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
 
     if pipeline_class == LinearPipelineWithTextFeatures:
         X.ww.set_types(logical_types={"provider": "NaturalLanguage"})
diff --git a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
index cd83f08b7d..88fcb2c6ac 100644
--- a/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
+++ b/evalml/tests/pipeline_tests/regression_pipeline_tests/test_regression.py
@@ -86,9 +86,14 @@ def test_woodwork_regression_pipeline(diabetes_local, linear_regression_pipeline
 
 def test_custom_indices():
     X = pd.DataFrame(
-        {"a": ["a", "b", "a", "a", "a", "c", "c", "c"]*3, "b": [0, 1, 1, 1, 1, 1, 0, 1]*3}
+        {
+            "a": ["a", "b", "a", "a", "a", "c", "c", "c"] * 3,
+            "b": [0, 1, 1, 1, 1, 1, 0, 1] * 3,
+        }
+    )
+    y = pd.Series(
+        [0, 0, 0, 1, 0, 1, 0, 0] * 3, index=np.random.choice(24, 24, replace=False)
     )
-    y = pd.Series([0, 0, 0, 1, 0, 1, 0, 0]*3, index=np.random.choice(24, 24, replace=False))
     x1, x2, y1, y2 = split_data(X, y, problem_type="regression")
     pipeline = RegressionPipeline(
         component_graph=["Imputer", "One Hot Encoder", "Linear Regressor"],
diff --git a/evalml/tests/pipeline_tests/test_component_graph.py b/evalml/tests/pipeline_tests/test_component_graph.py
index 4246d0a028..253e212db3 100644
--- a/evalml/tests/pipeline_tests/test_component_graph.py
+++ b/evalml/tests/pipeline_tests/test_component_graph.py
@@ -1018,7 +1018,12 @@ def test_component_graph_dataset_with_different_types():
 
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
     X = infer_feature_types(
-        X, {"column_1": "categorical", "column_2": "categorical", "column_5": "NaturalLanguage"}
+        X,
+        {
+            "column_1": "categorical",
+            "column_2": "categorical",
+            "column_5": "NaturalLanguage",
+        },
     )
 
     component_graph = ComponentGraph(graph)
@@ -1266,7 +1271,9 @@ def transform(self, X, y=None):
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
 
     # woodwork would infer this as boolean by default -- convert to a numeric type
-    X.ww.init(logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"})
+    X.ww.init(
+        logical_types={"column_1": "categorical"}, semantic_tags={"address": "address"}
+    )
 
     component_graph = ComponentGraph(graph)
     # we don't have feature type selectors defined yet, so in order for the above graph to work we have to
@@ -1336,7 +1343,9 @@ def test_component_graph_types_merge():
     X["column_5"] = X["column_4"]
     X["column_6"] = [42.0] * len(X)
     y = pd.Series([1, 0, 1, 0, 1, 1, 0, 0, 0])
-    X = infer_feature_types(X, {"column_1": "categorical", "column_5": "NaturalLanguage"})
+    X = infer_feature_types(
+        X, {"column_1": "categorical", "column_5": "NaturalLanguage"}
+    )
 
     component_graph = ComponentGraph(graph)
     # we don't have feature type selectors defined yet, so in order for the above graph to work we have to
diff --git a/evalml/tests/pipeline_tests/test_pipeline_utils.py b/evalml/tests/pipeline_tests/test_pipeline_utils.py
index 0574a30e58..eaf0a448e4 100644
--- a/evalml/tests/pipeline_tests/test_pipeline_utils.py
+++ b/evalml/tests/pipeline_tests/test_pipeline_utils.py
@@ -48,9 +48,10 @@ def _get_test_data_from_configuration(
     ):
         X_all = pd.DataFrame(
             {
-                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]*2,
+                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
+                * 2,
                 "numerical": range(14),
-                "categorical": ["a", "b", "a", "b", "b", "a", "b"]*2,
+                "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2,
                 "dates": pd.date_range("2000-02-03", periods=14, freq="W"),
                 "text": [
                     "this is a string",
@@ -60,7 +61,8 @@ def _get_test_data_from_configuration(
                     "cats are gr8",
                     "hello world",
                     "evalml is gr8",
-                ]*2,
+                ]
+                * 2,
                 "email": [
                     "abalone_0@gmail.com",
                     "AbaloneRings@yahoo.com",
@@ -69,7 +71,8 @@ def _get_test_data_from_configuration(
                     "fooEMAIL@email.org",
                     "evalml@evalml.org",
                     "evalml@alteryx.org",
-                ]*2,
+                ]
+                * 2,
                 "url": [
                     "https://evalml.alteryx.com/en/stable/",
                     "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html",
@@ -78,17 +81,18 @@ def _get_test_data_from_configuration(
                     "https://www.evalml.alteryx.com/en/stable/demos/text_input.html",
                     "https://github.com/alteryx/evalml",
                     "https://github.com/alteryx/featuretools",
-                ]*2,
+                ]
+                * 2,
             }
         )
-        y = pd.Series([0, 0, 1, 0, 0, 1, 1]*2)
+        y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2)
         if problem_type == ProblemTypes.MULTICLASS:
-            y = pd.Series([0, 2, 1, 2, 0, 2, 1]*2)
+            y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2)
         elif is_regression(problem_type):
             if lognormal_distribution:
-                y = pd.Series([1, 1, 1, 2, 3, 6, 9]*2)
+                y = pd.Series([1, 1, 1, 2, 3, 6, 9] * 2)
             else:
-                y = pd.Series([1, 2, 3, 3, 3, 4, 5]*2)
+                y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2)
         X = X_all[column_names]
 
         if input_type == "ww":

From 644fa65d1ffdad5284fa7343b53581d972b57b75 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 13:55:30 +0100
Subject: [PATCH 23/36] lgbm, partial dep, permutation importance

---
 evalml/tests/component_tests/test_lgbm_regressor.py           | 4 ++++
 .../model_understanding_tests/test_partial_dependence.py      | 1 +
 .../model_understanding_tests/test_permutation_importance.py  | 1 +
 3 files changed, 6 insertions(+)

diff --git a/evalml/tests/component_tests/test_lgbm_regressor.py b/evalml/tests/component_tests/test_lgbm_regressor.py
index 1390cd1e0d..e6beda8bdb 100644
--- a/evalml/tests/component_tests/test_lgbm_regressor.py
+++ b/evalml/tests/component_tests/test_lgbm_regressor.py
@@ -150,6 +150,8 @@ def test_multiple_fit(mock_predict):
     X1_fit = pd.DataFrame({"feature": ["a", "b", "c", "c"]})
     X1_predict = pd.DataFrame({"feature": ["a", "a", "b", "c"]})
     X1_predict_expected = pd.DataFrame({0: [0.0, 0.0, 1.0, 2.0]}, dtype="category")
+    X1_fit.ww.init(logical_types={"feature": "categorical"})
+    X1_predict.ww.init(logical_types={"feature": "categorical"})
 
     clf = LightGBMRegressor()
     clf.fit(X1_fit, y)
@@ -160,6 +162,8 @@ def test_multiple_fit(mock_predict):
     X2_fit = pd.DataFrame({"feature": ["c", "b", "a", "d"]})
     X2_predict = pd.DataFrame({"feature": ["d", "c", "b", "a"]})
     X2_predict_expected = pd.DataFrame({0: [3.0, 2.0, 1.0, 0.0]}, dtype="category")
+    X2_fit.ww.init(logical_types={"feature": "categorical"})
+    X2_predict.ww.init(logical_types={"feature": "categorical"})
 
     clf = LightGBMRegressor()
     clf.fit(X2_fit, y)
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index f045dbc776..3e95e3261c 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -1139,6 +1139,7 @@ def test_partial_dependence_respect_grid_resolution(fraud_100):
             "Random Forest Classifier",
         ]
     )
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
     pl.fit(X, y)
     dep = partial_dependence(pl, X, features="amount", grid_resolution=5)
 
diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py
index 35ebcd9266..4cf8bbd319 100644
--- a/evalml/tests/model_understanding_tests/test_permutation_importance.py
+++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py
@@ -626,6 +626,7 @@ def test_permutation_importance_oversampler(fraud_100):
             ],
         }
     )
+    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
     pipeline.fit(X=X, y=y)
     pipeline.predict(X)
     importance = calculate_permutation_importance(

From a3c5766d684d14fa22885427a798b74dcb96be19 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 13:55:58 +0100
Subject: [PATCH 24/36] lint fixes

---
 .../model_understanding_tests/test_partial_dependence.py      | 4 +++-
 .../model_understanding_tests/test_permutation_importance.py  | 4 +++-
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index 3e95e3261c..a51638fae4 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -1139,7 +1139,9 @@ def test_partial_dependence_respect_grid_resolution(fraud_100):
             "Random Forest Classifier",
         ]
     )
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
     pl.fit(X, y)
     dep = partial_dependence(pl, X, features="amount", grid_resolution=5)
 
diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py
index 4cf8bbd319..09e673662d 100644
--- a/evalml/tests/model_understanding_tests/test_permutation_importance.py
+++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py
@@ -626,7 +626,9 @@ def test_permutation_importance_oversampler(fraud_100):
             ],
         }
     )
-    X.ww.init(logical_types={"currency": "categorical", "expiration_date": "categorical"})
+    X.ww.init(
+        logical_types={"currency": "categorical", "expiration_date": "categorical"}
+    )
     pipeline.fit(X=X, y=y)
     pipeline.predict(X)
     importance = calculate_permutation_importance(

From b9cdc83a97305b2c277ed317e2d63a243685e66b Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 17:06:14 +0100
Subject: [PATCH 25/36] delayed features

---
 .../test_delayed_features_transformer.py      | 61 ++++++-------------
 1 file changed, 17 insertions(+), 44 deletions(-)

diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py
index 925556c100..584f17ca0e 100644
--- a/evalml/tests/component_tests/test_delayed_features_transformer.py
+++ b/evalml/tests/component_tests/test_delayed_features_transformer.py
@@ -86,6 +86,8 @@ def test_delayed_feature_extractor_maxdelay3_gap1(
         answer["feature"] = X.feature.astype("int64")
     if not encode_y_as_str:
         answer["target_delay_0"] = y_answer.astype("int64")
+    else:
+        y = y.astype("category")
 
     assert_frame_equal(
         answer, DelayedFeatureTransformer(max_delay=3, gap=1).fit_transform(X=X, y=y)
@@ -130,6 +132,8 @@ def test_delayed_feature_extractor_maxdelay5_gap1(
             "target_delay_5": y_answer.shift(5),
         }
     )
+    if encode_y_as_str:
+        y = y.astype("category")
     if not encode_X_as_str:
         answer["feature"] = X.feature.astype("int64")
     assert_frame_equal(
@@ -173,6 +177,8 @@ def test_delayed_feature_extractor_maxdelay3_gap7(
             "target_delay_3": y_answer.shift(3),
         }
     )
+    if encode_y_as_str:
+        y = y.astype("category")
     if not encode_X_as_str:
         answer["feature"] = X.feature.astype("int64")
     assert_frame_equal(
@@ -193,49 +199,6 @@ def test_delayed_feature_extractor_maxdelay3_gap7(
     )
 
 
-@pytest.mark.parametrize("encode_X_as_str", [True, False])
-@pytest.mark.parametrize("encode_y_as_str", [True, False])
-def test_delayed_feature_extractor_numpy(
-    encode_X_as_str, encode_y_as_str, delayed_features_data
-):
-    X, y = delayed_features_data
-    X, X_answer, y, y_answer = encode_X_y_as_strings(
-        X, y, encode_X_as_str, encode_y_as_str
-    )
-    X_np = X.values
-    y_np = y.values
-    answer = pd.DataFrame(
-        {
-            0: X.feature,
-            "0_delay_1": X_answer.feature.shift(1),
-            "0_delay_2": X_answer.feature.shift(2),
-            "0_delay_3": X_answer.feature.shift(3),
-            "target_delay_0": y_answer.astype("int64"),
-            "target_delay_1": y_answer.shift(1),
-            "target_delay_2": y_answer.shift(2),
-            "target_delay_3": y_answer.shift(3),
-        }
-    )
-    if not encode_X_as_str:
-        answer[0] = X.feature.astype("int64")
-    assert_frame_equal(
-        answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np)
-    )
-
-    answer_only_y = pd.DataFrame(
-        {
-            "target_delay_0": y_answer.astype("int64"),
-            "target_delay_1": y_answer.shift(1),
-            "target_delay_2": y_answer.shift(2),
-            "target_delay_3": y_answer.shift(3),
-        }
-    )
-    assert_frame_equal(
-        answer_only_y,
-        DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np),
-    )
-
-
 @pytest.mark.parametrize(
     "delay_features,delay_target", [(False, True), (True, False), (False, False)]
 )
@@ -264,6 +227,8 @@ def test_lagged_feature_extractor_delay_features_delay_target(
             "target_delay_3": y_answer.shift(3),
         }
     )
+    if encode_y_as_str:
+        y = y.astype("category")
     if not encode_X_as_str:
         all_delays["feature"] = X.feature.astype("int64")
     if not delay_features:
@@ -307,7 +272,8 @@ def test_lagged_feature_extractor_delay_target(
                 "target_delay_3": y_answer.shift(3),
             }
         )
-
+    if encode_y_as_str:
+        y = y.astype("category")
     transformer = DelayedFeatureTransformer(
         max_delay=3, gap=1, delay_features=delay_features, delay_target=delay_target
     )
@@ -372,6 +338,8 @@ def test_delay_feature_transformer_supports_custom_index(
 
     X = make_data_type(data_type, X)
     y = make_data_type(data_type, y)
+    if encode_y_as_str:
+        y = y.astype("category")
 
     assert_frame_equal(
         answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X, y)
@@ -407,6 +375,7 @@ def test_delay_feature_transformer_multiple_categorical_columns(delayed_features
             "target_delay_1": y_answer.shift(1),
         }
     )
+    y = y.astype("category")
     assert_frame_equal(
         answer, DelayedFeatureTransformer(max_delay=1, gap=11).fit_transform(X, y)
     )
@@ -469,9 +438,13 @@ def test_delay_feature_transformer_woodwork_custom_overrides_returned_by_compone
             dft.fit(X, y)
             transformed = dft.transform(X, y)
         assert isinstance(transformed, pd.DataFrame)
+
+        if logical_type == Boolean:
+            transformed.ww.init(logical_types={"0_delay_1": "categorical"})
         transformed_logical_types = {
             k: type(v) for k, v in transformed.ww.logical_types.items()
         }
+
         if logical_type in [Integer, Double, Categorical]:
             assert transformed_logical_types == {
                 0: logical_type,

From eb0cca3705262723560a1cb65d0dce6b6cdd7cc1 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 18:49:08 +0100
Subject: [PATCH 26/36] email featurizer fix

---
 .../preprocessing/transform_primitive_components.py      | 9 +--------
 evalml/tests/conftest.py                                 | 1 -
 .../model_understanding_tests/test_partial_dependence.py | 1 -
 3 files changed, 1 insertion(+), 10 deletions(-)

diff --git a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py
index 17a2356bc3..ac93c18806 100644
--- a/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py
+++ b/evalml/pipelines/components/transformers/preprocessing/transform_primitive_components.py
@@ -72,20 +72,13 @@ def transform(self, X, y=None):
 
         es = self._make_entity_set(X_ww)
         features = ft.calculate_feature_matrix(features=self._features, entityset=es)
-
         features.set_index(X_ww.index, inplace=True)
 
         X_ww = X_ww.ww.drop(self._columns)
+        features.ww.init(logical_types={col_: "categorical" for col_ in features})
         for col in features:
             X_ww.ww[col] = features[col]
 
-        all_created_columns = self._get_feature_provenance().values()
-        to_categorical = {
-            col: "Categorical"
-            for feature_list in all_created_columns
-            for col in feature_list
-        }
-        X_ww.ww.set_types(to_categorical)
         return X_ww
 
     @staticmethod
diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
index 2572dc97fb..c1a96c1fd4 100644
--- a/evalml/tests/conftest.py
+++ b/evalml/tests/conftest.py
@@ -43,7 +43,6 @@
     handle_problem_types,
     is_regression,
 )
-from evalml.utils import infer_feature_types
 
 
 def pytest_configure(config):
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index a51638fae4..68d1574bc3 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -21,7 +21,6 @@
     RegressionPipeline,
 )
 from evalml.problem_types import ProblemTypes
-from evalml.utils import infer_feature_types
 
 
 @pytest.fixture

From 3fb872e26ad929ab9bfe6b7087df03b53b19baf2 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Mon, 30 Aug 2021 19:23:29 +0100
Subject: [PATCH 27/36] per column imputer

---
 .../tests/component_tests/test_per_column_imputer.py  | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/evalml/tests/component_tests/test_per_column_imputer.py b/evalml/tests/component_tests/test_per_column_imputer.py
index f97390f5d3..6d163a53f8 100644
--- a/evalml/tests/component_tests/test_per_column_imputer.py
+++ b/evalml/tests/component_tests/test_per_column_imputer.py
@@ -128,7 +128,7 @@ def test_non_numeric_errors(non_numeric_df):
 
 
 def test_non_numeric_valid(non_numeric_df):
-    X = non_numeric_df
+    X = non_numeric_df.copy()
     X.ww.init(
         logical_types={
             "A": "categorical",
@@ -153,10 +153,19 @@ def test_non_numeric_valid(non_numeric_df):
     X_t = transformer.fit_transform(X)
     assert_frame_equal(X_expected, X_t)
 
+    X = non_numeric_df.copy()
     # constant with all strings
     strategies = {"D": {"impute_strategy": "constant", "fill_value": 100}}
     transformer = PerColumnImputer(impute_strategies=strategies)
 
+    X.ww.init(
+        logical_types={
+            "A": "categorical",
+            "B": "categorical",
+            "C": "categorical",
+            "D": "categorical",
+        }
+    )
     X_expected = pd.DataFrame(
         [
             ["a", "a", "a", "a"],

From 8130180d499a90ed6be8e461e2ecc2027ce1fe72 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 13:49:35 +0100
Subject: [PATCH 28/36] change fraud100

---
 evalml/tests/conftest.py                      |  9 ++++++++-
 .../test_explainers.py                        | 20 -------------------
 .../test_partial_dependence.py                |  3 ---
 .../test_permutation_importance.py            |  6 ------
 4 files changed, 8 insertions(+), 30 deletions(-)

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
index fe1d0f4ef5..f51df67529 100644
--- a/evalml/tests/conftest.py
+++ b/evalml/tests/conftest.py
@@ -990,7 +990,14 @@ def fraud_local():
 @pytest.fixture
 def fraud_100():
     X, y = load_fraud_local(n_rows=100)
-    X.ww.set_types(logical_types={"provider": "Categorical", "region": "Categorical"})
+    X.ww.set_types(
+        logical_types={
+            "provider": "Categorical",
+            "region": "Categorical",
+            "currency": "categorical",
+            "expiration_date": "categorical",
+        }
+    )
     return X, y
 
 
diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
index 9098b795dd..8d651427e1 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_explainers.py
@@ -1263,9 +1263,6 @@ def transform_y_for_problem_type(problem_type, y):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_linear_pipeline(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1310,8 +1307,6 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
     X.ww.set_types(
         logical_types={
             "provider": "NaturalLanguage",
-            "currency": "categorical",
-            "expiration_date": "categorical",
         }
     )
     component_graph = [
@@ -1370,9 +1365,6 @@ def test_categories_aggregated_text(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     pipeline = pipeline_class(
         component_graph=[
@@ -1427,9 +1419,6 @@ def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
 @pytest.mark.parametrize("pipeline_class,estimator", pipeline_test_cases)
 def test_categories_aggregated_pca_dag(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     component_graph = {
         "SelectNumeric": ["Select Columns Transformer", "X", "y"],
@@ -1489,9 +1478,6 @@ def test_categories_aggregated_but_not_those_that_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     component_graph = [
         "Select Columns Transformer",
@@ -1535,9 +1521,6 @@ def test_categories_aggregated_when_some_are_dropped(
     pipeline_class, estimator, fraud_100
 ):
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     component_graph = [
         "Select Columns Transformer",
@@ -1643,9 +1626,6 @@ def test_explain_predictions_oversampler(estimator, fraud_100):
         reason="Skipping test because imbalanced-learn not installed",
     )
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
     pipeline = BinaryClassificationPipeline(
         component_graph={
             "Imputer": ["Imputer", "X", "y"],
diff --git a/evalml/tests/model_understanding_tests/test_partial_dependence.py b/evalml/tests/model_understanding_tests/test_partial_dependence.py
index 68d1574bc3..1942a8e64a 100644
--- a/evalml/tests/model_understanding_tests/test_partial_dependence.py
+++ b/evalml/tests/model_understanding_tests/test_partial_dependence.py
@@ -1138,9 +1138,6 @@ def test_partial_dependence_respect_grid_resolution(fraud_100):
             "Random Forest Classifier",
         ]
     )
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
     pl.fit(X, y)
     dep = partial_dependence(pl, X, features="amount", grid_resolution=5)
 
diff --git a/evalml/tests/model_understanding_tests/test_permutation_importance.py b/evalml/tests/model_understanding_tests/test_permutation_importance.py
index b210ec22f2..f4e31d561b 100644
--- a/evalml/tests/model_understanding_tests/test_permutation_importance.py
+++ b/evalml/tests/model_understanding_tests/test_permutation_importance.py
@@ -310,9 +310,6 @@ def test_fast_permutation_importance_matches_slow_output(
             "dependency not installed."
         )
     X, y = fraud_100
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
 
     if pipeline_class == LinearPipelineWithTextFeatures:
         X.ww.set_types(logical_types={"provider": "NaturalLanguage"})
@@ -626,9 +623,6 @@ def test_permutation_importance_oversampler(fraud_100):
             ],
         }
     )
-    X.ww.init(
-        logical_types={"currency": "categorical", "expiration_date": "categorical"}
-    )
     pipeline.fit(X=X, y=y)
     pipeline.predict(X)
     importance = calculate_permutation_importance(

From 3b68cab0d72175d5920d932cade2cf3df2cd13de Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 15:18:36 +0100
Subject: [PATCH 29/36] permutation importance

---
 evalml/model_understanding/permutation_importance.py         | 1 +
 evalml/pipelines/components/transformers/column_selectors.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/evalml/model_understanding/permutation_importance.py b/evalml/model_understanding/permutation_importance.py
index a2420e63f9..e49053f212 100644
--- a/evalml/model_understanding/permutation_importance.py
+++ b/evalml/model_understanding/permutation_importance.py
@@ -293,6 +293,7 @@ def _shuffle_and_score_helper(
         col = X_permuted.iloc[shuffling_idx, col_idx]
         col.index = X_permuted.index
         X_permuted.iloc[:, col_idx] = col
+        X_permuted.ww.init(schema=X_features.ww.schema)
         if is_fast:
             feature_score = scorer(pipeline, X_permuted, X_features, y, objective)
         else:
diff --git a/evalml/pipelines/components/transformers/column_selectors.py b/evalml/pipelines/components/transformers/column_selectors.py
index 67a2bb66f5..52b46121a8 100644
--- a/evalml/pipelines/components/transformers/column_selectors.py
+++ b/evalml/pipelines/components/transformers/column_selectors.py
@@ -31,7 +31,7 @@ def _check_input_for_columns(self, X):
 
         missing_cols = set(cols) - set(column_names)
         if missing_cols:
-            raise ValueError("Columns of type {column_types} not found in input data.")
+            raise ValueError(f"Columns of type {missing_cols} not found in input data.")
 
     @abstractmethod
     def _modify_columns(self, cols, X, y=None):

From 9128d9c8ae0f6fa88b117f46c31ee02f59cd4b88 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 15:34:40 +0100
Subject: [PATCH 30/36] model_understanding docs update

---
 docs/source/user_guide/model_understanding.ipynb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/user_guide/model_understanding.ipynb b/docs/source/user_guide/model_understanding.ipynb
index 570351bf67..ad5b910c8e 100644
--- a/docs/source/user_guide/model_understanding.ipynb
+++ b/docs/source/user_guide/model_understanding.ipynb
@@ -154,7 +154,8 @@
    "outputs": [],
    "source": [
     "X_fraud, y_fraud = evalml.demos.load_fraud(100, verbose=False)\n",
-    "X_fraud.ww.init(logical_types={\"provider\": \"Categorical\", 'region': \"Categorical\"})\n",
+    "X_fraud.ww.init(logical_types={\"provider\": \"Categorical\", 'region': \"Categorical\",\n",
+    "                               \"currency\": \"Categorical\", \"expiration_date\": \"Categorical\"})\n",
     "\n",
     "fraud_pipeline = BinaryClassificationPipeline([\"DateTime Featurization Component\",\"One Hot Encoder\", \"Random Forest Classifier\"])\n",
     "fraud_pipeline.fit(X_fraud, y_fraud)\n",

From 6fcf20524255c91e04388960c1d40d15cf2ee673 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 16:16:00 +0100
Subject: [PATCH 31/36] data check update

---
 .../test_target_leakage_data_check.py                  | 10 ++++++++++
 1 file changed, 10 insertions(+)

diff --git a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
index 828a7e186d..9ffb4c3e83 100644
--- a/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
+++ b/evalml/tests/data_checks_tests/test_target_leakage_data_check.py
@@ -378,6 +378,7 @@ def test_target_leakage_regression():
     X["c"] = y / 10
     X["d"] = [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
     X["e"] = ["a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m", "n", "o"]
+    X.ww.init(logical_types={"e": "categorical"})
 
     expected = {
         "warnings": [
@@ -399,6 +400,12 @@ def test_target_leakage_regression():
                 message_code=DataCheckMessageCode.TARGET_LEAKAGE,
                 details={"column": "c"},
             ).to_dict(),
+            DataCheckWarning(
+                message="Column 'e' is 80.0% or more correlated with the target",
+                data_check_name=target_leakage_data_check_name,
+                message_code=DataCheckMessageCode.TARGET_LEAKAGE,
+                details={"column": "e"},
+            ).to_dict(),
         ],
         "errors": [],
         "actions": [
@@ -411,6 +418,9 @@ def test_target_leakage_regression():
             DataCheckAction(
                 DataCheckActionCode.DROP_COL, metadata={"column": "c"}
             ).to_dict(),
+            DataCheckAction(
+                DataCheckActionCode.DROP_COL, metadata={"column": "e"}
+            ).to_dict(),
         ],
     }
 

From caefd12e6110c23ad976ed98bb507d7ef26b392c Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 16:32:36 +0100
Subject: [PATCH 32/36] update objectives

---
 docs/source/user_guide/objectives.ipynb | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/source/user_guide/objectives.ipynb b/docs/source/user_guide/objectives.ipynb
index c78ed15f02..b79dc9f6ce 100644
--- a/docs/source/user_guide/objectives.ipynb
+++ b/docs/source/user_guide/objectives.ipynb
@@ -69,7 +69,8 @@
     "from evalml.objectives import F1\n",
     "\n",
     "X, y = load_fraud(n_rows=100)\n",
-    "X.ww.init(logical_types={\"provider\": \"Categorical\", \"region\": \"Categorical\"})\n",
+    "X.ww.init(logical_types={\"provider\": \"Categorical\", \"region\": \"Categorical\",\n",
+    "                         \"currency\": \"Categorical\", \"expiration_date\": \"Categorical\"})\n",
     "objective = F1()\n",
     "pipeline = BinaryClassificationPipeline(component_graph=['Simple Imputer', 'DateTime Featurization Component', 'One Hot Encoder', 'Random Forest Classifier'])\n",
     "pipeline.fit(X, y)\n",

From a562c67ce9acdd001b235ab8de7e70de5dfc40d6 Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 20:59:56 +0100
Subject: [PATCH 33/36] test updates

---
 .../test_delayed_features_transformer.py      | 40 +++++++++++++++++++
 .../component_tests/test_lgbm_classifier.py   |  8 ++--
 2 files changed, 44 insertions(+), 4 deletions(-)

diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py
index 584f17ca0e..2321a98693 100644
--- a/evalml/tests/component_tests/test_delayed_features_transformer.py
+++ b/evalml/tests/component_tests/test_delayed_features_transformer.py
@@ -199,6 +199,46 @@ def test_delayed_feature_extractor_maxdelay3_gap7(
     )
 
 
+def test_delayed_feature_extractor_numpy(
+    delayed_features_data
+):
+    X, y = delayed_features_data
+    X, X_answer, y, y_answer = encode_X_y_as_strings(
+        X, y, False, False
+    )
+    X_np = X.values
+    y_np = y.values
+    answer = pd.DataFrame(
+        {
+            0: X.feature,
+            "0_delay_1": X_answer.feature.shift(1),
+            "0_delay_2": X_answer.feature.shift(2),
+            "0_delay_3": X_answer.feature.shift(3),
+            "target_delay_0": y_answer.astype("int64"),
+            "target_delay_1": y_answer.shift(1),
+            "target_delay_2": y_answer.shift(2),
+            "target_delay_3": y_answer.shift(3),
+        }
+    )
+
+    assert_frame_equal(
+        answer, DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X_np, y_np)
+    )
+
+    answer_only_y = pd.DataFrame(
+        {
+            "target_delay_0": y_answer.astype("int64"),
+            "target_delay_1": y_answer.shift(1),
+            "target_delay_2": y_answer.shift(2),
+            "target_delay_3": y_answer.shift(3),
+        }
+    )
+    assert_frame_equal(
+        answer_only_y,
+        DelayedFeatureTransformer(max_delay=3, gap=7).fit_transform(X=None, y=y_np),
+    )
+
+
 @pytest.mark.parametrize(
     "delay_features,delay_target", [(False, True), (True, False), (False, False)]
 )
diff --git a/evalml/tests/component_tests/test_lgbm_classifier.py b/evalml/tests/component_tests/test_lgbm_classifier.py
index 2a35d0ba7c..5784029a94 100644
--- a/evalml/tests/component_tests/test_lgbm_classifier.py
+++ b/evalml/tests/component_tests/test_lgbm_classifier.py
@@ -174,14 +174,14 @@ def test_correct_args(mock_predict, mock_predict_proba, X_y_binary):
 def test_categorical_data_subset(mock_predict, mock_predict_proba, X_y_binary):
     X = pd.DataFrame(
         {
-            "feature_1": [0, 0, 1, 1, 0, 1] * 2,
-            "feature_2": ["a", "a", "b", "b", "c", "c"] * 2,
+            "feature_1": [0, 0, 1, 1, 0, 1],
+            "feature_2": ["a", "a", "b", "b", "c", "c"],
         }
     )
     X.ww.init(logical_types={"feature_2": "categorical"})
-    y = pd.Series([1, 1, 0, 0, 0, 1] * 2)
+    y = pd.Series([1, 1, 0, 0, 0, 1])
     X_expected = pd.DataFrame(
-        {0: [0, 0, 1, 1, 0, 1] * 2, 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0] * 2}
+        {0: [0, 0, 1, 1, 0, 1], 1: [0.0, 0.0, 1.0, 1.0, 2.0, 2.0]}
     )
     X_expected.iloc[:, 1] = X_expected.iloc[:, 1].astype("category")
 

From 482a0d6fcfb3d9a884562b9bb9b34c6b0ab746bc Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 21:25:16 +0100
Subject: [PATCH 34/36] more updates

---
 .../component_tests/test_delayed_features_transformer.py  | 8 ++------
 .../prediction_explanations_tests/test_force_plots.py     | 1 -
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/evalml/tests/component_tests/test_delayed_features_transformer.py b/evalml/tests/component_tests/test_delayed_features_transformer.py
index 2321a98693..b4245a3a8a 100644
--- a/evalml/tests/component_tests/test_delayed_features_transformer.py
+++ b/evalml/tests/component_tests/test_delayed_features_transformer.py
@@ -199,13 +199,9 @@ def test_delayed_feature_extractor_maxdelay3_gap7(
     )
 
 
-def test_delayed_feature_extractor_numpy(
-    delayed_features_data
-):
+def test_delayed_feature_extractor_numpy(delayed_features_data):
     X, y = delayed_features_data
-    X, X_answer, y, y_answer = encode_X_y_as_strings(
-        X, y, False, False
-    )
+    X, X_answer, y, y_answer = encode_X_y_as_strings(X, y, False, False)
     X_np = X.values
     y_np = y.values
     answer = pd.DataFrame(
diff --git a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
index b08f8ccc26..48b864a616 100644
--- a/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
+++ b/evalml/tests/model_understanding_tests/prediction_explanations_tests/test_force_plots.py
@@ -217,7 +217,6 @@ def test_force_plot_regression(
 def test_categories_aggregated_date_ohe(pipeline_class, estimator, fraud_100):
     X, y = fraud_100
     columns_to_select = ["datetime", "amount", "provider", "currency"]
-    X.ww.init(logical_types={"currency": "categorical"})
 
     pipeline = pipeline_class(
         component_graph=[

From 3a353ea1bc664cd212deddeea1d9376237767dfe Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 22:04:34 +0100
Subject: [PATCH 35/36] featuretools upgrade

---
 evalml/tests/component_tests/test_imputer.py  |  1 -
 .../component_tests/test_one_hot_encoder.py   | 76 +++++--------------
 .../latest_dependency_versions.txt            |  2 +-
 3 files changed, 21 insertions(+), 58 deletions(-)

diff --git a/evalml/tests/component_tests/test_imputer.py b/evalml/tests/component_tests/test_imputer.py
index 608f62c02b..7024e05807 100644
--- a/evalml/tests/component_tests/test_imputer.py
+++ b/evalml/tests/component_tests/test_imputer.py
@@ -144,7 +144,6 @@ def test_categorical_only_input(imputer_test_data):
     )
 
     imputer = Imputer()
-    imputer.fit(X, y)
     transformed = imputer.fit_transform(X, y)
 
     assert_frame_equal(transformed, expected, check_dtype=False)
diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index 36a5cad415..9f5ba9c1e5 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -17,6 +17,17 @@
 from evalml.utils import get_random_seed, infer_feature_types
 
 
+def set_first_three_columns_to_categorical(X):
+    X.ww.init(
+        logical_types={
+            "col_1": "categorical",
+            "col_2": "categorical",
+            "col_3": "categorical",
+        }
+    )
+    return X
+
+
 def test_init():
     parameters = {
         "top_n": 10,
@@ -166,13 +177,7 @@ def test_drop_first():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     encoder = OneHotEncoder(top_n=None, drop="first", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -181,6 +186,7 @@ def test_drop_first():
     assert col_names == expected_col_names
 
 
+
 def test_drop_binary():
     X = pd.DataFrame(
         {
@@ -189,13 +195,7 @@ def test_drop_binary():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     encoder = OneHotEncoder(top_n=None, drop="if_binary", handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -212,13 +212,7 @@ def test_drop_parameter_is_array():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     encoder = OneHotEncoder(top_n=None, drop=["b", "c", "a"], handle_unknown="error")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -237,13 +231,7 @@ def test_drop_binary_and_top_n_2():
             "col_3": ["a", "a", "a", "a", "a"],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     encoder = OneHotEncoder(top_n=2, drop="if_binary")
     encoder.fit(X)
     X_t = encoder.transform(X)
@@ -261,13 +249,7 @@ def test_handle_unknown():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     encoder = OneHotEncoder(handle_unknown="error")
     encoder.fit(X)
     assert isinstance(encoder.transform(X), pd.DataFrame)
@@ -334,13 +316,7 @@ def test_categories():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
 
     categories = [["a", "b", "c", "d"], ["a", "b", "c"], ["a", "b"]]
 
@@ -415,13 +391,7 @@ def test_more_top_n_unique_values():
             "col_4": [2, 0, 1, 3, 0, 1, 2],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
 
     random_seed = 2
 
@@ -464,13 +434,7 @@ def test_more_top_n_unique_values_large():
             "col_4": [2, 0, 1, 3, 0, 1, 2, 4, 1],
         }
     )
-    X.ww.init(
-        logical_types={
-            "col_1": "categorical",
-            "col_2": "categorical",
-            "col_3": "categorical",
-        }
-    )
+    X = set_first_three_columns_to_categorical(X)
     random_seed = 2
 
     encoder = OneHotEncoder(top_n=3, random_seed=random_seed)
diff --git a/evalml/tests/dependency_update_check/latest_dependency_versions.txt b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
index 47683d73c1..14a285469e 100644
--- a/evalml/tests/dependency_update_check/latest_dependency_versions.txt
+++ b/evalml/tests/dependency_update_check/latest_dependency_versions.txt
@@ -3,7 +3,7 @@ click==8.0.1
 cloudpickle==1.6.0
 colorama==0.4.4
 dask==2021.8.1
-featuretools==0.26.2
+featuretools==0.27.0
 graphviz==0.17
 imbalanced-learn==0.8.0
 ipywidgets==7.6.3

From 61c91eee8b2c925ee663512dbf8477d093b4b36f Mon Sep 17 00:00:00 2001
From: Parthiv Naresh <parthiv.naresh@alteryx.com>
Date: Tue, 31 Aug 2021 22:08:50 +0100
Subject: [PATCH 36/36] lint fix

---
 evalml/tests/component_tests/test_one_hot_encoder.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/evalml/tests/component_tests/test_one_hot_encoder.py b/evalml/tests/component_tests/test_one_hot_encoder.py
index 9f5ba9c1e5..f382bcd257 100644
--- a/evalml/tests/component_tests/test_one_hot_encoder.py
+++ b/evalml/tests/component_tests/test_one_hot_encoder.py
@@ -186,7 +186,6 @@ def test_drop_first():
     assert col_names == expected_col_names
 
 
-
 def test_drop_binary():
     X = pd.DataFrame(
         {