alteryx · eccabay · Sep 16, 2022 · Aug 30, 2022 · Aug 31, 2022 · Aug 31, 2022
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -8,8 +8,10 @@ Release Notes
         * Removed multiple samplers occurring in pipelines generated by ``DefaultAlgorithm`` :pr:`3696`
         * Fix search order changing when using ``DefaultAlgorithm`` :pr:`3704`
     * Changes
+        * Added support for using ``downcast_nullable_types`` with Series as well as DataFrames :pr:`3697`
     * Documentation Changes
     * Testing Changes
+        * Updated pytest fixtures and brittle test files to explicitly set woodwork typing information :pr:`3697`
 
 .. warning::
 

diff --git a/evalml/pipelines/components/transformers/imputers/time_series_imputer.py b/evalml/pipelines/components/transformers/imputers/time_series_imputer.py
@@ -3,7 +3,7 @@
 import woodwork as ww
 
 from evalml.pipelines.components.transformers import Transformer
-from evalml.utils import infer_feature_types
+from evalml.utils import downcast_nullable_types, infer_feature_types
 
 
 class TimeSeriesImputer(Transformer):

diff --git a/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py b/evalml/pipelines/components/transformers/preprocessing/time_series_featurizer.py
@@ -207,7 +207,9 @@ def _compute_rolling_transforms(self, X, y, original_features):
         if y is not None and "numeric" in y.ww.semantic_tags:
             data[f"target_rolling_mean"] = rolling_mean(y.index, y)
         data.index = X.index
-        data.ww.init()
+        data.ww.init(
+            logical_types={col: "Double" for col in data.columns},
+        )
         return data
 
     def _compute_delays(self, X_ww, y):
@@ -258,7 +260,7 @@ def _compute_delays(self, X_ww, y):
         # Features created from categorical columns should no longer be categorical
         lagged_features = pd.DataFrame(lagged_features)
         lagged_features.ww.init(
-            logical_types={col: "Double" for col in cols_derived_from_categoricals},
+            logical_types={col: "Double" for col in lagged_features.columns},
         )
         lagged_features.index = X_ww.index
         return ww.concat_columns([X_ww, lagged_features])

diff --git a/evalml/pipelines/time_series_pipeline_base.py b/evalml/pipelines/time_series_pipeline_base.py
@@ -171,7 +171,7 @@ def transform_all_but_final(self, X, y=None, X_train=None, y_train=None):
                 y_train,
             )
             features = super().transform_all_but_final(padded_features, padded_target)
-            features_holdout = features.iloc[-len(y) :]
+            features_holdout = features.ww.iloc[-len(y) :]
         return features_holdout
 
     def predict_in_sample(self, X, y, X_train, y_train, objective=None):

diff --git a/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py b/evalml/tests/automl_tests/parallel_tests/test_automl_dask.py
@@ -285,29 +285,14 @@ def test_automl_closes_engines(engine_str, X_y_binary_cls):
 def test_score_pipelines_passes_X_train_y_train(
     problem_type,
     engine_str,
-    X_y_binary,
-    X_y_regression,
-    X_y_multi,
-    AutoMLTestEnv,
-    ts_data_binary,
-    ts_data_multi,
+    X_y_based_on_pipeline_or_problem_type,
     ts_data,
+    AutoMLTestEnv,
 ):
-    if is_binary(problem_type):
-        if is_time_series(problem_type):
-            X, y = ts_data_binary
-        else:
-            X, y = X_y_binary
-    elif is_multiclass(problem_type):
-        if is_time_series(problem_type):
-            X, y = ts_data_multi
-        else:
-            X, y = X_y_multi
+    if is_time_series(problem_type):
+        X, _, y = ts_data(problem_type=problem_type)
     else:
-        if is_time_series(problem_type):
-            X, y = ts_data
-        else:
-            X, y = X_y_regression
+        X, y = X_y_based_on_pipeline_or_problem_type(problem_type)
 
     half = X.shape[0] // 2
     X_train, y_train = pd.DataFrame(X[:half]), pd.Series(y[:half])

diff --git a/evalml/tests/automl_tests/test_automl.py b/evalml/tests/automl_tests/test_automl.py
@@ -1944,15 +1944,13 @@ def test_percent_better_than_baseline_in_rankings(
     dummy_classifier_estimator_class,
     dummy_regressor_estimator_class,
     dummy_time_series_regressor_estimator_class,
-    ts_data_binary,
+    ts_data,
     X_y_multi,
 ):
     if not objective.is_defined_for_problem_type(problem_type_value):
         pytest.skip("Skipping because objective is not defined for problem type")
 
-    X, y = ts_data_binary
-    if problem_type_value == ProblemTypes.MULTICLASS:
-        X, y = X_y_multi
+    X, _, y = ts_data(problem_type=problem_type_value)
 
     estimator = {
         ProblemTypes.BINARY: dummy_classifier_estimator_class,
@@ -2031,12 +2029,7 @@ class Pipeline2(DummyPipeline):
             max_iterations=3,
             objective=objective,
             additional_objectives=[],
-            problem_configuration={
-                "time_index": "date",
-                "gap": 0,
-                "max_delay": 0,
-                "forecast_horizon": 2,
-            },
+            problem_configuration=pipeline_parameters["pipeline"],
             train_best_pipeline=False,
             n_jobs=1,
         )
@@ -2134,9 +2127,9 @@ def test_percent_better_than_baseline_computed_for_all_objectives(
     dummy_classifier_estimator_class,
     dummy_regressor_estimator_class,
     dummy_time_series_regressor_estimator_class,
-    ts_data_binary,
+    ts_data,
 ):
-    X, y = ts_data_binary
+    X, _, y = ts_data(problem_type=problem_type)
 
     problem_type_enum = handle_problem_types(problem_type)
 
@@ -2270,7 +2263,7 @@ def fit(self, *args, **kwargs):
 
 
 def test_time_series_regression_with_parameters(ts_data):
-    X, y = ts_data
+    X, _, y = ts_data()
     X.index.name = "date"
     problem_configuration = {
         "time_index": "date",
@@ -2879,8 +2872,7 @@ def test_automl_woodwork_user_types_preserved(
 
 
 def test_automl_validates_problem_configuration(ts_data):
-    _, y = ts_data
-    X = pd.DataFrame(pd.date_range("2020-10-01", "2020-10-31"), columns=["Date"])
+    X, _, y = ts_data()
     assert (
         AutoMLSearch(X_train=X, y_train=y, problem_type="binary").problem_configuration
         == {}
@@ -2937,14 +2929,14 @@ def test_automl_validates_problem_configuration(ts_data):
         y_train=y,
         problem_type="time series regression",
         problem_configuration={
-            "time_index": "Date",
+            "time_index": "date",
             "max_delay": 2,
             "gap": 3,
             "forecast_horizon": 2,
         },
     ).problem_configuration
     assert problem_config == {
-        "time_index": "Date",
+        "time_index": "date",
         "max_delay": 2,
         "gap": 3,
         "forecast_horizon": 2,
@@ -3076,7 +3068,7 @@ def test_automl_rerun(AutoMLTestEnv, X_y_binary, caplog):
 
 def test_timeseries_baseline_init_with_correct_gap_max_delay(AutoMLTestEnv, ts_data):
 
-    X, y = ts_data
+    X, _, y = ts_data()
     automl = AutoMLSearch(
         X_train=X,
         y_train=y,
@@ -4035,9 +4027,13 @@ def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_typ
     baseline.fit(X_train, y_train)
 
     expected_predictions = y.shift(1)[4:]
-    expected_predictions = expected_predictions.astype("int64")
+    expected_predictions = expected_predictions
     if problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
-        expected_predictions = pd.Series(expected_predictions, name="target_delay_1")
+        expected_predictions = pd.Series(
+            expected_predictions,
+            name="target_delay_1",
+            dtype="int64",
+        )
 
     preds = baseline.predict(X_validation, None, X_train, y_train)
     pd.testing.assert_series_equal(expected_predictions, preds)
@@ -4133,10 +4129,8 @@ def test_automl_thresholding_train_pipelines(mock_objective, threshold, X_y_bina
 def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog):
     caplog.clear()
     X, y = X_y_binary
-    X = pd.DataFrame(X)
     for col in columns:
-        X[col] = pd.Series(range(len(X)))
-    X.ww.init()
+        X.ww[col] = pd.Series(range(len(X)))
     X.ww.set_types({col: "Unknown" for col in columns})
     automl = AutoMLSearch(
         X_train=X,
@@ -4534,26 +4528,18 @@ def test_baseline_pipeline_properly_initalized(
 @pytest.mark.parametrize(
     "problem_type",
     [
-        ProblemTypes.TIME_SERIES_REGRESSION,
-        ProblemTypes.TIME_SERIES_MULTICLASS,
-        ProblemTypes.TIME_SERIES_BINARY,
+        "time series regression",
+        "time series multiclass",
+        "time series binary",
     ],
 )
 def test_automl_passes_known_in_advance_pipeline_parameters_to_all_pipelines(
     problem_type,
-    ts_data_binary,
-    ts_data_multi,
     ts_data,
     AutoMLTestEnv,
 ):
-    if problem_type == ProblemTypes.TIME_SERIES_MULTICLASS:
-        X, y = ts_data_multi
-    elif problem_type == ProblemTypes.TIME_SERIES_BINARY:
-        X, y = ts_data_binary
-    else:
-        X, y = ts_data
+    X, _, y = ts_data(problem_type=problem_type)
 
-    X.ww.init()
     X.ww["email"] = pd.Series(["[email protected]"] * X.shape[0], index=X.index)
     X.ww["category"] = pd.Series(["a"] * X.shape[0], index=X.index)
     X.ww.set_types({"email": "EmailAddress", "category": "Categorical"})
@@ -4588,7 +4574,7 @@ def test_automl_passes_known_in_advance_pipeline_parameters_to_all_pipelines(
         lambda d: d["Not Known In Advance Pipeline - Select Columns Transformer"][
             "columns"
         ]
-        == ["features", "date"],
+        == ["feature", "date"],
     ).all()
 
 
@@ -4633,10 +4619,10 @@ def test_cv_validation_scores(
 
 
 def test_cv_validation_scores_time_series(
-    ts_data_binary,
+    ts_data,
     AutoMLTestEnv,
 ):
-    X, y = ts_data_binary
+    X, _, y = ts_data(problem_type="time series binary")
     problem_configuration = {
         "time_index": "date",
         "gap": 0,
@@ -4678,7 +4664,7 @@ def test_search_parameters_held_automl(
     algorithm,
     batches,
     X_y_binary,
-    ts_data_binary,
+    ts_data,
 ):
     if problem_type == "binary":
         X, y = X_y_binary
@@ -4695,7 +4681,7 @@ def test_search_parameters_held_automl(
             },
         }
     else:
-        X, y = ts_data_binary
+        X, _, y = ts_data(problem_type="time series binary")
         problem_configuration = {
             "time_index": "date",
             "gap": 0,
@@ -4781,9 +4767,9 @@ def test_automl_accepts_features(
     AutoMLTestEnv,
 ):
     X, y = X_y_binary
-    X_pd = pd.DataFrame(X)
-    X_pd.columns = X_pd.columns.astype(str)
-    X_transform = X_pd.iloc[len(X) // 3 :]
+    X = pd.DataFrame(X)  # Drop ww information since setting column types fails
+    X.columns = X.columns.astype(str)
+    X_transform = X.iloc[len(X) // 3 :]
 
     if features == "with_features_provided":
         es = ft.EntitySet()
@@ -4839,7 +4825,7 @@ def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
     is_using_windows,
 ):
 
-    X, y = ts_data
+    X, _, y = ts_data()
 
     env = AutoMLTestEnv("time series regression")
     automl = AutoMLSearch(

diff --git a/evalml/tests/automl_tests/test_automl_iterative_algorithm.py b/evalml/tests/automl_tests/test_automl_iterative_algorithm.py
@@ -497,7 +497,7 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
     AutoMLTestEnv,
 ):
     X, y = X_y_multi
-    X = pd.DataFrame(X, columns=[f"Column_{i}" for i in range(20)])
+    X.ww.columns = [f"Column_{i}" for i in range(20)]
 
     component_graph_ = None
     search_parameters_ = {}

diff --git a/evalml/tests/automl_tests/test_automl_search_classification.py b/evalml/tests/automl_tests/test_automl_search_classification.py
@@ -733,12 +733,11 @@ def test_max_time(X_y_binary):
 )
 def test_automl_supports_time_series_classification(
     problem_type,
-    ts_data_binary,
-    ts_data_multi,
+    ts_data,
     AutoMLTestEnv,
 ):
+    X, _, y = ts_data(problem_type=problem_type)
     if problem_type == ProblemTypes.TIME_SERIES_BINARY:
-        X, y = ts_data_binary
         baseline = TimeSeriesBinaryClassificationPipeline(
             component_graph=["Time Series Baseline Estimator"],
             parameters={
@@ -757,9 +756,7 @@ def test_automl_supports_time_series_classification(
             },
         )
         score_return_value = {"Log Loss Binary": 0.2}
-        problem_type = "time series binary"
     else:
-        X, y = ts_data_multi
         baseline = TimeSeriesMulticlassClassificationPipeline(
             component_graph=["Time Series Baseline Estimator"],
             parameters={
@@ -778,7 +775,6 @@ def test_automl_supports_time_series_classification(
             },
         )
         score_return_value = {"Log Loss Multiclass": 0.25}
-        problem_type = "time series multiclass"
 
     configuration = {
         "time_index": "date",
@@ -819,12 +815,12 @@ def test_automl_time_series_classification_threshold(
     mock_split_data,
     optimize,
     objective,
-    ts_data_binary,
+    ts_data,
     AutoMLTestEnv,
 ):
-    X, y = ts_data_binary
     score_return_value = {objective: 0.4}
     problem_type = "time series binary"
+    X, _, y = ts_data(problem_type=problem_type)
 
     configuration = {
         "time_index": "date",
@@ -1197,7 +1193,7 @@ def test_time_series_pipeline_parameter_warnings(
     search_parameters,
     set_values,
     AutoMLTestEnv,
-    ts_data_binary,
+    ts_data,
 ):
     search_parameters.update(
         {
@@ -1209,7 +1205,7 @@ def test_time_series_pipeline_parameter_warnings(
             },
         },
     )
-    X, y = ts_data_binary
+    X, _, y = ts_data(problem_type="time series binary")
     configuration = {
         "time_index": "date",
         "gap": 0,

diff --git a/evalml/tests/automl_tests/test_automl_search_regression.py b/evalml/tests/automl_tests/test_automl_search_regression.py
@@ -218,8 +218,7 @@ def test_log_metrics_only_passed_directly(X_y_regression):
 
 @pytest.mark.parametrize("freq", ["D", "MS"])
 def test_automl_supports_time_series_regression(freq, AutoMLTestEnv, ts_data):
-    X, y = ts_data
-    X["date"] = pd.date_range(start="1/1/2018", periods=31, freq=freq)
+    X, _, y = ts_data(freq=freq)
 
     configuration = {
         "time_index": "date",