Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Statically set woodwork typing in tests #3697

Merged
merged 25 commits into from
Sep 16, 2022
Merged
Show file tree
Hide file tree
Changes from 19 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
e22954b
Remove unnecessary ww init
eccabay Aug 30, 2022
a0c38d9
Replace ts_data with get_ts_X_y in all cases
eccabay Aug 31, 2022
b7598d8
Explicitly set ww types in get_ts_X_y
eccabay Aug 31, 2022
30acb05
Update imputer_test_data to set ww types
eccabay Sep 1, 2022
50768db
downcast_nullable_types works for dataframe and series
eccabay Sep 1, 2022
f1cbddf
Rename get_ts_X_y and cols for ease of use
eccabay Sep 1, 2022
39f2db2
Update time series featurizer tests for ww and have ts featurizer exp…
eccabay Sep 2, 2022
9a24f19
Update data check tests to explicitly set typing
eccabay Sep 6, 2022
9b74832
Update X_y_binary/multi/regression to be ww instead of numpy
eccabay Sep 6, 2022
df08067
Update X_y_categorical_classification/regression to init ww
eccabay Sep 7, 2022
008967f
Merge branch 'main' into 3651_ww_hardening
eccabay Sep 7, 2022
927cf52
Update release notes
eccabay Sep 7, 2022
aa88b20
Add test for downcast_nullable_types
eccabay Sep 7, 2022
c5e16f7
Fix downcast_nullable_types and test
eccabay Sep 7, 2022
2063785
lint fix
eccabay Sep 7, 2022
6f6defc
Small updates to reduce merge conflicts with ww 0.18.0 upgrade
eccabay Sep 9, 2022
c6a1c47
Fix a few missing changes
eccabay Sep 9, 2022
c35252f
Merge branch 'main' into 3651_ww_hardening
eccabay Sep 13, 2022
c26e508
Merge branch 'main' into 3651_ww_hardening
eccabay Sep 13, 2022
e7134e8
lint fix
eccabay Sep 13, 2022
6ca9142
more lint
eccabay Sep 13, 2022
eaad897
Merge branch 'main' into 3651_ww_hardening
chukarsten Sep 15, 2022
fd108a3
Merge branch 'main' into 3651_ww_hardening
chukarsten Sep 16, 2022
8b7cc18
Merge branch 'main' into 3651_ww_hardening
eccabay Sep 16, 2022
56eb074
Update downcast_nullable_types for series consistency
eccabay Sep 16, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,10 @@ Release Notes
* Removed multiple samplers occurring in pipelines generated by ``DefaultAlgorithm`` :pr:`3696`
* Fix search order changing when using ``DefaultAlgorithm`` :pr:`3704`
* Changes
* Added support for using ``downcast_nullable_types`` with Series as well as DataFrames :pr:`3697`
* Documentation Changes
* Testing Changes
* Updated pytest fixtures and brittle test files to explicitly set woodwork typing information :pr:`3697`

.. warning::

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@
import woodwork as ww

from evalml.pipelines.components.transformers import Transformer
from evalml.utils import infer_feature_types
from evalml.utils import downcast_nullable_types, infer_feature_types


class TimeSeriesImputer(Transformer):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,9 @@ def _compute_rolling_transforms(self, X, y, original_features):
if y is not None and "numeric" in y.ww.semantic_tags:
data[f"target_rolling_mean"] = rolling_mean(y.index, y)
data.index = X.index
data.ww.init()
data.ww.init(
logical_types={col: "Double" for col in data.columns},
)
return data

def _compute_delays(self, X_ww, y):
Expand Down Expand Up @@ -258,7 +260,7 @@ def _compute_delays(self, X_ww, y):
# Features created from categorical columns should no longer be categorical
lagged_features = pd.DataFrame(lagged_features)
lagged_features.ww.init(
logical_types={col: "Double" for col in cols_derived_from_categoricals},
logical_types={col: "Double" for col in lagged_features.columns},
Comment on lines -261 to +263
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm not confident in the correctness of this change, as to how we handle all types and if everything does in fact become a double here. If anyone knows better about this, please let me know.

)
lagged_features.index = X_ww.index
return ww.concat_columns([X_ww, lagged_features])
Expand Down
2 changes: 1 addition & 1 deletion evalml/pipelines/time_series_pipeline_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def transform_all_but_final(self, X, y=None, X_train=None, y_train=None):
y_train,
)
features = super().transform_all_but_final(padded_features, padded_target)
features_holdout = features.iloc[-len(y) :]
features_holdout = features.ww.iloc[-len(y) :]
return features_holdout

def predict_in_sample(self, X, y, X_train, y_train, objective=None):
Expand Down
25 changes: 5 additions & 20 deletions evalml/tests/automl_tests/parallel_tests/test_automl_dask.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,29 +285,14 @@ def test_automl_closes_engines(engine_str, X_y_binary_cls):
def test_score_pipelines_passes_X_train_y_train(
problem_type,
engine_str,
X_y_binary,
X_y_regression,
X_y_multi,
AutoMLTestEnv,
ts_data_binary,
ts_data_multi,
X_y_based_on_pipeline_or_problem_type,
ts_data,
AutoMLTestEnv,
):
if is_binary(problem_type):
if is_time_series(problem_type):
X, y = ts_data_binary
else:
X, y = X_y_binary
elif is_multiclass(problem_type):
if is_time_series(problem_type):
X, y = ts_data_multi
else:
X, y = X_y_multi
if is_time_series(problem_type):
X, _, y = ts_data(problem_type=problem_type)
else:
if is_time_series(problem_type):
X, y = ts_data
else:
X, y = X_y_regression
X, y = X_y_based_on_pipeline_or_problem_type(problem_type)

half = X.shape[0] // 2
X_train, y_train = pd.DataFrame(X[:half]), pd.Series(y[:half])
Expand Down
74 changes: 30 additions & 44 deletions evalml/tests/automl_tests/test_automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -1944,15 +1944,13 @@ def test_percent_better_than_baseline_in_rankings(
dummy_classifier_estimator_class,
dummy_regressor_estimator_class,
dummy_time_series_regressor_estimator_class,
ts_data_binary,
ts_data,
X_y_multi,
):
if not objective.is_defined_for_problem_type(problem_type_value):
pytest.skip("Skipping because objective is not defined for problem type")

X, y = ts_data_binary
if problem_type_value == ProblemTypes.MULTICLASS:
X, y = X_y_multi
X, _, y = ts_data(problem_type=problem_type_value)

estimator = {
ProblemTypes.BINARY: dummy_classifier_estimator_class,
Expand Down Expand Up @@ -2031,12 +2029,7 @@ class Pipeline2(DummyPipeline):
max_iterations=3,
objective=objective,
additional_objectives=[],
problem_configuration={
"time_index": "date",
"gap": 0,
"max_delay": 0,
"forecast_horizon": 2,
},
problem_configuration=pipeline_parameters["pipeline"],
train_best_pipeline=False,
n_jobs=1,
)
Expand Down Expand Up @@ -2134,9 +2127,9 @@ def test_percent_better_than_baseline_computed_for_all_objectives(
dummy_classifier_estimator_class,
dummy_regressor_estimator_class,
dummy_time_series_regressor_estimator_class,
ts_data_binary,
ts_data,
):
X, y = ts_data_binary
X, _, y = ts_data(problem_type=problem_type)

problem_type_enum = handle_problem_types(problem_type)

Expand Down Expand Up @@ -2270,7 +2263,7 @@ def fit(self, *args, **kwargs):


def test_time_series_regression_with_parameters(ts_data):
X, y = ts_data
X, _, y = ts_data()
X.index.name = "date"
problem_configuration = {
"time_index": "date",
Expand Down Expand Up @@ -2879,8 +2872,7 @@ def test_automl_woodwork_user_types_preserved(


def test_automl_validates_problem_configuration(ts_data):
_, y = ts_data
X = pd.DataFrame(pd.date_range("2020-10-01", "2020-10-31"), columns=["Date"])
X, _, y = ts_data()
assert (
AutoMLSearch(X_train=X, y_train=y, problem_type="binary").problem_configuration
== {}
Expand Down Expand Up @@ -2937,14 +2929,14 @@ def test_automl_validates_problem_configuration(ts_data):
y_train=y,
problem_type="time series regression",
problem_configuration={
"time_index": "Date",
"time_index": "date",
"max_delay": 2,
"gap": 3,
"forecast_horizon": 2,
},
).problem_configuration
assert problem_config == {
"time_index": "Date",
"time_index": "date",
"max_delay": 2,
"gap": 3,
"forecast_horizon": 2,
Expand Down Expand Up @@ -3076,7 +3068,7 @@ def test_automl_rerun(AutoMLTestEnv, X_y_binary, caplog):

def test_timeseries_baseline_init_with_correct_gap_max_delay(AutoMLTestEnv, ts_data):

X, y = ts_data
X, _, y = ts_data()
automl = AutoMLSearch(
X_train=X,
y_train=y,
Expand Down Expand Up @@ -4035,9 +4027,13 @@ def test_automl_baseline_pipeline_predictions_and_scores_time_series(problem_typ
baseline.fit(X_train, y_train)

expected_predictions = y.shift(1)[4:]
expected_predictions = expected_predictions.astype("int64")
expected_predictions = expected_predictions
if problem_type != ProblemTypes.TIME_SERIES_REGRESSION:
expected_predictions = pd.Series(expected_predictions, name="target_delay_1")
expected_predictions = pd.Series(
expected_predictions,
name="target_delay_1",
dtype="int64",
)

preds = baseline.predict(X_validation, None, X_train, y_train)
pd.testing.assert_series_equal(expected_predictions, preds)
Expand Down Expand Up @@ -4133,10 +4129,8 @@ def test_automl_thresholding_train_pipelines(mock_objective, threshold, X_y_bina
def test_automl_drop_unknown_columns(columns, AutoMLTestEnv, X_y_binary, caplog):
caplog.clear()
X, y = X_y_binary
X = pd.DataFrame(X)
for col in columns:
X[col] = pd.Series(range(len(X)))
X.ww.init()
X.ww[col] = pd.Series(range(len(X)))
X.ww.set_types({col: "Unknown" for col in columns})
automl = AutoMLSearch(
X_train=X,
Expand Down Expand Up @@ -4534,26 +4528,18 @@ def test_baseline_pipeline_properly_initalized(
@pytest.mark.parametrize(
"problem_type",
[
ProblemTypes.TIME_SERIES_REGRESSION,
ProblemTypes.TIME_SERIES_MULTICLASS,
ProblemTypes.TIME_SERIES_BINARY,
"time series regression",
"time series multiclass",
"time series binary",
],
)
def test_automl_passes_known_in_advance_pipeline_parameters_to_all_pipelines(
problem_type,
ts_data_binary,
ts_data_multi,
ts_data,
AutoMLTestEnv,
):
if problem_type == ProblemTypes.TIME_SERIES_MULTICLASS:
X, y = ts_data_multi
elif problem_type == ProblemTypes.TIME_SERIES_BINARY:
X, y = ts_data_binary
else:
X, y = ts_data
Comment on lines -4560 to -4565
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice refactoring 👍

X, _, y = ts_data(problem_type=problem_type)

X.ww.init()
X.ww["email"] = pd.Series(["[email protected]"] * X.shape[0], index=X.index)
X.ww["category"] = pd.Series(["a"] * X.shape[0], index=X.index)
X.ww.set_types({"email": "EmailAddress", "category": "Categorical"})
Expand Down Expand Up @@ -4588,7 +4574,7 @@ def test_automl_passes_known_in_advance_pipeline_parameters_to_all_pipelines(
lambda d: d["Not Known In Advance Pipeline - Select Columns Transformer"][
"columns"
]
== ["features", "date"],
== ["feature", "date"],
).all()


Expand Down Expand Up @@ -4633,10 +4619,10 @@ def test_cv_validation_scores(


def test_cv_validation_scores_time_series(
ts_data_binary,
ts_data,
AutoMLTestEnv,
):
X, y = ts_data_binary
X, _, y = ts_data(problem_type="time series binary")
problem_configuration = {
"time_index": "date",
"gap": 0,
Expand Down Expand Up @@ -4678,7 +4664,7 @@ def test_search_parameters_held_automl(
algorithm,
batches,
X_y_binary,
ts_data_binary,
ts_data,
):
if problem_type == "binary":
X, y = X_y_binary
Expand All @@ -4695,7 +4681,7 @@ def test_search_parameters_held_automl(
},
}
else:
X, y = ts_data_binary
X, _, y = ts_data(problem_type="time series binary")
problem_configuration = {
"time_index": "date",
"gap": 0,
Expand Down Expand Up @@ -4781,9 +4767,9 @@ def test_automl_accepts_features(
AutoMLTestEnv,
):
X, y = X_y_binary
X_pd = pd.DataFrame(X)
X_pd.columns = X_pd.columns.astype(str)
X_transform = X_pd.iloc[len(X) // 3 :]
X = pd.DataFrame(X) # Drop ww information since setting column types fails
X.columns = X.columns.astype(str)
X_transform = X.iloc[len(X) // 3 :]

if features == "with_features_provided":
es = ft.EntitySet()
Expand Down Expand Up @@ -4839,7 +4825,7 @@ def test_automl_with_iterative_algorithm_puts_ts_estimators_first(
is_using_windows,
):

X, y = ts_data
X, _, y = ts_data()

env = AutoMLTestEnv("time series regression")
automl = AutoMLSearch(
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -497,7 +497,7 @@ def test_pipeline_custom_hyperparameters_make_pipeline(
AutoMLTestEnv,
):
X, y = X_y_multi
X = pd.DataFrame(X, columns=[f"Column_{i}" for i in range(20)])
X.ww.columns = [f"Column_{i}" for i in range(20)]

component_graph_ = None
search_parameters_ = {}
Expand Down
16 changes: 6 additions & 10 deletions evalml/tests/automl_tests/test_automl_search_classification.py
Original file line number Diff line number Diff line change
Expand Up @@ -733,12 +733,11 @@ def test_max_time(X_y_binary):
)
def test_automl_supports_time_series_classification(
problem_type,
ts_data_binary,
ts_data_multi,
ts_data,
AutoMLTestEnv,
):
X, _, y = ts_data(problem_type=problem_type)
if problem_type == ProblemTypes.TIME_SERIES_BINARY:
X, y = ts_data_binary
baseline = TimeSeriesBinaryClassificationPipeline(
component_graph=["Time Series Baseline Estimator"],
parameters={
Expand All @@ -757,9 +756,7 @@ def test_automl_supports_time_series_classification(
},
)
score_return_value = {"Log Loss Binary": 0.2}
problem_type = "time series binary"
else:
X, y = ts_data_multi
baseline = TimeSeriesMulticlassClassificationPipeline(
component_graph=["Time Series Baseline Estimator"],
parameters={
Expand All @@ -778,7 +775,6 @@ def test_automl_supports_time_series_classification(
},
)
score_return_value = {"Log Loss Multiclass": 0.25}
problem_type = "time series multiclass"

configuration = {
"time_index": "date",
Expand Down Expand Up @@ -819,12 +815,12 @@ def test_automl_time_series_classification_threshold(
mock_split_data,
optimize,
objective,
ts_data_binary,
ts_data,
AutoMLTestEnv,
):
X, y = ts_data_binary
score_return_value = {objective: 0.4}
problem_type = "time series binary"
X, _, y = ts_data(problem_type=problem_type)

configuration = {
"time_index": "date",
Expand Down Expand Up @@ -1197,7 +1193,7 @@ def test_time_series_pipeline_parameter_warnings(
search_parameters,
set_values,
AutoMLTestEnv,
ts_data_binary,
ts_data,
):
search_parameters.update(
{
Expand All @@ -1209,7 +1205,7 @@ def test_time_series_pipeline_parameter_warnings(
},
},
)
X, y = ts_data_binary
X, _, y = ts_data(problem_type="time series binary")
configuration = {
"time_index": "date",
"gap": 0,
Expand Down
3 changes: 1 addition & 2 deletions evalml/tests/automl_tests/test_automl_search_regression.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,8 +218,7 @@ def test_log_metrics_only_passed_directly(X_y_regression):

@pytest.mark.parametrize("freq", ["D", "MS"])
def test_automl_supports_time_series_regression(freq, AutoMLTestEnv, ts_data):
X, y = ts_data
X["date"] = pd.date_range(start="1/1/2018", periods=31, freq=freq)
X, _, y = ts_data(freq=freq)

configuration = {
"time_index": "date",
Expand Down
Loading