From b20598641b689e45251e6e810858711b3d1f26c3 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Tue, 9 Aug 2022 14:56:18 -0400 Subject: [PATCH 1/4] Update split_data with new default --- evalml/preprocessing/utils.py | 40 +++++--- .../preprocessing_tests/test_split_data.py | 94 +++++++++++++++++++ 2 files changed, 120 insertions(+), 14 deletions(-) diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index dba31c5d49..a5022afa12 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -47,7 +47,7 @@ def split_data( y, problem_type, problem_configuration=None, - test_size=0.2, + test_size=None, random_seed=0, ): """Split data into train and test sets. @@ -58,7 +58,8 @@ def split_data( problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list. problem_configuration (dict): Additional parameters needed to configure the search. For example, in time series problems, values should be passed in for the time_index, gap, and max_delay variables. - test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%). + test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1 + (10%) for timeseries problems. random_seed (int): Seed for the random number generator. Defaults to 0. Returns: @@ -95,24 +96,35 @@ def split_data( data_splitter = None if is_time_series(problem_type): + if test_size is None: + test_size = 0.1 + if ( + problem_configuration is not None + and "forecast_horizon" in problem_configuration + ): + fh_pct = problem_configuration["forecast_horizon"] / len(X) + test_size = max(test_size, fh_pct) data_splitter = TrainingValidationSplit( test_size=test_size, shuffle=False, stratify=None, random_seed=random_seed, ) - elif is_regression(problem_type): - data_splitter = ShuffleSplit( - n_splits=1, - test_size=test_size, - random_state=random_seed, - ) - elif is_classification(problem_type): - data_splitter = StratifiedShuffleSplit( - n_splits=1, - test_size=test_size, - random_state=random_seed, - ) + else: + if test_size is None: + test_size = 0.2 + if is_regression(problem_type): + data_splitter = ShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) + elif is_classification(problem_type): + data_splitter = StratifiedShuffleSplit( + n_splits=1, + test_size=test_size, + random_state=random_seed, + ) train, test = next(data_splitter.split(X, y)) diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 98d7caceae..40418c45cb 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -59,3 +59,97 @@ def test_split_data( y = pd.Series(y) pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + + +@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) +@pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) +def test_split_data_defaults( + problem_type, + data_type, + X_y_binary, + X_y_multi, + X_y_regression, + make_data_type, +): + if is_binary(problem_type): + X, y = X_y_binary + if is_multiclass(problem_type): + X, y = X_y_multi + if is_regression(problem_type): + X, y = X_y_regression + problem_configuration = None + if is_time_series(problem_type): + problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"} + + X = make_data_type(data_type, X) + y = make_data_type(data_type, y) + + if is_time_series(problem_type): + test_pct = 0.1 + else: + test_pct = 0.2 + X_train, X_test, y_train, y_test = split_data( + X, + y, + problem_type=problem_type, + problem_configuration=problem_configuration, + ) + test_size = len(X) * test_pct + train_size = len(X) - test_size + assert len(X_train) == train_size + assert len(X_test) == test_size + assert len(y_train) == train_size + assert len(y_test) == test_size + + if is_time_series(problem_type): + if not isinstance(X, pd.DataFrame): + X = pd.DataFrame(X) + y = pd.Series(y) + pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) + pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + + +def test_split_data_ts(X_y_regression): + X, y = X_y_regression + + # Test with no forecast horizon limitation + test_pct = 0.1 + problem_configuration = { + "gap": 1, + "max_delay": 7, + "forecast_horizon": 5, + "time_index": "ts_data", + } + X_train, X_test, y_train, y_test = split_data( + X, + y, + problem_type="time series regression", + problem_configuration=problem_configuration, + ) + test_size = len(X) * test_pct + train_size = len(X) - test_size + assert len(X_train) == train_size + assert len(X_test) == test_size + assert len(y_train) == train_size + assert len(y_test) == test_size + + # Test with a forecast horizon limitation + fh = 25 + problem_configuration = { + "gap": 1, + "max_delay": 7, + "forecast_horizon": fh, + "time_index": "ts_data", + } + X_train, X_test, y_train, y_test = split_data( + X, + y, + problem_type="time series regression", + problem_configuration=problem_configuration, + ) + test_size = fh + train_size = len(X) - fh + assert len(X_train) == train_size + assert len(X_test) == test_size + assert len(y_train) == train_size + assert len(y_test) == test_size From 96b9d2ef757ad199ec2d506972ee216f75c02497 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Tue, 9 Aug 2022 15:00:03 -0400 Subject: [PATCH 2/4] Update release notes --- docs/source/release_notes.rst | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index db9c84c67c..adfc6561cb 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -2,12 +2,13 @@ Release Notes ------------- **Future Releases** * Enhancements - * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631` + * Added ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631` * Fixes * Changes + * Reduced the default test size in ``split_data`` to 0.1 for timeseries problems :pr:`3650` * Documentation Changes * Updated broken link checker to exclude stackoverflow domain :pr:`3633` - * Add instructions to add new users to evalml-core-feedstock :pr:`3636` + * Added instructions to add new users to evalml-core-feedstock :pr:`3636` * Testing Changes .. warning:: From 46b232e19c7b208c3a68506671930146cbe775c2 Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Fri, 12 Aug 2022 14:34:49 -0400 Subject: [PATCH 3/4] Test updates from PR comments --- evalml/tests/conftest.py | 67 ++++++++++++++----- .../preprocessing_tests/test_split_data.py | 63 ++++++----------- 2 files changed, 71 insertions(+), 59 deletions(-) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 3e0c67aed8..f3300cb45c 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -96,17 +96,42 @@ def _get_test_data_from_configuration( problem_type, column_names=None, nullable_target=False, + scale=2, ): X_all = pd.DataFrame( { - "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan] - * 2, - "int_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2, - "age_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2, - "bool_null": [True, None, False, True, False, None, True] * 2, - "numerical": range(14), - "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2, - "dates": pd.date_range("2000-02-03", periods=14, freq="W"), + "all_null": [ + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + np.nan, + ] + * scale, + "int_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale, + "age_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale, + "bool_null": [ + True, + None, + False, + True, + False, + None, + True, + True, + False, + True, + ] + * scale, + "numerical": range(10 * scale), + "categorical": ["a", "b", "a", "b", "b", "a", "b", "a", "a", "b"] + * scale, + "dates": pd.date_range("2000-02-03", periods=10 * scale, freq="W"), "text": [ "this is a string", "this is another string", @@ -115,8 +140,11 @@ def _get_test_data_from_configuration( "cats are gr8", "hello world", "evalml is gr8", + "more strings", + "here we go", + "wheeeee!!!", ] - * 2, + * scale, "email": [ "abalone_0@gmail.com", "AbaloneRings@yahoo.com", @@ -125,8 +153,11 @@ def _get_test_data_from_configuration( "fooEMAIL@email.org", "evalml@evalml.org", "evalml@alteryx.org", + "woodwork@alteryx.org", + "featuretools@alteryx.org", + "compose@alteryx.org", ] - * 2, + * scale, "url": [ "https://evalml.alteryx.com/en/stable/", "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html", @@ -135,8 +166,11 @@ def _get_test_data_from_configuration( "https://www.evalml.alteryx.com/en/stable/demos/text_input.html", "https://github.com/alteryx/evalml", "https://github.com/alteryx/featuretools", + "https://github.com/alteryx/woodwork", + "https://github.com/alteryx/compose", + "https://woodwork.alteryx.com/en/stable/", ] - * 2, + * scale, "ip": [ "0.0.0.0", "1.1.1.101", @@ -145,15 +179,18 @@ def _get_test_data_from_configuration( "101.1.1.1", "192.168.1.1", "255.255.255.255", + "2.1.1.101", + "2.1.101.1", + "2.101.1.1", ] - * 2, + * scale, }, ) - y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2) + y = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 0] * scale) if problem_type == ProblemTypes.MULTICLASS: - y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2) + y = pd.Series([0, 2, 1, 2, 0, 2, 1, 2, 1, 0] * scale) elif is_regression(problem_type): - y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2) + y = pd.Series([1, 2, 3, 3, 3, 4, 5, 5, 6, 6] * scale) if nullable_target: y.iloc[2] = None if input_type == "ww": diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index 40418c45cb..8c34810a34 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -63,28 +63,17 @@ def test_split_data( @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_split_data_defaults( - problem_type, - data_type, - X_y_binary, - X_y_multi, - X_y_regression, - make_data_type, -): - if is_binary(problem_type): - X, y = X_y_binary - if is_multiclass(problem_type): - X, y = X_y_multi - if is_regression(problem_type): - X, y = X_y_regression +def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration): + X, y = get_test_data_from_configuration( + data_type, + problem_type, + column_names=["numerical"], + scale=10, + ) + problem_configuration = None if is_time_series(problem_type): problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"} - - X = make_data_type(data_type, X) - y = make_data_type(data_type, y) - - if is_time_series(problem_type): test_pct = 0.1 else: test_pct = 0.2 @@ -109,32 +98,20 @@ def test_split_data_defaults( pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) -def test_split_data_ts(X_y_regression): +@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"]) +def test_split_data_ts(test, X_y_regression): X, y = X_y_regression - # Test with no forecast horizon limitation - test_pct = 0.1 - problem_configuration = { - "gap": 1, - "max_delay": 7, - "forecast_horizon": 5, - "time_index": "ts_data", - } - X_train, X_test, y_train, y_test = split_data( - X, - y, - problem_type="time series regression", - problem_configuration=problem_configuration, - ) - test_size = len(X) * test_pct - train_size = len(X) - test_size - assert len(X_train) == train_size - assert len(X_test) == test_size - assert len(y_train) == train_size - assert len(y_test) == test_size + if test == "no_fh_limitation": + test_pct = 0.1 + fh = 5 + test_size = len(X) * test_pct + train_size = len(X) - test_size + elif test == "fh_limitation": + fh = 25 + test_size = fh + train_size = len(X) - fh - # Test with a forecast horizon limitation - fh = 25 problem_configuration = { "gap": 1, "max_delay": 7, @@ -147,8 +124,6 @@ def test_split_data_ts(X_y_regression): problem_type="time series regression", problem_configuration=problem_configuration, ) - test_size = fh - train_size = len(X) - fh assert len(X_train) == train_size assert len(X_test) == test_size assert len(y_train) == train_size From 874f81aa7d62237f0b85f810a1b10e3a873abd9b Mon Sep 17 00:00:00 2001 From: Karsten Chu Date: Mon, 15 Aug 2022 13:48:28 -0400 Subject: [PATCH 4/4] Added the ability of the conftest fixture to generate numpy. --- evalml/tests/conftest.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py index 48f30c25fe..b16c60a5eb 100644 --- a/evalml/tests/conftest.py +++ b/evalml/tests/conftest.py @@ -197,6 +197,10 @@ def _get_test_data_from_configuration( y = ww.init_series(y, logical_type="integer_nullable") X = X_all[column_names] + if input_type == "np": + X = X.to_numpy() + y = y.to_numpy() + if input_type == "ww": logical_types = {} if "text" in column_names: