From b20598641b689e45251e6e810858711b3d1f26c3 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <becca.mcbrayer@alteryx.com>
Date: Tue, 9 Aug 2022 14:56:18 -0400
Subject: [PATCH 1/4] Update split_data with new default

---
 evalml/preprocessing/utils.py                 | 40 +++++---
 .../preprocessing_tests/test_split_data.py    | 94 +++++++++++++++++++
 2 files changed, 120 insertions(+), 14 deletions(-)

diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
index dba31c5d49..a5022afa12 100644
--- a/evalml/preprocessing/utils.py
+++ b/evalml/preprocessing/utils.py
@@ -47,7 +47,7 @@ def split_data(
     y,
     problem_type,
     problem_configuration=None,
-    test_size=0.2,
+    test_size=None,
     random_seed=0,
 ):
     """Split data into train and test sets.
@@ -58,7 +58,8 @@ def split_data(
         problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
         problem_configuration (dict): Additional parameters needed to configure the search. For example,
             in time series problems, values should be passed in for the time_index, gap, and max_delay variables.
-        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
+        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1
+            (10%) for timeseries problems.
         random_seed (int): Seed for the random number generator. Defaults to 0.
 
     Returns:
@@ -95,24 +96,35 @@ def split_data(
 
     data_splitter = None
     if is_time_series(problem_type):
+        if test_size is None:
+            test_size = 0.1
+            if (
+                problem_configuration is not None
+                and "forecast_horizon" in problem_configuration
+            ):
+                fh_pct = problem_configuration["forecast_horizon"] / len(X)
+                test_size = max(test_size, fh_pct)
         data_splitter = TrainingValidationSplit(
             test_size=test_size,
             shuffle=False,
             stratify=None,
             random_seed=random_seed,
         )
-    elif is_regression(problem_type):
-        data_splitter = ShuffleSplit(
-            n_splits=1,
-            test_size=test_size,
-            random_state=random_seed,
-        )
-    elif is_classification(problem_type):
-        data_splitter = StratifiedShuffleSplit(
-            n_splits=1,
-            test_size=test_size,
-            random_state=random_seed,
-        )
+    else:
+        if test_size is None:
+            test_size = 0.2
+        if is_regression(problem_type):
+            data_splitter = ShuffleSplit(
+                n_splits=1,
+                test_size=test_size,
+                random_state=random_seed,
+            )
+        elif is_classification(problem_type):
+            data_splitter = StratifiedShuffleSplit(
+                n_splits=1,
+                test_size=test_size,
+                random_state=random_seed,
+            )
 
     train, test = next(data_splitter.split(X, y))
 
diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
index 98d7caceae..40418c45cb 100644
--- a/evalml/tests/preprocessing_tests/test_split_data.py
+++ b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -59,3 +59,97 @@ def test_split_data(
             y = pd.Series(y)
         pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
         pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
+
+
+@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
+@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
+def test_split_data_defaults(
+    problem_type,
+    data_type,
+    X_y_binary,
+    X_y_multi,
+    X_y_regression,
+    make_data_type,
+):
+    if is_binary(problem_type):
+        X, y = X_y_binary
+    if is_multiclass(problem_type):
+        X, y = X_y_multi
+    if is_regression(problem_type):
+        X, y = X_y_regression
+    problem_configuration = None
+    if is_time_series(problem_type):
+        problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"}
+
+    X = make_data_type(data_type, X)
+    y = make_data_type(data_type, y)
+
+    if is_time_series(problem_type):
+        test_pct = 0.1
+    else:
+        test_pct = 0.2
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type=problem_type,
+        problem_configuration=problem_configuration,
+    )
+    test_size = len(X) * test_pct
+    train_size = len(X) - test_size
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size
+
+    if is_time_series(problem_type):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+            y = pd.Series(y)
+        pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
+        pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
+
+
+def test_split_data_ts(X_y_regression):
+    X, y = X_y_regression
+
+    # Test with no forecast horizon limitation
+    test_pct = 0.1
+    problem_configuration = {
+        "gap": 1,
+        "max_delay": 7,
+        "forecast_horizon": 5,
+        "time_index": "ts_data",
+    }
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type="time series regression",
+        problem_configuration=problem_configuration,
+    )
+    test_size = len(X) * test_pct
+    train_size = len(X) - test_size
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size
+
+    # Test with a forecast horizon limitation
+    fh = 25
+    problem_configuration = {
+        "gap": 1,
+        "max_delay": 7,
+        "forecast_horizon": fh,
+        "time_index": "ts_data",
+    }
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type="time series regression",
+        problem_configuration=problem_configuration,
+    )
+    test_size = fh
+    train_size = len(X) - fh
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size

From 96b9d2ef757ad199ec2d506972ee216f75c02497 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <becca.mcbrayer@alteryx.com>
Date: Tue, 9 Aug 2022 15:00:03 -0400
Subject: [PATCH 2/4] Update release notes

---
 docs/source/release_notes.rst | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
index db9c84c67c..adfc6561cb 100644
--- a/docs/source/release_notes.rst
+++ b/docs/source/release_notes.rst
@@ -2,12 +2,13 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
-        * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
+        * Added ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
     * Fixes
     * Changes
+        * Reduced the default test size in ``split_data`` to 0.1 for timeseries problems :pr:`3650`
     * Documentation Changes
         * Updated broken link checker to exclude stackoverflow domain :pr:`3633`
-        * Add instructions to add new users to evalml-core-feedstock :pr:`3636`
+        * Added instructions to add new users to evalml-core-feedstock :pr:`3636`
     * Testing Changes
 
 .. warning::

From 46b232e19c7b208c3a68506671930146cbe775c2 Mon Sep 17 00:00:00 2001
From: Becca McBrayer <becca.mcbrayer@alteryx.com>
Date: Fri, 12 Aug 2022 14:34:49 -0400
Subject: [PATCH 3/4] Test updates from PR comments

---
 evalml/tests/conftest.py                      | 67 ++++++++++++++-----
 .../preprocessing_tests/test_split_data.py    | 63 ++++++-----------
 2 files changed, 71 insertions(+), 59 deletions(-)

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
index 3e0c67aed8..f3300cb45c 100644
--- a/evalml/tests/conftest.py
+++ b/evalml/tests/conftest.py
@@ -96,17 +96,42 @@ def _get_test_data_from_configuration(
         problem_type,
         column_names=None,
         nullable_target=False,
+        scale=2,
     ):
         X_all = pd.DataFrame(
             {
-                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
-                * 2,
-                "int_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
-                "age_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
-                "bool_null": [True, None, False, True, False, None, True] * 2,
-                "numerical": range(14),
-                "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2,
-                "dates": pd.date_range("2000-02-03", periods=14, freq="W"),
+                "all_null": [
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                ]
+                * scale,
+                "int_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
+                "age_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
+                "bool_null": [
+                    True,
+                    None,
+                    False,
+                    True,
+                    False,
+                    None,
+                    True,
+                    True,
+                    False,
+                    True,
+                ]
+                * scale,
+                "numerical": range(10 * scale),
+                "categorical": ["a", "b", "a", "b", "b", "a", "b", "a", "a", "b"]
+                * scale,
+                "dates": pd.date_range("2000-02-03", periods=10 * scale, freq="W"),
                 "text": [
                     "this is a string",
                     "this is another string",
@@ -115,8 +140,11 @@ def _get_test_data_from_configuration(
                     "cats are gr8",
                     "hello world",
                     "evalml is gr8",
+                    "more strings",
+                    "here we go",
+                    "wheeeee!!!",
                 ]
-                * 2,
+                * scale,
                 "email": [
                     "abalone_0@gmail.com",
                     "AbaloneRings@yahoo.com",
@@ -125,8 +153,11 @@ def _get_test_data_from_configuration(
                     "fooEMAIL@email.org",
                     "evalml@evalml.org",
                     "evalml@alteryx.org",
+                    "woodwork@alteryx.org",
+                    "featuretools@alteryx.org",
+                    "compose@alteryx.org",
                 ]
-                * 2,
+                * scale,
                 "url": [
                     "https://evalml.alteryx.com/en/stable/",
                     "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html",
@@ -135,8 +166,11 @@ def _get_test_data_from_configuration(
                     "https://www.evalml.alteryx.com/en/stable/demos/text_input.html",
                     "https://github.com/alteryx/evalml",
                     "https://github.com/alteryx/featuretools",
+                    "https://github.com/alteryx/woodwork",
+                    "https://github.com/alteryx/compose",
+                    "https://woodwork.alteryx.com/en/stable/",
                 ]
-                * 2,
+                * scale,
                 "ip": [
                     "0.0.0.0",
                     "1.1.1.101",
@@ -145,15 +179,18 @@ def _get_test_data_from_configuration(
                     "101.1.1.1",
                     "192.168.1.1",
                     "255.255.255.255",
+                    "2.1.1.101",
+                    "2.1.101.1",
+                    "2.101.1.1",
                 ]
-                * 2,
+                * scale,
             },
         )
-        y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2)
+        y = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 0] * scale)
         if problem_type == ProblemTypes.MULTICLASS:
-            y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2)
+            y = pd.Series([0, 2, 1, 2, 0, 2, 1, 2, 1, 0] * scale)
         elif is_regression(problem_type):
-            y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2)
+            y = pd.Series([1, 2, 3, 3, 3, 4, 5, 5, 6, 6] * scale)
         if nullable_target:
             y.iloc[2] = None
             if input_type == "ww":
diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
index 40418c45cb..8c34810a34 100644
--- a/evalml/tests/preprocessing_tests/test_split_data.py
+++ b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -63,28 +63,17 @@ def test_split_data(
 
 @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
 @pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
-def test_split_data_defaults(
-    problem_type,
-    data_type,
-    X_y_binary,
-    X_y_multi,
-    X_y_regression,
-    make_data_type,
-):
-    if is_binary(problem_type):
-        X, y = X_y_binary
-    if is_multiclass(problem_type):
-        X, y = X_y_multi
-    if is_regression(problem_type):
-        X, y = X_y_regression
+def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
+    X, y = get_test_data_from_configuration(
+        data_type,
+        problem_type,
+        column_names=["numerical"],
+        scale=10,
+    )
+
     problem_configuration = None
     if is_time_series(problem_type):
         problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"}
-
-    X = make_data_type(data_type, X)
-    y = make_data_type(data_type, y)
-
-    if is_time_series(problem_type):
         test_pct = 0.1
     else:
         test_pct = 0.2
@@ -109,32 +98,20 @@ def test_split_data_defaults(
         pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
 
 
-def test_split_data_ts(X_y_regression):
+@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
+def test_split_data_ts(test, X_y_regression):
     X, y = X_y_regression
 
-    # Test with no forecast horizon limitation
-    test_pct = 0.1
-    problem_configuration = {
-        "gap": 1,
-        "max_delay": 7,
-        "forecast_horizon": 5,
-        "time_index": "ts_data",
-    }
-    X_train, X_test, y_train, y_test = split_data(
-        X,
-        y,
-        problem_type="time series regression",
-        problem_configuration=problem_configuration,
-    )
-    test_size = len(X) * test_pct
-    train_size = len(X) - test_size
-    assert len(X_train) == train_size
-    assert len(X_test) == test_size
-    assert len(y_train) == train_size
-    assert len(y_test) == test_size
+    if test == "no_fh_limitation":
+        test_pct = 0.1
+        fh = 5
+        test_size = len(X) * test_pct
+        train_size = len(X) - test_size
+    elif test == "fh_limitation":
+        fh = 25
+        test_size = fh
+        train_size = len(X) - fh
 
-    # Test with a forecast horizon limitation
-    fh = 25
     problem_configuration = {
         "gap": 1,
         "max_delay": 7,
@@ -147,8 +124,6 @@ def test_split_data_ts(X_y_regression):
         problem_type="time series regression",
         problem_configuration=problem_configuration,
     )
-    test_size = fh
-    train_size = len(X) - fh
     assert len(X_train) == train_size
     assert len(X_test) == test_size
     assert len(y_train) == train_size

From 874f81aa7d62237f0b85f810a1b10e3a873abd9b Mon Sep 17 00:00:00 2001
From: Karsten Chu <karsten.chu@alteryx.com>
Date: Mon, 15 Aug 2022 13:48:28 -0400
Subject: [PATCH 4/4] Added the ability of the conftest fixture to generate
 numpy.

---
 evalml/tests/conftest.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
index 48f30c25fe..b16c60a5eb 100644
--- a/evalml/tests/conftest.py
+++ b/evalml/tests/conftest.py
@@ -197,6 +197,10 @@ def _get_test_data_from_configuration(
                 y = ww.init_series(y, logical_type="integer_nullable")
         X = X_all[column_names]
 
+        if input_type == "np":
+            X = X.to_numpy()
+            y = y.to_numpy()
+
         if input_type == "ww":
             logical_types = {}
             if "text" in column_names: