alteryx · chukarsten · Aug 15, 2022 · Aug 9, 2022 · Aug 9, 2022 · Aug 12, 2022
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,18 +2,19 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
-        * Add CI testing environment in Mac for install workflow :pr:`3646`
         * Updated to run with Woodwork >= 0.17.2 :pr:`3626`
-        * Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
-        * Add ``fit_transform`` method to pipelines and component graphs :pr:`3640`
+        * Added ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
+        * Added ``fit_transform`` method to pipelines and component graphs :pr:`3640` 
+        * Add CI testing environment in Mac for install workflow :pr:`3646`
     * Fixes
         * Reverted the Woodwork 0.17.x compatibility work due to performance regression :pr:`3664`
     * Changes
-        * Disable holdout set in AutoML search by default :pr:`3659`
+        * Disabled holdout set in AutoML search by default :pr:`3659`
         * Pinned ``sktime`` at >=0.7.0,<0.13.1 due to slowdowns with time series modeling :pr:`3658`
+        * Reduced the default test size in ``split_data`` to 0.1 for time series problems :pr:`3650`
     * Documentation Changes
         * Updated broken link checker to exclude stackoverflow domain :pr:`3633`
-        * Add instructions to add new users to evalml-core-feedstock :pr:`3636`
+        * Added instructions to add new users to evalml-core-feedstock :pr:`3636`
     * Testing Changes
 
 .. warning::

diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py
@@ -47,7 +47,7 @@ def split_data(
     y,
     problem_type,
     problem_configuration=None,
-    test_size=0.2,
+    test_size=None,
     random_seed=0,
 ):
     """Split data into train and test sets.
@@ -58,7 +58,8 @@ def split_data(
         problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
         problem_configuration (dict): Additional parameters needed to configure the search. For example,
             in time series problems, values should be passed in for the time_index, gap, and max_delay variables.
-        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
+        test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1
+            (10%) for timeseries problems.
         random_seed (int): Seed for the random number generator. Defaults to 0.
 
     Returns:
@@ -95,24 +96,35 @@ def split_data(
 
     data_splitter = None
     if is_time_series(problem_type):
+        if test_size is None:
+            test_size = 0.1
+            if (
+                problem_configuration is not None
+                and "forecast_horizon" in problem_configuration
+            ):
+                fh_pct = problem_configuration["forecast_horizon"] / len(X)
+                test_size = max(test_size, fh_pct)
         data_splitter = TrainingValidationSplit(
             test_size=test_size,
             shuffle=False,
             stratify=None,
             random_seed=random_seed,
         )
-    elif is_regression(problem_type):
-        data_splitter = ShuffleSplit(
-            n_splits=1,
-            test_size=test_size,
-            random_state=random_seed,
-        )
-    elif is_classification(problem_type):
-        data_splitter = StratifiedShuffleSplit(
-            n_splits=1,
-            test_size=test_size,
-            random_state=random_seed,
-        )
+    else:
+        if test_size is None:
+            test_size = 0.2
+        if is_regression(problem_type):
+            data_splitter = ShuffleSplit(
+                n_splits=1,
+                test_size=test_size,
+                random_state=random_seed,
+            )
+        elif is_classification(problem_type):
+            data_splitter = StratifiedShuffleSplit(
+                n_splits=1,
+                test_size=test_size,
+                random_state=random_seed,
+            )
 
     train, test = next(data_splitter.split(X, y))
 

diff --git a/evalml/tests/conftest.py b/evalml/tests/conftest.py
@@ -96,17 +96,42 @@ def _get_test_data_from_configuration(
         problem_type,
         column_names=None,
         nullable_target=False,
+        scale=2,
     ):
         X_all = pd.DataFrame(
             {
-                "all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
-                * 2,
-                "int_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
-                "age_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
-                "bool_null": [True, None, False, True, False, None, True] * 2,
-                "numerical": range(14),
-                "categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2,
-                "dates": pd.date_range("2000-02-03", periods=14, freq="W"),
+                "all_null": [
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                    np.nan,
+                ]
+                * scale,
+                "int_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
+                "age_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
+                "bool_null": [
+                    True,
+                    None,
+                    False,
+                    True,
+                    False,
+                    None,
+                    True,
+                    True,
+                    False,
+                    True,
+                ]
+                * scale,
+                "numerical": range(10 * scale),
+                "categorical": ["a", "b", "a", "b", "b", "a", "b", "a", "a", "b"]
+                * scale,
+                "dates": pd.date_range("2000-02-03", periods=10 * scale, freq="W"),
                 "text": [
                     "this is a string",
                     "this is another string",
@@ -115,8 +140,11 @@ def _get_test_data_from_configuration(
                     "cats are gr8",
                     "hello world",
                     "evalml is gr8",
+                    "more strings",
+                    "here we go",
+                    "wheeeee!!!",
                 ]
-                * 2,
+                * scale,
                 "email": [
                     "[email protected]",
                     "[email protected]",
@@ -125,8 +153,11 @@ def _get_test_data_from_configuration(
                     "[email protected]",
                     "[email protected]",
                     "[email protected]",
+                    "[email protected]",
+                    "[email protected]",
+                    "[email protected]",
                 ]
-                * 2,
+                * scale,
                 "url": [
                     "https://evalml.alteryx.com/en/stable/",
                     "https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html",
@@ -135,8 +166,11 @@ def _get_test_data_from_configuration(
                     "https://www.evalml.alteryx.com/en/stable/demos/text_input.html",
                     "https://github.com/alteryx/evalml",
                     "https://github.com/alteryx/featuretools",
+                    "https://github.com/alteryx/woodwork",
+                    "https://github.com/alteryx/compose",
+                    "https://woodwork.alteryx.com/en/stable/",
                 ]
-                * 2,
+                * scale,
                 "ip": [
                     "0.0.0.0",
                     "1.1.1.101",
@@ -145,21 +179,28 @@ def _get_test_data_from_configuration(
                     "101.1.1.1",
                     "192.168.1.1",
                     "255.255.255.255",
+                    "2.1.1.101",
+                    "2.1.101.1",
+                    "2.101.1.1",
                 ]
-                * 2,
+                * scale,
             },
         )
-        y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2)
+        y = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 0] * scale)
         if problem_type == ProblemTypes.MULTICLASS:
-            y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2)
+            y = pd.Series([0, 2, 1, 2, 0, 2, 1, 2, 1, 0] * scale)
         elif is_regression(problem_type):
-            y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2)
+            y = pd.Series([1, 2, 3, 3, 3, 4, 5, 5, 6, 6] * scale)
         if nullable_target:
             y.iloc[2] = None
             if input_type == "ww":
                 y = ww.init_series(y, logical_type="integer_nullable")
         X = X_all[column_names]
 
+        if input_type == "np":
+            X = X.to_numpy()
+            y = y.to_numpy()
+
         if input_type == "ww":
             logical_types = {}
             if "text" in column_names:

diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py
@@ -59,3 +59,72 @@ def test_split_data(
             y = pd.Series(y)
         pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
         pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
+
+
+@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
+@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
+def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
+    X, y = get_test_data_from_configuration(
+        data_type,
+        problem_type,
+        column_names=["numerical"],
+        scale=10,
+    )
+
+    problem_configuration = None
+    if is_time_series(problem_type):
+        problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"}
+        test_pct = 0.1
+    else:
+        test_pct = 0.2
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type=problem_type,
+        problem_configuration=problem_configuration,
+    )
+    test_size = len(X) * test_pct
+    train_size = len(X) - test_size
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size
+
+    if is_time_series(problem_type):
+        if not isinstance(X, pd.DataFrame):
+            X = pd.DataFrame(X)
+            y = pd.Series(y)
+        pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
+        pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
+
+
+@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
+def test_split_data_ts(test, X_y_regression):
+    X, y = X_y_regression
+
+    if test == "no_fh_limitation":
+        test_pct = 0.1
+        fh = 5
+        test_size = len(X) * test_pct
+        train_size = len(X) - test_size
+    elif test == "fh_limitation":
+        fh = 25
+        test_size = fh
+        train_size = len(X) - fh
+
+    problem_configuration = {
+        "gap": 1,
+        "max_delay": 7,
+        "forecast_horizon": fh,
+        "time_index": "ts_data",
+    }
+    X_train, X_test, y_train, y_test = split_data(
+        X,
+        y,
+        problem_type="time series regression",
+        problem_configuration=problem_configuration,
+    )
+    assert len(X_train) == train_size
+    assert len(X_test) == test_size
+    assert len(y_train) == train_size
+    assert len(y_test) == test_size