Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update split_data with new default for timeseries #3650

Merged
merged 8 commits into from
Aug 15, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
11 changes: 6 additions & 5 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,18 +2,19 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Add CI testing environment in Mac for install workflow :pr:`3646`
* Updated to run with Woodwork >= 0.17.2 :pr:`3626`
* Add ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
* Add ``fit_transform`` method to pipelines and component graphs :pr:`3640`
* Added ``exclude_featurizers`` parameter to ``AutoMLSearch`` to specify featurizers that should be excluded from all pipelines :pr:`3631`
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make sure you send a picture of this to @angela97lin

* Added ``fit_transform`` method to pipelines and component graphs :pr:`3640`
* Add CI testing environment in Mac for install workflow :pr:`3646`
* Fixes
* Reverted the Woodwork 0.17.x compatibility work due to performance regression :pr:`3664`
* Changes
* Disable holdout set in AutoML search by default :pr:`3659`
* Disabled holdout set in AutoML search by default :pr:`3659`
* Pinned ``sktime`` at >=0.7.0,<0.13.1 due to slowdowns with time series modeling :pr:`3658`
* Reduced the default test size in ``split_data`` to 0.1 for time series problems :pr:`3650`
* Documentation Changes
* Updated broken link checker to exclude stackoverflow domain :pr:`3633`
* Add instructions to add new users to evalml-core-feedstock :pr:`3636`
* Added instructions to add new users to evalml-core-feedstock :pr:`3636`
* Testing Changes

.. warning::
Expand Down
40 changes: 26 additions & 14 deletions evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ def split_data(
y,
problem_type,
problem_configuration=None,
test_size=0.2,
test_size=None,
random_seed=0,
):
"""Split data into train and test sets.
Expand All @@ -58,7 +58,8 @@ def split_data(
problem_type (str or ProblemTypes): type of supervised learning problem. see evalml.problem_types.problemtype.all_problem_types for a full list.
problem_configuration (dict): Additional parameters needed to configure the search. For example,
in time series problems, values should be passed in for the time_index, gap, and max_delay variables.
test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%).
test_size (float): What percentage of data points should be included in the test set. Defaults to 0.2 (20%) for non-timeseries problems and 0.1
(10%) for timeseries problems.
random_seed (int): Seed for the random number generator. Defaults to 0.

Returns:
Expand Down Expand Up @@ -95,24 +96,35 @@ def split_data(

data_splitter = None
if is_time_series(problem_type):
if test_size is None:
test_size = 0.1
if (
problem_configuration is not None
and "forecast_horizon" in problem_configuration
):
fh_pct = problem_configuration["forecast_horizon"] / len(X)
test_size = max(test_size, fh_pct)
data_splitter = TrainingValidationSplit(
test_size=test_size,
shuffle=False,
stratify=None,
random_seed=random_seed,
)
elif is_regression(problem_type):
data_splitter = ShuffleSplit(
n_splits=1,
test_size=test_size,
random_state=random_seed,
)
elif is_classification(problem_type):
data_splitter = StratifiedShuffleSplit(
n_splits=1,
test_size=test_size,
random_state=random_seed,
)
else:
if test_size is None:
test_size = 0.2
if is_regression(problem_type):
data_splitter = ShuffleSplit(
n_splits=1,
test_size=test_size,
random_state=random_seed,
)
elif is_classification(problem_type):
data_splitter = StratifiedShuffleSplit(
n_splits=1,
test_size=test_size,
random_state=random_seed,
)

train, test = next(data_splitter.split(X, y))

Expand Down
71 changes: 56 additions & 15 deletions evalml/tests/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -96,17 +96,42 @@ def _get_test_data_from_configuration(
problem_type,
column_names=None,
nullable_target=False,
scale=2,
):
X_all = pd.DataFrame(
{
"all_null": [np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan]
* 2,
"int_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
"age_null": [0, 1, 2, np.nan, 4, np.nan, 6] * 2,
"bool_null": [True, None, False, True, False, None, True] * 2,
"numerical": range(14),
"categorical": ["a", "b", "a", "b", "b", "a", "b"] * 2,
"dates": pd.date_range("2000-02-03", periods=14, freq="W"),
"all_null": [
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
np.nan,
]
* scale,
"int_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
"age_null": [0, 1, 2, np.nan, 4, np.nan, 6, 7, 8, 9] * scale,
"bool_null": [
True,
None,
False,
True,
False,
None,
True,
True,
False,
True,
]
* scale,
"numerical": range(10 * scale),
"categorical": ["a", "b", "a", "b", "b", "a", "b", "a", "a", "b"]
* scale,
"dates": pd.date_range("2000-02-03", periods=10 * scale, freq="W"),
"text": [
"this is a string",
"this is another string",
Expand All @@ -115,8 +140,11 @@ def _get_test_data_from_configuration(
"cats are gr8",
"hello world",
"evalml is gr8",
"more strings",
"here we go",
"wheeeee!!!",
]
* 2,
* scale,
"email": [
"[email protected]",
"[email protected]",
Expand All @@ -125,8 +153,11 @@ def _get_test_data_from_configuration(
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
"[email protected]",
]
* 2,
* scale,
"url": [
"https://evalml.alteryx.com/en/stable/",
"https://woodwork.alteryx.com/en/stable/guides/statistical_insights.html",
Expand All @@ -135,8 +166,11 @@ def _get_test_data_from_configuration(
"https://www.evalml.alteryx.com/en/stable/demos/text_input.html",
"https://github.com/alteryx/evalml",
"https://github.com/alteryx/featuretools",
"https://github.com/alteryx/woodwork",
"https://github.com/alteryx/compose",
"https://woodwork.alteryx.com/en/stable/",
]
* 2,
* scale,
"ip": [
"0.0.0.0",
"1.1.1.101",
Expand All @@ -145,21 +179,28 @@ def _get_test_data_from_configuration(
"101.1.1.1",
"192.168.1.1",
"255.255.255.255",
"2.1.1.101",
"2.1.101.1",
"2.101.1.1",
]
* 2,
* scale,
},
)
y = pd.Series([0, 0, 1, 0, 0, 1, 1] * 2)
y = pd.Series([0, 0, 1, 0, 0, 1, 1, 0, 1, 0] * scale)
if problem_type == ProblemTypes.MULTICLASS:
y = pd.Series([0, 2, 1, 2, 0, 2, 1] * 2)
y = pd.Series([0, 2, 1, 2, 0, 2, 1, 2, 1, 0] * scale)
elif is_regression(problem_type):
y = pd.Series([1, 2, 3, 3, 3, 4, 5] * 2)
y = pd.Series([1, 2, 3, 3, 3, 4, 5, 5, 6, 6] * scale)
if nullable_target:
y.iloc[2] = None
if input_type == "ww":
y = ww.init_series(y, logical_type="integer_nullable")
X = X_all[column_names]

if input_type == "np":
X = X.to_numpy()
y = y.to_numpy()

if input_type == "ww":
logical_types = {}
if "text" in column_names:
Expand Down
69 changes: 69 additions & 0 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,3 +59,72 @@ def test_split_data(
y = pd.Series(y)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)


@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
X, y = get_test_data_from_configuration(
data_type,
problem_type,
column_names=["numerical"],
scale=10,
)

problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "ts_data"}
test_pct = 0.1
else:
test_pct = 0.2
X_train, X_test, y_train, y_test = split_data(
X,
y,
problem_type=problem_type,
problem_configuration=problem_configuration,
)
test_size = len(X) * test_pct
train_size = len(X) - test_size
assert len(X_train) == train_size
assert len(X_test) == test_size
assert len(y_train) == train_size
assert len(y_test) == test_size

if is_time_series(problem_type):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
y = pd.Series(y)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)


@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
def test_split_data_ts(test, X_y_regression):
X, y = X_y_regression

if test == "no_fh_limitation":
test_pct = 0.1
fh = 5
test_size = len(X) * test_pct
train_size = len(X) - test_size
elif test == "fh_limitation":
fh = 25
test_size = fh
train_size = len(X) - fh

problem_configuration = {
"gap": 1,
"max_delay": 7,
"forecast_horizon": fh,
"time_index": "ts_data",
}
X_train, X_test, y_train, y_test = split_data(
X,
y,
problem_type="time series regression",
problem_configuration=problem_configuration,
)
assert len(X_train) == train_size
assert len(X_test) == test_size
assert len(y_train) == train_size
assert len(y_test) == test_size