Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update split_data to call split_multiseries_data #4312

Merged
merged 4 commits into from
Sep 19, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ Release Notes
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Documentation Changes
* Removed LightGBM's excessive amount of warnings :pr:`4308`
* Testing Changes
Expand Down
31 changes: 30 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import is_classification, is_regression, is_time_series
from evalml.problem_types import (
is_classification,
is_multiseries,
is_regression,
is_time_series,
)
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -118,6 +123,9 @@ def split_data(
Returns:
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets.

Raises:
ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems.

Examples:
>>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"])
>>> y = pd.Series([8, 9, 10, 11, 12, 13])
Expand All @@ -144,6 +152,27 @@ def split_data(
1 9
dtype: int64
"""
if is_multiseries(problem_type) and isinstance(y, pd.Series):
if problem_configuration is None:
raise ValueError(
"split_data requires problem_configuration for multiseries problems",
)
series_id = problem_configuration.get("series_id")
eccabay marked this conversation as resolved.
Show resolved Hide resolved
time_index = problem_configuration.get("time_index")
if series_id is None or time_index is None:
raise ValueError(
"split_data needs both series_id and time_index values in the problem_configuration to split multiseries data",
)
return split_multiseries_data(
X,
y,
series_id,
time_index,
problem_configuration=problem_configuration,
test_size=test_size,
random_seed=random_seed,
)

X = infer_feature_types(X)
y = infer_feature_types(y)

Expand Down
87 changes: 75 additions & 12 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ProblemTypes,
is_binary,
is_multiclass,
is_multiseries,
is_regression,
is_time_series,
)
Expand All @@ -19,6 +20,7 @@ def test_split_data(
X_y_binary,
X_y_multi,
X_y_regression,
multiseries_ts_data_unstacked,
make_data_type,
):
if is_binary(problem_type):
Expand All @@ -30,6 +32,8 @@ def test_split_data(
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked

X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
Expand All @@ -50,17 +54,28 @@ def test_split_data(
assert len(y_test) == test_size
assert isinstance(X_train, pd.DataFrame)
assert isinstance(X_test, pd.DataFrame)
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)
if not is_multiseries(problem_type):
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)
else:
assert isinstance(y_train, pd.DataFrame)
assert isinstance(y_test, pd.DataFrame)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_frame_equal(y_test, y[int(train_size) :], check_dtype=False)

if is_time_series(problem_type):
if is_time_series(problem_type) and not is_multiseries(problem_type):
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)


@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
def test_split_data_defaults(
problem_type,
data_type,
get_test_data_from_configuration,
multiseries_ts_data_unstacked,
):
X, y = get_test_data_from_configuration(
data_type,
problem_type,
Expand All @@ -71,6 +86,8 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked
test_pct = 0.1
else:
test_pct = 0.2
Expand All @@ -92,7 +109,18 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu
X = pd.DataFrame(X)
y = pd.Series(y)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
if not is_multiseries(problem_type):
pd.testing.assert_series_equal(
y_test,
y[int(train_size) :],
check_dtype=False,
)
else:
pd.testing.assert_frame_equal(
y_test,
y[int(train_size) :],
check_dtype=False,
)


@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
Expand Down Expand Up @@ -127,8 +155,33 @@ def test_split_data_ts(test, X_y_regression):
assert len(y_test) == test_size


def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked
with pytest.raises(
ValueError,
match="requires problem_configuration for multiseries",
):
split_data(X, y, problem_type="multiseries time series regression")

with pytest.raises(
ValueError,
match="needs both series_id and time_index values in the problem_configuration",
):
split_data(
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date"},
)


@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"])
def test_split_multiseries_data(
no_features,
splitting_function,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked

if no_features:
Expand All @@ -137,12 +190,22 @@ def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)
# Results should be identical whether split_multiseries_data is called through
# split_data or directly
if splitting_function == "split_data":
X_train, X_holdout, y_train, y_holdout = split_data(
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date", "series_id": "series_id"},
)
else:
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)

pd.testing.assert_frame_equal(
X_train.sort_index(axis=1),
Expand Down