Skip to content

Commit

Permalink
Merge branch 'main' into 4313_pandas_version
Browse files Browse the repository at this point in the history
  • Loading branch information
eccabay authored Sep 19, 2023
2 parents c205f45 + cf6bc94 commit ab9b09d
Show file tree
Hide file tree
Showing 5 changed files with 109 additions and 13 deletions.
2 changes: 2 additions & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,9 @@ Release Notes
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Documentation Changes
* Removed LightGBM's excessive amount of warnings :pr:`4308`
* Testing Changes

.. warning::
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,6 +108,7 @@ def __init__(
"n_jobs": n_jobs,
"bagging_freq": bagging_freq,
"bagging_fraction": bagging_fraction,
"verbose": -1,
}
parameters.update(kwargs)
lg_parameters = copy.copy(parameters)
Expand Down
31 changes: 30 additions & 1 deletion evalml/preprocessing/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,12 @@

from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries
from evalml.preprocessing.data_splitters import TrainingValidationSplit
from evalml.problem_types import is_classification, is_regression, is_time_series
from evalml.problem_types import (
is_classification,
is_multiseries,
is_regression,
is_time_series,
)
from evalml.utils import infer_feature_types


Expand Down Expand Up @@ -118,6 +123,9 @@ def split_data(
Returns:
pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets.
Raises:
ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems.
Examples:
>>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"])
>>> y = pd.Series([8, 9, 10, 11, 12, 13])
Expand All @@ -144,6 +152,27 @@ def split_data(
1 9
dtype: int64
"""
if is_multiseries(problem_type) and isinstance(y, pd.Series):
if problem_configuration is None:
raise ValueError(
"split_data requires problem_configuration for multiseries problems",
)
series_id = problem_configuration.get("series_id")
time_index = problem_configuration.get("time_index")
if series_id is None or time_index is None:
raise ValueError(
"split_data needs both series_id and time_index values in the problem_configuration to split multiseries data",
)
return split_multiseries_data(
X,
y,
series_id,
time_index,
problem_configuration=problem_configuration,
test_size=test_size,
random_seed=random_seed,
)

X = infer_feature_types(X)
y = infer_feature_types(y)

Expand Down
1 change: 1 addition & 0 deletions evalml/tests/component_tests/test_components.py
Original file line number Diff line number Diff line change
Expand Up @@ -492,6 +492,7 @@ def test_describe_component():
"n_jobs": -1,
"bagging_fraction": 0.9,
"bagging_freq": 0,
"verbose": -1,
},
}
assert lg_regressor.describe(return_dict=True) == {
Expand Down
87 changes: 75 additions & 12 deletions evalml/tests/preprocessing_tests/test_split_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
ProblemTypes,
is_binary,
is_multiclass,
is_multiseries,
is_regression,
is_time_series,
)
Expand All @@ -19,6 +20,7 @@ def test_split_data(
X_y_binary,
X_y_multi,
X_y_regression,
multiseries_ts_data_unstacked,
make_data_type,
):
if is_binary(problem_type):
Expand All @@ -30,6 +32,8 @@ def test_split_data(
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked

X = make_data_type(data_type, X)
y = make_data_type(data_type, y)
Expand All @@ -50,17 +54,28 @@ def test_split_data(
assert len(y_test) == test_size
assert isinstance(X_train, pd.DataFrame)
assert isinstance(X_test, pd.DataFrame)
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)
if not is_multiseries(problem_type):
assert isinstance(y_train, pd.Series)
assert isinstance(y_test, pd.Series)
else:
assert isinstance(y_train, pd.DataFrame)
assert isinstance(y_test, pd.DataFrame)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_frame_equal(y_test, y[int(train_size) :], check_dtype=False)

if is_time_series(problem_type):
if is_time_series(problem_type) and not is_multiseries(problem_type):
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)


@pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types)
@pytest.mark.parametrize("data_type", ["np", "pd", "ww"])
def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration):
def test_split_data_defaults(
problem_type,
data_type,
get_test_data_from_configuration,
multiseries_ts_data_unstacked,
):
X, y = get_test_data_from_configuration(
data_type,
problem_type,
Expand All @@ -71,6 +86,8 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu
problem_configuration = None
if is_time_series(problem_type):
problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"}
if is_multiseries(problem_type):
X, y = multiseries_ts_data_unstacked
test_pct = 0.1
else:
test_pct = 0.2
Expand All @@ -92,7 +109,18 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu
X = pd.DataFrame(X)
y = pd.Series(y)
pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False)
pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False)
if not is_multiseries(problem_type):
pd.testing.assert_series_equal(
y_test,
y[int(train_size) :],
check_dtype=False,
)
else:
pd.testing.assert_frame_equal(
y_test,
y[int(train_size) :],
check_dtype=False,
)


@pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"])
Expand Down Expand Up @@ -127,8 +155,33 @@ def test_split_data_ts(test, X_y_regression):
assert len(y_test) == test_size


def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked):
X, y = multiseries_ts_data_stacked
with pytest.raises(
ValueError,
match="requires problem_configuration for multiseries",
):
split_data(X, y, problem_type="multiseries time series regression")

with pytest.raises(
ValueError,
match="needs both series_id and time_index values in the problem_configuration",
):
split_data(
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date"},
)


@pytest.mark.parametrize("no_features", [True, False])
def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"])
def test_split_multiseries_data(
no_features,
splitting_function,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked

if no_features:
Expand All @@ -137,12 +190,22 @@ def test_split_multiseries_data(no_features, multiseries_ts_data_stacked):
X_train_expected, X_holdout_expected = X[:-10], X[-10:]
y_train_expected, y_holdout_expected = y[:-10], y[-10:]

X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)
# Results should be identical whether split_multiseries_data is called through
# split_data or directly
if splitting_function == "split_data":
X_train, X_holdout, y_train, y_holdout = split_data(
X,
y,
problem_type="multiseries time series regression",
problem_configuration={"time_index": "date", "series_id": "series_id"},
)
else:
X_train, X_holdout, y_train, y_holdout = split_multiseries_data(
X,
y,
"series_id",
"date",
)

pd.testing.assert_frame_equal(
X_train.sort_index(axis=1),
Expand Down

0 comments on commit ab9b09d

Please sign in to comment.