From 90b0e5c317429a34fc2e7cf99a6592c1b3118772 Mon Sep 17 00:00:00 2001 From: Michael Fu Date: Tue, 19 Sep 2023 09:37:00 -0700 Subject: [PATCH 1/2] Make LightGBM not verbose (#4308) * Made lightgbm not verbose --- docs/source/release_notes.rst | 1 + .../components/estimators/classifiers/lightgbm_classifier.py | 1 + evalml/tests/component_tests/test_components.py | 1 + 3 files changed, 3 insertions(+) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 40174fd6df..54c565a9da 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -11,6 +11,7 @@ Release Notes * Fixes * Changes * Documentation Changes + * Removed LightGBM's excessive amount of warnings :pr:`4308` * Testing Changes .. warning:: diff --git a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py index 66a66be176..04190f901a 100644 --- a/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py +++ b/evalml/pipelines/components/estimators/classifiers/lightgbm_classifier.py @@ -108,6 +108,7 @@ def __init__( "n_jobs": n_jobs, "bagging_freq": bagging_freq, "bagging_fraction": bagging_fraction, + "verbose": -1, } parameters.update(kwargs) lg_parameters = copy.copy(parameters) diff --git a/evalml/tests/component_tests/test_components.py b/evalml/tests/component_tests/test_components.py index 0d8650eb71..90ec22f5c4 100644 --- a/evalml/tests/component_tests/test_components.py +++ b/evalml/tests/component_tests/test_components.py @@ -492,6 +492,7 @@ def test_describe_component(): "n_jobs": -1, "bagging_fraction": 0.9, "bagging_freq": 0, + "verbose": -1, }, } assert lg_regressor.describe(return_dict=True) == { From cf6bc94a766730fdbed1006c97366ccf437879aa Mon Sep 17 00:00:00 2001 From: Becca McBrayer Date: Tue, 19 Sep 2023 13:12:51 -0400 Subject: [PATCH 2/2] Update split_data to call split_multiseries_data (#4312) * Update split_data to call split_multiseries_data --- docs/source/release_notes.rst | 1 + evalml/preprocessing/utils.py | 31 ++++++- .../preprocessing_tests/test_split_data.py | 87 ++++++++++++++++--- 3 files changed, 106 insertions(+), 13 deletions(-) diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst index 54c565a9da..efcfb028dc 100644 --- a/docs/source/release_notes.rst +++ b/docs/source/release_notes.rst @@ -10,6 +10,7 @@ Release Notes * Extended TimeSeriesRegularizer to support multiseries :pr:`4303` * Fixes * Changes + * Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312` * Documentation Changes * Removed LightGBM's excessive amount of warnings :pr:`4308` * Testing Changes diff --git a/evalml/preprocessing/utils.py b/evalml/preprocessing/utils.py index 6e7c203611..dc17e75ee8 100644 --- a/evalml/preprocessing/utils.py +++ b/evalml/preprocessing/utils.py @@ -4,7 +4,12 @@ from evalml.pipelines.utils import stack_data, stack_X, unstack_multiseries from evalml.preprocessing.data_splitters import TrainingValidationSplit -from evalml.problem_types import is_classification, is_regression, is_time_series +from evalml.problem_types import ( + is_classification, + is_multiseries, + is_regression, + is_time_series, +) from evalml.utils import infer_feature_types @@ -118,6 +123,9 @@ def split_data( Returns: pd.DataFrame, pd.DataFrame, pd.Series, pd.Series: Feature and target data each split into train and test sets. + Raises: + ValueError: If the problem_configuration is missing or does not contain both a time_index and series_id for multiseries problems. + Examples: >>> X = pd.DataFrame([1, 2, 3, 4, 5, 6], columns=["First"]) >>> y = pd.Series([8, 9, 10, 11, 12, 13]) @@ -144,6 +152,27 @@ def split_data( 1 9 dtype: int64 """ + if is_multiseries(problem_type) and isinstance(y, pd.Series): + if problem_configuration is None: + raise ValueError( + "split_data requires problem_configuration for multiseries problems", + ) + series_id = problem_configuration.get("series_id") + time_index = problem_configuration.get("time_index") + if series_id is None or time_index is None: + raise ValueError( + "split_data needs both series_id and time_index values in the problem_configuration to split multiseries data", + ) + return split_multiseries_data( + X, + y, + series_id, + time_index, + problem_configuration=problem_configuration, + test_size=test_size, + random_seed=random_seed, + ) + X = infer_feature_types(X) y = infer_feature_types(y) diff --git a/evalml/tests/preprocessing_tests/test_split_data.py b/evalml/tests/preprocessing_tests/test_split_data.py index cbb8c941ed..f5e494d57c 100644 --- a/evalml/tests/preprocessing_tests/test_split_data.py +++ b/evalml/tests/preprocessing_tests/test_split_data.py @@ -6,6 +6,7 @@ ProblemTypes, is_binary, is_multiclass, + is_multiseries, is_regression, is_time_series, ) @@ -19,6 +20,7 @@ def test_split_data( X_y_binary, X_y_multi, X_y_regression, + multiseries_ts_data_unstacked, make_data_type, ): if is_binary(problem_type): @@ -30,6 +32,8 @@ def test_split_data( problem_configuration = None if is_time_series(problem_type): problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} + if is_multiseries(problem_type): + X, y = multiseries_ts_data_unstacked X = make_data_type(data_type, X) y = make_data_type(data_type, y) @@ -50,17 +54,28 @@ def test_split_data( assert len(y_test) == test_size assert isinstance(X_train, pd.DataFrame) assert isinstance(X_test, pd.DataFrame) - assert isinstance(y_train, pd.Series) - assert isinstance(y_test, pd.Series) + if not is_multiseries(problem_type): + assert isinstance(y_train, pd.Series) + assert isinstance(y_test, pd.Series) + else: + assert isinstance(y_train, pd.DataFrame) + assert isinstance(y_test, pd.DataFrame) + pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) + pd.testing.assert_frame_equal(y_test, y[int(train_size) :], check_dtype=False) - if is_time_series(problem_type): + if is_time_series(problem_type) and not is_multiseries(problem_type): pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) @pytest.mark.parametrize("problem_type", ProblemTypes.all_problem_types) @pytest.mark.parametrize("data_type", ["np", "pd", "ww"]) -def test_split_data_defaults(problem_type, data_type, get_test_data_from_configuration): +def test_split_data_defaults( + problem_type, + data_type, + get_test_data_from_configuration, + multiseries_ts_data_unstacked, +): X, y = get_test_data_from_configuration( data_type, problem_type, @@ -71,6 +86,8 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu problem_configuration = None if is_time_series(problem_type): problem_configuration = {"gap": 1, "max_delay": 7, "time_index": "date"} + if is_multiseries(problem_type): + X, y = multiseries_ts_data_unstacked test_pct = 0.1 else: test_pct = 0.2 @@ -92,7 +109,18 @@ def test_split_data_defaults(problem_type, data_type, get_test_data_from_configu X = pd.DataFrame(X) y = pd.Series(y) pd.testing.assert_frame_equal(X_test, X[int(train_size) :], check_dtype=False) - pd.testing.assert_series_equal(y_test, y[int(train_size) :], check_dtype=False) + if not is_multiseries(problem_type): + pd.testing.assert_series_equal( + y_test, + y[int(train_size) :], + check_dtype=False, + ) + else: + pd.testing.assert_frame_equal( + y_test, + y[int(train_size) :], + check_dtype=False, + ) @pytest.mark.parametrize("test", ["fh_limitation", "no_fh_limitation"]) @@ -127,8 +155,33 @@ def test_split_data_ts(test, X_y_regression): assert len(y_test) == test_size +def test_split_data_calls_multiseries_error(multiseries_ts_data_stacked): + X, y = multiseries_ts_data_stacked + with pytest.raises( + ValueError, + match="requires problem_configuration for multiseries", + ): + split_data(X, y, problem_type="multiseries time series regression") + + with pytest.raises( + ValueError, + match="needs both series_id and time_index values in the problem_configuration", + ): + split_data( + X, + y, + problem_type="multiseries time series regression", + problem_configuration={"time_index": "date"}, + ) + + @pytest.mark.parametrize("no_features", [True, False]) -def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): +@pytest.mark.parametrize("splitting_function", ["split_data", "split_multiseries_data"]) +def test_split_multiseries_data( + no_features, + splitting_function, + multiseries_ts_data_stacked, +): X, y = multiseries_ts_data_stacked if no_features: @@ -137,12 +190,22 @@ def test_split_multiseries_data(no_features, multiseries_ts_data_stacked): X_train_expected, X_holdout_expected = X[:-10], X[-10:] y_train_expected, y_holdout_expected = y[:-10], y[-10:] - X_train, X_holdout, y_train, y_holdout = split_multiseries_data( - X, - y, - "series_id", - "date", - ) + # Results should be identical whether split_multiseries_data is called through + # split_data or directly + if splitting_function == "split_data": + X_train, X_holdout, y_train, y_holdout = split_data( + X, + y, + problem_type="multiseries time series regression", + problem_configuration={"time_index": "date", "series_id": "series_id"}, + ) + else: + X_train, X_holdout, y_train, y_holdout = split_multiseries_data( + X, + y, + "series_id", + "date", + ) pd.testing.assert_frame_equal( X_train.sort_index(axis=1),