Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fixed forecast period generation function for multiseries #4320

Merged
merged 9 commits into from
Sep 29, 2023
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Extended DateTimeFormatCheck data check to support multiseries :pr:`4300`
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Fixed forecast period generation function for multiseries :pr:`4320`
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Pinned pandas version under 2.1.0 :pr:`4315`
Expand Down
37 changes: 37 additions & 0 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Pipeline base class for time series regression problems."""
import pandas as pd
from woodwork.statistics_utils import infer_frequency

from evalml.pipelines.time_series_regression_pipeline import (
Expand Down Expand Up @@ -137,3 +138,39 @@ def predict_in_sample(
# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
return stacked_predictions

def get_forecast_period(self, X):
"""Generates all possible forecasting time points based on latest data point in X.

For the multiseries case, each time stamp is duplicated for each unique value in `X`'s `series_id` column.
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved

Args:
X (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures].
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved

Raises:
ValueError: If pipeline is not trained.

Returns:
pd.DataFrame: Dataframe containing a column with datetime periods from `gap` to `forecast_horizon + gap`
per unique `series_id` value.
"""
dates = super().get_forecast_period(X)
dates.name = self.time_index
series_id_values = X[self.series_id].unique()

new_period_df = dates.to_frame().merge(
pd.Series(series_id_values, name=self.series_id),
how="cross",
)

# Generate new numeric index
num_idx = pd.Series(
range(
dates.index[0] + (self.gap * len(series_id_values)),
dates.index[0]
+ (self.gap * len(series_id_values))
+ len(new_period_df),
),
)
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved
new_period_df.index = num_idx
return new_period_df
82 changes: 63 additions & 19 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,39 +205,83 @@ def get_prediction_intervals(
Raises:
MethodPropertyNotFoundError: If the estimator does not support Time Series Regression as a problem type.
"""
X_no_datetime, y_no_datetime = self._drop_time_index(X, y)
estimator_input = self.transform_all_but_final(
X_no_datetime,
y_no_datetime,
X_train=X_train,
y_train=y_train,
)
has_stl = STLDecomposer.name in list(
self.component_graph.component_instances.keys(),
)
if coverage is None:
coverage = [0.95]

if self.estimator.model_family in self.NO_PREDS_PI_ESTIMATORS and has_stl:
if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
from evalml.pipelines.utils import stack_data, unstack_multiseries

X, y = unstack_multiseries(
X,
y,
self.series_id,
self.time_index,
self.input_target_name,
)

X_no_datetime, y_no_datetime = self._drop_time_index(X, y)

estimator_input = self.transform_all_but_final(
X_no_datetime,
y_no_datetime,
X_train=X_train,
y_train=y_train,
)
pred_intervals = self.estimator.get_prediction_intervals(
X=estimator_input,
y=y,
coverage=coverage,
)
trans_pred_intervals = {}
intervals_labels = list(list(pred_intervals.values())[0].keys())
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I had to go into the debugger and play with the code myself to figure out what this line was doing 😅 A much simpler way:

intervals_labels = pred_intervals[0].keys()

That may need to be cast to a list for later on, I didn't test fully, but either way it's more readable

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm, this doesn't work since pred_intervals is a dict and this would pull the value at key 0 rather than the first value of the dictionary. Is there a better way to pull the first value?

Copy link
Contributor

@eccabay eccabay Sep 29, 2023

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah oops, I see what I missed. I don't know of a better way to manipulate the dictionaries, but we could also just do intervals_labels = pd.DataFrame(pred_intervals).index 😂

interval_series_pred_intervals = {
interval: {} for interval in intervals_labels
}
residuals = self.estimator.predict(
estimator_input,
) # Get residual values
trend_pred_intervals = self.get_component(
"STL Decomposer",
).get_trend_prediction_intervals(y, coverage=coverage)
for key, orig_pi_values in pred_intervals.items():
trans_pred_intervals[key] = pd.Series(
(orig_pi_values.values - residuals.values)
+ trend_pred_intervals[key].values
+ y.values,
index=orig_pi_values.index,
)
)
trans_pred_intervals = {}
if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
trend_pred_intervals = self.get_component(
"STL Decomposer",
).get_trend_prediction_intervals(y_no_datetime, coverage=coverage)
christopherbunn marked this conversation as resolved.
Show resolved Hide resolved
for series_id, intervals in pred_intervals.items():
for key, orig_pi_values in intervals.items():
series_id_target_name = (
self.input_target_name + "_" + str(series_id)
)
interval_series_pred_intervals[key][
series_id_target_name
] = pd.Series(
(orig_pi_values.values - residuals[series_id].values)
+ trend_pred_intervals[series_id_target_name][key].values
+ y[series_id_target_name].values,
index=orig_pi_values.index,
)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This is a lot of repeated code with the other logical branch, which is going to make life very hard for us if we ever need to update this code. Could you abstract it out into a local helper function?

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Something like

def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
    return_intervals = {}
    for key, orig_pi_values in intervals.items():
        return_intervals[key] = pd.Series(
            (orig_pi_values.values - residuals.values)
            + trend_pred_intervals[key].values
            + y.values,
            index=origin_pi_values.index
        )
    return return_intervals

if is_multiseries(problem_type):
    for series_id, series_intervals in pred_intervals.items():
        series_id_target_name = self.input_target_name + "_" + str(series_id)
        interval_series_pred_intervals[series_id_target_name] = _get_series_intervals(
            series_intervals,
            residuals[series_id],
            trend_pred_intervals[series_id_target_name],
            y[series_id_target_name]
        )
else:
    trans_pred_intervals = _get_series_intervals(pred_intervals, residuals, trend_pred_intervals, y)

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The code I suggested does make a change to the dictionary structure for the multiseries case, which you'll have to let me know if it works or not - I swapped the intervals with series ids, to give us {series_1: {0.75_lower: <>, 0.75_upper: <>, ...}, series_2: {...}...} instead of {0.75_lower: {series_1: <>, series_2: <>, ...}, ...}
Personally, I think this would make it easier to get per-series prediction intervals, but you'll have to let me know if it's too much effort to swap things around at this point. We could also completely overhaul the data structure for this to be something actually 2D like a dataframe instead of nested dictionaries, but that might just be tech debt for the future.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I ended up using your implementation but I tweaked it slightly. I still kept the original dictionary structure since it makes stacking each prediction interval in the end slightly easier. Let me know what you think!

for interval in intervals_labels:
series_id_df = pd.DataFrame(
interval_series_pred_intervals[interval],
)
stacked_pred_interval = stack_data(
data=series_id_df,
series_id_name=self.series_id,
)
trans_pred_intervals[interval] = stacked_pred_interval

else:
trend_pred_intervals = self.get_component(
"STL Decomposer",
).get_trend_prediction_intervals(y, coverage=coverage)
for key, orig_pi_values in pred_intervals.items():
trans_pred_intervals[key] = pd.Series(
(orig_pi_values.values - residuals.values)
+ trend_pred_intervals[key].values
+ y.values,
index=orig_pi_values.index,
)
return trans_pred_intervals
else:
future_vals = self.predict(
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from unittest.mock import MagicMock, patch

import numpy as np
import pandas as pd
import pytest
from pandas._testing import assert_series_equal

from evalml.pipelines import MultiseriesRegressionPipeline
from evalml.pipelines.utils import unstack_multiseries
from evalml.preprocessing import split_multiseries_data


Expand Down Expand Up @@ -157,3 +162,191 @@ def test_multiseries_pipeline_predict(
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
@pytest.mark.parametrize("numeric_idx", [True, False])
def test_time_series_get_forecast_period(
forecast_horizon,
gap,
numeric_idx,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked
if numeric_idx:
X = X.reset_index(drop=True)

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

with pytest.raises(
ValueError,
match="Pipeline must be fitted before getting forecast.",
):
clf.get_forecast_period(X)

clf.fit(X, y)
result = clf.get_forecast_period(X)

len_unique_series_id = len(X["series_id"].unique())

assert result.shape[0] == forecast_horizon * len_unique_series_id
assert all(
result.index
== range(
len(X) + (gap * len_unique_series_id),
len(X)
+ (gap * len_unique_series_id)
+ (forecast_horizon * len_unique_series_id),
),
)
assert result.iloc[0]["date"] == X.iloc[-1]["date"] + np.timedelta64(
1 + gap,
clf.frequency,
)
assert np.issubdtype(result.dtypes["date"], np.datetime64)
assert list(result.columns) == ["date", "series_id"]


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
def test_time_series_get_forecast_predictions(
forecast_horizon,
gap,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked

X_train, y_train = X.iloc[:25], y.iloc[:25]
X_validation = X.iloc[25 + (gap * 5) : 25 + (gap * 5) + (forecast_horizon * 5)]

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

clf.fit(X_train, y_train)
forecast_preds = clf.get_forecast_predictions(X=X_train, y=y_train)
X_val_preds = clf.predict(X_validation, X_train=X_train, y_train=y_train)
assert_series_equal(forecast_preds, X_val_preds)


@pytest.mark.parametrize("set_coverage", [True, False])
@pytest.mark.parametrize("add_decomposer", [True, False])
@pytest.mark.parametrize("ts_native_estimator", [True, False])
def test_time_series_pipeline_get_prediction_intervals(
ts_native_estimator,
add_decomposer,
set_coverage,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked
y = pd.Series(np.random.rand(100), name="target")
component_graph = {
"Regressor": [
"VARMAX Regressor" if ts_native_estimator else "VARMAX Regressor",
"X" if not add_decomposer else "STL Decomposer.x",
"y" if not add_decomposer else "STL Decomposer.y",
],
}
if add_decomposer:
component_graph.update(
{
"STL Decomposer": [
"STL Decomposer",
"X",
"y",
],
},
)

pipeline_parameters = {
"pipeline": {
"time_index": "date",
"max_delay": 10,
"forecast_horizon": 7,
"gap": 0,
"series_id": "series_id",
},
}

pipeline = MultiseriesRegressionPipeline(
component_graph=component_graph,
parameters=pipeline_parameters,
)
X_train, y_train = X[:65], y[:65]
X_validation, y_validation = X[65:], y[65:]
mock_X, _ = unstack_multiseries(
X_train,
y_train,
series_id="series_id",
time_index="date",
target_name="target",
)
mock_transform_return_value = (
mock_X,
pd.DataFrame(np.random.rand(13, 5)),
)
with patch(
"evalml.pipelines.components.transformers.preprocessing.stl_decomposer.STLDecomposer.transform",
MagicMock(return_value=mock_transform_return_value),
):
pipeline.fit(X_train, y_train)

coverage = [0.75, 0.85, 0.95] if set_coverage else None

pl_intervals = pipeline.get_prediction_intervals(
X=X_validation,
y=y_validation,
X_train=X_train,
y_train=y_train,
coverage=coverage,
)

if set_coverage is False:
coverage = [0.95]

if set_coverage:
pairs = [(0.75, 0.85), (0.85, 0.95)]
for pair in pairs:
assert all(
[
narrower >= broader
for narrower, broader in zip(
pl_intervals[f"{pair[0]}_lower"],
pl_intervals[f"{pair[1]}_lower"],
)
],
)
assert all(
[
narrower <= broader
for narrower, broader in zip(
pl_intervals[f"{pair[0]}_upper"],
pl_intervals[f"{pair[1]}_upper"],
)
],
)
for cover_value in coverage:
assert all(
[
lower < upper
for lower, upper in zip(
pl_intervals[f"{cover_value}_lower"],
pl_intervals[f"{cover_value}_upper"],
)
],
)