Skip to content

Commit

Permalink
Fixed forecast period generation function for multiseries (#4320)
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn authored Sep 29, 2023
1 parent 5c3e832 commit da17fae
Show file tree
Hide file tree
Showing 4 changed files with 306 additions and 17 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@ Release Notes
* Extended DateTimeFormatCheck data check to support multiseries :pr:`4300`
* Extended TimeSeriesRegularizer to support multiseries :pr:`4303`
* Fixes
* Fixed forecast period generation function for multiseries :pr:`4320`
* Changes
* Updated ``split_data`` to call ``split_multiseries_data`` when passed stacked multiseries data :pr:`4312`
* Pinned pandas version under 2.1.0 :pr:`4315`
Expand Down
32 changes: 32 additions & 0 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Pipeline base class for time series regression problems."""
import pandas as pd
from woodwork.statistics_utils import infer_frequency

from evalml.pipelines.time_series_regression_pipeline import (
Expand Down Expand Up @@ -137,3 +138,34 @@ def predict_in_sample(
# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
return stacked_predictions

def get_forecast_period(self, X):
"""Generates all possible forecasting time points based on latest data point in X.
For the multiseries case, each time stamp is duplicated for each unique value in `X`'s `series_id` column.
Input data must be stacked in order to properly generate unique periods.
Args:
X (pd.DataFrame, np.ndarray): Stacked data the pipeline was trained on of shape [n_samples_train * n_series_ids, n_features].
Raises:
ValueError: If pipeline is not trained.
Returns:
pd.DataFrame: Dataframe containing a column with datetime periods from `gap` to `forecast_horizon + gap`
per unique `series_id` value.
"""
dates = super().get_forecast_period(X)
dates.name = self.time_index
series_id_values = X[self.series_id].unique()

new_period_df = dates.to_frame().merge(
pd.Series(series_id_values, name=self.series_id),
how="cross",
)

# Generate new numeric index
start_idx = dates.index[0] + (self.gap * len(series_id_values))
num_idx = pd.Series(range(start_idx, start_idx + len(new_period_df)))
new_period_df.index = num_idx
return new_period_df
97 changes: 80 additions & 17 deletions evalml/pipelines/time_series_regression_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
from evalml.model_family import ModelFamily
from evalml.pipelines.components import STLDecomposer
from evalml.pipelines.time_series_pipeline_base import TimeSeriesPipelineBase
from evalml.problem_types import ProblemTypes
from evalml.problem_types import ProblemTypes, is_multiseries
from evalml.utils.woodwork_utils import infer_feature_types


Expand Down Expand Up @@ -205,40 +205,103 @@ def get_prediction_intervals(
Raises:
MethodPropertyNotFoundError: If the estimator does not support Time Series Regression as a problem type.
"""
X_no_datetime, y_no_datetime = self._drop_time_index(X, y)
estimator_input = self.transform_all_but_final(
X_no_datetime,
y_no_datetime,
X_train=X_train,
y_train=y_train,
)
has_stl = STLDecomposer.name in list(
self.component_graph.component_instances.keys(),
)
if coverage is None:
coverage = [0.95]

if self.estimator.model_family in self.NO_PREDS_PI_ESTIMATORS and has_stl:

def _get_series_intervals(intervals, residuals, trend_pred_intervals, y):
return_intervals = {}
for key, orig_pi_values in intervals.items():
return_intervals[key] = pd.Series(
(orig_pi_values.values - residuals.values)
+ trend_pred_intervals[key].values
+ y.values,
index=orig_pi_values.index,
)
return return_intervals

if self.problem_type == ProblemTypes.MULTISERIES_TIME_SERIES_REGRESSION:
from evalml.pipelines.utils import stack_data, unstack_multiseries

X, y = unstack_multiseries(
X,
y,
self.series_id,
self.time_index,
self.input_target_name,
)

X_no_datetime, y_no_datetime = self._drop_time_index(X, y)

estimator_input = self.transform_all_but_final(
X_no_datetime,
y_no_datetime,
X_train=X_train,
y_train=y_train,
)
pred_intervals = self.estimator.get_prediction_intervals(
X=estimator_input,
y=y,
coverage=coverage,
)
trans_pred_intervals = {}
residuals = self.estimator.predict(
estimator_input,
) # Get residual values
)
transformed_pred_intervals = {}
trend_pred_intervals = self.get_component(
"STL Decomposer",
).get_trend_prediction_intervals(y, coverage=coverage)
for key, orig_pi_values in pred_intervals.items():
trans_pred_intervals[key] = pd.Series(
(orig_pi_values.values - residuals.values)
+ trend_pred_intervals[key].values
+ y.values,
index=orig_pi_values.index,

if is_multiseries(self.problem_type):
# Coverage label is label for each prediction interval limit(e.g. "0.95_lower")
coverage_labels = list(list(pred_intervals.values())[0].keys())

# Store prediction interval data in {coverage_label: {series_id: bound_value}}
interval_series_pred_intervals = {
coverage_label: {} for coverage_label in coverage_labels
}

# `pred_intervals` are in {series_id: {coverage_label: bound_value}} form
for series_id, series_intervals in pred_intervals.items():
series_id_target_name = (
self.input_target_name + "_" + str(series_id)
)
series_id_prediction_intervals = _get_series_intervals(
series_intervals,
residuals[series_id],
trend_pred_intervals[series_id_target_name],
y[series_id_target_name],
)
# Store `series_id_prediction_intervals` data in `interval_series_pred_intervals` format
for (
coverage_label,
bound_value,
) in series_id_prediction_intervals.items():
interval_series_pred_intervals[coverage_label][
series_id_target_name
] = bound_value
# Stack bound data for each coverage label so each bound has a single pd.Series
for coverage_label in coverage_labels:
series_id_interval_df = pd.DataFrame(
interval_series_pred_intervals[coverage_label],
)
stacked_pred_interval = stack_data(
data=series_id_interval_df,
series_id_name=self.series_id,
)
transformed_pred_intervals[coverage_label] = stacked_pred_interval
else:
transformed_pred_intervals = _get_series_intervals(
pred_intervals,
residuals,
trend_pred_intervals,
y,
)
return trans_pred_intervals
return transformed_pred_intervals
else:
future_vals = self.predict(
X=X,
Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,12 @@
from unittest.mock import MagicMock, patch

import numpy as np
import pandas as pd
import pytest
from pandas._testing import assert_series_equal

from evalml.pipelines import MultiseriesRegressionPipeline
from evalml.pipelines.utils import unstack_multiseries
from evalml.preprocessing import split_multiseries_data


Expand Down Expand Up @@ -157,3 +162,191 @@ def test_multiseries_pipeline_predict(
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
@pytest.mark.parametrize("numeric_idx", [True, False])
def test_time_series_get_forecast_period(
forecast_horizon,
gap,
numeric_idx,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked
if numeric_idx:
X = X.reset_index(drop=True)

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

with pytest.raises(
ValueError,
match="Pipeline must be fitted before getting forecast.",
):
clf.get_forecast_period(X)

clf.fit(X, y)
result = clf.get_forecast_period(X)

len_unique_series_id = len(X["series_id"].unique())

assert result.shape[0] == forecast_horizon * len_unique_series_id
assert all(
result.index
== range(
len(X) + (gap * len_unique_series_id),
len(X)
+ (gap * len_unique_series_id)
+ (forecast_horizon * len_unique_series_id),
),
)
assert result.iloc[0]["date"] == X.iloc[-1]["date"] + np.timedelta64(
1 + gap,
clf.frequency,
)
assert np.issubdtype(result.dtypes["date"], np.datetime64)
assert list(result.columns) == ["date", "series_id"]


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
def test_time_series_get_forecast_predictions(
forecast_horizon,
gap,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked

X_train, y_train = X.iloc[:25], y.iloc[:25]
X_validation = X.iloc[25 + (gap * 5) : 25 + (gap * 5) + (forecast_horizon * 5)]

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

clf.fit(X_train, y_train)
forecast_preds = clf.get_forecast_predictions(X=X_train, y=y_train)
X_val_preds = clf.predict(X_validation, X_train=X_train, y_train=y_train)
assert_series_equal(forecast_preds, X_val_preds)


@pytest.mark.parametrize("set_coverage", [True, False])
@pytest.mark.parametrize("add_decomposer", [True, False])
@pytest.mark.parametrize("ts_native_estimator", [True, False])
def test_time_series_pipeline_get_prediction_intervals(
ts_native_estimator,
add_decomposer,
set_coverage,
multiseries_ts_data_stacked,
):
X, y = multiseries_ts_data_stacked
y = pd.Series(np.random.rand(100), name="target")
component_graph = {
"Regressor": [
"VARMAX Regressor" if ts_native_estimator else "VARMAX Regressor",
"X" if not add_decomposer else "STL Decomposer.x",
"y" if not add_decomposer else "STL Decomposer.y",
],
}
if add_decomposer:
component_graph.update(
{
"STL Decomposer": [
"STL Decomposer",
"X",
"y",
],
},
)

pipeline_parameters = {
"pipeline": {
"time_index": "date",
"max_delay": 10,
"forecast_horizon": 7,
"gap": 0,
"series_id": "series_id",
},
}

pipeline = MultiseriesRegressionPipeline(
component_graph=component_graph,
parameters=pipeline_parameters,
)
X_train, y_train = X[:65], y[:65]
X_validation, y_validation = X[65:], y[65:]
mock_X, _ = unstack_multiseries(
X_train,
y_train,
series_id="series_id",
time_index="date",
target_name="target",
)
mock_transform_return_value = (
mock_X,
pd.DataFrame(np.random.rand(13, 5)),
)
with patch(
"evalml.pipelines.components.transformers.preprocessing.stl_decomposer.STLDecomposer.transform",
MagicMock(return_value=mock_transform_return_value),
):
pipeline.fit(X_train, y_train)

coverage = [0.75, 0.85, 0.95] if set_coverage else None

pl_intervals = pipeline.get_prediction_intervals(
X=X_validation,
y=y_validation,
X_train=X_train,
y_train=y_train,
coverage=coverage,
)

if set_coverage is False:
coverage = [0.95]

if set_coverage:
pairs = [(0.75, 0.85), (0.85, 0.95)]
for pair in pairs:
assert all(
[
narrower >= broader
for narrower, broader in zip(
pl_intervals[f"{pair[0]}_lower"],
pl_intervals[f"{pair[1]}_lower"],
)
],
)
assert all(
[
narrower <= broader
for narrower, broader in zip(
pl_intervals[f"{pair[0]}_upper"],
pl_intervals[f"{pair[1]}_upper"],
)
],
)
for cover_value in coverage:
assert all(
[
lower < upper
for lower, upper in zip(
pl_intervals[f"{cover_value}_lower"],
pl_intervals[f"{cover_value}_upper"],
)
],
)

0 comments on commit da17fae

Please sign in to comment.