Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
christopherbunn committed Sep 22, 2023
1 parent cfe7a2d commit 4efa089
Show file tree
Hide file tree
Showing 2 changed files with 119 additions and 0 deletions.
37 changes: 37 additions & 0 deletions evalml/pipelines/multiseries_regression_pipeline.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
"""Pipeline base class for time series regression problems."""
import pandas as pd
from woodwork.statistics_utils import infer_frequency

from evalml.pipelines.time_series_regression_pipeline import (
Expand Down Expand Up @@ -137,3 +138,39 @@ def predict_in_sample(
# Index will start at the unstacked index, so we need to reset it to the original index
stacked_predictions.index = X.index
return stacked_predictions

def get_forecast_period(self, X):
"""Generates all possible forecasting time points based on latest data point in X.
For the multiseries case, each time stamp is duplicated for each unique value in `X`'s `series_id` column.
Args:
X (pd.DataFrame, np.ndarray): Data the pipeline was trained on of shape [n_samples_train, n_feautures].
Raises:
ValueError: If pipeline is not trained.
Returns:
pd.DataFrame: Dataframe containing a column with datetime periods from `gap` to `forecast_horizon + gap`
per unique `series_id` value.
"""
dates = super().get_forecast_period(X)
dates.name = self.time_index
series_id_values = X[self.series_id].unique()

new_period_df = dates.to_frame().merge(
pd.Series(series_id_values, name=self.series_id),
how="cross",
)

# Generate new numeric index
num_idx = pd.Series(
range(
dates.index[0] + (self.gap * len(series_id_values)),
dates.index[0]
+ (self.gap * len(series_id_values))
+ len(new_period_df),
),
)
new_period_df.index = num_idx
return new_period_df
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import numpy as np
import pandas as pd
import pytest
from pandas._testing import assert_series_equal

from evalml.pipelines import MultiseriesRegressionPipeline
from evalml.preprocessing import split_multiseries_data
Expand Down Expand Up @@ -157,3 +159,83 @@ def test_multiseries_pipeline_predict(
dtype="float64",
)
pd.testing.assert_series_equal(y_pred, expected)


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
@pytest.mark.parametrize("numeric_idx", [True, False])
def test_time_series_get_forecast_period(
forecast_horizon,
gap,
numeric_idx,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked
if numeric_idx:
X = X.reset_index(drop=True)

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

with pytest.raises(
ValueError,
match="Pipeline must be fitted before getting forecast.",
):
clf.get_forecast_period(X)

clf.fit(X, y)
result = clf.get_forecast_period(X)

len_unique_series_id = len(X["series_id"].unique())

assert result.shape[0] == forecast_horizon * len_unique_series_id
assert all(
result.index
== range(
len(X) + (gap * len_unique_series_id),
len(X)
+ (gap * len_unique_series_id)
+ (forecast_horizon * len_unique_series_id),
),
)
assert result.iloc[0]["date"] == X.iloc[-1]["date"] + np.timedelta64(
1 + gap,
clf.frequency,
)
assert np.issubdtype(result.dtypes["date"], np.datetime64)
assert list(result.columns) == ["date", "series_id"]


@pytest.mark.parametrize("forecast_horizon,gap", [[3, 0], [5, 2], [2, 5]])
def test_time_series_get_forecast_predictions(
forecast_horizon,
gap,
multiseries_ts_data_stacked,
component_graph,
pipeline_parameters,
):
X, y = multiseries_ts_data_stacked

X_train, y_train = X.iloc[:25], y.iloc[:25]
X_validation = X.iloc[25 + (gap * 5) : 25 + (gap * 5) + (forecast_horizon * 5)]

pipeline_parameters["pipeline"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Time Series Featurizer"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["Baseline Multiseries"]["forecast_horizon"] = forecast_horizon
pipeline_parameters["pipeline"]["gap"] = gap
pipeline_parameters["Time Series Featurizer"]["gap"] = gap
pipeline_parameters["Baseline Multiseries"]["gap"] = gap

clf = MultiseriesRegressionPipeline(component_graph, pipeline_parameters)

clf.fit(X_train, y_train)
forecast_preds = clf.get_forecast_predictions(X=X_train, y=y_train)
X_val_preds = clf.predict(X_validation, X_train=X_train, y_train=y_train)
assert_series_equal(forecast_preds, X_val_preds)

0 comments on commit 4efa089

Please sign in to comment.