Skip to content

Commit

Permalink
Add the sp parameter to ARIMA models (#3597)
Browse files Browse the repository at this point in the history
* Move component_obj init to fit, add sp param

* New tests for sp parameter

* Return default sp to 1
  • Loading branch information
eccabay authored Jul 8, 2022
1 parent 26896e9 commit a303470
Show file tree
Hide file tree
Showing 3 changed files with 124 additions and 18 deletions.
1 change: 1 addition & 0 deletions docs/source/release_notes.rst
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@ Release Notes
-------------
**Future Releases**
* Enhancements
* Added the option to set the ``sp`` parameter for ARIMA models :pr:`3597`
* Fixes
* Fixed iterative graphs not appearing in documentation :pr:`3592`
* Updated the `load_diabetes()` method to account for scikit-learn 1.1.1 changes to the dataset :pr:`3591`
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,9 @@ class ARIMARegressor(Estimator):
max_d (int): Maximum Differencing degree. Defaults to 2.
max_q (int): Maximum Moving Average order. Defaults to 5.
seasonal (boolean): Whether to fit a seasonal model to ARIMA. Defaults to True.
sp (int or str): Period for seasonal differencing, specifically the number of periods in each season. If "detect", this
model will automatically detect this parameter (given the time series is a standard frequency) and will fall
back to 1 (no seasonality) if it cannot be detected. Defaults to 1.
n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1.
random_seed (int): Seed for the random number generator. Defaults to 0.
"""
Expand Down Expand Up @@ -65,6 +68,7 @@ def __init__(
max_d=2,
max_q=5,
seasonal=True,
sp=1,
n_jobs=-1,
random_seed=0,
maxiter=10,
Expand All @@ -83,7 +87,6 @@ def __init__(
"maxiter": maxiter,
"n_jobs": n_jobs,
}

parameters.update(kwargs)

arima_model_msg = (
Expand All @@ -93,9 +96,11 @@ def __init__(
"sktime.forecasting.arima", error_msg=arima_model_msg
)
arima_model = sktime_arima.AutoARIMA(**parameters)

parameters["use_covariates"] = use_covariates
parameters["time_index"] = time_index

self.sp = sp
self.use_covariates = use_covariates

super().__init__(
Expand Down Expand Up @@ -129,6 +134,24 @@ def _set_forecast(self, X):
fh_ = ForecastingHorizon([i + 1 for i in range(len(X))], is_relative=True)
return fh_

def _get_sp(self, X):
if X is None:
return 1
freq_mappings = {
"D": 7,
"M": 12,
"Q": 4,
}
time_index = self._parameters.get("time_index", None)
sp = self.sp
if sp == "detect":
inferred_freqs = X.ww.infer_temporal_frequencies()
freq = inferred_freqs.get(time_index, None)
sp = 1
if freq is not None:
sp = freq_mappings.get(freq[:1], 1)
return sp

def fit(self, X, y=None):
"""Fits ARIMA regressor to data.
Expand All @@ -140,14 +163,17 @@ def fit(self, X, y=None):
self
Raises:
ValueError: If X was passed to `fit` but not passed in `predict`.
ValueError: If y was not passed in.
"""
if X is not None:
X = X.fillna(X.mean())
X, y = self._manage_woodwork(X, y)
if y is None:
raise ValueError("ARIMA Regressor requires y as input.")

sp = self._get_sp(X)
self._component_obj.sp = sp

X = self._remove_datetime(X, features=True)
if X is not None:
X.ww.set_types(
Expand All @@ -158,6 +184,7 @@ def fit(self, X, y=None):
)
y = self._remove_datetime(y)
X, y = self._match_indices(X, y)

if X is not None and not X.empty and self.use_covariates:
self._component_obj.fit(y=y, X=X)
else:
Expand Down
110 changes: 94 additions & 16 deletions evalml/tests/component_tests/test_arima_regressor.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from unittest.mock import MagicMock, patch
import math
from unittest.mock import patch

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -145,6 +146,45 @@ def test_set_forecast(get_ts_X_y):
assert fh_.is_relative


def test_get_sp():
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="D")})
X.ww.init()
clf_day = ARIMARegressor(time_index="dates", sp="detect")
sp_ = clf_day._get_sp(X)
assert sp_ == 7

X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
X.ww.init()
clf_month = ARIMARegressor(time_index="dates", sp="detect")
sp_ = clf_month._get_sp(X)
assert sp_ == 12

# Testing the case where an unknown frequency is passed
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="2D")})
X.ww.init()
clf_month = ARIMARegressor(time_index="dates", sp="detect")
sp_ = clf_month._get_sp(X)
assert sp_ == 1

# Testing the case where there is no time index given
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
X.ww.init()
clf_noindex = ARIMARegressor(sp="detect")
sp_ = clf_noindex._get_sp(X)
assert sp_ == 1

# Testing the case where X is None
sp_ = clf_noindex._get_sp(None)
assert sp_ == 1

# Testing the case where sp is given and does not match the frequency
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
X.ww.init()
clf_month = ARIMARegressor(time_index="dates", sp=2)
sp_ = clf_month._get_sp(X)
assert sp_ == 2


def test_feature_importance(ts_data):
X, y = ts_data
clf = ARIMARegressor()
Expand Down Expand Up @@ -243,6 +283,27 @@ def test_fit_predict_sk_failure(
assert y_pred.index.equals(X_test.index)


def test_arima_sp_changes_result():
y = pd.Series([math.sin(i) for i in range(200)])

X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=200, freq="D")})
X.ww.init()
clf_day = ARIMARegressor(time_index="dates", sp="detect")
clf_day.fit(X, y)
pred_d = clf_day.predict(X)

X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=200, freq="Q")})
X.ww.init()
clf_quarter = ARIMARegressor(time_index="dates", sp="detect")
clf_quarter.fit(X, y)
pred_q = clf_quarter.predict(X)

assert clf_day._component_obj.sp == 7
assert clf_quarter._component_obj.sp == 4
with pytest.raises(AssertionError):
pd.testing.assert_series_equal(pred_d, pred_q)


@pytest.mark.parametrize("freq_num", ["1", "2"])
@pytest.mark.parametrize("freq_str", ["T", "M", "Y"])
def test_different_time_units_out_of_sample(
Expand All @@ -265,12 +326,14 @@ def test_different_time_units_out_of_sample(
m_clf = ARIMARegressor(d=None)
m_clf.fit(X=X[:15], y=y[:15])
y_pred = m_clf.predict(X=X[15:])
assert m_clf._component_obj.d is None

assert (y_pred_sk.values == y_pred.values).all()
assert y_pred.index.equals(X[15:].index)


def test_arima_supports_boolean_features():
@patch("sktime.forecasting.arima.AutoARIMA.fit")
def test_arima_supports_boolean_features(mock_fit):
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=10)})
X.ww.init()
X.ww["bool_1"] = (
Expand All @@ -286,34 +349,49 @@ def test_arima_supports_boolean_features():
y = pd.Series(range(10))

ar = ARIMARegressor(time_index="dates")

ar._component_obj = MagicMock()
ar.fit(X, y)

pd.testing.assert_series_equal(
ar._component_obj.fit.call_args[1]["X"]["bool_1"], X["bool_1"].astype(float)
mock_fit.call_args[1]["X"]["bool_1"], X["bool_1"].astype(float)
)
pd.testing.assert_series_equal(
ar._component_obj.fit.call_args[1]["X"]["bool_2"], X["bool_2"].astype(float)
mock_fit.call_args[1]["X"]["bool_2"], X["bool_2"].astype(float)
)


def test_arima_boolean_features_no_error():
X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=100)})
X.ww.init()
X.ww["bool_1"] = (
pd.Series([True, False])
.sample(n=100, replace=True, random_state=0)
.reset_index(drop=True)
)
X.ww["bool_2"] = (
pd.Series([True, False])
.sample(n=100, replace=True, random_state=1)
.reset_index(drop=True)
)
y = pd.Series(range(100))

# Test that non-mocked predict does not error or produce NaNs
ar = ARIMARegressor(time_index="dates")
ar.fit(X, y)
preds = ar.predict(X)
assert not preds.isna().any()


def test_arima_regressor_respects_use_covariates(ts_data):
@patch("sktime.forecasting.arima.AutoARIMA.fit")
@patch("sktime.forecasting.arima.AutoARIMA.predict")
def test_arima_regressor_respects_use_covariates(mock_predict, mock_fit, ts_data):
X, y = ts_data
X_train, y_train = X.iloc[:25], y.iloc[:25]
X_test, _ = X.iloc[25:], y.iloc[25:]
clf = ARIMARegressor(use_covariates=False)
with patch.object(clf, "_component_obj") as mock_obj:
clf.fit(X_train, y_train)
clf.predict(X_test)
mock_obj.fit.assert_called_once()
assert "X" not in mock_obj.fit.call_args.kwargs
assert "y" in mock_obj.fit.call_args.kwargs
mock_obj.predict.assert_called_once()
assert "X" not in mock_obj.predict.call_args.kwargs

clf.fit(X_train, y_train)
clf.predict(X_test)
mock_fit.assert_called_once()
assert "X" not in mock_fit.call_args.kwargs
assert "y" in mock_fit.call_args.kwargs
mock_predict.assert_called_once()
assert "X" not in mock_predict.call_args.kwargs

0 comments on commit a303470

Please sign in to comment.