Add the sp parameter to ARIMA models (#3597)

* Move component_obj init to fit, add sp param * New tests for sp parameter * Return default sp to 1
alteryx · Jul 8, 2022 · a303470 · a303470
1 parent 26896e9
commit a303470
Show file tree

Hide file tree

Showing 3 changed files with 124 additions and 18 deletions.
diff --git a/docs/source/release_notes.rst b/docs/source/release_notes.rst
@@ -2,6 +2,7 @@ Release Notes
 -------------
 **Future Releases**
     * Enhancements
+        * Added the option to set the ``sp`` parameter for ARIMA models :pr:`3597`
     * Fixes
         * Fixed iterative graphs not appearing in documentation :pr:`3592`
         * Updated the `load_diabetes()` method to account for scikit-learn 1.1.1 changes to the dataset :pr:`3591`

diff --git a/evalml/pipelines/components/estimators/regressors/arima_regressor.py b/evalml/pipelines/components/estimators/regressors/arima_regressor.py
@@ -26,6 +26,9 @@ class ARIMARegressor(Estimator):
         max_d (int): Maximum Differencing degree. Defaults to 2.
         max_q (int): Maximum Moving Average order. Defaults to 5.
         seasonal (boolean): Whether to fit a seasonal model to ARIMA. Defaults to True.
+        sp (int or str): Period for seasonal differencing, specifically the number of periods in each season. If "detect", this
+            model will automatically detect this parameter (given the time series is a standard frequency) and will fall
+            back to 1 (no seasonality) if it cannot be detected. Defaults to 1.
         n_jobs (int or None): Non-negative integer describing level of parallelism used for pipelines. Defaults to -1.
         random_seed (int): Seed for the random number generator. Defaults to 0.
     """
@@ -65,6 +68,7 @@ def __init__(
         max_d=2,
         max_q=5,
         seasonal=True,
+        sp=1,
         n_jobs=-1,
         random_seed=0,
         maxiter=10,
@@ -83,7 +87,6 @@ def __init__(
             "maxiter": maxiter,
             "n_jobs": n_jobs,
         }
-
         parameters.update(kwargs)
 
         arima_model_msg = (
@@ -93,9 +96,11 @@ def __init__(
             "sktime.forecasting.arima", error_msg=arima_model_msg
         )
         arima_model = sktime_arima.AutoARIMA(**parameters)
+
         parameters["use_covariates"] = use_covariates
         parameters["time_index"] = time_index
 
+        self.sp = sp
         self.use_covariates = use_covariates
 
         super().__init__(
@@ -129,6 +134,24 @@ def _set_forecast(self, X):
         fh_ = ForecastingHorizon([i + 1 for i in range(len(X))], is_relative=True)
         return fh_
 
+    def _get_sp(self, X):
+        if X is None:
+            return 1
+        freq_mappings = {
+            "D": 7,
+            "M": 12,
+            "Q": 4,
+        }
+        time_index = self._parameters.get("time_index", None)
+        sp = self.sp
+        if sp == "detect":
+            inferred_freqs = X.ww.infer_temporal_frequencies()
+            freq = inferred_freqs.get(time_index, None)
+            sp = 1
+            if freq is not None:
+                sp = freq_mappings.get(freq[:1], 1)
+        return sp
+
     def fit(self, X, y=None):
         """Fits ARIMA regressor to data.
 
@@ -140,14 +163,17 @@ def fit(self, X, y=None):
             self
 
         Raises:
-            ValueError: If X was passed to `fit` but not passed in `predict`.
+            ValueError: If y was not passed in.
         """
         if X is not None:
             X = X.fillna(X.mean())
         X, y = self._manage_woodwork(X, y)
         if y is None:
             raise ValueError("ARIMA Regressor requires y as input.")
 
+        sp = self._get_sp(X)
+        self._component_obj.sp = sp
+
         X = self._remove_datetime(X, features=True)
         if X is not None:
             X.ww.set_types(
@@ -158,6 +184,7 @@ def fit(self, X, y=None):
             )
         y = self._remove_datetime(y)
         X, y = self._match_indices(X, y)
+
         if X is not None and not X.empty and self.use_covariates:
             self._component_obj.fit(y=y, X=X)
         else:

diff --git a/evalml/tests/component_tests/test_arima_regressor.py b/evalml/tests/component_tests/test_arima_regressor.py
@@ -1,4 +1,5 @@
-from unittest.mock import MagicMock, patch
+import math
+from unittest.mock import patch
 
 import numpy as np
 import pandas as pd
@@ -145,6 +146,45 @@ def test_set_forecast(get_ts_X_y):
     assert fh_.is_relative
 
 
+def test_get_sp():
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="D")})
+    X.ww.init()
+    clf_day = ARIMARegressor(time_index="dates", sp="detect")
+    sp_ = clf_day._get_sp(X)
+    assert sp_ == 7
+
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
+    X.ww.init()
+    clf_month = ARIMARegressor(time_index="dates", sp="detect")
+    sp_ = clf_month._get_sp(X)
+    assert sp_ == 12
+
+    # Testing the case where an unknown frequency is passed
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="2D")})
+    X.ww.init()
+    clf_month = ARIMARegressor(time_index="dates", sp="detect")
+    sp_ = clf_month._get_sp(X)
+    assert sp_ == 1
+
+    # Testing the case where there is no time index given
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
+    X.ww.init()
+    clf_noindex = ARIMARegressor(sp="detect")
+    sp_ = clf_noindex._get_sp(X)
+    assert sp_ == 1
+
+    # Testing the case where X is None
+    sp_ = clf_noindex._get_sp(None)
+    assert sp_ == 1
+
+    # Testing the case where sp is given and does not match the frequency
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=500, freq="M")})
+    X.ww.init()
+    clf_month = ARIMARegressor(time_index="dates", sp=2)
+    sp_ = clf_month._get_sp(X)
+    assert sp_ == 2
+
+
 def test_feature_importance(ts_data):
     X, y = ts_data
     clf = ARIMARegressor()
@@ -243,6 +283,27 @@ def test_fit_predict_sk_failure(
     assert y_pred.index.equals(X_test.index)
 
 
+def test_arima_sp_changes_result():
+    y = pd.Series([math.sin(i) for i in range(200)])
+
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=200, freq="D")})
+    X.ww.init()
+    clf_day = ARIMARegressor(time_index="dates", sp="detect")
+    clf_day.fit(X, y)
+    pred_d = clf_day.predict(X)
+
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=200, freq="Q")})
+    X.ww.init()
+    clf_quarter = ARIMARegressor(time_index="dates", sp="detect")
+    clf_quarter.fit(X, y)
+    pred_q = clf_quarter.predict(X)
+
+    assert clf_day._component_obj.sp == 7
+    assert clf_quarter._component_obj.sp == 4
+    with pytest.raises(AssertionError):
+        pd.testing.assert_series_equal(pred_d, pred_q)
+
+
 @pytest.mark.parametrize("freq_num", ["1", "2"])
 @pytest.mark.parametrize("freq_str", ["T", "M", "Y"])
 def test_different_time_units_out_of_sample(
@@ -265,12 +326,14 @@ def test_different_time_units_out_of_sample(
     m_clf = ARIMARegressor(d=None)
     m_clf.fit(X=X[:15], y=y[:15])
     y_pred = m_clf.predict(X=X[15:])
+    assert m_clf._component_obj.d is None
 
     assert (y_pred_sk.values == y_pred.values).all()
     assert y_pred.index.equals(X[15:].index)
 
 
-def test_arima_supports_boolean_features():
+@patch("sktime.forecasting.arima.AutoARIMA.fit")
+def test_arima_supports_boolean_features(mock_fit):
     X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=10)})
     X.ww.init()
     X.ww["bool_1"] = (
@@ -286,34 +349,49 @@ def test_arima_supports_boolean_features():
     y = pd.Series(range(10))
 
     ar = ARIMARegressor(time_index="dates")
-
-    ar._component_obj = MagicMock()
     ar.fit(X, y)
 
     pd.testing.assert_series_equal(
-        ar._component_obj.fit.call_args[1]["X"]["bool_1"], X["bool_1"].astype(float)
+        mock_fit.call_args[1]["X"]["bool_1"], X["bool_1"].astype(float)
     )
     pd.testing.assert_series_equal(
-        ar._component_obj.fit.call_args[1]["X"]["bool_2"], X["bool_2"].astype(float)
+        mock_fit.call_args[1]["X"]["bool_2"], X["bool_2"].astype(float)
+    )
+
+
+def test_arima_boolean_features_no_error():
+    X = pd.DataFrame({"dates": pd.date_range("2021-01-01", periods=100)})
+    X.ww.init()
+    X.ww["bool_1"] = (
+        pd.Series([True, False])
+        .sample(n=100, replace=True, random_state=0)
+        .reset_index(drop=True)
+    )
+    X.ww["bool_2"] = (
+        pd.Series([True, False])
+        .sample(n=100, replace=True, random_state=1)
+        .reset_index(drop=True)
     )
+    y = pd.Series(range(100))
 
-    # Test that non-mocked predict does not error or produce NaNs
     ar = ARIMARegressor(time_index="dates")
     ar.fit(X, y)
     preds = ar.predict(X)
     assert not preds.isna().any()
 
 
-def test_arima_regressor_respects_use_covariates(ts_data):
+@patch("sktime.forecasting.arima.AutoARIMA.fit")
+@patch("sktime.forecasting.arima.AutoARIMA.predict")
+def test_arima_regressor_respects_use_covariates(mock_predict, mock_fit, ts_data):
     X, y = ts_data
     X_train, y_train = X.iloc[:25], y.iloc[:25]
     X_test, _ = X.iloc[25:], y.iloc[25:]
     clf = ARIMARegressor(use_covariates=False)
-    with patch.object(clf, "_component_obj") as mock_obj:
-        clf.fit(X_train, y_train)
-        clf.predict(X_test)
-        mock_obj.fit.assert_called_once()
-        assert "X" not in mock_obj.fit.call_args.kwargs
-        assert "y" in mock_obj.fit.call_args.kwargs
-        mock_obj.predict.assert_called_once()
-        assert "X" not in mock_obj.predict.call_args.kwargs
+
+    clf.fit(X_train, y_train)
+    clf.predict(X_test)
+    mock_fit.assert_called_once()
+    assert "X" not in mock_fit.call_args.kwargs
+    assert "y" in mock_fit.call_args.kwargs
+    mock_predict.assert_called_once()
+    assert "X" not in mock_predict.call_args.kwargs