diff --git a/CHANGELOG.md b/CHANGELOG.md index 53efb2488..0d85b0463 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -10,7 +10,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ### Added - - -- +- `DeadlineMovingAverageModel` ([#827](https://github.com/tinkoff-ai/etna/pull/827)) - `DirectEnsemble` ([#824](https://github.com/tinkoff-ai/etna/pull/824)) - - @@ -494,4 +494,4 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Distribution plot - Anomalies (Outliers) plot - Backtest (CrossValidation) plot - - Forecast plot + - Forecast plot \ No newline at end of file diff --git a/etna/models/__init__.py b/etna/models/__init__.py index 73d43aacb..17a804329 100644 --- a/etna/models/__init__.py +++ b/etna/models/__init__.py @@ -8,6 +8,7 @@ from etna.models.catboost import CatBoostModelPerSegment from etna.models.catboost import CatBoostMultiSegmentModel from etna.models.catboost import CatBoostPerSegmentModel +from etna.models.deadline_ma import DeadlineMovingAverageModel from etna.models.holt_winters import HoltModel from etna.models.holt_winters import HoltWintersModel from etna.models.holt_winters import SimpleExpSmoothingModel diff --git a/etna/models/deadline_ma.py b/etna/models/deadline_ma.py new file mode 100644 index 000000000..eae606869 --- /dev/null +++ b/etna/models/deadline_ma.py @@ -0,0 +1,169 @@ +import warnings +from enum import Enum +from typing import Dict +from typing import List + +import numpy as np +import pandas as pd + +from etna.models.base import PerSegmentModel + + +class SeasonalityMode(Enum): + """Enum for seasonality mode for DeadlineMovingAverageModel.""" + + month = "month" + year = "year" + + @classmethod + def _missing_(cls, value): + raise NotImplementedError( + f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} seasonality allowed" + ) + + +class _DeadlineMovingAverageModel: + """Moving average model that uses exact previous dates to predict.""" + + def __init__(self, window: int = 3, seasonality: str = "month"): + """ + Initialize deadline moving average model. + + Length of remembered tail of series is equal to the number of ``window`` months or years, depending on the ``seasonality``. + + Parameters + ---------- + window: int + Number of values taken for forecast for each point. + seasonality: str + Only allowed monthly or annual seasonality. + """ + self.name = "target" + self.window = window + self.seasonality = SeasonalityMode(seasonality) + self.freqs_available = {"H", "D"} + + def fit(self, df: pd.DataFrame, regressors: List[str]) -> "_DeadlineMovingAverageModel": + """ + Fit DeadlineMovingAverageModel model. + + Parameters + ---------- + df: pd.DataFrame + Data to fit on + regressors: + List of the columns with regressors(ignored in this model) + + Raises + ------ + ValueError + If freq of dataframe is not supported + ValueError + If series is too short for chosen shift value + + Returns + ------- + : + Fitted model + """ + freq = pd.infer_freq(df["timestamp"]) + if freq not in self.freqs_available: + raise ValueError(f"{freq} is not supported! Use daily or hourly frequency!") + + if set(df.columns) != {"timestamp", "target"}: + warnings.warn( + message=f"{type(self).__name__} does not work with any exogenous series or features. " + f"It uses only target series for predict/\n " + ) + targets = df["target"] + timestamps = df["timestamp"] + + if self.seasonality == SeasonalityMode.month: + first_index = timestamps.iloc[-1] - pd.DateOffset(months=self.window) + + elif self.seasonality == SeasonalityMode.year: + first_index = timestamps.iloc[-1] - pd.DateOffset(years=self.window) + + if first_index < timestamps.iloc[0]: + raise ValueError( + "Given series is too short for chosen shift value. Try lower shift value, or give" "longer series." + ) + + self.series = targets.loc[timestamps >= first_index] + self.timestamps = timestamps.loc[timestamps >= first_index] + self.shift = len(self.series) + + return self + + def predict(self, df: pd.DataFrame) -> np.ndarray: + """ + Compute predictions from a DeadlineMovingAverageModel. + + Parameters + ---------- + df: pd.DataFrame + Used only for getting the horizon of forecast and timestamps. + + Returns + ------- + : + Array with predictions. + """ + timestamps = df["timestamp"] + index = pd.date_range(start=self.timestamps.iloc[0], end=timestamps.iloc[-1]) + res = np.append(self.series.values, np.zeros(len(df))) + res = pd.DataFrame(res) + res.index = index + for i in range(len(self.series), len(res)): + for w in range(1, self.window + 1): + if self.seasonality == SeasonalityMode.month: + prev_date = res.index[i] - pd.DateOffset(months=w) + + elif self.seasonality == SeasonalityMode.year: + prev_date = res.index[i] - pd.DateOffset(years=w) + if prev_date <= self.timestamps.iloc[-1]: + res.loc[index[i]] += self.series.loc[self.timestamps == prev_date].values + else: + res.loc[index[i]] += res.loc[prev_date].values + + res.loc[index[i]] = res.loc[index[i]] / self.window + + res = res.values.reshape( + len(res), + ) + + return res[-len(df) :] + + +class DeadlineMovingAverageModel(PerSegmentModel): + """Moving average model that uses exact previous dates to predict.""" + + def __init__(self, window: int = 3, seasonality: str = "month"): + """ + Initialize deadline moving average model. + + Parameters + ---------- + window: int + Number of values taken for forecast for each point. + seasonality: str + Only allowed monthly or annual seasonality. + """ + self.window = window + self.seasonality = seasonality + super(DeadlineMovingAverageModel, self).__init__( + base_model=_DeadlineMovingAverageModel(window=window, seasonality=seasonality) + ) + + def get_model(self) -> Dict[str, "DeadlineMovingAverageModel"]: + """Get internal model. + + Returns + ------- + : + Internal model + """ + return self._get_model() + + +__all__ = ["DeadlineMovingAverageModel"] diff --git a/tests/test_models/test_simple_models.py b/tests/test_models/test_simple_models.py index 86489500c..a9e0fd80b 100644 --- a/tests/test_models/test_simple_models.py +++ b/tests/test_models/test_simple_models.py @@ -2,6 +2,10 @@ import pandas as pd import pytest +from etna.datasets import TSDataset +from etna.metrics import MAE +from etna.models.deadline_ma import DeadlineMovingAverageModel +from etna.models.deadline_ma import _DeadlineMovingAverageModel from etna.models.moving_average import MovingAverageModel from etna.models.naive import NaiveModel from etna.models.seasonal_ma import SeasonalMovingAverageModel @@ -9,6 +13,28 @@ from etna.pipeline import Pipeline +@pytest.fixture() +def df(): + """Generate dataset with simple values without any noise""" + history = 140 + + df1 = pd.DataFrame() + df1["target"] = np.arange(history) + df1["segment"] = "A" + df1["timestamp"] = pd.date_range(start="2020-01-01", periods=history) + + df2 = pd.DataFrame() + df2["target"] = [0, 2, 4, 6, 8, 10, 12] * 20 + df2["segment"] = "B" + df2["timestamp"] = pd.date_range(start="2020-01-01", periods=history) + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="1d") + + return tsds + + @pytest.mark.parametrize("model", [SeasonalMovingAverageModel, NaiveModel, MovingAverageModel]) def test_simple_model_forecaster_run(simple_df, model): sma_model = model() @@ -20,6 +46,17 @@ def test_simple_model_forecaster_run(simple_df, model): assert len(res) == 14 +@pytest.mark.parametrize("model", [DeadlineMovingAverageModel]) +def test_deadline_model_forecaster_run(simple_df, model): + model = model(window=1) + model.fit(simple_df) + future_ts = simple_df.make_future(future_steps=7) + res = model.forecast(future_ts) + res = res.to_pandas(flatten=True) + assert not res.isnull().values.any() + assert len(res) == 14 + + def test_seasonal_moving_average_forecaster_correct(simple_df): model = SeasonalMovingAverageModel(window=3, seasonality=7) model.fit(simple_df) @@ -97,13 +134,78 @@ def test_moving_average_forecaster_correct(simple_df): assert np.all(res.values == answer.values) +def test_deadline_moving_average_forecaster_correct(df): + model = DeadlineMovingAverageModel(window=3, seasonality="month") + model.fit(df) + future_ts = df.make_future(future_steps=20) + res = model.forecast(future_ts) + res = res.to_pandas(flatten=True)[["target", "segment", "timestamp"]] + + df1 = pd.DataFrame() + df1["target"] = np.array( + [ + 79 + 2 / 3, + 80 + 2 / 3, + 81 + 2 / 3, + 82 + 2 / 3, + 83 + 2 / 3, + 84 + 2 / 3, + 85 + 2 / 3, + 86 + 2 / 3, + 87 + 2 / 3, + 88 + 2 / 3, + 89 + 1 / 3, + 89 + 2 / 3, + 90 + 2 / 3, + 91 + 2 / 3, + 92 + 2 / 3, + 93 + 2 / 3, + 94.0 + 2 / 3, + 95.0 + 2 / 3, + 96.0 + 2 / 3, + 97 + 2 / 3, + ] + ) + df1["segment"] = "A" + df1["timestamp"] = pd.date_range(start="2020-05-20", periods=20) + + df2 = pd.DataFrame() + df2["target"] = np.array( + [ + 5.0 + 1 / 3, + 7.0 + 1 / 3, + 4.0 + 2 / 3, + 6.0 + 2 / 3, + 8.0 + 2 / 3, + 6.0, + 3.0 + 1 / 3, + 5.0 + 1 / 3, + 7.0 + 1 / 3, + 4.0 + 2 / 3, + 6.0, + 6.0 + 2 / 3, + 4.0, + 6.0, + 8.0, + 5.0 + 1 / 3, + 7.0 + 1 / 3, + 4.0 + 2 / 3, + 6.0 + 2 / 3, + 4.0, + ] + ) + df2["segment"] = "B" + df2["timestamp"] = pd.date_range(start="2020-05-20", periods=20) + + answer = pd.concat([df2, df1], axis=0, ignore_index=True) + res = res.sort_values(by=["segment", "timestamp"]) + answer = answer.sort_values(by=["segment", "timestamp"]) + assert np.all(res.values == answer.values) + + @pytest.mark.parametrize( "etna_model_class", - ( - SeasonalMovingAverageModel, - MovingAverageModel, - NaiveModel, - ), + (SeasonalMovingAverageModel, MovingAverageModel, NaiveModel, DeadlineMovingAverageModel), ) def test_get_model_before_training(etna_model_class): """Check that get_model method throws an error if per-segment model is not fitted yet.""" @@ -118,6 +220,7 @@ def test_get_model_before_training(etna_model_class): (NaiveModel, _SeasonalMovingAverageModel), (SeasonalMovingAverageModel, _SeasonalMovingAverageModel), (MovingAverageModel, _SeasonalMovingAverageModel), + (DeadlineMovingAverageModel, _DeadlineMovingAverageModel), ), ) def test_get_model_after_training(example_tsds, etna_model_class, expected_class): @@ -128,3 +231,145 @@ def test_get_model_after_training(example_tsds, etna_model_class, expected_class assert isinstance(models_dict, dict) for segment in example_tsds.segments: assert isinstance(models_dict[segment], expected_class) + + +@pytest.fixture +def big_ts() -> TSDataset: + np.random.seed(42) + periods = 1000 + df1 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df1["segment"] = "segment_1" + df1["target"] = np.random.uniform(10, 20, size=periods) + + df2 = pd.DataFrame({"timestamp": pd.date_range("2020-01-01", periods=periods)}) + df2["segment"] = "segment_2" + df2["target"] = np.random.uniform(-15, 5, size=periods) + + df = pd.concat([df1, df2]).reset_index(drop=True) + df = TSDataset.to_dataset(df) + tsds = TSDataset(df, freq="D") + + return tsds + + +def test_pipeline_with_deadline_model(big_ts): + model = DeadlineMovingAverageModel(window=5, seasonality="month") + pipeline = Pipeline(model=model, horizon=200) + metrics, forecast, _ = pipeline.backtest(ts=big_ts, metrics=[MAE()], n_folds=3) + assert not forecast.isnull().values.any() + + +@pytest.fixture() +def two_month_ts(): + history = 61 + + df1 = pd.DataFrame() + df1["target"] = np.arange(history) + df1["segment"] = "A" + df1["timestamp"] = pd.date_range(start="2020-01-01", periods=history) + + df = TSDataset.to_dataset(df1) + tsds = TSDataset(df, freq="1d") + return tsds + + +def test_deadline_model_correct_with_big_horizons(two_month_ts): + model = DeadlineMovingAverageModel(window=2, seasonality="month") + model.fit(two_month_ts) + future_ts = two_month_ts.make_future(future_steps=90) + res = model.forecast(future_ts) + expected = np.array( + [ + [16.5], + [17.5], + [18.5], + [19.5], + [20.5], + [21.5], + [22.5], + [23.5], + [24.5], + [25.5], + [26.5], + [27.5], + [28.5], + [29.5], + [30.5], + [31.5], + [32.5], + [33.5], + [34.5], + [35.5], + [36.5], + [37.5], + [38.5], + [39.5], + [40.5], + [41.5], + [42.5], + [43.5], + [44.0], + [44.5], + [45.5], + [24.25], + [25.25], + [26.25], + [27.25], + [28.25], + [29.25], + [30.25], + [31.25], + [32.25], + [33.25], + [34.25], + [35.25], + [36.25], + [37.25], + [38.25], + [39.25], + [40.25], + [41.25], + [42.25], + [43.25], + [44.25], + [45.25], + [46.25], + [47.25], + [48.25], + [49.25], + [50.25], + [51.25], + [51.5], + [52.75], + [20.375], + [21.375], + [22.375], + [23.375], + [24.375], + [25.375], + [26.375], + [27.375], + [28.375], + [29.375], + [30.375], + [31.375], + [32.375], + [33.375], + [34.375], + [35.375], + [36.375], + [37.375], + [38.375], + [39.375], + [40.375], + [41.375], + [42.375], + [43.375], + [44.375], + [45.375], + [46.375], + [47.375], + [47.75], + ] + ) + assert np.all(res.df.values == expected)