tinkoff-ai · Ama16 · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022 · Nov 2, 2022
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -14,7 +14,7 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 - Add `plot_change_points_interactive` ([#988](https://github.com/tinkoff-ai/etna/pull/988))
 - Add `experimental` module with `TimeSeriesBinaryClassifier` and `PredictabilityAnalyzer` ([#985](https://github.com/tinkoff-ai/etna/pull/985))
 - Inference track results: add `predict` method to pipelines, teach some models to work with context, change hierarchy of base models, update notebook examples ([#979](https://github.com/tinkoff-ai/etna/pull/979))
--
+- Add `get_ruptures_regularization` in `experimental` module ([#1001](https://github.com/tinkoff-ai/etna/pull/1001))
 -
 ### Changed
 -

diff --git a/etna/experimental/change_points/__init__.py b/etna/experimental/change_points/__init__.py
@@ -0,0 +1 @@
+from etna.experimental.change_points.regularization_search import get_ruptures_regularization
diff --git a/etna/experimental/change_points/regularization_search.py b/etna/experimental/change_points/regularization_search.py
@@ -0,0 +1,178 @@
+from enum import Enum
+from typing import Dict
+from typing import Tuple
+from typing import Union
+
+import numpy as np
+import pandas as pd
+from ruptures.base import BaseEstimator
+from ruptures.costs import CostLinear
+
+from etna.datasets import TSDataset
+
+
+class OptimizationMode(str, Enum):
+    """Enum for different optimization modes."""
+
+    pen = "pen"
+    epsilon = "epsilon"
+
+    @classmethod
+    def _missing_(cls, value):
+        raise NotImplementedError(
+            f"{value} is not a valid {cls.__name__}. Only {', '.join([repr(m.value) for m in cls])} modes allowed"
+        )
+
+
+def _get_n_bkps(series: pd.Series, change_point_model: BaseEstimator, **model_predict_params) -> int:
+    """Get number of change points, detected with given params.
+    Parameters
+    ----------
+    series:
+        series to detect change points
+    change_point_model:
+        model to get trend change points
+
+    Returns
+    -------
+    :
+        number of change points
+    """
+    signal = series.to_numpy()
+    if isinstance(change_point_model.cost, CostLinear):
+        signal = signal.reshape((-1, 1))
+
+    change_point_model.fit(signal=signal)
+
+    change_points_indices = change_point_model.predict(**model_predict_params)[:-1]
+    return len(change_points_indices)
+
+
+def _get_next_value(
+    now_value: float, lower_bound: float, upper_bound: float, need_greater: bool
+) -> Tuple[float, float, float]:
+    """Give next value according to binary search.
+    Parameters
+    ----------
+    now_value:
+        current value
+    lower_bound:
+        lower bound for search
+    upper_bound:
+        upper bound for search
+    need_greater:
+        True if we need greater value for n_bkps than previous time
+
+    Returns
+    -------
+    :
+        next value and its bounds
+    """
+    if need_greater:
+        return np.mean([now_value, lower_bound]), lower_bound, now_value
+    else:
+        return np.mean([now_value, upper_bound]), now_value, upper_bound
+
+
+def bin_search(
+    series: pd.Series,
+    change_point_model: BaseEstimator,
+    n_bkps: int,
+    opt_param: str,
+    max_value: float,
+    max_iters: int = 200,
+) -> float:
+    """Run binary search for optimal regularizations.
+    Parameters
+    ----------
+    series:
+        series for search
+    change_point_model:
+        model to get trend change points
+    n_bkps:
+        target numbers of changepoints
+    opt_param:
+        parameter for optimization
+    max_value:
+        maximum possible value, the upper bound for search
+    max_iters:
+        maximum iterations; in case if the required number of points is unattainable, values will be selected after max_iters iterations
+
+    Returns
+    -------
+    :
+        regularization parameters value
+    """
+    zero_param = _get_n_bkps(series, change_point_model, **{opt_param: 0})
+    max_param = _get_n_bkps(series, change_point_model, **{opt_param: max_value})
+    if zero_param < n_bkps:
+        raise ValueError("Impossible number of changepoints. Please, decrease n_bkps value.")
+    if n_bkps < max_param:
+        raise ValueError("Impossible number of changepoints. Please, increase max_value or increase n_bkps value.")
+
+    lower_bound, upper_bound = 0.0, max_value
+    now_value = np.mean([lower_bound, upper_bound])
+    now_n_bkps = _get_n_bkps(series, change_point_model, **{opt_param: now_value})
+    iters = 0
+
+    while now_n_bkps != n_bkps and iters < max_iters:
+        need_greater = now_n_bkps < n_bkps
+        now_value, lower_bound, upper_bound = _get_next_value(now_value, lower_bound, upper_bound, need_greater)
+        now_n_bkps = _get_n_bkps(series, change_point_model, **{opt_param: now_value})
+        iters += 1
+    return now_value
+
+
+def get_ruptures_regularization(
+    ts: TSDataset,
+    in_column: str,
+    change_point_model: BaseEstimator,
+    n_bkps: Union[Dict[str, int], int],
+    mode: OptimizationMode,
+    max_value: float = 10000,
+    max_iters: int = 200,
+) -> Dict[str, Dict[str, float]]:
+    """Get regularization parameter values for given number of changepoints.
+    It is assumed that as the regularization being selected increases, the number of change points decreases.
+
+    Parameters
+    ----------
+    ts:
+        Dataset with timeseries data
+    in_column:
+        name of processed column
+    change_point_model:
+        model to get trend change points
+    n_bkps:
+        target numbers of changepoints
+    mode:
+        optimization mode
+    max_value:
+        maximum possible value, the upper bound for search
+    max_iters:
+        maximum iterations; in case if the required number of points is unattainable, values will be selected after max_iters iterations
+
+    Returns
+    -------
+    :
+        regularization parameters values
+
+    Raises
+    ______
+    ValueError:
+        If max_value is so low for needed n_bkps
+    """
+    mode = OptimizationMode(mode)
+    df = ts.to_pandas()
+    segments = df.columns.get_level_values(0).unique()
+
+    if isinstance(n_bkps, int):
+        n_bkps = dict(zip(segments, [n_bkps] * len(segments)))
+
+    regulatization = {}
+    for segment in segments:
+        series = ts[:, segment, in_column]
+        regulatization[segment] = {
+            mode.value: bin_search(series, change_point_model, n_bkps[segment], mode, max_value, max_iters)
+        }
+    return regulatization
diff --git a/tests/test_experimental/test_change_points/__init__.py b/tests/test_experimental/test_change_points/__init__.py
diff --git a/tests/test_experimental/test_change_points/test_regularization_search.py b/tests/test_experimental/test_change_points/test_regularization_search.py
@@ -0,0 +1,88 @@
+import pytest
+from ruptures import Binseg
+
+from etna.datasets import TSDataset
+from etna.datasets import generate_ar_df
+from etna.experimental.change_points import get_ruptures_regularization
+from etna.experimental.change_points.regularization_search import _get_n_bkps
+
+
+@pytest.fixture
+def simple_change_points_ts():
+    df = generate_ar_df(periods=125, start_time="2021-05-20", n_segments=3, freq="D", random_seed=42)
+    df_ts_format = TSDataset.to_dataset(df)
+    return TSDataset(df_ts_format, freq="D")
+
+
+@pytest.mark.parametrize(
+    "segment,params,expected",
+    (
+        ("segment_0", {"pen": 20}, 6),
+        ("segment_0", {"epsilon": 20}, 24),
+        ("segment_1", {"pen": 10}, 7),
+        ("segment_1", {"epsilon": 100}, 12),
+        ("segment_2", {"pen": 2}, 14),
+        ("segment_2", {"epsilon": 200}, 6),
+    ),
+)
+def test_get_n_bkps(segment, params, expected, simple_change_points_ts):
+    series = simple_change_points_ts[:, segment, "target"]
+    assert _get_n_bkps(series, Binseg(), **params) == expected
+
+
+@pytest.mark.parametrize(
+    "n_bkps,mode",
+    (
+        ({"segment_0": 3, "segment_1": 14, "segment_2": 19}, "pen"),
+        ({"segment_0": 5, "segment_1": 2, "segment_2": 8}, "epsilon"),
+        ({"segment_0": 11, "segment_1": 18, "segment_2": 4}, "pen"),
+        ({"segment_0": 18, "segment_1": 21, "segment_2": 7}, "epsilon"),
+    ),
+)
+def test_get_regularization(n_bkps, mode, simple_change_points_ts):
+    in_column = "target"
+    res = get_ruptures_regularization(
+        simple_change_points_ts, in_column=in_column, change_point_model=Binseg(), n_bkps=n_bkps, mode=mode
+    )
+    assert sorted(res.keys()) == sorted(simple_change_points_ts.to_pandas().columns.get_level_values(0).unique())
+    for seg in res.keys():
+        series = simple_change_points_ts[:, seg, in_column]
+        answer = _get_n_bkps(series, Binseg(), **{mode: res[seg][mode]})
+        assert answer == n_bkps[seg]
+
+
+@pytest.mark.parametrize(
+    "n_bkps,mode",
+    (
+        ({"segment_0": 3, "segment_1": 34, "segment_2": 19}, "pen"),
+        ({"segment_0": 45, "segment_1": 2, "segment_2": 8}, "epsilon"),
+    ),
+)
+def test_fail_get_regularization_high(n_bkps, mode, simple_change_points_ts):
+    in_column = "target"
+    with pytest.raises(ValueError, match="Impossible number of changepoints. Please, decrease n_bkps value."):
+        _ = get_ruptures_regularization(
+            simple_change_points_ts, in_column=in_column, change_point_model=Binseg(), n_bkps=n_bkps, mode=mode
+        )
+
+
+@pytest.mark.parametrize(
+    "n_bkps,mode",
+    (
+        ({"segment_0": 3, "segment_1": 1, "segment_2": 19}, "pen"),
+        ({"segment_0": 1, "segment_1": 2, "segment_2": 8}, "epsilon"),
+    ),
+)
+def test_fail_get_regularization_low(n_bkps, mode, simple_change_points_ts):
+    in_column = "target"
+    with pytest.raises(
+        ValueError, match="Impossible number of changepoints. Please, increase max_value or increase n_bkps value."
+    ):
+        _ = get_ruptures_regularization(
+            simple_change_points_ts,
+            in_column=in_column,
+            change_point_model=Binseg(),
+            n_bkps=n_bkps,
+            mode=mode,
+            max_value=1,
+        )
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		from etna.experimental.change_points.regularization_search import get_ruptures_regularization