From 8796da36c8669bfb6551bb5c16f747b6d427c351 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Thu, 12 Dec 2024 18:17:56 -0800 Subject: [PATCH 1/2] Create noise parameter selector and tests --- src/seer/anomaly_detection/accessors.py | 9 ++- .../anomaly_detection/anomaly_detection.py | 9 ++- .../anomaly_detection/anomaly_detection_di.py | 6 ++ .../anomaly_detection/detectors/__init__.py | 3 + .../detectors/anomaly_detectors.py | 18 +++++- .../anomaly_detection/detectors/mp_scorers.py | 6 +- .../detectors/noise_reducers.py | 54 ++++++++++++++++++ .../detectors/test_noise_reducers.py | 56 +++++++++++++++++++ 8 files changed, 154 insertions(+), 7 deletions(-) create mode 100644 src/seer/anomaly_detection/detectors/noise_reducers.py create mode 100644 tests/seer/anomaly_detection/detectors/test_noise_reducers.py diff --git a/src/seer/anomaly_detection/accessors.py b/src/seer/anomaly_detection/accessors.py index 60f030b1b..ed9d798a3 100644 --- a/src/seer/anomaly_detection/accessors.py +++ b/src/seer/anomaly_detection/accessors.py @@ -1,12 +1,15 @@ import abc import logging +import os import random +import sys from datetime import datetime, timedelta from typing import List, Optional import numpy as np import sentry_sdk -import stumpy # type: ignore # mypy throws "missing library stubs" + +# import stumpy # type: ignore # mypy throws "missing library stubs" from pydantic import BaseModel from sqlalchemy import delete @@ -24,6 +27,10 @@ from seer.dependency_injection import inject, injected from seer.exceptions import ClientError +stumpy_path_src = "/Users/aayushseth/code/stumpy-noise-reduction" +sys.path.insert(0, os.path.abspath(stumpy_path_src)) +import stumpy # type: ignore # mypy throws "missing library stubs" + logger = logging.getLogger(__name__) diff --git a/src/seer/anomaly_detection/anomaly_detection.py b/src/seer/anomaly_detection/anomaly_detection.py index 87620fcdf..e544534d2 100644 --- a/src/seer/anomaly_detection/anomaly_detection.py +++ b/src/seer/anomaly_detection/anomaly_detection.py @@ -1,9 +1,12 @@ import logging +import os +import sys from typing import List, Tuple import numpy as np import sentry_sdk -import stumpy # type: ignore # mypy throws "missing library stubs" + +# import stumpy # type: ignore # mypy throws "missing library stubs" from pydantic import BaseModel from seer.anomaly_detection.accessors import AlertDataAccessor, DbAlertDataAccessor @@ -29,6 +32,10 @@ from seer.exceptions import ClientError, ServerError from seer.tags import AnomalyDetectionModes, AnomalyDetectionTags +stumpy_path_src = "/Users/aayushseth/code/stumpy-noise-reduction" +sys.path.insert(0, os.path.abspath(stumpy_path_src)) +import stumpy # type: ignore # mypy throws "missing library stubs" + anomaly_detection_module.enable() logger = logging.getLogger(__name__) diff --git a/src/seer/anomaly_detection/anomaly_detection_di.py b/src/seer/anomaly_detection/anomaly_detection_di.py index 4db9b3229..1f5f1b9d6 100644 --- a/src/seer/anomaly_detection/anomaly_detection_di.py +++ b/src/seer/anomaly_detection/anomaly_detection_di.py @@ -12,6 +12,7 @@ LocationDetector, ProphetLocationDetector, ) +from seer.anomaly_detection.detectors.noise_reducers import NoiseReducer, VarianceNoiseReducer from seer.anomaly_detection.models import AlgoConfig from seer.dependency_injection import Module @@ -60,3 +61,8 @@ def mp_utils_provider() -> MPUtils: @anomaly_detection_module.provider def location_detector_provider() -> LocationDetector: return ProphetLocationDetector() + + +@anomaly_detection_module.provider +def noise_reducer_provider() -> NoiseReducer: + return VarianceNoiseReducer() diff --git a/src/seer/anomaly_detection/detectors/__init__.py b/src/seer/anomaly_detection/detectors/__init__.py index 44602a6b4..348f8530e 100644 --- a/src/seer/anomaly_detection/detectors/__init__.py +++ b/src/seer/anomaly_detection/detectors/__init__.py @@ -2,6 +2,7 @@ anomaly_detectors, mp_scorers, mp_utils, + noise_reducers, normalizers, smoothers, window_size_selectors, @@ -24,3 +25,5 @@ FlagSmoother = smoothers.FlagSmoother MajorityVoteBatchFlagSmoother = smoothers.MajorityVoteBatchFlagSmoother MajorityVoteStreamFlagSmoother = smoothers.MajorityVoteStreamFlagSmoother +NoiseReducer = noise_reducers.NoiseReducer +VarianceNoiseReducer = noise_reducers.VarianceNoiseReducer diff --git a/src/seer/anomaly_detection/detectors/anomaly_detectors.py b/src/seer/anomaly_detection/detectors/anomaly_detectors.py index 7b3023d87..738ab8229 100644 --- a/src/seer/anomaly_detection/detectors/anomaly_detectors.py +++ b/src/seer/anomaly_detection/detectors/anomaly_detectors.py @@ -1,14 +1,18 @@ import abc import logging +import os +import sys import numpy as np import numpy.typing as npt import sentry_sdk -import stumpy # type: ignore # mypy throws "missing library stubs" + +# import stumpy # type: ignore # mypy throws "missing library stubs" from pydantic import BaseModel, ConfigDict, Field from seer.anomaly_detection.detectors.mp_scorers import MPScorer from seer.anomaly_detection.detectors.mp_utils import MPUtils +from seer.anomaly_detection.detectors.noise_reducers import NoiseReducer from seer.anomaly_detection.detectors.smoothers import ( MajorityVoteBatchFlagSmoother, MajorityVoteStreamFlagSmoother, @@ -26,6 +30,10 @@ from seer.dependency_injection import inject, injected from seer.exceptions import ServerError +stumpy_path_src = "/Users/aayushseth/code/stumpy-noise-reduction" +sys.path.insert(0, os.path.abspath(stumpy_path_src)) +import stumpy # type: ignore # mypy throws "missing library stubs" + logger = logging.getLogger(__name__) @@ -81,6 +89,7 @@ def _compute_matrix_profile( ws_selector: WindowSizeSelector = injected, scorer: MPScorer = injected, mp_utils: MPUtils = injected, + noise_reducer: NoiseReducer = injected, ) -> MPTimeSeriesAnomaliesSingleWindow: """ This method calls stumpy.stump to compute the matrix profile and scores the matrix profile distances @@ -107,11 +116,13 @@ def _compute_matrix_profile( # TODO: Add sentry logging of this error raise ServerError("Invalid window size") # Get the matrix profile for the time series + noise_parameter = noise_reducer.get_noise_parameter(ts_values) mp = stumpy.stump( ts_values, m=max(3, window_size), ignore_trivial=algo_config.mp_ignore_trivial, normalize=False, + std_noise=noise_parameter, ) # We do not normalize the matrix profile here as normalizing during stream detection later is not straighforward. @@ -176,6 +187,7 @@ def detect( algo_config: AlgoConfig = injected, scorer: MPScorer = injected, mp_utils: MPUtils = injected, + noise_reducer: NoiseReducer = injected, ) -> MPTimeSeriesAnomaliesSingleWindow: """ This method uses stumpy.stumpi to stream compute the matrix profile and scores the matrix profile distances @@ -213,8 +225,10 @@ def detect( streamed_mp: list[list[float]] = [] thresholds: list[list[Threshold]] = [] for cur_val, cur_timestamp in zip(timeseries.values, timeseries.timestamps): + # Update the stumpi stream processor with new data - stream.update(cur_val) + noise_parameter = noise_reducer.get_noise_parameter(np.array(self.history_values)) + stream.update(cur_val, std_noise=noise_parameter) # Get the matrix profile for the new data and score it cur_mp = [stream.P_[-1], stream.I_[-1], stream.left_I_[-1], -1] diff --git a/src/seer/anomaly_detection/detectors/mp_scorers.py b/src/seer/anomaly_detection/detectors/mp_scorers.py index e91d48245..5ff00272b 100644 --- a/src/seer/anomaly_detection/detectors/mp_scorers.py +++ b/src/seer/anomaly_detection/detectors/mp_scorers.py @@ -190,13 +190,13 @@ class MPIQRScorer(MPScorer): { # High sensitivity = more anomalies + higher false positives # Data point outside of bottom 70% of the MP distances considered anomalous - "high": [0.3, 0.7], + "high": [0.35, 0.65], # Medium sensitivity = lesser anomalies + lesser false positives # Data point outside of bottom 80% of the MP distances considered anomalous - "medium": [0.2, 0.8], + "medium": [0.25, 0.75], # Low sensitivity = least anomalies + least false positives # Data point outside of bottom 90% of the MP distances considered anomalous - "low": [0.1, 0.9], + "low": [0.15, 0.85], }, description="Lower and upper bounds for high sensitivity", ) diff --git a/src/seer/anomaly_detection/detectors/noise_reducers.py b/src/seer/anomaly_detection/detectors/noise_reducers.py new file mode 100644 index 000000000..7936541d4 --- /dev/null +++ b/src/seer/anomaly_detection/detectors/noise_reducers.py @@ -0,0 +1,54 @@ +import abc + +import numpy as np +import numpy.typing as npt +from pydantic import BaseModel + + +class NoiseReducer(BaseModel, abc.ABC): + """ + Abstract base class for selecting the noise parameter for stumpy + """ + + @abc.abstractmethod + def get_noise_parameter( + self, timeseries: npt.NDArray, window: int = 12, scale_factor: float = 1.0 + ) -> float: + return NotImplemented + + +class VarianceNoiseReducer(NoiseReducer): + def get_noise_parameter( + self, timeseries: npt.NDArray, window: int = 12, scale_factor: float = 1.0 + ) -> float: + """ + Gets the noise parameter by calculating the median variance across sliding non-overlapping windows of the timeseries. + + Parameters: + ----------- + timeseries : npt.NDArray + Input time series array + window : int, default=12 + Size of sliding window + scale_factor : float, default=1.0 + Factor to scale the final noise parameter + + Returns: + -------- + float + Noise parameter calculated as median variance * scale_factor + """ + + # TODO: The window should be ~half a day so should be based on ad_config + if len(timeseries) == 0 or window <= 0: + return 0.0 + + window = min(window, len(timeseries)) + + n_windows = len(timeseries) // window + windowed_ts = timeseries[: n_windows * window].reshape(n_windows, window) + variances = np.var(windowed_ts, axis=1) + + noise_parameter = np.median(variances) * scale_factor + + return noise_parameter diff --git a/tests/seer/anomaly_detection/detectors/test_noise_reducers.py b/tests/seer/anomaly_detection/detectors/test_noise_reducers.py new file mode 100644 index 000000000..788a2db12 --- /dev/null +++ b/tests/seer/anomaly_detection/detectors/test_noise_reducers.py @@ -0,0 +1,56 @@ +import unittest + +import numpy as np + +from seer.anomaly_detection.detectors.noise_reducers import VarianceNoiseReducer + + +class TestVarianceNoiseReducer(unittest.TestCase): + + def setUp(self): + self.noise_reducer = VarianceNoiseReducer() + + def test_get_noise_parameter_standard_array(self): + timeseries = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + window = 4 + scale_factor = 1.0 + result = self.noise_reducer.get_noise_parameter(timeseries, window, scale_factor) + expected = 1.25 + assert result == expected + + def test_get_noise_parameter_with_scale_factor(self): + timeseries = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]) + window = 4 + scale_factor = 2.0 + result = self.noise_reducer.get_noise_parameter(timeseries, window, scale_factor) + expected = 2.5 + assert result == expected + + def test_get_noise_parameter_constant_array(self): + timeseries = np.array([5, 5, 5, 5, 5, 5, 5, 5]) + window = 4 + result = self.noise_reducer.get_noise_parameter(timeseries, window) + expected = 0.0 + assert result == expected + + def test_get_noise_parameter_window_larger_than_timeseries(self): + timeseries = np.array([1, 2, 3, 4]) + window = 10 + result = self.noise_reducer.get_noise_parameter(timeseries, window) + expected = 1.25 + assert result == expected + + def test_get_noise_parameter_empty_array(self): + timeseries = np.array([]) + window = 4 + result = self.noise_reducer.get_noise_parameter(timeseries, window) + expected = 0.0 + assert result == expected + + def test_get_noise_parameter_single_value(self): + timeseries = np.array([1]) + window = 4 + result = self.noise_reducer.get_noise_parameter(timeseries, window) + # Single value should have 0 variance + expected = 0.0 + assert result == expected From 84dc584dc8bc4779c3a866f125e6b859e3e5ed90 Mon Sep 17 00:00:00 2001 From: Aayush Seth Date: Mon, 16 Dec 2024 11:24:49 -0800 Subject: [PATCH 2/2] update noise parameter calculation --- .../detectors/noise_reducers.py | 41 ++++++++++++------- 1 file changed, 27 insertions(+), 14 deletions(-) diff --git a/src/seer/anomaly_detection/detectors/noise_reducers.py b/src/seer/anomaly_detection/detectors/noise_reducers.py index 7936541d4..e4c5bfc5d 100644 --- a/src/seer/anomaly_detection/detectors/noise_reducers.py +++ b/src/seer/anomaly_detection/detectors/noise_reducers.py @@ -2,6 +2,7 @@ import numpy as np import numpy.typing as npt +from numpy.lib.stride_tricks import as_strided from pydantic import BaseModel @@ -12,14 +13,24 @@ class NoiseReducer(BaseModel, abc.ABC): @abc.abstractmethod def get_noise_parameter( - self, timeseries: npt.NDArray, window: int = 12, scale_factor: float = 1.0 + self, timeseries: npt.NDArray, window: int = 0, scale_factor: float = 1.0 ) -> float: return NotImplemented class VarianceNoiseReducer(NoiseReducer): + + def _get_subsequences(self, arr: npt.NDArray, m: int) -> npt.NDArray: + """ + Gets the vectorized subsequences of size m of the input array + """ + n = arr.size - m + 1 + s = arr.itemsize + subs = as_strided(arr, shape=(n, m), strides=(s, s)) + return subs + def get_noise_parameter( - self, timeseries: npt.NDArray, window: int = 12, scale_factor: float = 1.0 + self, timeseries: npt.NDArray, window: int = 0, scale_factor: float = 1.0 ) -> float: """ Gets the noise parameter by calculating the median variance across sliding non-overlapping windows of the timeseries. @@ -29,9 +40,9 @@ def get_noise_parameter( timeseries : npt.NDArray Input time series array window : int, default=12 - Size of sliding window + Size of sliding window for calculating variances. Default is scale_factor : float, default=1.0 - Factor to scale the final noise parameter + Factor to scale the final noise parameter. A higher value will result in more aggressive noise reduction. Returns: -------- @@ -39,16 +50,18 @@ def get_noise_parameter( Noise parameter calculated as median variance * scale_factor """ - # TODO: The window should be ~half a day so should be based on ad_config - if len(timeseries) == 0 or window <= 0: - return 0.0 - - window = min(window, len(timeseries)) + # Limiting our selection to the last num_days + # TODO: This should be based on ad_config + num_days = 7 + num_points = 24 * num_days + timeseries = timeseries[-num_points:] - n_windows = len(timeseries) // window - windowed_ts = timeseries[: n_windows * window].reshape(n_windows, window) - variances = np.var(windowed_ts, axis=1) + # Paper suggests 10% of the reference data + if window == 0: + window = int(0.1 * len(timeseries)) - noise_parameter = np.median(variances) * scale_factor + variances = np.var(self._get_subsequences(timeseries, window), axis=1) - return noise_parameter + # Taking the 20th percentile as opposed to the 5th percentile + noise_parameter = np.percentile(variances, 20) + return noise_parameter * scale_factor