diff --git a/.env b/.env index 08f761d20..ad4d10b58 100644 --- a/.env +++ b/.env @@ -1,15 +1,54 @@ -# Frontend ENV Variables +# For a more detailed description of the configuration variables please visit: https://docs.chaosgenius.io/docs/Operator_Guides/Configuration/Config%20Parameters + +# User Configurable Variables +## Webapp URL +CHAOSGENIUS_WEBAPP_URL=http://localhost:8080/ # URL of the Chaos Genius deployment. Usually, this will be http://:8080/ (http://localhost:8080/ in local installations). + +## Analytics +### Common Analytics Configuration +DAYS_OFFSET_FOR_ANALTYICS=2 # Sets the days offset from the current date till which your KPI's will run for. +HOURS_OFFSET_FOR_ANALTYICS=0 # Sets the hours offset from the latest data point till which Anomaly Detection will run for your KPI. +TIMEZONE=UTC # Timezone on which all your analytics are reported. +METADATA_SYNC_TIME=03:00 # Synctime for your metadata + +### Anomaly Configuration +MULTIDIM_ANALYSIS_FOR_ANOMALY=False # Enables the generation of multi-dimensional subgroups. +MAX_SUBDIM_CARDINALITY=1000 # Sets the maximum number of unique values allowed in a dimension. +TOP_DIMENSIONS_FOR_ANOMALY_DRILLDOWN=10 # Sets the maximum number of dimensions shown in the Anomaly Drill Downs +MIN_DATA_IN_SUBGROUP=30 # The minimum population in a subgroup. +TOP_SUBDIMENSIONS_FOR_ANOMALY=10 # Sets the maximum number of sud-dimensions shown in the Anomaly Sub-dimensions page. +MAX_FILTER_SUBGROUPS_ANOMALY=250 # Sets the maximum number of subgroups considered for Anomaly Detection +MAX_ANOMALY_SLACK_DAYS=14 # Sets the maximum number of days for which we can have no data and still consider the KPI for Anomaly Detection. + +### DeepDrills Configuration +MAX_ROWS_FOR_DEEPDRILLS=10000000 #Sets the maximum number of rows allowed for a KPI to be added. +MAX_DEEPDRILLS_SLACK_DAYS=14 # Sets the maximum number of days for which we can have no data and still consider the KPI for DeepDrills. +DEEPDRILLS_HTABLE_MAX_PARENTS=5 # Sets the maximum number of rows in the first level of the DeepDrills' drilldowns. +DEEPDRILLS_HTABLE_MAX_CHILDREN=5 # Sets the maximum number of rows in the subsequent levels of the DeepDrills' drilldowns. +DEEPDRILLS_HTABLE_MAX_DEPTH=3 # Sets the maximum depth of the drilldowns in DeepDrills. +DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day,month_on_month,month_to_date,week_on_week,week_to_date # Sets the enabled time ranges for which DeepDrills is computed as comma separated values. + +## Sentry Logging (leave empty to disable backend telemetry) +SENTRY_DSN= + +## Enterprise Edition Key +CHAOSGENIUS_ENTERPRISE_EDITION_KEY= + +# System Configuration +## Frontend Configuration REACT_APP_BASE_URL= REACT_APP_DISABLE_TELEMETRY=false -AIRBYTE_ENABLED=False - +## Backend Configuration +### Flask Server FLASK_APP=run FLASK_ENV=production FLASK_DEBUG=0 FLASK_RUN_PORT=5000 SECRET_KEY="t8GIEp8hWmR8y6VLqd6qQCMXzjRaKsx8nRruWNtFuec=" SEND_FILE_MAX_AGE_DEFAULT=31556926 + +### Database Configuration DB_HOST=chaosgenius-db DB_USERNAME=postgres DB_PASSWORD=chaosgenius @@ -23,9 +62,10 @@ INTEGRATION_DB_USERNAME=postgres INTEGRATION_DB_PASSWORD=chaosgenius INTEGRATION_DB_PORT=5432 INTEGRATION_DATABASE=chaosgenius_data + +#### Celery Configuration CELERY_RESULT_BACKEND=redis://chaosgenius-redis:6379/1 CELERY_BROKER_URL=redis://chaosgenius-redis:6379/1 -CHAOSGENIUS_WEBAPP_URL=http://localhost:8080/ # Alert configuration ## to enable event alerts @@ -43,6 +83,9 @@ TASK_CHECKPOINT_LIMIT=1000 # Version identification CHAOSGENIUS_VERSION_POSTFIX=git +## CG-Airbyte +AIRBYTE_ENABLED=False + # === airbyte env vars start here === VERSION=0.29.12-alpha @@ -50,6 +93,7 @@ VERSION=0.29.12-alpha # SOURCES LIST # Set value to 'true' if the source is required, 'false' otherwise. ########################## +# Enabling these data sources requires the third-party version. Please install the third-party version or upgrade to it. SOURCE_GOOGLE_ANALYTICS=true SOURCE_GOOGLE_SHEETS=true SOURCE_MYSQL=false @@ -61,6 +105,7 @@ SOURCE_FACEBOOK_ADS=false SOURCE_BING_ADS=false SOURCE_GOOGLE_BIG_QUERY=false SOURCE_SNOWFLAKE=false + # Airbyte Internal Job Database, see https://docs.airbyte.io/operator-guides/configuring-airbyte-db DATABASE_USER=docker DATABASE_PASSWORD=docker @@ -137,23 +182,3 @@ MAX_SYNC_JOB_ATTEMPTS=3 # Time in days to reach a timeout to cancel the synchronization MAX_SYNC_TIMEOUT_DAYS=3 -#Configurable Analytics Setting -MULTIDIM_ANALYSIS_FOR_ANOMALY=False -MAX_SUBDIM_CARDINALITY=1000 -TOP_DIMENSIONS_FOR_ANOMALY_DRILLDOWN=10 -MIN_DATA_IN_SUBGROUP=30 -TOP_SUBDIMENSIONS_FOR_ANOMALY=10 -MAX_ROWS_FOR_DEEPDRILLS=10000000 -MAX_FILTER_SUBGROUPS_ANOMALY=250 -MAX_DEEPDRILLS_SLACK_DAYS=14 -MAX_ANOMALY_SLACK_DAYS=14 -DAYS_OFFSET_FOR_ANALTYICS=2 -HOURS_OFFSET_FOR_ANALTYICS=0 -DEEPDRILLS_HTABLE_MAX_PARENTS=5 -DEEPDRILLS_HTABLE_MAX_CHILDREN=5 -DEEPDRILLS_HTABLE_MAX_DEPTH=3 -DEEPDRILLS_ENABLED_TIME_RANGES=last_30_days,last_7_days,previous_day,month_on_month,month_to_date,week_on_week,week_to_date -TIMEZONE=UTC - -SENTRY_DSN= -CHAOSGENIUS_ENTERPRISE_EDITION_KEY= diff --git a/.flake8 b/.flake8 index 642256acd..5559cd643 100644 --- a/.flake8 +++ b/.flake8 @@ -1,6 +1,5 @@ [flake8] max-line-length = 88 -extend-ignore = E203, E501, D203 exclude = .git, __pycache__, @@ -9,3 +8,7 @@ exclude = sandbox max-complexity = 10 docstring-convention = google +classmethod-decorators = + classmethod + validator + root_validator diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md index 4ffc26ef2..66c2523b2 100644 --- a/.github/ISSUE_TEMPLATE/bug_report.md +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -13,7 +13,7 @@ A clear and concise description of what the bug is. ## Explain the environment - **Chaos Genius version**: example is 0.1.3-alpha - **OS Version / Instance**: example macOS 11.1, Windows 10, Ubuntu 18.04, AWS EC2 -- **Deployment type**: example are Docker or setup from sratch +- **Deployment type**: example are Docker or setup from scratch ## Current behavior A clear and concise description of what currently happens and what are the steps to reproduce it. diff --git a/.github/workflows/python-test.yml b/.github/workflows/python-test.yml index 9bb222940..ef78864fd 100644 --- a/.github/workflows/python-test.yml +++ b/.github/workflows/python-test.yml @@ -34,8 +34,11 @@ jobs: run: | # stop the build if there are Python syntax errors or undefined names flake8 chaos_genius --count --select=E9,F63,F7,F82 --show-source --statistics - # exit-zero treats all errors as warnings. The GitHub editor is 127 chars wide - flake8 chaos_genius --count --exit-zero --max-complexity=10 --max-line-length=127 --statistics + # exit-zero treats all errors as warnings. + flake8 chaos_genius --count --exit-zero --statistics - name: Test with pytest run: | pytest + - name: Check diverged migrations + run: | + ./scripts/check_diverged_migrations.sh diff --git a/chaos_genius/alerts/__init__.py b/chaos_genius/alerts/__init__.py index 5d2205c52..2f5a48817 100644 --- a/chaos_genius/alerts/__init__.py +++ b/chaos_genius/alerts/__init__.py @@ -1,6 +1,11 @@ +"""Alerting logic, email/slack formats and other utilities. + +Most of the code in this module has extensive type annotation. Please use the Pylance +VS Code extension (or the Pyright equivalent in other editors) along with flake8 when +developing. +""" import logging -from datetime import date -from typing import List, Tuple +from typing import List, Optional, Tuple from chaos_genius.alerts.anomaly_alerts import AnomalyAlertController from chaos_genius.alerts.event_alerts import StaticEventAlertController @@ -12,8 +17,8 @@ logger = logging.getLogger() -def check_and_trigger_alert(alert_id): - """Check the alert and trigger the notification if found +def check_and_trigger_alert(alert_id: int): + """Check the alert and trigger the notification if found. Args: alert_id (int): alert id @@ -24,7 +29,7 @@ def check_and_trigger_alert(alert_id): Returns: bool: status of the alert trigger """ - alert_info = Alert.get_by_id(alert_id) + alert_info: Optional[Alert] = Alert.get_by_id(alert_id) if not alert_info: raise Exception("Alert doesn't exist") @@ -32,6 +37,8 @@ def check_and_trigger_alert(alert_id): print("Alert isn't active. Please activate the alert.") return True + # TODO: extract these values of `alert_type` as an enum + # ref: https://github.com/chaos-genius/chaos_genius/pull/836#discussion_r838077656 if alert_info.alert_type == "Event Alert": data_source_id = alert_info.data_source @@ -43,40 +50,43 @@ def check_and_trigger_alert(alert_id): elif ( alert_info.alert_type == "KPI Alert" and alert_info.kpi_alert_type == "Anomaly" ): - anomaly_obj = AnomalyAlertController(alert_info.as_dict) - return anomaly_obj.check_and_prepare_alert() + anomaly_obj = AnomalyAlertController(alert_info) + return anomaly_obj.check_and_send_alert() elif alert_info.alert_type == "KPI Alert" and alert_info.kpi_alert_type == "Static": - static_kpi_alert = StaticKpiAlertController(alert_info.as_dict) + # TODO: is this still needed? + StaticKpiAlertController(alert_info.as_dict) return True def trigger_anomaly_alerts_for_kpi( - kpi_obj: Kpi, end_date: date -) -> Tuple[List[int], List[int]]: + kpi_obj: Kpi, +) -> Tuple[List[int], List[Tuple[int, Exception]]]: """Triggers anomaly alerts starting from end_date. Args: kpi_obj (Kpi): Object of kpi for which alerts are to be triggered - end_date (dateimte.datetime): Datetime object containing the upper bound of anomaly date values + end_date (dateimte.datetime): Datetime object containing the upper bound of + anomaly date values Returns: List[int]: List of alert IDs for which alert messages were successfully sent - List[int]: List of alert IDs for which alert failed + List[Tuple[int, Exception]]: List of alert IDs and exceptions for which alert + failed """ - success_alerts = [] - errors = [] - alerts = Alert.query.filter( - Alert.kpi == kpi_obj.id, Alert.active == True, Alert.alert_status == True + success_alerts: List[int] = [] + errors: List[Tuple[int, Exception]] = [] + alerts: List[Alert] = Alert.query.filter( + Alert.kpi == kpi_obj.id, + Alert.active == True, # noqa: E712 + Alert.alert_status == True, # noqa: E712 ).all() for alert in alerts: try: - anomaly_obj = AnomalyAlertController( - alert.as_dict, anomaly_end_date=end_date - ) - anomaly_obj.check_and_prepare_alert() + anomaly_obj = AnomalyAlertController(alert) + anomaly_obj.check_and_send_alert() success_alerts.append(alert.id) except Exception as e: logger.error(f"Error running alert for Alert ID: {alert.id}", exc_info=e) - errors.append(alert.id) + errors.append((alert.id, e)) return success_alerts, errors diff --git a/chaos_genius/alerts/alert_channel_creds.py b/chaos_genius/alerts/alert_channel_creds.py index 885cb6950..aa8eb6f5b 100644 --- a/chaos_genius/alerts/alert_channel_creds.py +++ b/chaos_genius/alerts/alert_channel_creds.py @@ -1,32 +1,60 @@ +"""Utilities for retrieving channel credentials from config-setting.""" +from typing import Tuple + from chaos_genius.controllers.config_controller import get_config_object -def get_creds(name): - return HELPER_FUNC_DICT[name](name) +# TODO: make a new type here to better represent the return value +# ref: https://github.com/chaos-genius/chaos_genius/pull/836#discussion_r838085548 +def get_email_creds() -> Tuple[str, int, str, str, str]: + """Retrieves email channel configuration. + + Returns: + A tuple of (host, port, username, password, sender_email) + + Raises: + Exception: if email channel was not configured. + """ + # TODO: remove hardcoding of "email" - use a constant or a function + # ref: https://github.com/chaos-genius/chaos_genius/pull/836#discussion_r838110482 + config_obj = get_config_object("email") + if not config_obj: + raise Exception("Email alert channel was not configured") + email_config = config_obj.as_dict.get("config_setting") -def get_email_creds(name): - config_obj = get_config_object(name) - if config_obj is None: - return "", "", "", "", "" + if not email_config: + raise Exception("Email alert channel was not configured") - configs = config_obj.as_dict.get("config_setting", {}) return ( - configs.get("server", ""), - configs.get("port", ""), - configs.get("username", ""), - configs.get("password", ""), - configs.get("sender_email", ""), + email_config.get("server", ""), + email_config.get("port", 0), + email_config.get("username", ""), + email_config.get("password", ""), + email_config.get("sender_email", ""), ) -def get_slack_creds(name): - config_obj = get_config_object(name) - if config_obj is None: - return "" +def get_slack_creds() -> str: + """Retrieves slack channel configuration. + + Returns: + The slack webhook URL + + Raises: + Exception: if slack channel was not configured. + """ + config_obj = get_config_object("slack") + if not config_obj: + raise Exception("Slack alert channel was not configured") - configs = config_obj.as_dict.get("config_setting", {}) - return configs.get("webhook_url", "") + configs = config_obj.as_dict.get("config_setting") + if not configs: + raise Exception("Slack alert channel was not configured") + if "webhook_url" not in configs: + raise Exception( + "Slack alert channel configuration is invalid. webhook_url was not found." + ) -HELPER_FUNC_DICT = {"email": get_email_creds, "slack": get_slack_creds} + return configs["webhook_url"] diff --git a/chaos_genius/alerts/anomaly_alerts.py b/chaos_genius/alerts/anomaly_alerts.py index f0c347994..cefc0b256 100644 --- a/chaos_genius/alerts/anomaly_alerts.py +++ b/chaos_genius/alerts/anomaly_alerts.py @@ -1,514 +1,708 @@ +"""Controller and helpers for KPI or Anomaly alerts.""" import datetime +import heapq import io import logging -import os -import time from copy import deepcopy -from typing import List, Optional +from typing import Any, Dict, List, Optional, Sequence, Tuple, TypeVar, Union import pandas as pd -from jinja2 import Environment, FileSystemLoader, select_autoescape +from pydantic import BaseModel, StrictFloat, StrictInt, root_validator, validator +from pydantic.tools import parse_obj_as from chaos_genius.alerts.constants import ( + ALERT_DATE_FORMAT, ALERT_DATETIME_FORMAT, - ANOMALY_ALERT_COLUMN_NAMES, + ALERT_READABLE_DATA_TIMESTAMP_FORMAT, + ALERT_READABLE_DATE_FORMAT, + ALERT_READABLE_DATETIME_FORMAT, ANOMALY_TABLE_COLUMN_NAMES_MAPPER, - ANOMALY_TABLE_COLUMNS_HOLDING_FLOATS, - FREQUENCY_DICT, - IGNORE_COLUMNS_ANOMALY_TABLE, OVERALL_KPI_SERIES_TYPE_REPR, ) -from chaos_genius.alerts.email import send_static_alert_email from chaos_genius.alerts.slack import anomaly_alert_slack from chaos_genius.alerts.utils import ( + AlertException, change_message_from_percent, - count_anomalies, find_percentage_change, - format_anomaly_points, - save_anomaly_point_formatting, - top_anomalies, + human_readable, + send_email_using_template, webapp_url_prefix, ) +from chaos_genius.controllers.kpi_controller import ( + get_active_kpi_from_id, + get_anomaly_data, + get_last_anomaly_timestamp, +) # from chaos_genius.connectors.base_connector import get_df_from_db_uri from chaos_genius.core.rca.rca_utils.string_helpers import ( convert_query_string_to_user_string, ) from chaos_genius.databases.models.alert_model import Alert -from chaos_genius.databases.models.anomaly_data_model import AnomalyDataOutput from chaos_genius.databases.models.kpi_model import Kpi from chaos_genius.databases.models.triggered_alerts_model import TriggeredAlerts +from chaos_genius.utils.utils import jsonable_encoder + +logger = logging.getLogger(__name__) + + +class AnomalyPointOriginal(BaseModel): + """Representation of a point of anomaly data as received from raw anomaly data.""" + + # TODO: could be generated from AnomalyDataOutput model + + # y-value of point + y: float + # lower bound of expected value + yhat_lower: float + # upper bound of expected value + yhat_upper: float + # severity of the anomaly (0 to 100) + severity: float + + # overall, subdim or data_quality + anomaly_type: str + # subdimension name (when it's a subdim) + series_type: Optional[str] + + # timestamp when this entry was added + created_at: datetime.datetime + # timestamp of the anomaly point + data_datetime: datetime.datetime + + @property + def expected_value(self) -> str: + """Expected values represented in a string.""" + return f"{self.yhat_lower} to {self.yhat_upper}" + + @property + def readable_data_timestamp(self) -> str: + """Date timestmap as a readable string. + + Also known as Time of Occurrence. + """ + return self.data_datetime.strftime(ALERT_READABLE_DATA_TIMESTAMP_FORMAT) + + @property + def date_only(self) -> str: + """Only date part of the data timestamp (data_datetime).""" + return self.data_datetime.strftime(ALERT_DATE_FORMAT) + + def format_series_type(self): + """Format series_type to be more readable for use in alerts. + + Note: do not call this twice on the same instance. + """ + # TODO: make this idempotent + self.series_type = _format_series_type(self.anomaly_type, self.series_type) + + # -- pydantic specific configuration starts here -- + + # use custom datetime format + _normalize_datetimes = validator("created_at", "data_datetime", allow_reuse=True)( + lambda dt: datetime.datetime.strptime(dt, ALERT_DATETIME_FORMAT) + if not isinstance(dt, datetime.datetime) + else dt + ) + + class Config: + """Custom pydantic configuration.""" + + json_encoders = { + # custom datetime format for JSON conversion + datetime: lambda dt: dt.strftime(ALERT_DATETIME_FORMAT), + } + + +class AnomalyPoint(AnomalyPointOriginal): + """Representation of a point of anomaly data as used in alerting. + + This is the data stored in triggered alerts. + """ + + # severity value rounded to integer + severity: int + # percentage change from previous day's point + percent_change: Union[StrictFloat, StrictInt, str] + # human readable message describing the percent_change + change_message: str + + @staticmethod + def from_original( + point: AnomalyPointOriginal, + previous_anomaly_point: Optional[AnomalyPointOriginal] = None, + fixed_change_message: Optional[str] = None, + ) -> "AnomalyPoint": + """Constructs a formatted AnomalyPoint from AnomalyPointOriginal. + + Arguments: + point: original anomaly point + previous_anomaly_point: the anomaly point from which change percent will be + calculated. + fixed_change_message: the change message to use when previous anomaly point + cannot be found. If specified, change percent will not be calculated. + """ + series_type = ( + OVERALL_KPI_SERIES_TYPE_REPR + if point.series_type == "overall" + else point.series_type + ) + + y = round(point.y, 2) + yhat_lower = round(point.yhat_lower, 2) + yhat_upper = round(point.yhat_upper, 2) + severity = round(point.severity) + + series_type = _format_series_type(point.anomaly_type, point.series_type) + + if fixed_change_message is not None: + change_message = fixed_change_message + percent_change = "-" + else: + percent_change = find_percentage_change( + point.y, previous_anomaly_point.y if previous_anomaly_point else None + ) + change_message = change_message_from_percent(percent_change) + + return AnomalyPoint( + y=y, + yhat_lower=yhat_lower, + yhat_upper=yhat_upper, + severity=severity, + anomaly_type=point.anomaly_type, + series_type=series_type, + created_at=point.created_at, + data_datetime=point.data_datetime, + percent_change=percent_change, + change_message=change_message, + ) + + @root_validator(pre=True) + def _support_old_field_names(cls, values: Dict[str, Any]) -> Dict[str, Any]: + aliases = { + "percent_change": "percentage_change", + "change_message": "nl_message", + } + + for field_name, alias in aliases.items(): + if field_name not in values: + if alias in values: + values[field_name] = values[alias] + + return values + + +class AnomalyPointFormatted(AnomalyPoint): + """Anomaly point data with formatting used in templates (email, slack, etc). + + Also used in digests as a representation of points in TriggeredAlerts. + """ + + kpi_id: int + kpi_name: str + alert_id: int + alert_name: str + alert_channel: str + # stores alert channel configuration + # in individual alerts, this will be the entire dict (`Dict`) + # in digests, this will be just the list of emails or None (`Optional[List[str]]`) + # TODO: make a different type for digest data or use a consistent type across both + # ref: https://github.com/chaos-genius/chaos_genius/pull/862#discussion_r839400411 + alert_channel_conf: Any + + formatted_date: str + formatted_change_percent: str + + @staticmethod + def from_point( + point: AnomalyPoint, + time_series_frequency: Optional[str], + kpi_id: int, + kpi_name: str, + alert_id: int, + alert_name: str, + alert_channel: str, + alert_channel_conf: Any, + ) -> "AnomalyPointFormatted": + """Constructs a formatted point from an AnomalyPoint.""" + dt_format = ALERT_READABLE_DATETIME_FORMAT + if time_series_frequency is not None and time_series_frequency == "D": + dt_format = ALERT_READABLE_DATE_FORMAT + formatted_date = point.data_datetime.strftime(dt_format) + + formatted_change_percent = point.percent_change + if isinstance(point.percent_change, (int, float)): + if point.percent_change > 0: + formatted_change_percent = f"+{point.percent_change}%" + else: + formatted_change_percent = f"{point.percent_change}%" + + return AnomalyPointFormatted( + **point.dict(), + kpi_id=kpi_id, + kpi_name=kpi_name, + alert_id=alert_id, + alert_name=alert_name, + alert_channel=alert_channel, + alert_channel_conf=alert_channel_conf, + formatted_date=formatted_date, + formatted_change_percent=str(formatted_change_percent), + ) + + @property + def y_readable(self): + """Returns human readable format for y value of anomaly point.""" + return human_readable(self.y) -logger = logging.getLogger() + @property + def yhat_lower_readable(self): + """Returns human readable format for lower bound of expected range.""" + return human_readable(self.yhat_lower) + + @property + def yhat_upper_readable(self): + """Returns human readable format for upper bound of expected range.""" + return human_readable(self.yhat_upper) class AnomalyAlertController: - def __init__(self, alert_info, anomaly_end_date=None): - self.alert_info = alert_info + """Controller for KPI/anomaly alerts.""" + + def __init__(self, alert: Alert): + """Initializes a KPI/anomaly alerts controller. + + Note: an AnomalyAlertController instance must only be used for one check/trigger + of an alert. The same object must not be re-used. + + Arguments: + alert: object of the Alert model for which to send alerts + """ + self.alert = alert + self.alert_id: int = self.alert.id + self.kpi_id: int = self.alert.kpi self.now = datetime.datetime.now() - if anomaly_end_date: - self.anomaly_end_date = anomaly_end_date - else: - self.anomaly_end_date = self.now - datetime.timedelta(days=3) - - def check_and_prepare_alert(self): - kpi_id = self.alert_info["kpi"] - alert_id = self.alert_info["id"] - alert: Optional[Alert] = Alert.get_by_id(self.alert_info["id"]) - if alert is None: - logger.info(f"Could not find alert by ID: {self.alert_info['id']}") - return False - - check_time = FREQUENCY_DICT[self.alert_info["alert_frequency"]] - fuzzy_interval = datetime.timedelta( - minutes=30 - ) # this represents the upper bound of the time interval that an alert can fall short of the check_time hours before which it can be sent again - if ( - alert.last_alerted is not None - and alert.last_alerted > (self.now - check_time) - and alert.last_alerted > ((self.now + fuzzy_interval) - check_time) - ): - # this check works in three steps - # 1) Verify if the last alerted value of an alert is not None - # 2) Verify if less than check_time hours have elapsed since the last alert was sent - # 3) If less than check_time hours have elapsed, check if the additonal time to complete check_time hours is greater than fuzzy_interval - logger.info( - f"Skipping alert with ID {self.alert_info['id']} since it was already run" + latest_anomaly_timestamp = get_last_anomaly_timestamp([self.kpi_id]) + + if latest_anomaly_timestamp is None: + raise AlertException( + "Could not get latest anomaly timestamp. No anomaly data was found.", + alert_id=self.alert_id, + kpi_id=self.kpi_id, ) - return True - alert.update(commit=True, last_alerted=self.now) + self.latest_anomaly_timestamp = latest_anomaly_timestamp + logger.info("latest_anomaly_timestamp is %s", latest_anomaly_timestamp) + + def check_and_send_alert(self): + """Determines anomalies, sends alert and stores alert data. - # TODO: Add the series type filter for query optimisation - anomaly_data = AnomalyDataOutput.query.filter( - AnomalyDataOutput.kpi_id == kpi_id, - AnomalyDataOutput.anomaly_type.in_(["overall", "subdim"]), - AnomalyDataOutput.is_anomaly.in_([1, -1]), - AnomalyDataOutput.data_datetime >= self.anomaly_end_date, - AnomalyDataOutput.severity >= self.alert_info["severity_cutoff_score"], - ).all() + Note: must only be called once on an instance. + """ + anomaly_data = self._get_anomalies() if len(anomaly_data) == 0: - logger.info(f"No anomaly exists (Alert ID - {alert_id})") + logger.info( + f"(Alert: {self.alert_id}, KPI: {self.kpi_id}) no anomaly exists." + ) return True - logger.info(f"Alert ID {alert_id} is sent to the respective alert channel") + formatted_anomaly_data = self._format_anomaly_data(anomaly_data) + + status = False + try: + if self._to_send_individual(): + if self.alert.alert_channel == "email": + self._send_email_alert(formatted_anomaly_data) + elif self.alert.alert_channel == "slack": + self._send_slack_alert(formatted_anomaly_data) + else: + raise AlertException( + f"Unknown alert channel: {self.alert.alert_channel}", + alert_id=self.alert_id, + kpi_id=self.kpi_id, + ) + else: + logger.info( + f"(Alert: {self.alert_id}, KPI: {self.kpi_id}) not sending " + "alert as it was configured to be a digest." + ) + + # TODO: last_anomaly_timestamp can be updated even if no anomaly exists. + self._update_alert_metadata(self.alert) + + status = True + finally: + self._save_triggered_alerts(status, formatted_anomaly_data) + + return status + + def _get_anomalies( + self, + time_diff: datetime.timedelta = datetime.timedelta(), + anomalies_only: bool = True, + include_severity_cutoff: bool = True, + ) -> List[AnomalyPointOriginal]: + last_anomaly_timestamp: Optional[ + datetime.datetime + ] = self.alert.last_anomaly_timestamp + + if last_anomaly_timestamp is not None: + # when last_anomaly_timestamp is available + # get data after last_anomaly_timestamp + start_timestamp = last_anomaly_timestamp - time_diff + include_start_timestamp = False + else: + # when last_anomaly_timestamp is not available + # get data of the last timestamp in anomaly table + start_timestamp = self.latest_anomaly_timestamp - time_diff + include_start_timestamp = True + + end_timestamp = self.latest_anomaly_timestamp - time_diff + include_end_timestamp = True + + severity_cutoff = ( + self.alert.severity_cutoff_score if include_severity_cutoff else None + ) - if self.alert_info["alert_channel"] == "email": - outcome, alert_data = self.send_alert_email(anomaly_data) - elif self.alert_info["alert_channel"] == "slack": - outcome, alert_data = self.send_slack_alert(anomaly_data) + logger.info( + f"Checking for anomalies for (KPI: {self.kpi_id}, Alert: " + f"{self.alert_id}) in the range - start: {start_timestamp} (included: " + f"{include_start_timestamp}) and end: {end_timestamp} " + "(included: True)" + ) - if alert_data is None: - return outcome + anomaly_data = get_anomaly_data( + [self.kpi_id], + anomaly_types=["subdim", "overall"], + anomalies_only=anomalies_only, + start_timestamp=start_timestamp, + include_start_timestamp=include_start_timestamp, + # only get anomaly data till latest timestamp + # (ignore newer data added after alert started) + end_timestamp=end_timestamp, + include_end_timestamp=include_end_timestamp, + severity_cutoff=severity_cutoff, + ) + + return parse_obj_as( + List[AnomalyPointOriginal], [point.as_dict for point in anomaly_data] + ) + + def _update_alert_metadata(self, alert: Alert): + """Sets last alerted and last anomaly timestamps.""" + alert.update( + commit=True, + last_alerted=self.now, + last_anomaly_timestamp=self.latest_anomaly_timestamp, + ) + + def _save_triggered_alerts( + self, status: bool, formatted_anomaly_data: List[AnomalyPoint] + ): + """Saves data for alert (which has been sent) in the triggered alerts table.""" + # for digests, we would like the latest anomalies to be displayed first + formatted_anomaly_data = sorted( + formatted_anomaly_data, + key=lambda point: (point.data_datetime, point.severity), + reverse=True, + ) alert_metadata = { - "alert_frequency": self.alert_info["alert_frequency"], - "alert_data": alert_data, - "end_date": self.anomaly_end_date.strftime(ALERT_DATETIME_FORMAT), - "severity_cutoff_score": self.alert_info["severity_cutoff_score"], - "kpi": self.alert_info["kpi"], + "alert_frequency": self.alert.alert_frequency, + "alert_data": jsonable_encoder(formatted_anomaly_data), + "severity_cutoff_score": self.alert.severity_cutoff_score, + "kpi": self.kpi_id, } triggered_alert = TriggeredAlerts( - alert_conf_id=self.alert_info["id"], + alert_conf_id=self.alert_id, alert_type="KPI Alert", - is_sent=outcome, + is_sent=status, created_at=datetime.datetime.now(), alert_metadata=alert_metadata, ) triggered_alert.update(commit=True) - logger.info(f"The triggered alert data was successfully stored") - return outcome - - def get_overall_subdim_data(self, anomaly_data): - - anomaly_data = [anomaly_point.as_dict for anomaly_point in anomaly_data] - anomaly_data = [ - { - key: value - for key, value in anomaly_point.items() - if key not in IGNORE_COLUMNS_ANOMALY_TABLE - } - for anomaly_point in anomaly_data - ] - - for anomaly_point in anomaly_data: - anomaly_point["series_type"] = ( - OVERALL_KPI_SERIES_TYPE_REPR - if anomaly_point.get("anomaly_type") == "overall" - else anomaly_point["series_type"] - ) - for key, value in anomaly_point.items(): - if key in ANOMALY_TABLE_COLUMNS_HOLDING_FLOATS: - anomaly_point[key] = round(value, 2) - if anomaly_point["series_type"] != OVERALL_KPI_SERIES_TYPE_REPR: - anomaly_point["series_type"] = convert_query_string_to_user_string( - anomaly_point["series_type"] - ) - - overall_data = [ - anomaly_point - for anomaly_point in anomaly_data - if anomaly_point.get("anomaly_type") == "overall" - ] - subdim_data = [ - anomaly_point - for anomaly_point in anomaly_data - if anomaly_point.get("anomaly_type") == "subdim" - ] - overall_data.sort(key=lambda anomaly: anomaly.get("severity"), reverse=True) - subdim_data.sort(key=lambda anomaly: anomaly.get("severity"), reverse=True) - - return overall_data, subdim_data + logger.info("The triggered alert data was successfully stored") - def _find_point(self, point, prev_data): + def _find_point( + self, point: AnomalyPointOriginal, prev_data: List[AnomalyPointOriginal] + ): """Finds same type of point in previous data.""" intended_point = None for prev_point in prev_data: - if prev_point.get("series_type") == point.get("series_type"): + if prev_point.series_type == point.series_type: intended_point = prev_point break return intended_point - def _save_nl_message_daily_freq(self, anomaly_data: List[dict], kpi: Kpi): - """Saves change message for every point, for a daily frequency KPI.""" - time_diff = datetime.timedelta(days=1, hours=0, minutes=0) - - # TODO: fix circular import - from chaos_genius.controllers.digest_controller import get_previous_data - - prev_day_data = get_previous_data(kpi.id, self.anomaly_end_date, time_diff) - - prev_day_data = [anomaly_point.as_dict for anomaly_point in prev_day_data] - - for point in prev_day_data: - if point.get("anomaly_type") != "overall": - point["series_type"] = convert_query_string_to_user_string( - point["series_type"] - ) - else: - point["series_type"] = OVERALL_KPI_SERIES_TYPE_REPR - - for point in anomaly_data: - intended_point = self._find_point(point, prev_day_data) - - if intended_point is None: - # previous point wasn't found - point["percentage_change"] = "–" - elif point["y"] == 0 and intended_point["y"] == point["y"]: - # previous data was same as current - point["percentage_change"] = "–" - elif intended_point["y"] == 0: - # previous point was 0 - sign_ = "+" if point["y"] > 0 else "-" - point["percentage_change"] = sign_ + "inf" - else: - point["percentage_change"] = find_percentage_change( - point["y"], intended_point["y"] - ) + def _format_anomaly_data( + self, anomaly_data: List[AnomalyPointOriginal] + ) -> List[AnomalyPoint]: + kpi = self._get_kpi() - point["nl_message"] = change_message_from_percent( - point["percentage_change"] - ) + time_series_freq: Optional[str] = kpi.anomaly_params.get("frequency") - def _save_nl_message_hourly_freq(self, anomaly_data: List[dict], kpi: Kpi): - """Saves change message for every point, for a hourly frequency KPI.""" - data = dict() + # get previous anomaly point for comparison time_diff = datetime.timedelta(days=1, hours=0, minutes=0) + prev_day_data = self._get_anomalies( + time_diff=time_diff, anomalies_only=False, include_severity_cutoff=False + ) - # TODO: fix circular import - from chaos_genius.controllers.digest_controller import get_previous_data - - prev_day_data = get_previous_data(kpi.id, self.anomaly_end_date, time_diff) - prev_day_data = [anomaly_point.as_dict for anomaly_point in prev_day_data] - - for point in prev_day_data: - if point.get("anomaly_type") != "overall": - point["series_type"] = convert_query_string_to_user_string( - point["series_type"] - ) - else: - point["series_type"] = OVERALL_KPI_SERIES_TYPE_REPR - - for point in prev_day_data: - if point["data_datetime"].hour not in data.keys(): - data[point["data_datetime"].hour] = [] - data[point["data_datetime"].hour].append(point) + # store a mapping of hour => list of anomaly points for that hour + hourly_data: Dict[int, List[AnomalyPointOriginal]] = dict() + if time_series_freq == "H": + for point in prev_day_data: + if point.data_datetime.hour not in hourly_data.keys(): + hourly_data[point.data_datetime.hour] = [] + hourly_data[point.data_datetime.hour].append(point) + formatted_anomaly_data: List[AnomalyPoint] = [] for point in anomaly_data: - hour_val = point["data_datetime"].hour - intended_point = self._find_point(point, data.get(hour_val, [])) - if intended_point is None: - # previous point wasn't found - point["percentage_change"] = "–" - elif point["y"] == 0 and intended_point["y"] == point["y"]: - # previous data was same as current - point["percentage_change"] = "–" - elif intended_point["y"] == 0: - # previous point was 0 - sign_ = "+" if point["y"] > 0 else "-" - point["percentage_change"] = sign_ + "inf" + if time_series_freq == "D": + # in case of daily granularity, find point in the previous day + previous_point = self._find_point(point, prev_day_data) + elif time_series_freq == "H": + # in case of hourly granularity, find the point of the same hour + # but in the previous day. + previous_point = self._find_point( + point, hourly_data.get(point.data_datetime.hour, []) + ) else: - point["percentage_change"] = find_percentage_change( - point["y"], intended_point["y"] + raise AlertException( + f"Time series frequency not found or invalid: {time_series_freq}", + alert_id=self.alert_id, + kpi_id=self.kpi_id, ) - point["nl_message"] = change_message_from_percent( - point["percentage_change"] + formatted_anomaly_data.append( + AnomalyPoint.from_original(point, previous_point) ) - def save_nl_message(self, anomaly_data: List[dict]): - """Constructs and saves change message for every point.""" - kpi_id = self.alert_info["kpi"] - kpi = Kpi.get_by_id(kpi_id) - if kpi is None: - for point in anomaly_data: - point["nl_message"] = "KPI does not exist" - return - - time_series_freq = kpi.anomaly_params.get("frequency") - if time_series_freq is None: - for point in anomaly_data: - point["nl_message"] = "Time series frequency does not exist" - return - - if time_series_freq in ("d", "D", "daily", "Daily"): - self._save_nl_message_daily_freq(anomaly_data, kpi) - elif time_series_freq in ("h", "H", "hourly", "Hourly"): - self._save_nl_message_hourly_freq(anomaly_data, kpi) - else: - for point in anomaly_data: - point["nl_message"] = "Unsupported time series frequency" - - def format_alert_data(self, data: List[dict]): - """Pre-processes anomaly alert data.""" - self.save_nl_message(data) + # Sort in descending order according to severity + formatted_anomaly_data.sort(key=lambda point: point.severity, reverse=True) - for anomaly_point in data: - lower = anomaly_point.get("yhat_lower") - upper = anomaly_point.get("yhat_upper") - anomaly_point["Expected Value"] = f"{lower} to {upper}" + return formatted_anomaly_data - # round off severity for better representation - anomaly_point["severity"] = round(anomaly_point["severity"]) + def _get_kpi(self) -> Kpi: + kpi = get_active_kpi_from_id(self.kpi_id) - # rename column names for human readability - for key, value in ANOMALY_TABLE_COLUMN_NAMES_MAPPER.items(): - anomaly_point[value] = anomaly_point[key] - - my_time = time.strptime( - anomaly_point["Time of Occurrence"].strftime(ALERT_DATETIME_FORMAT), - ALERT_DATETIME_FORMAT, - ) - timestamp = time.mktime(my_time) - date_time = datetime.datetime.fromtimestamp(timestamp) - new_time = date_time.strftime("%b %d %Y %H:%M:%S") - anomaly_point["Time of Occurrence"] = new_time - anomaly_point["data_datetime"] = anomaly_point["data_datetime"].strftime( - ALERT_DATETIME_FORMAT - ) - anomaly_point["created_at"] = anomaly_point["created_at"].strftime( - ALERT_DATETIME_FORMAT + if kpi is None: + raise AlertException( + "KPI does not exist.", alert_id=self.alert_id, kpi_id=self.kpi_id ) - def _remove_attributes_from_anomaly_points( - self, anomaly_data: List[dict], list_attributes: List[str] - ): - for attr in list_attributes: - for point in anomaly_data: - delattr(point, attr) + return kpi - def send_alert_email(self, anomaly_data): + def _to_send_individual(self) -> bool: + """Whether to send individual alert or include in a digest. - alert_channel_conf = self.alert_info["alert_channel_conf"] + Returns: + True if an individual alert needs to be sent, False otherwise. + """ + daily_digest = self.alert.daily_digest + weekly_digest = self.alert.weekly_digest - if type(alert_channel_conf) != dict: - logger.info( - f"The alert channel configuration is incorrect for Alert ID - {self.alert_info['id']}" - ) - return False + return not (daily_digest or weekly_digest) - recipient_emails = alert_channel_conf.get("email", []) + def _get_top_anomalies_and_counts( + self, formatted_anomaly_data: List[AnomalyPoint], kpi: Kpi + ) -> Tuple[Sequence[AnomalyPointFormatted], int, int]: + overall_count, subdim_count = _count_anomalies(formatted_anomaly_data) - if recipient_emails: - subject = f"{self.alert_info['alert_name']} - Chaos Genius Alert ({self.now.strftime('%b %d')})❗" - alert_message = self.alert_info["alert_message"] + top_anomalies_ = deepcopy(_top_anomalies(formatted_anomaly_data, 5)) + top_anomalies_ = _format_anomaly_point_for_template( + top_anomalies_, kpi, self.alert + ) - kpi_id = self.alert_info["kpi"] - kpi_obj = Kpi.query.filter(Kpi.active == True, Kpi.id == kpi_id).first() + return top_anomalies_, overall_count, subdim_count - if kpi_obj is None: - logger.error(f"No KPI exists for Alert ID - {self.alert_info['id']}") - return False + def _send_email_alert(self, formatted_anomaly_data: List[AnomalyPoint]) -> None: + alert_channel_conf = self.alert.alert_channel_conf - kpi_name = getattr(kpi_obj, "name") + if not isinstance(alert_channel_conf, dict): + raise AlertException( + f"Alert channel config was not a dict. Got: {alert_channel_conf}", + alert_id=self.alert_id, + kpi_id=self.kpi_id, + ) - overall_data, subdim_data = self.get_overall_subdim_data(anomaly_data) + recipient_emails = alert_channel_conf.get("email") - overall_data_email_body = ( - deepcopy([overall_data[0]]) if len(overall_data) > 0 else [] - ) - len_subdim = min(10, len(subdim_data)) - subdim_data_email_body = ( - deepcopy(subdim_data[0:len_subdim]) if len(subdim_data) > 0 else [] + if not recipient_emails: + raise AlertException( + f"No recipient emails found. Got: {recipient_emails}", + alert_id=self.alert_id, + kpi_id=self.kpi_id, ) - overall_data.extend(subdim_data) - overall_data_email_body.extend(subdim_data_email_body) - - self.format_alert_data(overall_data) - self.format_alert_data(overall_data_email_body) - - column_names = ANOMALY_ALERT_COLUMN_NAMES - overall_data_ = pd.DataFrame(overall_data, columns=column_names) - files = [] - if not overall_data_.empty: - file_detail = {} - file_detail["fname"] = "data.csv" - with io.StringIO() as buffer: - overall_data_.to_csv(buffer, encoding="utf-8") - file_detail["fdata"] = buffer.getvalue() - files = [file_detail] - - daily_digest = self.alert_info.get("daily_digest", False) - weekly_digest = self.alert_info.get("weekly_digest", False) - - if not (daily_digest or weekly_digest): - points = deepcopy( - [anomaly_point.as_dict for anomaly_point in anomaly_data] - ) - format_anomaly_points(points) - self.format_alert_data(points) - save_anomaly_point_formatting( - points, kpi_obj.anomaly_params.get("frequency") - ) - top_anomalies_ = top_anomalies(points, 5) - overall_count, subdim_count = count_anomalies(points) - - test = self.send_template_email( - "email_alert.html", - recipient_emails, - subject, - files, - column_names=column_names, - top_anomalies=top_anomalies_, - alert_message=alert_message, - kpi_name=kpi_name, - alert_frequency=self.alert_info["alert_frequency"].capitalize(), - preview_text="Anomaly Alert", - alert_name=self.alert_info.get("alert_name"), - kpi_link=f"{webapp_url_prefix()}#/dashboard/0/anomaly/{kpi_id}", - alert_dashboard_link=f"{webapp_url_prefix()}api/digest", - overall_count=overall_count, - subdim_count=subdim_count, - str=str - ) - logger.info(f"Status for Alert ID - {self.alert_info['id']} : {test}") - # self.remove_attributes_from_anomaly_data(overall_data, ["nl_message"]) - # TODO: fix this circular import - from chaos_genius.controllers.digest_controller import ( - structure_anomaly_data_for_digests, - ) + subject = ( + f"{self.alert.alert_name} - Chaos Genius Alert " + f"({self.now.strftime('%b %d')})❗" + ) - anomaly_data = structure_anomaly_data_for_digests(overall_data) - return False, anomaly_data - else: - logger.info( - f"No receipent email available (Alert ID - {self.alert_info['id']})" - ) - return False, None + # attach CSV of anomaly data + files = [ + { + "fname": "data.csv", + "fdata": _make_anomaly_data_csv(formatted_anomaly_data), + } + ] + + kpi = self._get_kpi() + + ( + top_anomalies_, + overall_count, + subdim_count, + ) = self._get_top_anomalies_and_counts(formatted_anomaly_data, kpi) + + send_email_using_template( + "email_alert.html", + recipient_emails, + subject, + files, + top_anomalies=top_anomalies_, + alert_message=self.alert.alert_message, + kpi_name=kpi.name, + preview_text="Anomaly Alert", + alert_name=self.alert.alert_name, + kpi_link=f"{webapp_url_prefix()}#/dashboard/0/anomaly/" f"{self.kpi_id}", + alert_dashboard_link=f"{webapp_url_prefix()}api/digest", + overall_count=overall_count, + subdim_count=subdim_count, + str=str, + ) - def send_template_email(self, template, recipient_emails, subject, files, **kwargs): - """Sends an email using a template.""" - path = os.path.join(os.path.dirname(__file__), "email_templates") - env = Environment( - loader=FileSystemLoader(path), autoescape=select_autoescape(["html", "xml"]) + logger.info( + f"(Alert: {self.alert_id}, KPI: {self.kpi_id}) The email alert was " + "successfully sent" ) - template = env.get_template(template) - test = send_static_alert_email( - recipient_emails, subject, template.render(**kwargs), self.alert_info, files + def _send_slack_alert(self, formatted_anomaly_data: List[AnomalyPoint]): + kpi = self._get_kpi() + + ( + top_anomalies_, + overall_count, + subdim_count, + ) = self._get_top_anomalies_and_counts(formatted_anomaly_data, kpi) + + err = anomaly_alert_slack( + kpi.name, + self.alert.alert_name, + self.kpi_id, + self.alert.alert_message, + top_anomalies_, + overall_count, + subdim_count, ) - if test == True: + + if err == "": logger.info( - f"The email for Alert ID - {self.alert_info['id']} was successfully sent" + f"(Alert: {self.alert_id}, KPI: {self.kpi_id}) The slack alert was " + "successfully sent" ) else: - logger.info( - f"The email for Alert ID - {self.alert_info['id']} has not been sent" + raise AlertException( + f"Slack alert was not sent: {err}", + alert_id=self.alert_id, + kpi_id=self.kpi_id, ) - return test - def send_slack_alert(self, anomaly_data): - kpi_id = self.alert_info["kpi"] - kpi_obj = Kpi.query.filter(Kpi.active == True, Kpi.id == kpi_id).first() +def _format_series_type(anomaly_type: str, series_type: Optional[str]) -> str: + """Format a anomaly point's series type for use in alerts. - if kpi_obj is None: - logger.info(f"No KPI exists for Alert ID - {self.alert_info['id']}") - return False, None + Do not call this function twice on the same data. - kpi_name = getattr(kpi_obj, "name") - alert_name = self.alert_info.get("alert_name") - alert_message = self.alert_info["alert_message"] + Arguments: + anomaly_type: see AnomalyPointOriginal + series_type: see AnomalyPointOriginal + """ + series_type = ( + OVERALL_KPI_SERIES_TYPE_REPR + if anomaly_type == "overall" + else convert_query_string_to_user_string(series_type or "") + ) - overall_data, subdim_data = self.get_overall_subdim_data(anomaly_data) + return series_type - overall_data_alert_body = ( - deepcopy([overall_data[0]]) if len(overall_data) > 0 else [] - ) - len_subdim = min(5, len(subdim_data)) - subdim_data_alert_body = ( - deepcopy(subdim_data[0:len_subdim]) if len(subdim_data) > 0 else [] + +def _make_anomaly_data_csv(anomaly_points: List[AnomalyPoint]) -> str: + """Create an in-memory string containing the CSV of given anomaly data.""" + anomaly_df = pd.DataFrame( + [ + point.dict(include=ANOMALY_TABLE_COLUMN_NAMES_MAPPER.keys()) + for point in anomaly_points + ] + ) + + anomaly_df.rename(ANOMALY_TABLE_COLUMN_NAMES_MAPPER, inplace=True) + + # this is a property that is calculated, so it needs to be assigned separately + anomaly_df[ANOMALY_TABLE_COLUMN_NAMES_MAPPER["expected_value"]] = [ + point.expected_value for point in anomaly_points + ] + + with io.StringIO() as buffer: + anomaly_df.to_csv(buffer, encoding="utf-8") + csv_data = buffer.getvalue() + + return csv_data + + +def _format_anomaly_point_for_template( + points: Sequence[AnomalyPoint], kpi: Kpi, alert: Alert +) -> Sequence[AnomalyPointFormatted]: + """Formats fields of each point, to be used in alert templates.""" + return list( + map( + lambda point: AnomalyPointFormatted.from_point( + point, + kpi.anomaly_params.get("frequency"), + kpi.id, + kpi.name, + alert.id, + alert.alert_name, + alert.alert_channel, + alert.alert_channel_conf, + ), + points, ) + ) - overall_data.extend(subdim_data) - overall_data_alert_body.extend(subdim_data_alert_body) - self.format_alert_data(overall_data) - self.format_alert_data(overall_data_alert_body) +# ref: https://stackoverflow.com/a/53287607/11199009 +TAnomalyPoint = TypeVar("TAnomalyPoint", bound=AnomalyPointOriginal) - daily_digest = self.alert_info.get("daily_digest", False) - weekly_digest = self.alert_info.get("weekly_digest", False) - test = "failed" - if not (daily_digest or weekly_digest): - points = deepcopy([anomaly_point.as_dict for anomaly_point in anomaly_data]) - format_anomaly_points(points) - self.format_alert_data(points) - save_anomaly_point_formatting( - points, kpi_obj.anomaly_params.get("frequency") - ) - top_anomalies_ = top_anomalies(points, 5) - overall_count, subdim_count = count_anomalies(points) - - test = anomaly_alert_slack( - kpi_name, - alert_name, - kpi_id, - alert_message, - top_anomalies_, - overall_count, - subdim_count, - ) +def _top_anomalies(points: Sequence[TAnomalyPoint], n=10) -> Sequence[TAnomalyPoint]: + """Returns top n anomalies according to severity.""" + return heapq.nlargest(n, points, key=lambda point: point.severity) - if test == "ok": - logger.info( - f"The slack alert for Alert ID - {self.alert_info['id']} was successfully sent" - ) - else: - logger.info( - f"The slack alert for Alert ID - {self.alert_info['id']} has not been sent" - ) - message = f"Status for KPI ID - {self.alert_info['kpi']}: {test}" - test = test == "ok" - # self.remove_attributes_from_anomaly_data(overall_data, ["nl_message"]) - # TODO: fix this circular import - from chaos_genius.controllers.digest_controller import ( - structure_anomaly_data_for_digests, - ) +def _count_anomalies(points: Sequence[TAnomalyPoint]) -> Tuple[int, int]: + """Returns a count of overall anomalies and subdim anomalies.""" + total = len(points) + overall = sum( + 1 for point in points if point.series_type == OVERALL_KPI_SERIES_TYPE_REPR + ) + subdims = total - overall + return overall, subdims + + +def get_top_anomalies_and_counts( + formatted_anomaly_data: Sequence[AnomalyPointFormatted], + n: int = 10, +) -> Tuple[Sequence[AnomalyPointFormatted], int, int]: + """Returns top anomalies and counts of all anomalies for digests. + + Arguments: + formatted_anomaly_data: list of `AnomalyPointFormatted`s + n: number of top anomalies to be returned + """ + overall_count, subdim_count = _count_anomalies(formatted_anomaly_data) + + top_anomalies_ = deepcopy(_top_anomalies(formatted_anomaly_data, n)) - anomaly_data = structure_anomaly_data_for_digests(overall_data) - return test, anomaly_data + return top_anomalies_, overall_count, subdim_count diff --git a/chaos_genius/alerts/base_alert_digests.py b/chaos_genius/alerts/base_alert_digests.py index 21d48a62e..1e24a5371 100644 --- a/chaos_genius/alerts/base_alert_digests.py +++ b/chaos_genius/alerts/base_alert_digests.py @@ -1,28 +1,24 @@ +"""Controller and helpers for alert digests.""" import datetime -import heapq import logging -import os from collections import defaultdict -from typing import Dict, List, Tuple +from typing import DefaultDict, Dict, List, Sequence, Set, Tuple -from jinja2 import Environment, FileSystemLoader, select_autoescape - -from chaos_genius.alerts.constants import ( - ALERT_DATE_FORMAT, - ALERT_DATETIME_FORMAT, - ALERT_READABLE_DATETIME_FORMAT, - FREQUENCY_DICT, +from chaos_genius.alerts.anomaly_alerts import ( + AnomalyPointFormatted, + get_top_anomalies_and_counts, ) -from chaos_genius.alerts.email import send_static_alert_email +from chaos_genius.alerts.constants import ALERT_DATE_FORMAT, FREQUENCY_DICT from chaos_genius.alerts.slack import alert_digest_slack_formatted -from chaos_genius.alerts.utils import ( - count_anomalies, - save_anomaly_point_formatting, - top_anomalies, - webapp_url_prefix, -) +from chaos_genius.alerts.utils import send_email_using_template, webapp_url_prefix from chaos_genius.controllers.config_controller import get_config_object -from chaos_genius.controllers.digest_controller import get_alert_kpi_configurations +from chaos_genius.controllers.digest_controller import ( + extract_anomaly_points_from_triggered_alerts, + get_alert_kpi_configurations, + preprocess_triggered_alert, +) +from chaos_genius.databases.models.alert_model import Alert +from chaos_genius.databases.models.kpi_model import Kpi from chaos_genius.databases.models.triggered_alerts_model import TriggeredAlerts logger = logging.getLogger(__name__) @@ -31,17 +27,29 @@ class AlertDigestController: + """Controller for anomaly alert digests.""" + def __init__(self, frequency: str): + """Initializes an anomaly alert digests controller. + Note: an AlertDigestController instance must only be used for one check/trigger + of a digest. The same object must not be re-used. + + Arguments: + frequency: digest frequency. See keys of FREQUENCY_DICT for possible values. + """ self.time_diff = FREQUENCY_DICT[frequency] self.curr_time = datetime.datetime.now() - self.alert_config_cache = dict() - self.kpi_cache = dict() + self.alert_config_cache: Dict[int, Alert] = dict() + self.kpi_cache: Dict[int, Kpi] = dict() self.frequency = frequency - def prepare_digests(self): + def check_and_send_digests(self): + """Collects alerts to be sent and sends them to respective channels. - data = ( + Note: must only be called once on an instance. + """ + triggered_alerts: List[TriggeredAlerts] = ( TriggeredAlerts.query.filter( TriggeredAlerts.created_at >= (self.curr_time - self.time_diff) ) @@ -49,72 +57,77 @@ def prepare_digests(self): .all() ) - slack_digests = [] - email_digests = [] - - self.alert_config_cache, self.kpi_cache = get_alert_kpi_configurations(data) - - for alert in data: - alert_conf_id = alert.alert_conf_id - alert_conf = self.alert_config_cache.get(alert_conf_id) - - kpi_id = alert_conf.get("kpi") - kpi = self.kpi_cache.get(kpi_id) if kpi_id is not None else None + slack_digests: List[TriggeredAlerts] = [] + email_digests: List[TriggeredAlerts] = [] - alert.kpi_id = kpi_id - alert.kpi_name = kpi.get("name") if kpi is not None else "Doesn't Exist" - alert.alert_name = alert_conf.get("alert_name") - alert.alert_channel = alert_conf.get("alert_channel") + self.alert_config_cache, self.kpi_cache = get_alert_kpi_configurations( + triggered_alerts + ) - if not isinstance(alert_conf.get("alert_channel_conf"), dict): - alert.alert_channel_conf = None - else: - alert.alert_channel_conf = alert_conf.get("alert_channel_conf", {}).get( - alert.alert_channel, None - ) + for triggered_alert in triggered_alerts: + triggered_alert = preprocess_triggered_alert( + triggered_alert, self.alert_config_cache, self.kpi_cache + ) - if alert_conf.get(ALERT_ATTRIBUTES_MAPPER[self.frequency]): - if alert.alert_channel == "slack": - slack_digests.append(alert) - if alert.alert_channel == "email": - email_digests.append(alert) + if getattr( + self.alert_config_cache[triggered_alert.alert_conf_id], + ALERT_ATTRIBUTES_MAPPER[self.frequency], + ): + if triggered_alert.alert_channel == "slack": + slack_digests.append(triggered_alert) + if triggered_alert.alert_channel == "email": + email_digests.append(triggered_alert) if len(email_digests) > 0: - email_status = self.segregate_email_digests(email_digests) + self._send_email_digests(email_digests) if len(slack_digests) > 0: - slack_status = self.send_slack_digests(slack_digests) + self._send_slack_digests(slack_digests) - def segregate_email_digests(self, email_digests): - user_triggered_alerts = defaultdict(set) + def _send_email_digests(self, email_digests: List[TriggeredAlerts]): + user_triggered_alerts: DefaultDict[str, Set[int]] = defaultdict(set) for alert in email_digests: for user in alert.alert_channel_conf: user_triggered_alerts[user].add(alert.id) - triggered_alert_dict = {alert.id: alert for alert in email_digests} + triggered_alert_dict: Dict[int, TriggeredAlerts] = { + alert.id: alert for alert in email_digests + } for recipient in user_triggered_alerts.keys(): - self.send_alert_digest( + self._send_email_digest( recipient, user_triggered_alerts[recipient], triggered_alert_dict ) - def send_alert_digest(self, recipient, triggered_alert_ids, triggered_alert_dict): - triggered_alerts = [ - triggered_alert_dict[id_].__dict__ for id_ in triggered_alert_ids - ] - points = _all_anomaly_points(triggered_alerts) - top_anomalies_ = top_anomalies(points) - overall_count, subdim_count = count_anomalies(points) - save_anomaly_point_formatting(points) - - test = self.send_template_email( + def _get_top_anomalies_and_counts( + self, triggered_alerts: List[TriggeredAlerts] + ) -> Tuple[Sequence[AnomalyPointFormatted], int, int]: + points = extract_anomaly_points_from_triggered_alerts( + triggered_alerts, self.kpi_cache + ) + return get_top_anomalies_and_counts(points) + + def _send_email_digest( + self, + recipient: str, + triggered_alert_ids: Set[int], + triggered_alert_dict: Dict[int, TriggeredAlerts], + ): + triggered_alerts = [triggered_alert_dict[id_] for id_ in triggered_alert_ids] + ( + top_anomalies_, + overall_count, + subdim_count, + ) = self._get_top_anomalies_and_counts(triggered_alerts) + + send_email_using_template( "digest_template.html", [recipient], - f"Daily Alerts Report ({self.curr_time.strftime(ALERT_DATE_FORMAT)}) - Chaos Genius Alert❗", + ( + f"Daily Alerts Report ({self.curr_time.strftime(ALERT_DATE_FORMAT)}) - " + "Chaos Genius Alert❗" + ), [], - column_names=["alert_name", "kpi_name", "created_at", "link"], preview_text="", - getattr=getattr, - isinstance=isinstance, str=str, overall_count=overall_count, subdim_count=subdim_count, @@ -123,69 +136,39 @@ def send_alert_digest(self, recipient, triggered_alert_ids, triggered_alert_dict top_anomalies=top_anomalies_, ) - def send_template_email(self, template, recipient_emails, subject, files, **kwargs): - """Sends an email using a template.""" - - path = os.path.join(os.path.dirname(__file__), "email_templates") - env = Environment( - loader=FileSystemLoader(path), autoescape=select_autoescape(["html", "xml"]) - ) - - template = env.get_template(template) - test = send_static_alert_email( - recipient_emails, subject, template.render(**kwargs), None, files - ) - - return test - - def send_slack_digests(self, triggered_alerts): + def _send_slack_digests(self, triggered_alerts: List[TriggeredAlerts]): """Sends a slack alert containing a summary of triggered alerts.""" - triggered_alerts = [alert.__dict__ for alert in triggered_alerts] - - points = _all_anomaly_points(triggered_alerts) - top10 = top_anomalies(points) - overall_count, subdim_count = count_anomalies(points) - save_anomaly_point_formatting(points) - - test = alert_digest_slack_formatted( - self.frequency, top10, overall_count, subdim_count + ( + top_anomalies_, + overall_count, + subdim_count, + ) = self._get_top_anomalies_and_counts(triggered_alerts) + + err = alert_digest_slack_formatted( + self.frequency, top_anomalies_, overall_count, subdim_count ) - if test == "ok": - logger.info("The slack alert digest was successfully sent") + if err == "": + logger.info( + f"(frequency: {self.frequency}) The slack alert digest was successfully" + " sent" + ) else: - logger.info("The slack alert digest has not been sent") - - message = f"Status for slack alert digest: {test}" - return message - - -def _all_anomaly_points(triggered_alerts: List[Dict]) -> List[Dict]: - return [ - dict( - point, - kpi_name=alert["kpi_name"], - alert_name=alert["alert_name"], - kpi_id=alert["kpi_id"], - ) - for alert in triggered_alerts - for point in alert["alert_metadata"]["alert_data"] - ] + raise Exception( + f"(frequency: {self.frequency}) Error in sending slack digest: {err}" + ) def check_and_trigger_digest(frequency: str): - """Check the alert and trigger alert digest + """Check the alert and trigger alert digest. Args: frequency (str): frequency of alert digest Raises: - Exception: Raise if digest frequency is incorrect or alert digests have not been configured - - Returns: - bool: status of the alert digest trigger + Exception: Raise if digest frequency is incorrect or alert digests have not + been configured """ - if frequency not in ALERT_ATTRIBUTES_MAPPER.keys(): msg = f"Alert Digest frequency is not valid. Got: {frequency}." logger.error(msg) @@ -206,6 +189,4 @@ def check_and_trigger_digest(frequency: str): raise Exception(msg) digest_obj = AlertDigestController(frequency) - digest_obj.prepare_digests() - - return True + digest_obj.check_and_send_digests() diff --git a/chaos_genius/alerts/constants.py b/chaos_genius/alerts/constants.py index 2ee2d74ae..372caa530 100644 --- a/chaos_genius/alerts/constants.py +++ b/chaos_genius/alerts/constants.py @@ -10,29 +10,17 @@ ALERT_READABLE_DATETIME_FORMAT = "%b %d, %I %p" -DIGEST_DATETIME_FORMAT = "%b %d %Y %H:%M:%S" +ALERT_READABLE_DATA_TIMESTAMP_FORMAT = "%b %d %Y %H:%M:%S" ANOMALY_TABLE_COLUMN_NAMES_MAPPER = { "series_type": "Dimension", "data_datetime": "Time of Occurrence", "y": "Value", "severity": "Severity Score", - "nl_message": "Change", + "change_message": "Change", + "expected_value": "Expected Value", } -IGNORE_COLUMNS_ANOMALY_TABLE = ["id", "index", "kpi_id", "is_anomaly"] - -ANOMALY_ALERT_COLUMN_NAMES = [ - "Dimension", - "Time of Occurrence", - "Value", - "Expected Value", - "Severity Score", - "Change", -] - -ANOMALY_TABLE_COLUMNS_HOLDING_FLOATS = ["y", "yhat_upper", "yhat_lower", "severity"] - FREQUENCY_DICT = { "weekly": datetime.timedelta(days=7, hours=0, minutes=0), "daily": datetime.timedelta(days=1, hours=0, minutes=0), diff --git a/chaos_genius/alerts/email.py b/chaos_genius/alerts/email.py index 1780935aa..ea0a095c4 100644 --- a/chaos_genius/alerts/email.py +++ b/chaos_genius/alerts/email.py @@ -1,27 +1,24 @@ +"""Utilities for sending emails.""" +import logging import smtplib -import traceback from email.mime.application import MIMEApplication from email.mime.multipart import MIMEMultipart from email.mime.text import MIMEText +from typing import Dict, List, Sequence -from chaos_genius.alerts.alert_channel_creds import get_creds +from chaos_genius.alerts.alert_channel_creds import get_email_creds -# TODO: Need little refactoring - - -EMAIL_HOST = None -EMAIL_HOST_PORT = None -EMAIL_HOST_USER = None -EMAIL_HOST_PASSWORD = None -EMAIL_SENDER = None DEBUG = False TEMPLATE_DIR = "chaos_genius/alerts/templates" EMAIL_TEMPLATE_MAPPING = {"STATIC_ALERT": "static_alert.html"} -def init_smtp_server(): - """Initiate the SMTP server +logger = logging.getLogger(__name__) + + +def init_smtp_server(host: str, port: int, user: str, password: str): + """Initiate the SMTP server. Raises: Exception: Raise if env variable not found @@ -30,41 +27,50 @@ def init_smtp_server(): Returns: obj: SMTP server object """ - if not (EMAIL_HOST and EMAIL_HOST_PASSWORD and EMAIL_HOST_USER): - raise Exception("SMTP ENV Variable not found...") - retry_count = 0 - def connect_smtp(retry_count): - server = None + def connect_smtp(retry_count: int) -> smtplib.SMTP: try: retry_count += 1 - server = smtplib.SMTP(EMAIL_HOST, EMAIL_HOST_PORT) + server = smtplib.SMTP(host, port) server.ehlo() server.starttls() # stmplib docs recommend calling ehlo() before & after starttls() server.ehlo() - server.login(EMAIL_HOST_USER, EMAIL_HOST_PASSWORD) - except Exception as err_msg: - print("Error: ", err_msg) - print("Retrying again to connect...") + server.login(user, password) + except smtplib.SMTPException as e: + logger.error("Error in initializing SMTP connection", exc_info=e) if retry_count < 4: + logger.warn("Retrying SMTP connection") server = connect_smtp(retry_count) + else: + raise return server - server = connect_smtp(retry_count) - if not server: - raise Exception("SMTP Connection Failed...") - return server + return connect_smtp(retry_count) -def send_email(recipient_emails, message, count=0): - """send the email to the provided recipients +def send_email( + recipient_emails: List[str], + message: MIMEMultipart, + host: str, + port: int, + user: str, + password: str, + sender: str, + count=0, +): + """Send the email to the provided recipients. Args: - recipient_emails (list): List of emails - message (MIMEMultipart): email MIMEMultipart object - count (int, optional): Retry count for the emails. Defaults to 0. + recipient_emails: List of emails + message: email MIMEMultipart object + host: hostname of SMTP server + port: port number of SMTP server + user: username to authenticate to SMTP server + password: password to authenticate to SMTP server + sender: email sender + count: Retry count for the emails. Defaults to 0 """ count += 1 try: @@ -74,81 +80,54 @@ def send_email(recipient_emails, message, count=0): else: toaddr = recipient_emails - server = init_smtp_server() - server.sendmail(EMAIL_SENDER, toaddr, message.as_string()) + server = init_smtp_server(host, port, user, password) + server.sendmail(sender, toaddr, message.as_string()) server.quit() - print("Email sent to " + ", ".join(toaddr)) + logger.info(f"Email sent to {', '.join(toaddr)}") except smtplib.SMTPServerDisconnected: - print(f"Retry ({count}) for the email") + logger.info(f"Retry ({count}) for the email") if count < 3: - send_email(recipient_emails, message, count) + send_email( + recipient_emails, message, host, port, user, password, sender, count + ) else: - print("Email Sending Failed after max retries") - except Exception: - print(traceback.format_exc()) - - -def initialize_env_variables(): - global EMAIL_HOST - global EMAIL_HOST_PORT - global EMAIL_HOST_USER - global EMAIL_HOST_PASSWORD - global EMAIL_SENDER - global DEBUG - creds = get_creds("email") - ( - EMAIL_HOST, - EMAIL_HOST_PORT, - EMAIL_HOST_USER, - EMAIL_HOST_PASSWORD, - EMAIL_SENDER, - ) = creds + logger.error("Email Sending Failed after max retries") + raise def send_static_alert_email( - recipient_emails, subject, messsage_body, alert_info, files=[] -): - """Send the static event alert email with the CSV attachment + recipient_emails: List[str], + subject: str, + messsage_body: str, + files: Sequence[Dict] = [], +) -> None: + """Send an alert email with the CSV attachment. Args: recipient_emails (list): List of emails subject (str): Subject of the email messsage_body (str): Main configurable body text - alert_info (dict): alert information - files (list, optional): List of the files with the file name and file data as base64. Defaults to []. - - Returns: - bool: status of the email + files (list, optional): List of the files with the file name and file data as + base64. Defaults to []. """ - status = False - initialize_env_variables() - - try: - message = MIMEMultipart() - message["From"] = EMAIL_SENDER - message["To"] = ",".join(recipient_emails) - message["Subject"] = subject - - msgAlternative = MIMEMultipart("alternative") - # msgText = MIMEText(parsed_template, 'html') - msgText = MIMEText( - messsage_body, "html" - ) # TODO: To be changed according to use - msgAlternative.attach(msgText) - message.attach(msgAlternative) - - for file_detail in files: - fname = file_detail["fname"] - fdata = file_detail["fdata"] - attachment = MIMEApplication(fdata, fname) - attachment["Content-Disposition"] = 'attachment; filename="{}"'.format( - fname - ) - message.attach(attachment) - - send_email(recipient_emails, message) - status = True - except Exception as err_msg: - print(err_msg) - - return status + host, port, user, password, sender = get_email_creds() + + message = MIMEMultipart() + message["From"] = sender + message["To"] = ",".join(recipient_emails) + message["Subject"] = subject + + msg_alternative = MIMEMultipart("alternative") + # msgText = MIMEText(parsed_template, 'html') + msg_text = MIMEText(messsage_body, "html") # TODO: To be changed according to use + msg_alternative.attach(msg_text) + message.attach(msg_alternative) + + for file_detail in files: + fname = file_detail["fname"] + fdata = file_detail["fdata"] + attachment = MIMEApplication(fdata, fname) + attachment["Content-Disposition"] = 'attachment; filename="{}"'.format(fname) + message.attach(attachment) + + send_email(recipient_emails, message, host, port, user, password, sender) diff --git a/chaos_genius/alerts/email_templates/README.md b/chaos_genius/alerts/email_templates/README.md new file mode 100644 index 000000000..506cdbe67 --- /dev/null +++ b/chaos_genius/alerts/email_templates/README.md @@ -0,0 +1,10 @@ +# HTML templates for Email alerts + +These are the Jinja templates used for sending email alerts. + +## Notes for contributing/making changes to these files + +- All CSS styles must be inlined to each HTML element. Style tags are not supported well and the formatting may break when an email is forwarded or is inside a thread. +- Use [this tool](https://templates.mailchimp.com/resources/inline-css/) to automatically convert style tags to inline CSS. + - NOTE: recheck the HTML generated from this - especially the tags that included Jinja template variables. The tool does not understand Jinja template blocks such as if, while, etc. which could lead to multiple extra closing or opening blocks being added. Also check for attributes which are assigned to a template variable. + - TODO: find a better tool that can work with Jinja templates. diff --git a/chaos_genius/alerts/email_templates/digest_template.html b/chaos_genius/alerts/email_templates/digest_template.html index 95e63de86..85f80ed04 100644 --- a/chaos_genius/alerts/email_templates/digest_template.html +++ b/chaos_genius/alerts/email_templates/digest_template.html @@ -1,39 +1,12 @@ {% extends "layout.html" %} -{% block head %} - -{% endblock %} - {% block content %}

Daily Alert Digest

-
+
-
+

Summary

    @@ -41,30 +14,32 @@

    Summary

  • Total alerts generated (including subdimensions): {{ overall_count + subdim_count }}
-Alerts Dashboard +Alerts Dashboard -
+

Top 10 Anomalies

    {% for point in top_anomalies %}
  • - {{point["kpi_name"]}} ({{point["Dimension"]}}) + {{point.kpi_name}} ({{point.series_type}}) - changed to {{point["y"]}} - {% if point["percentage_change"] is string and "inf" not in point["percentage_change"] %} - ({{ point["change_message"] }}) - {% elif "+inf" in str(point["percentage_change"]) or point["percentage_change"] > 0 %} - ({{ point["change_message"] }}) - {% elif "-inf" in str(point["percentage_change"]) or point["percentage_change"] < 0 %} - ({{ point["change_message"] }}) - {% else %} - ({{ point["change_message"] }}) - {% endif %} - on {{ point["formatted_date"] }} - (expected: {{point["yhat_lower"]}} to {{point["yhat_upper"]}}, - severity: - {{point["severity"]}}) + changed to {{point.y_readable}} + {% if point.percent_change is string and "inf" not in point.percent_change %} + + {% elif "+inf" in str(point.percent_change) or point.percent_change > 0 %} + + {% elif "-inf" in str(point.percent_change) or point.percent_change < 0 %} + + {% else %} + + {% endif %} + ({{ point.formatted_change_percent }}) + + on {{ point.formatted_date }} + (expected: {{point.yhat_lower_readable}} to {{point.yhat_upper_readable}}, + severity: + {{point.severity}})
  • {% endfor %} diff --git a/chaos_genius/alerts/email_templates/email_alert.html b/chaos_genius/alerts/email_templates/email_alert.html index 08f57a048..5f939c63f 100644 --- a/chaos_genius/alerts/email_templates/email_alert.html +++ b/chaos_genius/alerts/email_templates/email_alert.html @@ -1,37 +1,10 @@ {% extends "layout.html" %} -{% block head %} - -{% endblock %} - {% block content %}

    {{ alert_name }}

    -
    +
    @@ -50,31 +23,33 @@

    Alert Message

    {{ alert_message }}
    -View KPI -Alerts Dashboard +View KPI +Alerts Dashboard -
    +

    Top Anomalies

      {% for point in top_anomalies %}
    • - {{kpi_name}} ({{point["Dimension"]}}) - changed to - {{point["y"]}} - {% if point["percentage_change"] is string and "inf" not in point["percentage_change"] %} - ({{ point["change_message"] }}) - {% elif "+inf" in str(point["percentage_change"]) or point["percentage_change"] > 0 %} - ({{ point["change_message"] }}) - {% elif "-inf" in str(point["percentage_change"]) or point["percentage_change"] < 0 %} - ({{ point["change_message"] }}) - {% else %} - ({{ point["change_message"] }}) - {% endif %} - on {{ point["formatted_date"] }} - (expected: {{point["yhat_lower"]}} to {{point["yhat_upper"]}}, - severity: - {{point["severity"]}}) + {{kpi_name}} ({{point.series_type}}) + + changed to {{point.y_readable}} + {% if point.percent_change is string and "inf" not in point.percent_change %} + + {% elif "+inf" in str(point.percent_change) or point.percent_change > 0 %} + + {% elif "-inf" in str(point.percent_change) or point.percent_change < 0 %} + + {% else %} + + {% endif %} + ({{ point.formatted_change_percent }}) + + on {{ point.formatted_date }} + (expected: {{point.yhat_lower_readable}} to {{point.yhat_upper_readable}}, + severity: + {{point.severity}})
    • {% endfor %} diff --git a/chaos_genius/alerts/event_alerts.py b/chaos_genius/alerts/event_alerts.py index 7daef8497..a59b8d086 100644 --- a/chaos_genius/alerts/event_alerts.py +++ b/chaos_genius/alerts/event_alerts.py @@ -264,7 +264,7 @@ def send_template_email(self, template, recipient_emails, subject, files, **kwar template = env.get_template(template) test = send_static_alert_email( - recipient_emails, subject, template.render(**kwargs), self.alert_info, files + recipient_emails, subject, template.render(**kwargs), files ) if test == True: diff --git a/chaos_genius/alerts/slack.py b/chaos_genius/alerts/slack.py index 73242c595..7ab74f3e3 100644 --- a/chaos_genius/alerts/slack.py +++ b/chaos_genius/alerts/slack.py @@ -1,32 +1,36 @@ -from datetime import datetime -from typing import List, Optional +"""Utilities for sending slack alert messages.""" +import logging +from typing import Optional, Sequence -from slack_sdk.webhook import WebhookClient +from slack_sdk.webhook.client import WebhookClient -from chaos_genius.alerts.alert_channel_creds import get_creds -from chaos_genius.alerts.constants import ( - ALERT_DATE_FORMAT, - ALERT_DATETIME_FORMAT, - ALERT_READABLE_DATETIME_FORMAT, -) +import chaos_genius.alerts.anomaly_alerts as anomaly_alerts +from chaos_genius.alerts.alert_channel_creds import get_slack_creds from chaos_genius.alerts.utils import webapp_url_prefix +logger = logging.getLogger(__name__) -def get_webhook_client(): - url = get_creds("slack") - try: - return WebhookClient(url) - except Exception as err_msg: - print(err_msg) - return None + +def get_webhook_client() -> WebhookClient: + """Initializes a Slack Webhook client.""" + url = get_slack_creds() + return WebhookClient(url) def anomaly_alert_slack( - kpi_name, alert_name, kpi_id, alert_message, points, overall_count, subdim_count -): + kpi_name: str, + alert_name: str, + kpi_id: int, + alert_message: str, + points: "Sequence[anomaly_alerts.AnomalyPointFormatted]", + overall_count: int, + subdim_count: int, +) -> str: + """Sends an anomaly alert on slack. + + Returns an empty string if successful or the error as a string if not. + """ client = get_webhook_client() - if not client: - raise Exception("Slack not configured properly.") response = client.send( blocks=[ { @@ -56,8 +60,11 @@ def anomaly_alert_slack( "type": "section", "text": { "type": "mrkdwn", - "text": f"- Total alerts generated (Overall KPI): *{overall_count}*\n" - f"- Total alerts generated (including subdimenions): *{subdim_count + overall_count}*\n", + "text": ( + f"- Total alerts generated (Overall KPI): *{overall_count}*\n" + + "- Total alerts generated (including subdimenions): " + + f"*{subdim_count + overall_count}*\n" + ), }, }, { @@ -115,9 +122,9 @@ def anomaly_alert_slack( ) if response.body != "ok": - print(response.body) + return response.body - return response.body + return "" def event_alert_slack(alert_name, alert_frequency, alert_message, alert_overview): @@ -163,7 +170,9 @@ def event_alert_slack(alert_name, alert_frequency, alert_message, alert_overview def _format_slack_anomalies( - top10: List[dict], kpi_name=None, include_kpi_link=True + top10: "Sequence[anomaly_alerts.AnomalyPointFormatted]", + kpi_name: Optional[str] = None, + include_kpi_link: bool = True, ) -> str: out = "" @@ -171,31 +180,37 @@ def _format_slack_anomalies( if include_kpi_link: kpi_name_link = ( - f'<{webapp_url_prefix()}#/dashboard/0/anomaly/{point["kpi_id"]}' - f'|{point["kpi_name"]} (*{point["Dimension"]}*)>' + f"<{webapp_url_prefix()}#/dashboard/0/anomaly/{point.kpi_id}" + + f"|{point.kpi_name} (*{point.series_type}*)>" ) else: - kpi_name_link = f'{kpi_name} ({point["Dimension"]})' + kpi_name_link = f"{kpi_name} ({point.series_type})" - date = point.get("formatted_date") + date = point.formatted_date threshold_message = ( - f'expected: *{point["yhat_lower"]} to {point["yhat_upper"]}*' + f"expected: *{point.yhat_lower_readable} to {point.yhat_upper_readable}*" ) - change_message = point["change_message"] out += ( f"- *{kpi_name_link}* changed to " - f'*{point["y"]}* (*{change_message}*) ' - f'on {date} ({threshold_message}, severity: *{point["severity"]}*)\n' + + f"*{point.y_readable}* (*{point.formatted_change_percent}*) " + + f"on {date} ({threshold_message}, severity: *{point.severity}*)\n" ) return out def alert_digest_slack_formatted( - frequency: str, top10: List[dict], overall_count: int, subdim_count: int -): + frequency: str, + top10: "Sequence[anomaly_alerts.AnomalyPointFormatted]", + overall_count: int, + subdim_count: int, +) -> str: + """Sends an anomaly digest on slack. + + Returns an empty string if successful or the error as a string if not. + """ client = get_webhook_client() if not client: raise Exception("Slack not configured properly.") @@ -225,8 +240,11 @@ def alert_digest_slack_formatted( "type": "section", "text": { "type": "mrkdwn", - "text": f"- Total alerts generated (Overall KPI): *{overall_count}*\n" - f"- Total alerts generated (including subdimenions): *{subdim_count + overall_count}*\n", + "text": ( + f"- Total alerts generated (Overall KPI): *{overall_count}*\n" + + "- Total alerts generated (including subdimenions): " + + f"*{subdim_count + overall_count}*\n" + ), }, }, { @@ -263,43 +281,9 @@ def alert_digest_slack_formatted( ) if response.body != "ok": - print(response.body) - - return response.body - - -def anomaly_alert_slack_formatted(alert_name, kpi_name, data_source_name, table_data): - client = get_webhook_client() - if not client: - raise Exception("Slack not configured properly.") - response = client.send( - text=f"Anomaly Alert: {kpi_name}", - blocks=[ - { - "type": "header", - "text": { - "type": "plain_text", - "text": f"Alert: {alert_name}", - "emoji": True, - }, - }, - { - "type": "section", - "text": { - "type": "mrkdwn", - "text": f"This is the alert generated from KPI *{kpi_name}* and Data Source *{data_source_name}*.", - }, - }, - ], - ) - - subsequent_response = "failed" - if response.body == "ok": - subsequent_response = alert_table_sender(client, table_data) + return response.body - if response.body == "ok" and subsequent_response == "ok": - return "ok" - return subsequent_response + return "" def alert_table_sender(client, table_data): diff --git a/chaos_genius/alerts/static_kpi_alerts.py b/chaos_genius/alerts/static_kpi_alerts.py index 4b8584e92..45724f9f0 100644 --- a/chaos_genius/alerts/static_kpi_alerts.py +++ b/chaos_genius/alerts/static_kpi_alerts.py @@ -1,3 +1,4 @@ +# TODO: is this still needed? class StaticKpiAlertController: def __init__(self, alert_info): self.alert_info = alert_info diff --git a/chaos_genius/alerts/utils.py b/chaos_genius/alerts/utils.py index 5713c47e3..2d56de045 100644 --- a/chaos_genius/alerts/utils.py +++ b/chaos_genius/alerts/utils.py @@ -1,23 +1,38 @@ """Common utilities for alerts and alert digests.""" -import datetime -import heapq -from typing import Dict, List, Tuple, Union - -from chaos_genius.alerts.constants import ( - ALERT_DATETIME_FORMAT, - ALERT_READABLE_DATE_FORMAT, - ALERT_READABLE_DATETIME_FORMAT, - ANOMALY_TABLE_COLUMNS_HOLDING_FLOATS, - OVERALL_KPI_SERIES_TYPE_REPR, -) -from chaos_genius.core.rca.rca_utils.string_helpers import ( - convert_query_string_to_user_string, -) +import os +from math import floor, log10 +from typing import List, Optional, Union + +from jinja2 import Environment, FileSystemLoader, select_autoescape + +from chaos_genius.alerts.email import send_static_alert_email from chaos_genius.core.utils.round import round_number from chaos_genius.settings import CHAOSGENIUS_WEBAPP_URL +class AlertException(Exception): + """A general exception in a specific alert. + + Stores and prints alert ID and KPI ID. + """ + + def __init__(self, message: str, alert_id: int, kpi_id: Optional[int] = None): + """Initialize a new alert exception. + + Args: + message: exception message. + alert_id: ID of alert where this originated from. + kpi_id: ID of KPI associated with the alert. + """ + if kpi_id: + message = f"(KPI: {kpi_id}, Alert: {alert_id}) {message}" + else: + message = f"(Alert: {alert_id}) {message}" + + super().__init__(message) + + def webapp_url_prefix(): """Constructs webapp URL prefix with a trailing slash. @@ -32,43 +47,6 @@ def webapp_url_prefix(): return f"{CHAOSGENIUS_WEBAPP_URL}{forward_slash}" -def save_anomaly_point_formatting(points: List[Dict], frequency: str = None): - """Adds formatted fields to each point, to be used in alert templates.""" - for point in points: - dt = datetime.datetime.strptime(point["data_datetime"], ALERT_DATETIME_FORMAT) - - dt_format = ALERT_READABLE_DATETIME_FORMAT - if frequency is not None and frequency == "D": - dt_format = ALERT_READABLE_DATE_FORMAT - - date = dt.strftime(dt_format) - point["formatted_date"] = date - - change_percent = point["percentage_change"] - change_message = change_percent - if isinstance(change_percent, (int, float)): - if change_percent > 0: - change_message = f"+{change_percent}%" - else: - change_message = f"{change_percent}%" - point["change_message"] = change_message - - -def top_anomalies(points: List[Dict], n=10) -> List[Dict]: - """Returns top n anomalies according to severity.""" - return heapq.nlargest(n, points, key=lambda point: point["severity"]) - - -def count_anomalies(points: List[Dict]) -> Tuple[int, int]: - """Returns a count of overall anomalies and subdim anomalies.""" - total = len(points) - overall = sum( - 1 for point in points if point["Dimension"] == OVERALL_KPI_SERIES_TYPE_REPR - ) - subdims = total - overall - return overall, subdims - - def change_message_from_percent(percent_change: Union[str, int, float]) -> str: """Creates a change message from given percentage change. @@ -88,28 +66,70 @@ def change_message_from_percent(percent_change: Union[str, int, float]) -> str: return f"Decreased by ({percent_change}%)" -def format_anomaly_points(points: List[dict]): - for anomaly_point in points: - anomaly_point["series_type"] = ( - OVERALL_KPI_SERIES_TYPE_REPR - if anomaly_point.get("anomaly_type") == "overall" - else anomaly_point["series_type"] - ) - for key, value in anomaly_point.items(): - if key in ANOMALY_TABLE_COLUMNS_HOLDING_FLOATS: - anomaly_point[key] = round(value, 2) - if anomaly_point["series_type"] != OVERALL_KPI_SERIES_TYPE_REPR: - anomaly_point["series_type"] = convert_query_string_to_user_string( - anomaly_point["series_type"] - ) - - def find_percentage_change( - curr_val: Union[int, float], prev_val: Union[int, float] + curr_val: Union[int, float], prev_val: Optional[Union[int, float]] ) -> Union[int, float, str]: """Calculates percentage change between previous and current value.""" - if prev_val == 0: - return "-" - change = curr_val - prev_val - percentage_change = (change / prev_val) * 100 - return round_number(percentage_change) + if prev_val is None: + # previous point wasn't found + return "–" + elif curr_val == 0 and prev_val == curr_val: + # both current and previous value are 0 + return "–" + elif prev_val == 0: + # previous value is 0, but current value isn't + sign_ = "+" if curr_val > 0 else "-" + return sign_ + "inf" + else: + change = curr_val - prev_val + percentage_change = (change / prev_val) * 100 + return round_number(percentage_change) + + +def send_email_using_template( + template_name: str, + recipient_emails: List[str], + subject: str, + files: List[dict], + **kwargs, +) -> None: + """Sends an email using a template.""" + path = os.path.join(os.path.dirname(__file__), "email_templates") + env = Environment( + loader=FileSystemLoader(path), autoescape=select_autoescape(["html", "xml"]) + ) + + template = env.get_template(template_name) + send_static_alert_email(recipient_emails, subject, template.render(**kwargs), files) + + +HRN_PREFIXES = { + -9: "n", + -6: "µ", + -3: "m", + 0: "", + 3: "K", + 6: "M", + 9: "B", + 12: "T", +} + + +def _get_exponent(num: float) -> int: + """Returns the power of 10 to which the number is raised to.""" + if num == 0: + return 0 + + return floor(log10(abs(num))) + + +def human_readable(num: float) -> str: + """Returns the human readable format of a number.""" + exponent = _get_exponent(num) + + new_exponent = min((3 * floor(exponent / 3)), 12) + precision = 10 ** (new_exponent) + + new_val = round(num / precision, 3) + human_readable_format = str(new_val) + HRN_PREFIXES[new_exponent] + return human_readable_format diff --git a/chaos_genius/app.py b/chaos_genius/app.py index 1abf2e1a5..993bed9bd 100644 --- a/chaos_genius/app.py +++ b/chaos_genius/app.py @@ -14,6 +14,7 @@ from chaos_genius.utils.utils import DEMO_ENDPOINT_WHITELIST from chaos_genius.views import ( data_source_view, + download_view, kpi_view, public_view, meta_view, @@ -82,6 +83,7 @@ def register_blueprints(app): app.register_blueprint(anomaly_data_view.blueprint, url_prefix='/api/anomaly-data') app.register_blueprint(alert_view.blueprint, url_prefix='/api/alert') app.register_blueprint(dashboard_view.blueprint, url_prefix='/api/dashboard') + app.register_blueprint(download_view.blueprint, url_prefix='/api/downloads') app.register_blueprint(status_view.blueprint, url_prefix='/api/status') app.register_blueprint(meta_view.blueprint, url_prefix='/api/meta') app.register_blueprint(digest_view.blueprint, url_prefix='/api/digest') @@ -159,3 +161,4 @@ def register_commands(app): app.cli.add_command(commands.insert_demo_data) app.cli.add_command(commands.run_anomaly_rca_scheduler) app.cli.add_command(commands.run_digest) + app.cli.add_command(commands.fetch_metadata) diff --git a/chaos_genius/celery_config.py b/chaos_genius/celery_config.py index 644f0a276..bd45798c9 100644 --- a/chaos_genius/celery_config.py +++ b/chaos_genius/celery_config.py @@ -1,8 +1,9 @@ from datetime import timedelta from celery.schedules import crontab, schedule +from chaos_genius.settings import METADATA_SYNC_TIME -CELERY_IMPORTS = ("chaos_genius.jobs") +CELERY_IMPORTS = "chaos_genius.jobs" CELERY_TASK_RESULT_EXPIRES = 30 CELERY_TIMEZONE = "UTC" @@ -10,6 +11,8 @@ CELERY_TASK_SERIALIZER = "json" CELERY_RESULT_SERIALIZER = "json" +METADATA_SYNC_TIME_HRS, METADATA_SYNC_TIME_MINS = METADATA_SYNC_TIME.split(":") + CELERYBEAT_SCHEDULE = { "anomaly-scheduler": { "task": "chaos_genius.jobs.analytics_scheduler.scheduler_wrapper", @@ -31,14 +34,23 @@ "schedule": crontab(minute="0"), # Hourly: at 0th minute "args": ("hourly",), }, + "metadata-prefetch-daily": { + "task": "chaos_genius.jobs.metadata_prefetch.metadata_prefetch_daily_scheduler", + "schedule": crontab( + hour=METADATA_SYNC_TIME_HRS, minute=METADATA_SYNC_TIME_MINS + ), + "args": (), + }, } CELERY_ROUTES = { "chaos_genius.jobs.anomaly_tasks.*": {"queue": "anomaly-rca"}, "chaos_genius.jobs.analytics_scheduler.*": {"queue": "anomaly-rca"}, "chaos_genius.jobs.alert_tasks.*": {"queue": "alerts"}, + "chaos_genius.jobs.metadata_prefetch.*": {"queue": "alerts"}, } + # Scheduler runs every hour # looks at tasks in last n hour # if they are in processing in 24 hours, schedule them right away diff --git a/chaos_genius/commands.py b/chaos_genius/commands.py index 6c37714f0..9243c8163 100644 --- a/chaos_genius/commands.py +++ b/chaos_genius/commands.py @@ -8,7 +8,12 @@ import click from flask.cli import with_appcontext +from chaos_genius.controllers.data_source_controller import get_data_source_list +from chaos_genius.controllers.data_source_metadata_controller import ( + run_metadata_prefetch, +) from chaos_genius.settings import AIRBYTE_ENABLED +from chaos_genius.utils.utils import time_my_func HERE = os.path.abspath(os.path.dirname(__file__)) PROJECT_ROOT = os.path.join(HERE, os.pardir) @@ -125,6 +130,36 @@ def run_rca(kpi, end_date): click.echo(f"Completed the RCA for KPI ID: {kpi}.") +def _fetch_metadata(id: int): + if id == 0: + to_run_ids = [data_source.id for data_source in get_data_source_list()] + click.echo(f"Fetching the metadata for all active datasources: {to_run_ids}") + else: + to_run_ids = [id] + + for data_source_id in to_run_ids: + click.echo(f"Fetching the metadata for data source ID: {data_source_id}") + status = run_metadata_prefetch(data_source_id) + click.echo( + f"Completed the metadata fetch for data source ID: {data_source_id} with " + + f"status: {status}." + ) + + +@click.command() +@with_appcontext +@click.option( + "--id", required=True, type=int, help="Fetch the metadata of provided data source." +) +@time_my_func +def fetch_metadata(id): + """Fetch the metadata of the given data source. + + if id is 0 run for all active datasources + """ + _fetch_metadata(id) + + @click.command() @with_appcontext @click.option('--id', required=True, type=int, help="Perform the alert operation for provided Alert ID.") @@ -163,9 +198,9 @@ def run_anomaly_rca_scheduler(): @with_appcontext def reinstall_db(): """Delete the db and reinstall again.""" - from chaos_genius.settings import META_DATABASE - from chaos_genius.extensions import db from chaos_genius.databases.demo_data import install_demo_db + from chaos_genius.extensions import db + from chaos_genius.settings import META_DATABASE if click.confirm(click.style(f"Do you want to delete and reinstall the database: {META_DATABASE}?", fg="red", bold=True)): click.echo('Deleting the database...') db.drop_all() diff --git a/chaos_genius/connectors/__init__.py b/chaos_genius/connectors/__init__.py index 1a8a3b89f..19682fdb2 100644 --- a/chaos_genius/connectors/__init__.py +++ b/chaos_genius/connectors/__init__.py @@ -4,7 +4,7 @@ from chaos_genius.connectors.snowflake import SnowflakeDb from chaos_genius.connectors.redshift import Redshift from chaos_genius.connectors.druid import Druid -from chaos_genius.connectors.connector_utils import merge_dataframe_chunks + DB_CLASS_MAPPER = { "Postgres": PostgresDb, @@ -12,25 +12,31 @@ "BigQuery": BigQueryDb, "Snowflake": SnowflakeDb, "Redshift": Redshift, - "Druid": Druid + "Druid": Druid, } def get_sqla_db_conn(data_source_info=None, connection_config=None): database = None if not (data_source_info or connection_config): - raise Exception("Either provide the data source info or the database connection config") + raise Exception( + "Either provide the data source info or the database connection config" + ) if data_source_info: ds_type = data_source_info["connection_type"] ds_third_party = data_source_info["is_third_party"] if ds_third_party is False: db_class = DB_CLASS_MAPPER[ds_type] - db_connection_info = data_source_info["sourceConfig"]["connectionConfiguration"] + db_connection_info = data_source_info["sourceConfig"][ + "connectionConfiguration" + ] database = db_class(connection_info=db_connection_info) else: # TODO: Make this configurable from the integration constants db_class = DB_CLASS_MAPPER["Postgres"] - db_connection_info = data_source_info["destinationConfig"]["connectionConfiguration"] + db_connection_info = data_source_info["destinationConfig"][ + "connectionConfiguration" + ] database = db_class(connection_info=db_connection_info) elif connection_config: ds_type = connection_config["connection_type"] @@ -40,19 +46,13 @@ def get_sqla_db_conn(data_source_info=None, connection_config=None): return database -def get_metadata(data_source_info, from_query=False, query=''): +def get_metadata(data_source_info, from_query=False, query=""): db_tables = data_source_info["dbConfig"]["tables"] db_connection = get_sqla_db_conn(data_source_info=data_source_info) db_connection.init_inspector() - metadata = { - "tables": { - "query": { - "table_columns": [] - } - } - } + metadata = {"tables": {"query": {"table_columns": []}}} all_schema = {} - err_msg = '' + err_msg = "" try: if not from_query: all_schema = db_connection.get_schema_metadata(tables=db_tables) @@ -66,33 +66,49 @@ def get_metadata(data_source_info, from_query=False, query=''): metadata = all_schema return metadata, err_msg -def get_table_info(data_source_info, schema, table_name): - db_connection = get_sqla_db_conn(data_source_info=data_source_info) - table_info = {} + +def get_table_info(data_source_info, schema, table_name, from_db_conn=False, db_conn=None): + if from_db_conn and db_conn: + db_connection = db_conn + else: + db_connection = get_sqla_db_conn(data_source_info=data_source_info) if db_connection is None: return None + table_info = {} db_connection.init_inspector() table_info["columns"] = db_connection.get_columns(table_name, schema) table_info["primary_key"] = db_connection.get_primary_key(table_name, schema) return table_info -def get_schema_names(data_source_info): - db_connection = get_sqla_db_conn(data_source_info=data_source_info) + +def get_schema_names(data_source_info, from_db_conn=False, db_conn=None): + if from_db_conn and db_conn: + db_connection = db_conn + else: + db_connection = get_sqla_db_conn(data_source_info=data_source_info) if db_connection is None: return None db_connection.init_inspector() - return db_connection.get_schema_names_list() + schema_list = db_connection.get_schema_names_list() + if schema_list is None: + schema_list = [None] + return schema_list -def get_table_list(data_source_info, schema): - db_connection = get_sqla_db_conn(data_source_info=data_source_info) + +def get_table_list(data_source_info, schema, from_db_conn=False, db_conn=None): + if from_db_conn and db_conn: + db_connection = db_conn + else: + db_connection = get_sqla_db_conn(data_source_info=data_source_info) if db_connection is None: return None db_connection.init_inspector() return db_connection.get_tables(schema) + def get_view_list(data_source_info, schema): db_connection = get_sqla_db_conn(data_source_info=data_source_info) if db_connection is None: @@ -101,6 +117,7 @@ def get_view_list(data_source_info, schema): db_connection.init_inspector() return db_connection.get_view_names_list(schema) + def test_connection(data_source_info): db_connection = get_sqla_db_conn(connection_config=data_source_info) status, message = db_connection.test_connection() diff --git a/chaos_genius/controllers/data_source_controller.py b/chaos_genius/controllers/data_source_controller.py index 7a1caf74e..9da07f3d9 100644 --- a/chaos_genius/controllers/data_source_controller.py +++ b/chaos_genius/controllers/data_source_controller.py @@ -1,15 +1,16 @@ from collections import defaultdict +from typing import List + +from chaos_genius.connectors import test_connection from chaos_genius.databases.models.data_source_model import DataSource from chaos_genius.extensions import integration_connector as connector from chaos_genius.settings import AIRBYTE_ENABLED -from chaos_genius.third_party.integration_server_config import ( - SOURCE_WHITELIST_AND_TYPE -) -from chaos_genius.connectors import test_connection +from chaos_genius.third_party.integration_server_config import SOURCE_WHITELIST_AND_TYPE +from chaos_genius.utils.metadata_api_config import NON_THIRD_PARTY_DATASOURCES def get_datasource_data_from_id(n: int, as_obj: bool = False) -> dict: - """Returns the corresponding Data-Source data for the given Data-Source ID + """Returns the corresponding Data-Source data for the given Data-Source ID from DATA_SOURCE_DATA. :param n: ID of Data-Source @@ -45,17 +46,23 @@ def mask_sensitive_info(data_source_type_def: dict, data_source_details: dict) - for prop, value in data_source_details.items(): if not isinstance(value, dict): prop_def_details = source_def_prop.get(prop, {}) - if prop_def_details.get('airbyte_secret', False): + if prop_def_details.get("airbyte_secret", False): value = str(value) masked_dict[prop] = f"{value[:2]}********{value[-2:]}" else: masked_dict[prop] = value else: for inner_prop, inner_value in value.items(): - prop_def_details = source_def_prop.get(prop, {}).get('properties', {}).get(inner_prop, {}) - if prop_def_details.get('airbyte_secret', False): + prop_def_details = ( + source_def_prop.get(prop, {}) + .get("properties", {}) + .get(inner_prop, {}) + ) + if prop_def_details.get("airbyte_secret", False): inner_value = str(inner_value) - masked_dict[prop][inner_prop] = f"{inner_value[:2]}********{inner_value[-2:]}" + masked_dict[prop][ + inner_prop + ] = f"{inner_value[:2]}********{inner_value[-2:]}" else: masked_dict[prop][inner_prop] = inner_value return masked_dict @@ -74,18 +81,24 @@ def test_data_source(payload: dict) -> dict: if is_third_party and not AIRBYTE_ENABLED: return { "message": "Airbyte is not enabled. Please enable Airbyte to test the third party connection", - "status": "failed" + "status": "failed", } if is_third_party: connector_client = connector.connection - for _property in ["connection_type", "sourceId", "workspaceId", "name", "sourceName"]: + for _property in [ + "connection_type", + "sourceId", + "workspaceId", + "name", + "sourceName", + ]: payload.pop(_property, None) connection_status = connector_client.test_connection(payload) else: db_status, message = test_connection(payload) connection_status = { "message": message, - "status": "succeeded" if db_status is True else "failed" + "status": "succeeded" if db_status is True else "failed", } return connection_status @@ -104,7 +117,24 @@ def update_third_party(payload): if is_third_party: if AIRBYTE_ENABLED: connector_client = connector.connection - for _property in ["sourceDefinitionId", "workspaceId", "sourceName", "connection_type"]: + for _property in [ + "sourceDefinitionId", + "workspaceId", + "sourceName", + "connection_type", + ]: payload.pop(_property, None) connection_status = connector_client.update_source(payload) return connection_status + + +def get_data_source_list(exclude_third_party=True) -> List[DataSource]: + args = () + if exclude_third_party: + args = (DataSource.connection_type.in_(NON_THIRD_PARTY_DATASOURCES),) + data_sources = ( + DataSource.query.filter(DataSource.active == True, *args) # noqa: E712 + .order_by(DataSource.created_at.desc()) + .all() + ) + return data_sources diff --git a/chaos_genius/controllers/data_source_metadata_controller.py b/chaos_genius/controllers/data_source_metadata_controller.py new file mode 100644 index 000000000..32a718be4 --- /dev/null +++ b/chaos_genius/controllers/data_source_metadata_controller.py @@ -0,0 +1,226 @@ +import json +import logging +from datetime import datetime +from typing import List, cast + +from chaos_genius.connectors import ( + get_schema_names, + get_sqla_db_conn, + get_table_info, + get_table_list, +) +from chaos_genius.controllers.data_source_controller import get_datasource_data_from_id +from chaos_genius.databases.models.data_source_metadata_model import DataSourceMetadata +from chaos_genius.databases.models.data_source_model import DataSource +from chaos_genius.utils.metadata_api_config import NON_THIRD_PARTY_DATASOURCES + +logger = logging.getLogger(__name__) + + +def fetch_schema_list(data_source_id: int, as_obj: bool = False): + """Fetch the schema list from the metadata of the given data source.""" + schema_list = [] + data_source_metadata: DataSourceMetadata = ( + DataSourceMetadata.query.filter( + DataSourceMetadata.data_source_id == data_source_id, + DataSourceMetadata.metadata_type == "schema_list", + ) + .order_by(DataSourceMetadata.created_at.desc()) + .first() + ) + if data_source_metadata: + schema_list: List[str] = data_source_metadata.metadata_info.get("schema_list", []) + + if as_obj: + return data_source_metadata + else: + return schema_list + + +def fetch_table_list(data_source_id: int, schema: str, as_obj: bool=False): + """Fetch the table list from the metadata of the given data source and schema.""" + table_list = [] + data_source_metadata: DataSourceMetadata = ( + DataSourceMetadata.query.filter( + DataSourceMetadata.data_source_id == data_source_id, + DataSourceMetadata.metadata_type == "table_list", + DataSourceMetadata.metadata_param == get_metadata_param_str([schema]), + ) + .order_by(DataSourceMetadata.created_at.desc()) + .first() + ) + if data_source_metadata: + table_list = data_source_metadata.metadata_info.get("table_list", []) + + if as_obj: + return data_source_metadata + else: + return table_list + + +def delete_table_list(data_source_id: int, schema: str): + """Delete the table list from the metadata of the given data source and schema.""" + data_source_metadata: DataSourceMetadata = ( + DataSourceMetadata.query.filter( + DataSourceMetadata.data_source_id == data_source_id, + DataSourceMetadata.metadata_type == "table_list", + DataSourceMetadata.metadata_param == get_metadata_param_str([schema]), + ) + .order_by(DataSourceMetadata.created_at.desc()) + .first() + ) + if data_source_metadata: + data_source_metadata.delete(commit=True) + + +def fetch_table_info(data_source_id: int, schema: str, table: str, as_obj: bool=False): + """Fetch the table info from the metadata of the given data source and table.""" + table_info = {} + data_source_metadata: DataSourceMetadata = ( + DataSourceMetadata.query.filter( + DataSourceMetadata.data_source_id == data_source_id, + DataSourceMetadata.metadata_type == "table_info", + DataSourceMetadata.metadata_param + == get_metadata_param_str([schema, table]), + ) + .order_by(DataSourceMetadata.created_at.desc()) + .first() + ) + if data_source_metadata: + table_info = data_source_metadata.metadata_info + + if as_obj: + return data_source_metadata + else: + return table_info + + +def delete_table_info(data_source_id: int, schema: str, table: str): + """Delete the table info from the metadata of the given data source and table""" + data_source_metadata: DataSourceMetadata = ( + DataSourceMetadata.query.filter( + DataSourceMetadata.data_source_id == data_source_id, + DataSourceMetadata.metadata_type == "table_info", + DataSourceMetadata.metadata_param + == get_metadata_param_str([schema, table]), + ) + .order_by(DataSourceMetadata.created_at.desc()) + .first() + ) + if data_source_metadata: + data_source_metadata.delete(commit=True) + + +def run_metadata_prefetch(data_source_id: int): + """Fetch the metadata of the given data source.""" + + data_source_obj = cast(DataSource, get_datasource_data_from_id(data_source_id, as_obj=True)) + sync_error = False + + if data_source_obj.connection_type not in NON_THIRD_PARTY_DATASOURCES: + logger.warning( + f"Datasource with id: {data_source_id} is a third-party datasource" + ) + return False + + if data_source_obj.sync_status == "In Progress": + logger.warning( + f"Datasource with id: {data_source_id} already in Progress, skipping.." + ) + return True + + try: + data_source_obj.sync_status = "In Progress" + data_source_obj.update(commit=True) + + db_connection = get_sqla_db_conn(data_source_obj.as_dict) + + schema_list, old_schemas_list = scan_db_and_save_schema_list( + data_source_id, db_connection + ) + for schema in schema_list: + table_list, old_tables_list = scan_db_and_save_table_list( + data_source_id, db_connection, schema + ) + for table in table_list: + _ = scan_db_and_save_table_info( + data_source_id, db_connection, schema, table + ) + + table_to_delete = list(set(old_tables_list) - set(table_list)) + for table in table_to_delete: + delete_table_info(data_source_id, schema, table) + + schema_to_delete = list(set(old_schemas_list) - set(schema_list)) + for schema in schema_to_delete: + delete_table_list(data_source_id, schema) + + except Exception as err: + sync_error = True + logger.error("Error in metadata prefetch.", exc_info=err) + + data_source_obj = cast(DataSource, get_datasource_data_from_id(data_source_id, as_obj=True)) + data_source_obj.sync_status = "Completed" if not sync_error else "Error" + data_source_obj.last_sync = datetime.now() + data_source_obj.update(commit=True) + + return True if not sync_error else False + + +def scan_db_and_save_schema_list(data_source_id, db_connection): + """Scan the database for schema list.""" + schema_list = get_schema_names({}, from_db_conn=True, db_conn=db_connection) + old_schemas = fetch_schema_list(data_source_id, as_obj=True) + data_source_metadata = DataSourceMetadata( + data_source_id=data_source_id, + metadata_type="schema_list", + metadata_param=get_metadata_param_str(), + metadata_info={"schema_list": schema_list}, + ) + data_source_metadata.save(commit=True) + old_schemas_list = [] + if old_schemas: + old_schemas_list: List[str] = old_schemas.metadata_info.get("schema_list", []) + old_schemas.delete(commit=True) + return schema_list, old_schemas_list + + +def scan_db_and_save_table_list(data_source_id, db_connection, schema): + """Scan the database for table list.""" + table_list = get_table_list({}, schema, from_db_conn=True, db_conn=db_connection) + old_tables = fetch_table_list(data_source_id, schema, as_obj=True) + data_source_metadata = DataSourceMetadata( + data_source_id=data_source_id, + metadata_type="table_list", + metadata_param=get_metadata_param_str([schema]), + metadata_info={"table_list": table_list}, + ) + data_source_metadata.save(commit=True) + old_tables_list = [] + if old_tables: + old_tables_list = old_tables.metadata_info.get("table_list", []) + old_tables.delete(commit=True) + return table_list, old_tables_list + + +def scan_db_and_save_table_info(data_source_id, db_connection, schema, table): + """Scan the database for table info.""" + table_info = get_table_info( + {}, schema, table, from_db_conn=True, db_conn=db_connection + ) + old_table_info = fetch_table_info(data_source_id, schema, table, as_obj=True) + data_source_metadata = DataSourceMetadata( + data_source_id=data_source_id, + metadata_type="table_info", + metadata_param=get_metadata_param_str([schema, table]), + metadata_info=table_info, + ) + data_source_metadata.save(commit=True) + if old_table_info: + old_table_info.delete(commit=True) + return table_info + + +def get_metadata_param_str(list_of_params=[]): + """Get the metadata param string.""" + return json.dumps(list_of_params) diff --git a/chaos_genius/controllers/digest_controller.py b/chaos_genius/controllers/digest_controller.py index 62f0ba88e..c925eb949 100644 --- a/chaos_genius/controllers/digest_controller.py +++ b/chaos_genius/controllers/digest_controller.py @@ -1,151 +1,165 @@ import datetime +import logging from collections import defaultdict -from typing import DefaultDict, List - -from chaos_genius.alerts.utils import change_message_from_percent -from chaos_genius.alerts.constants import ( - ALERT_DATE_FORMAT, - ALERT_DATETIME_FORMAT, - DIGEST_DATETIME_FORMAT, - OVERALL_KPI_SERIES_TYPE_REPR +from typing import DefaultDict, Dict, List, Optional, Sequence, Tuple + +from chaos_genius.alerts.anomaly_alerts import AnomalyPoint, AnomalyPointFormatted +from chaos_genius.alerts.constants import ( + ALERT_DATE_FORMAT, + ALERT_READABLE_DATA_TIMESTAMP_FORMAT, + OVERALL_KPI_SERIES_TYPE_REPR, ) from chaos_genius.databases.models.alert_model import Alert from chaos_genius.databases.models.kpi_model import Kpi from chaos_genius.databases.models.triggered_alerts_model import TriggeredAlerts -from chaos_genius.databases.models.anomaly_data_model import AnomalyDataOutput -from typing import Iterator - - -def structure_anomaly_data_for_digests(anomaly_data): - - data = dict() - for point in anomaly_data: - dt_obj = datetime.datetime.strptime(point["data_datetime"], ALERT_DATETIME_FORMAT) - if dt_obj.hour not in data.keys(): - data[dt_obj.hour] = [] - data[dt_obj.hour].append(point) - segregated_data = list(data.items()) - segregated_data.sort(key=lambda arr: arr[0], reverse=True) +logger = logging.getLogger(__name__) - anomaly_data_formatted = [] - for _, arr in segregated_data: - arr.sort(key=lambda point: point["severity"], reverse=True) - anomaly_data_formatted.extend(arr) - return anomaly_data_formatted - -def get_alert_kpi_configurations(data): - - alert_config_cache = dict() - alert_conf_ids = list(set([alert.alert_conf_id for alert in data])) +def get_alert_kpi_configurations(triggered_alerts: Sequence[TriggeredAlerts]): + """Gets all alert and KPI configs for given triggered alerts.""" + alert_conf_ids = list(set([alert.alert_conf_id for alert in triggered_alerts])) alert_confs = Alert.query.filter(Alert.id.in_(alert_conf_ids)).all() - alert_config_cache = {alert.id: alert.as_dict for alert in alert_confs} + alert_config_cache: Dict[int, Alert] = {alert.id: alert for alert in alert_confs} - kpi_cache = dict() kpi_ids = list( set( [ alert.alert_metadata.get("kpi") - for alert in data + for alert in triggered_alerts if alert.alert_metadata.get("kpi") is not None ] ) ) kpis = Kpi.query.filter(Kpi.id.in_(kpi_ids)).all() - kpi_cache = {kpi.id: kpi.as_dict for kpi in kpis} + kpi_cache: Dict[int, Kpi] = {kpi.id: kpi for kpi in kpis} return alert_config_cache, kpi_cache -def triggered_alert_data_processing(data): - - alert_config_cache, kpi_cache = get_alert_kpi_configurations(data) +def preprocess_triggered_alert( + triggered_alert: TriggeredAlerts, + alert_config_cache: Dict[int, Alert], + kpi_cache: Dict[int, Kpi], +) -> TriggeredAlerts: + """Preprocess a triggered alert for use in digests and alerts dashboard.""" + alert_conf_id = triggered_alert.alert_conf_id + alert_conf = alert_config_cache[alert_conf_id] + + kpi_id = alert_conf.kpi + kpi = kpi_cache.get(kpi_id) + + # TODO: make a dataclass for this + triggered_alert.kpi_id = kpi_id + triggered_alert.kpi_name = kpi.name if kpi is not None else "Doesn't Exist" + triggered_alert.alert_name = alert_conf.alert_name + triggered_alert.alert_channel = alert_conf.alert_channel + triggered_alert.alert_channel_conf = alert_conf.alert_channel_conf + triggered_alert.alert_message = alert_conf.alert_message + + if not isinstance(alert_conf.alert_channel_conf, dict): + triggered_alert.alert_channel_conf = None + else: + # in case of email, this makes triggered_alert.alert_channel_conf the list of + # emails + triggered_alert.alert_channel_conf = getattr( + alert_conf, "alert_channel_conf", {} + ).get(triggered_alert.alert_channel) + + return triggered_alert + + +def extract_anomaly_points_from_triggered_alerts( + triggered_alerts: List[TriggeredAlerts], kpi_cache: Dict[int, Kpi] +) -> List[AnomalyPointFormatted]: + """Extracts all anomaly points from given (anomaly/KPI) triggered alerts. + + Arguments: + triggered_alerts: the sequence of triggered alerts to extract points from. Must + be anomaly alerts. + kpi_cache: obtained from `get_alert_kpi_configurations` + """ + anomaly_points: List[AnomalyPointFormatted] = [] + for triggered_alert in triggered_alerts: + for point in triggered_alert.alert_metadata["alert_data"]: - for alert in data: - alert_conf_id = getattr(alert, "alert_conf_id") - alert_conf = alert_config_cache.get(alert_conf_id, None) + try: + point = AnomalyPointFormatted.from_point( + AnomalyPoint.parse_obj(point), + time_series_frequency=getattr( + kpi_cache.get(triggered_alert.kpi_id), "anomaly_params", {} + ).get("frequency"), + kpi_id=triggered_alert.kpi_id, + kpi_name=triggered_alert.kpi_name, + alert_id=triggered_alert.alert_conf_id, + alert_name=triggered_alert.alert_name, + alert_channel=triggered_alert.alert_channel, + alert_channel_conf=triggered_alert.alert_channel_conf, + ) - kpi_id = alert_conf.get("kpi", None) - kpi = kpi_cache.get(kpi_id) if kpi_id is not None else None + anomaly_points.append(point) + except OverflowError as e: + logger.error( + "Error in extracting an anomaly point from triggered alert", + exc_info=e, + ) - alert.kpi_name = kpi.get("name") if kpi is not None else "Doesn't Exist" - alert.kpi_id = kpi_id - alert.alert_name = alert_conf.get("alert_name") - alert.alert_channel = alert_conf.get("alert_channel") - alert.alert_message = alert_conf.get("alert_message") + return anomaly_points - if not isinstance(alert_conf.get("alert_channel_conf"), dict): - alert.alert_channel_conf = None - else: - alert.alert_channel_conf = alert_conf.get("alert_channel_conf").get( - alert.alert_channel, None - ) - return data +def _preprocess_triggered_alerts( + triggered_alerts: Sequence[TriggeredAlerts], + alert_config_cache: Dict[int, Alert], + kpi_cache: Dict[int, Kpi], +) -> List[TriggeredAlerts]: + """Preprocess triggered alerts for use in the Alert Dashboard.""" + return [ + preprocess_triggered_alert(ta, alert_config_cache, kpi_cache) + for ta in triggered_alerts + ] def _filter_anomaly_alerts( - anomaly_alerts_data: List[TriggeredAlerts], include_subdims: bool = False -): + anomaly_points: Sequence[AnomalyPointFormatted], include_subdims: bool = False +) -> List[AnomalyPointFormatted]: if not include_subdims: - for alert in anomaly_alerts_data: - alert.alert_metadata["alert_data"] = list( - filter( - lambda point: point["Dimension"] == OVERALL_KPI_SERIES_TYPE_REPR, - alert.alert_metadata["alert_data"], - ) - ) + return [ + point + for point in anomaly_points + if point.series_type == OVERALL_KPI_SERIES_TYPE_REPR + ] else: - for alert in anomaly_alerts_data: - anomaly_data = [] - counts: DefaultDict[str, int] = defaultdict(lambda: 0) - max_subdims = 20 - - for point in alert.alert_metadata["alert_data"]: - if point["Dimension"] != OVERALL_KPI_SERIES_TYPE_REPR: - counts[point["data_datetime"]] += 1 - if counts[point["data_datetime"]] > max_subdims: - continue + counts: DefaultDict[Tuple[int, datetime.datetime], int] = defaultdict(lambda: 0) + filtered_points: List[AnomalyPointFormatted] = [] + max_subdims = 20 - anomaly_data.append(point) + for point in anomaly_points: - alert.alert_metadata["alert_data"] = anomaly_data + if point.series_type != OVERALL_KPI_SERIES_TYPE_REPR: + counts[(point.alert_id, point.data_datetime)] += 1 + if counts[(point.alert_id, point.data_datetime)] > max_subdims: + continue + filtered_points.append(point) -def _add_nl_messages_anomaly_alerts(anomaly_alerts_data): - for triggered_alert in anomaly_alerts_data: - for point in triggered_alert.alert_metadata["alert_data"]: - percentage_change = point.get("percentage_change", None) - if percentage_change is None: - point["nl_message"] = "Not available for older triggered alerts." - else: - point["nl_message"] = change_message_from_percent(percentage_change) - - -def _preprocess_anomaly_alerts(anomaly_alerts_data: list): - for triggered_alert in anomaly_alerts_data: - for point in triggered_alert.alert_metadata["alert_data"]: - exact_time = point.get("data_datetime") - - if exact_time is None: - point["date_only"] = "Older Alerts" - else: - exact_time = datetime.datetime.strptime(exact_time, ALERT_DATETIME_FORMAT) - point["date_only"] = exact_time.strftime(ALERT_DATE_FORMAT) - - _add_nl_messages_anomaly_alerts(anomaly_alerts_data) + return filtered_points def _preprocess_event_alerts(event_alerts_data: list): for triggered_alert in event_alerts_data: - new_time = triggered_alert.created_at.strftime(DIGEST_DATETIME_FORMAT) - triggered_alert.date_only = triggered_alert.created_at.strftime(ALERT_DATE_FORMAT) + new_time = triggered_alert.created_at.strftime( + ALERT_READABLE_DATA_TIMESTAMP_FORMAT + ) + triggered_alert.date_only = triggered_alert.created_at.strftime( + ALERT_DATE_FORMAT + ) triggered_alert.created_at = new_time -def get_digest_view_data(triggered_alert_id=None, include_subdims: bool = False): - +def get_digest_view_data( + triggered_alert_id: Optional[int] = None, include_subdims: bool = False +): + """Collects triggered alerts data for alerts dashboard.""" curr_time = datetime.datetime.now() time_diff = datetime.timedelta(days=7) @@ -153,32 +167,26 @@ def get_digest_view_data(triggered_alert_id=None, include_subdims: bool = False) if triggered_alert_id is not None: filters.append(TriggeredAlerts.id == triggered_alert_id) - data = ( + triggered_alerts: Sequence[TriggeredAlerts] = ( TriggeredAlerts.query.filter(*filters) .order_by(TriggeredAlerts.created_at.desc()) .all() ) - data = triggered_alert_data_processing(data) - anomaly_alerts_data = [alert for alert in data if alert.alert_type == "KPI Alert"] - _filter_anomaly_alerts(anomaly_alerts_data, include_subdims) - _preprocess_anomaly_alerts(anomaly_alerts_data) - event_alerts_data = [alert for alert in data if alert.alert_type == "Event Alert"] + alert_config_cache, kpi_cache = get_alert_kpi_configurations(triggered_alerts) + + triggered_alerts = _preprocess_triggered_alerts( + triggered_alerts, alert_config_cache, kpi_cache + ) + + anomaly_alerts = extract_anomaly_points_from_triggered_alerts( + [alert for alert in triggered_alerts if alert.alert_type == "KPI Alert"], + kpi_cache, + ) + anomaly_alerts = _filter_anomaly_alerts(anomaly_alerts, include_subdims) + event_alerts_data = [ + alert for alert in triggered_alerts if alert.alert_type == "Event Alert" + ] _preprocess_event_alerts(event_alerts_data) - return anomaly_alerts_data, event_alerts_data - - -def get_previous_data( - kpi_id: int, - point_timestamp: datetime.datetime, - time_diff: datetime.timedelta -) -> Iterator[AnomalyDataOutput]: - """Queries anomaly data in range [ts - time_diff, ts).""" - prev_day_data = AnomalyDataOutput.query.filter( - AnomalyDataOutput.kpi_id == kpi_id, - AnomalyDataOutput.anomaly_type.in_(["overall", "subdim"]), - AnomalyDataOutput.data_datetime < point_timestamp, - AnomalyDataOutput.data_datetime >= (point_timestamp - time_diff), - ).all() - return prev_day_data + return anomaly_alerts, event_alerts_data diff --git a/chaos_genius/controllers/kpi_controller.py b/chaos_genius/controllers/kpi_controller.py index bee13e368..88796742a 100644 --- a/chaos_genius/controllers/kpi_controller.py +++ b/chaos_genius/controllers/kpi_controller.py @@ -1,29 +1,27 @@ import logging -import typing from datetime import date, datetime, timedelta -from typing import Optional, Union, Iterator +from typing import Iterator, List, Optional -from flask import current_app # noqa: F401 +from sqlalchemy import delete from chaos_genius.controllers.task_monitor import checkpoint_failure, checkpoint_success from chaos_genius.core.anomaly.controller import AnomalyDetectionController +from chaos_genius.core.rca.constants import TIME_RANGES_BY_KEY from chaos_genius.core.rca.rca_controller import RootCauseAnalysisController from chaos_genius.core.utils.data_loader import DataLoader +from chaos_genius.core.utils.round import round_number +from chaos_genius.databases.models.anomaly_data_model import AnomalyDataOutput from chaos_genius.databases.models.kpi_model import Kpi - -from chaos_genius.settings import ( - MAX_DEEPDRILLS_SLACK_DAYS, - DAYS_OFFSET_FOR_ANALTYICS, -) - +from chaos_genius.databases.models.rca_data_model import RcaData +from chaos_genius.extensions import db +from chaos_genius.settings import DAYS_OFFSET_FOR_ANALTYICS, MAX_DEEPDRILLS_SLACK_DAYS logger = logging.getLogger(__name__) -def _is_data_present_for_end_date( - kpi_info: dict, - end_date: date = None -) -> bool: +def _is_data_present_for_end_date(kpi_info: dict, end_date: date = None) -> bool: + if end_date is None: + end_date = datetime.now().date() df_count = DataLoader(kpi_info, end_date=end_date, days_before=0).get_count() return df_count != 0 @@ -49,46 +47,33 @@ def get_kpi_data_from_id(n: int) -> dict: def run_anomaly_for_kpi( - kpi_id: int, - end_date: date = None, - task_id: Optional[int] = None -) -> Union["typing.Literal[False]", date]: - - try: - logger.info(f"Starting Anomaly Detection for KPI ID: {kpi_id}.") - kpi_info = get_kpi_data_from_id(kpi_id) - logger.info("Retrieved KPI information.") + kpi_id: int, end_date: Optional[date] = None, task_id: Optional[int] = None +): - logger.info("Selecting end date.") + logger.info(f"Starting Anomaly Detection for KPI ID: {kpi_id}.") + kpi_info = get_kpi_data_from_id(kpi_id) + logger.info(f"(KPI ID: {kpi_id}) Retrieved KPI information.") - if end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "D": - # by default we always calculate for n-days_offset_for_analytics - end_date = datetime.today().date() - timedelta(days=(DAYS_OFFSET_FOR_ANALTYICS)) - # Check if data is available or not then try for n-days_offset_for_analytics-1 - if not _is_data_present_for_end_date(kpi_info, end_date): - end_date = end_date - timedelta(days=1) - logger.info("Decreasing end date by 1.") + logger.info("(KPI ID: {kpi_id}) Selecting end date.") - elif end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "H": - end_date = datetime.today().date() - - logger.info(f"End date is {end_date}.") + if end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "D": + # by default we always calculate for n-days_offset_for_analytics + end_date = datetime.today().date() - timedelta(days=(DAYS_OFFSET_FOR_ANALTYICS)) + # Check if data is available or not then try for n-days_offset_for_analytics-1 + if not _is_data_present_for_end_date(kpi_info, end_date): + end_date = end_date - timedelta(days=1) + logger.info("(KPI ID: {kpi_id}) Decreasing end date by 1.") - adc = AnomalyDetectionController(kpi_info, end_date, task_id=task_id) - adc.detect() - logger.info(f"Anomaly Detection has completed for KPI ID: {kpi_id}.") - - if kpi_info["scheduler_params"]["scheduler_frequency"] == "H": - end_date = adc.end_date - logger.info(f"End date for hourly alerts is {end_date}.") + elif ( + end_date is None and kpi_info["scheduler_params"]["scheduler_frequency"] == "H" + ): + end_date = datetime.today().date() - except Exception: # noqa: B902 - logger.error( - f"Anomaly Detection encountered an error for KPI ID: {kpi_id}", exc_info=True - ) - return False + logger.info(f"(KPI ID: {kpi_id}) End date is {end_date}.") - return end_date + adc = AnomalyDetectionController(kpi_info, end_date, task_id=task_id) + adc.detect() + logger.info(f"Anomaly Detection has completed for KPI ID: {kpi_id}.") def _get_end_date_for_rca_kpi(kpi_info: dict, end_date: date = None) -> date: @@ -102,12 +87,16 @@ def _get_end_date_for_rca_kpi(kpi_info: dict, end_date: date = None) -> date: end_date = end_date - timedelta(days=1) count += 1 if count > MAX_DEEPDRILLS_SLACK_DAYS: - raise ValueError(f"KPI has no data for the last {MAX_DEEPDRILLS_SLACK_DAYS} days.") + raise ValueError( + f"KPI has no data for the last {MAX_DEEPDRILLS_SLACK_DAYS} days." + ) return end_date -def run_rca_for_kpi(kpi_id: int, end_date: date = None, task_id: Optional[int] = None) -> bool: +def run_rca_for_kpi( + kpi_id: int, end_date: date = None, task_id: Optional[int] = None +) -> bool: try: logger.info(f"Starting RCA for KPI ID: {kpi_id}.") kpi_info = get_kpi_data_from_id(kpi_id) @@ -131,11 +120,13 @@ def run_rca_for_kpi(kpi_id: int, end_date: date = None, task_id: Optional[int] = ) logger.error( f"(Task: {task_id}, KPI: {kpi_id}) DeepDrills - Data Loader and Validation - Exception occured.", - exc_info=e + exc_info=e, ) return False - rca_controller = RootCauseAnalysisController(kpi_info, end_date, task_id=task_id) + rca_controller = RootCauseAnalysisController( + kpi_info, end_date, task_id=task_id + ) rca_controller.compute() logger.info(f"Completed RCA for KPI ID: {kpi_id}.") @@ -167,6 +158,102 @@ def get_active_kpis() -> Iterator[Kpi]: return kpis -# def delete_data(kpi, query): -# db.session.execute(query) -# db.session.commit() \ No newline at end of file +def get_anomaly_data( + kpi_ids: List[int], + anomaly_types: List[str] = None, + anomalies_only: bool = False, + start_timestamp: datetime = None, + include_start_timestamp: bool = True, + end_timestamp: datetime = None, + include_end_timestamp: bool = True, + severity_cutoff: float = None, +) -> List[AnomalyDataOutput]: + """Returns list of anomaly points using paramters to filter the output.""" + filters = [] + if kpi_ids: + filters.append(AnomalyDataOutput.kpi_id.in_(kpi_ids)) + + if anomaly_types: + # TODO: Add the series type filter for query optimisation + filters.append(AnomalyDataOutput.anomaly_type.in_(anomaly_types)) + + if anomalies_only: + filters.append(AnomalyDataOutput.is_anomaly.in_([1, -1])) + + if start_timestamp: + if include_start_timestamp: + filters.append(AnomalyDataOutput.data_datetime >= start_timestamp) + else: + filters.append(AnomalyDataOutput.data_datetime > start_timestamp) + + if end_timestamp: + if include_end_timestamp: + filters.append(AnomalyDataOutput.data_datetime <= end_timestamp) + else: + filters.append(AnomalyDataOutput.data_datetime < end_timestamp) + + if severity_cutoff: + filters.append(AnomalyDataOutput.severity >= severity_cutoff) + + anomaly_data = AnomalyDataOutput.query.filter(*filters).all() + + return anomaly_data + + +def get_last_anomaly_timestamp( + kpi_ids: List[int], + anomaly_types: List[str] = ["overall", "subdim"], +) -> Optional[datetime]: + """Returns the timestamp of the latest anomaly data.""" + result = ( + AnomalyDataOutput.query.filter( + (AnomalyDataOutput.kpi_id.in_(kpi_ids)) + & (AnomalyDataOutput.anomaly_type.in_(anomaly_types)) + ) + .order_by(AnomalyDataOutput.data_datetime.desc()) + .first() + ) + + if result: + return result.data_datetime + + +def get_active_kpi_from_id(kpi_id: int) -> Optional[Kpi]: + """Returns a kpi obj for an active Kpi using the input kpi id.""" + kpi_obj = Kpi.query.filter( + Kpi.active == True, Kpi.id == kpi_id # noqa: E712 + ).first() + + return kpi_obj + + +def delete_rca_output_for_kpi(kpi_id: int): + """Delete RCA output for a prticular KPI.""" + delete_kpi_query = delete(RcaData).where(RcaData.kpi_id == kpi_id) + db.session.execute(delete_kpi_query) + db.session.commit() + + +def delete_anomaly_output_for_kpi(kpi_id: int): + """Delete Anomaly output for a particular KPI.""" + delete_kpi_query = delete(AnomalyDataOutput).where( + AnomalyDataOutput.kpi_id == kpi_id + ) + db.session.execute(delete_kpi_query) + db.session.commit() + + +def get_anomaly_count(kpi_id, timeline): + + curr_date = datetime.now().date() + (_, _), (sd, _) = TIME_RANGES_BY_KEY[timeline]["function"](curr_date) + + # TODO: Add the series type filter + anomaly_data = AnomalyDataOutput.query.filter( + AnomalyDataOutput.kpi_id == kpi_id, + AnomalyDataOutput.anomaly_type == "overall", + AnomalyDataOutput.is_anomaly == 1, + AnomalyDataOutput.data_datetime >= sd, + ).all() + + return len(anomaly_data) diff --git a/chaos_genius/core/anomaly/controller.py b/chaos_genius/core/anomaly/controller.py index 8f6cc556c..37cf855b4 100644 --- a/chaos_genius/core/anomaly/controller.py +++ b/chaos_genius/core/anomaly/controller.py @@ -104,14 +104,19 @@ def _load_anomaly_data(self) -> pd.DataFrame: if self.kpi_info["anomaly_params"]["frequency"] == "H": period /= 24 - start_date = last_date - timedelta(days=period) if last_date else None - - return DataLoader( - self.kpi_info, - end_date=self.end_date, - start_date=start_date, - days_before=period, - ).get_data() + if last_date: + start_date = last_date - timedelta(days=period) + return DataLoader( + self.kpi_info, + end_date=self.end_date, + start_date=start_date, + ).get_data() + else: + return DataLoader( + self.kpi_info, + end_date=self.end_date, + days_before=period, + ).get_data() def _get_last_date_in_db(self, series: str, subgroup: str = None) -> datetime: """Return the last date for which we have data for the given series. diff --git a/chaos_genius/core/rca/rca_utils/api_utils.py b/chaos_genius/core/rca/rca_utils/api_utils.py index 6a17540dc..45cbcfa3f 100644 --- a/chaos_genius/core/rca/rca_utils/api_utils.py +++ b/chaos_genius/core/rca/rca_utils/api_utils.py @@ -3,6 +3,7 @@ from datetime import date, datetime from typing import List +from chaos_genius.extensions import db from chaos_genius.controllers.kpi_controller import get_kpi_data_from_id from chaos_genius.core.rca.constants import TIME_RANGES_BY_KEY from chaos_genius.databases.models.rca_data_model import RcaData @@ -12,6 +13,7 @@ get_lastscan_string_with_tz, get_rca_date_from_string, ) +from sqlalchemy import func, and_ logger = logging.getLogger(__name__) @@ -93,7 +95,7 @@ def kpi_aggregation(kpi_id, timeline="last_30_days"): return status, message, final_data -def kpi_line_data(kpi_id): +def kpi_line_data(kpi_id, download=False): """Get KPI line data.""" final_data = [] status = "success" @@ -116,10 +118,15 @@ def kpi_line_data(kpi_id): raise ValueError("No data found.") final_data = data_point.data - for row in final_data: - row["date"] = convert_datetime_to_timestamp( - get_rca_date_from_string(row["date"]) - ) + if not download: + for row in final_data: + row["date"] = convert_datetime_to_timestamp( + get_rca_date_from_string(row["date"]) + ) + else: + for row in final_data: + row["date"] = get_rca_date_from_string(row["date"]) + except Exception as err: # noqa: B902 logger.error(f"Error in KPI Line data retrieval: {err}", exc_info=1) status = "error" @@ -209,6 +216,62 @@ def rca_hierarchical_data(kpi_id, timeline="last_30_days", dimension=None): return status, message, final_data +def rca_hierarchical_data_all_dims(kpi_id, timeline="last_30_days"): + """Get RCA hierarchical data for all dimensions.""" + final_data_list = {} + status = "success" + message = "" + try: + kpi_info = get_kpi_data_from_id(kpi_id) + end_date = get_rca_output_end_date(kpi_info) + + subq = ( + db.session.query( + RcaData.dimension, + func.max(RcaData.created_at).label("latest_created_at"), + ) + .filter(RcaData.kpi_id == kpi_id) + .group_by(RcaData.dimension) + .subquery() + ) + + data_points = ( + db.session.query(RcaData) + .filter( + (RcaData.kpi_id == kpi_id) + & (RcaData.data_type == "htable") + & (RcaData.timeline == timeline) + & (RcaData.end_date <= end_date) + ) + .join( + subq, + and_( + RcaData.dimension == subq.c.dimension, + RcaData.created_at == subq.c.latest_created_at, + ), + ) + .all() + ) + + final_data_list = [] + if data_points: + for data_point in data_points: + final_data = data_point.data + final_data["analysis_date"] = get_datetime_string_with_tz( + get_analysis_date(kpi_id, end_date) + ) + final_data["dimension"] = data_point.dimension + final_data_list.append(final_data) + else: + raise ValueError("No data found.") + except Exception as err: # noqa: B902 + logger.error(f"Error in RCA hierarchical table retrieval: {err}", exc_info=1) + status = "error" + message = str(err) + final_data_list = [] + return status, message, final_data_list + + def get_rca_output_end_date(kpi_info: dict) -> date: """Get RCA end date.""" end_date = None diff --git a/chaos_genius/core/utils/constants.py b/chaos_genius/core/utils/constants.py index 754f4e84d..d73a61a99 100644 --- a/chaos_genius/core/utils/constants.py +++ b/chaos_genius/core/utils/constants.py @@ -1,8 +1,6 @@ """Provides constants for Chaos Genius.""" SUPPORTED_TIMEZONES = { - "GMT": "GMT+00:00", - "UTC": "GMT+00:00", "ECT": "GMT+01:00", "EET": "GMT+02:00", "ART": "GMT+02:00", diff --git a/chaos_genius/core/utils/data_loader.py b/chaos_genius/core/utils/data_loader.py index 59279cec3..115ac8bdd 100644 --- a/chaos_genius/core/utils/data_loader.py +++ b/chaos_genius/core/utils/data_loader.py @@ -1,18 +1,19 @@ """Provides utilties for loading data from KPIs.""" -from datetime import date, datetime, timedelta +import contextlib import logging -import random -import string -import pytz +from datetime import date, datetime, timedelta +from typing import List, Optional import pandas as pd +from pandas.api.types import is_datetime64_any_dtype as is_datetime +import pytz + from chaos_genius.connectors import get_sqla_db_conn -from chaos_genius.databases.models.data_source_model import DataSource -from chaos_genius.settings import TIMEZONE from chaos_genius.core.utils.constants import SUPPORTED_TIMEZONES from chaos_genius.core.utils.utils import randomword - +from chaos_genius.databases.models.data_source_model import DataSource +from chaos_genius.settings import TIMEZONE _SQL_IDENTIFIERS = { "MySQL": "`", @@ -23,20 +24,26 @@ class DataLoader: + """Data Loader Class.""" + def __init__( self, kpi_info: dict, - end_date: date = None, - start_date: date = None, - days_before: int = None, - tail: int = None, + end_date: Optional[date] = None, + start_date: Optional[date] = None, + days_before: Optional[int] = None, + tail: Optional[int] = None, validation: bool = False, ): """Initialize Data Loader for KPI. - If end_date is none, end_date is set to current datetime. - If start_date is none, days_before is used to determine start_date. - You must specify either start_date or days_before. + Accepted combinations of end_date, start_date and days_before: + - none + - end_date + - start_date + - end_date, start_date + - end_date, days_before + - start_date, days_before :param kpi_info: kpi info to load data for :type kpi_info: dict @@ -50,24 +57,37 @@ def __init__( :type tail: int, optional :param validation: if validation is True, we do not perform preprocessing :type validation: bool, optional - :raises ValueError: Raises error if both start_date and days_before are - not specified + :raises ValueError: Raises error if start_date, end_date and days_before not in accepted combinations """ self.kpi_info = kpi_info self.tail = tail self.validation = validation - if end_date is None: - end_date = datetime.today().date() + self.end_date = end_date + self.start_date = start_date + self.days_before = days_before - if start_date is None and days_before is not None: - start_date = end_date - timedelta(days=days_before) + if self.end_date is None and self.start_date is None and self.days_before is not None: + raise ValueError( + "If days_before is specified, either start_date or end_date must be specified" + ) + + if self.end_date is not None and self.start_date is not None and self.days_before is not None: + raise ValueError( + "end_date, start_date and days_before cannot be specified at the same time" + ) + + if self.end_date is None and self.start_date is not None and self.days_before is not None: + self.end_date = self.start_date + timedelta(days=self.days_before) + + if self.end_date is not None and self.start_date is None and self.days_before is not None: + self.start_date = self.end_date - timedelta(days=self.days_before) - self.start_date = start_date # when we do date <= "6 Feb 2022", we get data till "6 Feb 2022 00:00:00" # (inclusive), but we need data till "7 Feb 2022 00:00:00" (exclusive) # so we add one day here and make our query date < "7 Feb 2022" - self.end_date = end_date + timedelta(days=1) + if self.end_date is not None: + self.end_date = self.end_date + timedelta(days=1) self.connection_info = DataSource.get_by_id( kpi_info["data_source"] @@ -80,24 +100,42 @@ def __init__( def _get_id_string(self, value): return f"{self.identifier}{value}{self.identifier}" - def _build_date_filter(self): - dt_col_str = self._get_id_string(self.dt_col) + def _convert_date_to_string(self, date: date, offset: str): + # TODO: Once SUPPOERTED_TIMEZONES is deprecated, + # we shouldn't need to take offset as a string, but rather + # take in a pytz timezone and skip using strings. + date = date.strftime("%Y-%m-%d") + date += f"T00:00:00{offset}" + if not self.kpi_info.get("timezone_aware"): + date = ( + pd.Timestamp(datetime.strptime(date, "%Y-%m-%dT%H:%M:%S%z")) + .tz_convert(self.connection_info["database_timezone"]) + .tz_localize(None) + # TODO: We should also use date.isoformat() here + .strftime("%Y-%m-%dT%H:%M:%S") + ) + return date - start_date_str = self.start_date.strftime("%Y-%m-%d") - end_date_str = self.end_date.strftime("%Y-%m-%d") + def _build_date_filter(self) -> List[str]: + dt_col_str = self._get_id_string(self.dt_col) - # TODO: Write tests for tz aware date strings - # if we have tz aware data, we need to add tz info to data - if self.kpi_info.get("timezone_aware"): + # TODO: Deprecate SUPPORTED_TIMEZONES over releases. + # Use reporting timezone to localize start & end date + if TIMEZONE in SUPPORTED_TIMEZONES: tz_offset_string = SUPPORTED_TIMEZONES[TIMEZONE][-6:] - tz_offset_string = f"T00:00:00{tz_offset_string}" - start_date_str += tz_offset_string - end_date_str += tz_offset_string + else: + tz_offset_string = datetime.now(pytz.timezone(TIMEZONE)).strftime("%z") + tz_offset_string = tz_offset_string[:3] + ":" + tz_offset_string[3:] - start_query = f"{dt_col_str} >= '{start_date_str}'" - end_query = f"{dt_col_str} < '{end_date_str}'" + filters = [] + if self.start_date is not None: + start_date_str = self._convert_date_to_string(self.start_date, tz_offset_string) + filters.append(f"{dt_col_str} >= '{start_date_str}'") + if self.end_date is not None: + end_date_str = self._convert_date_to_string(self.end_date, tz_offset_string) + filters.append(f"{dt_col_str} < '{end_date_str}'") - return f" where {start_query} and {end_query} " + return filters def _get_tz_from_offset_str(self, utc_offset_str="GMT+00:00"): # TODO: Move to utils file @@ -110,13 +148,12 @@ def _get_tz_from_offset_str(self, utc_offset_str="GMT+00:00"): timezones = pytz.all_timezones for tz_name in timezones: - try: + # TODO: use getattr here with a default value instead of supressing + with contextlib.suppress(AttributeError): tz = pytz.timezone(tz_name) tz_offset = tz._transition_info[-1][0] if utc_offset == tz_offset: return tz - except AttributeError: - pass raise ValueError(f"No timezone found for offset {utc_offset_str}") def _get_table_name(self): @@ -129,42 +166,21 @@ def _get_table_name(self): return f"{schema_name}.{table_name}" return table_name - def _get_filters_for_query(self): - query = "" - kpi_filters = self.kpi_info.get("filters") - if kpi_filters: - kpi_filters_query = " " - for key, values in kpi_filters.items(): - if values: - # TODO: Bad Hack to remove the last comma, fix it - values_str = str(tuple(values)) - values_str = values_str[:-2] + ")" - kpi_filters_query += ( - f" and {self._get_id_string(key)} in {values_str}" - ) - kpi_filters_query += " " - query += kpi_filters_query - return query.strip() - def _build_query(self, count=False): table_name = self._get_table_name() - date_filter = ( - self._build_date_filter().strip() - if self.start_date is not None - else "" - ) + all_filters = [] + + all_filters.extend(self._build_date_filter()) if count: - query = f"select count(*) from {table_name} {date_filter}" + query = f"select count(*) from {table_name}" else: - query = f"select * from {table_name} {date_filter}" + query = f"select * from {table_name}" - filters_for_query = self._get_filters_for_query() - if date_filter == "" and filters_for_query != "": + if all_filters: query += " where " - - query += filters_for_query + query += " and ".join(all_filters) if self.tail is not None: query += f" limit {self.tail}" @@ -175,20 +191,46 @@ def _run_query(self, query): db_connection = get_sqla_db_conn(data_source_info=self.connection_info) return db_connection.run_query(query) + def _prepare_date_column(self, df): + if is_datetime(df[self.dt_col]): + # this should handle tz naive cases as all data points are in timestamp + return + + dtypes = df[self.dt_col].apply(lambda x: type(x)).unique() + + if len(dtypes) == 1 and dtypes[0] == str: + # strings should be parsed later + return + + # convert to timestamp and convert to UTC + df[self.dt_col] = pd.to_datetime(df[self.dt_col], utc=True) + def _preprocess_df(self, df): df[self.dt_col] = pd.to_datetime(df[self.dt_col]) - if self.kpi_info.get("timezone_aware"): - # if tz aware data, convert to given timezone - # and then strip tz information + # TODO: use the timezone_aware column in kpi table once updated + # tz-naive timestamps get localized to their database timezone. + if df[self.dt_col].dt.tz is None: + df[self.dt_col] = df[self.dt_col].dt.tz_localize( + self.connection_info["database_timezone"] + ) + + # TODO: Deprecate SUPPORTED_TIMEZONES over releases. + # maps the abbreviations to respective tz regions + if TIMEZONE in SUPPORTED_TIMEZONES: tz_to_convert_to = self._get_tz_from_offset_str( SUPPORTED_TIMEZONES[TIMEZONE] ) - df[self.dt_col] = ( - df[self.dt_col] - .dt.tz_convert(tz_to_convert_to) - .dt.tz_localize(None) - ) + else: + tz_to_convert_to = TIMEZONE + + # convert to reporting timezone + # and then strip tz information + df[self.dt_col] = ( + df[self.dt_col] + .dt.tz_convert(tz_to_convert_to) + .dt.tz_localize(None) + ) def get_count(self) -> int: """Return count of rows in KPI data.""" @@ -228,12 +270,12 @@ def get_data(self, return_empty=False) -> pd.DataFrame: if len(df) == 0: if return_empty: - logger.warn( - "Returning empty dataframe for KPI {}".format(kpi_id) - ) + logger.warn(f"Returning empty dataframe for KPI {kpi_id}") return df raise ValueError("Dataframe is empty.") + self._prepare_date_column(df) + if not self.validation: self._preprocess_df(df) diff --git a/chaos_genius/core/utils/kpi_validation.py b/chaos_genius/core/utils/kpi_validation.py index 0d26017bb..518cdce98 100644 --- a/chaos_genius/core/utils/kpi_validation.py +++ b/chaos_genius/core/utils/kpi_validation.py @@ -1,13 +1,15 @@ """Provides utility functions for validating KPIs.""" import logging -from typing import Any, Dict, List, Tuple, Union +from datetime import date, datetime +from typing import Any, Dict, List, Optional, Tuple, Union import pandas as pd from pandas.api.types import is_datetime64_any_dtype as is_datetime from chaos_genius.core.rca.root_cause_analysis import SUPPORTED_AGGREGATIONS from chaos_genius.core.utils.data_loader import DataLoader +from chaos_genius.databases.models.data_source_model import DataSource from chaos_genius.settings import MAX_ROWS_FOR_DEEPDRILLS KPI_VALIDATION_TAIL_SIZE = 1000 @@ -15,36 +17,48 @@ logger = logging.getLogger(__name__) -def validate_kpi(kpi_info: Dict[str, Any], data_source: Dict[str, Any]) -> Tuple[bool, str]: +def validate_kpi(kpi_info: Dict[str, Any], check_tz_aware: bool = False) -> Tuple[bool, str, Optional[bool]]: """Load data for KPI and invoke all validation checks. :param kpi_info: Dictionary with all params for the KPI :type kpi_info: Dict[str, Any] - :param data_source: Dictionary describing the data source - :type data_source: Dict[str, Any] - :return: Returns a tuple with the status as a bool and a status message - :rtype: Tuple[bool, str] + :param check_tz_aware: Bool for checking if the data is timezone aware + :return: Returns a tuple with the status as a bool, a status message and None if check_tz_aware is False otherwise a bool telling whether the data is timezone aware + :rtype: Tuple[bool, str, Optional[bool]] """ try: df = DataLoader( kpi_info, tail=KPI_VALIDATION_TAIL_SIZE, validation=True ).get_data() logger.info(f"Created df with {len(df)} rows for validation") - except Exception as e: # noqa: B902 + except Exception as e: logger.error("Unable to load data for KPI validation", exc_info=1) - return False, "Could not load data. Error: " + str(e) + return False, f"Could not load data. Error: {str(e)}", None - supports_tz_aware = data_source["connection_type"] == "Druid" + # TODO: Take in connection info as an argument instead of + # getting it here as it will help with mocking for tests. + connection_info = DataSource.get_by_id( + kpi_info["data_source"] + ).as_dict + supports_date_string_parsing = connection_info["name"] == "Druid" - return _validate_kpi_from_df( + status, message = _validate_kpi_from_df( df, kpi_info, kpi_column_name=kpi_info["metric"], agg_type=kpi_info["aggregation"], date_column_name=kpi_info["datetime_column"], - supports_tz_aware=supports_tz_aware, + supports_date_string_parsing=supports_date_string_parsing ) + if check_tz_aware: + df[kpi_info["datetime_column"]] = pd.to_datetime(df[kpi_info["datetime_column"]]) + # check if timezone is present + is_tz_aware = df[kpi_info["datetime_column"]].dt.tz is not None + return status, message, is_tz_aware + else: + return status, message, None + def _validate_kpi_from_df( df: pd.core.frame.DataFrame, @@ -52,8 +66,7 @@ def _validate_kpi_from_df( kpi_column_name: str, agg_type: str, date_column_name: str, - debug: bool = False, - supports_tz_aware: bool = False, + supports_date_string_parsing: bool = False, ) -> Tuple[bool, str]: """Invoke each validation check and break if there's a falsy check. @@ -67,9 +80,8 @@ def _validate_kpi_from_df( :type agg_type: str :param date_column_name: Name of the date column :type date_column_name: str - :param debug: Bool for using debug mode with extra print statements at each - validation, defaults to False - :type debug: bool, optional + :param supports_date_string_parsing: Bool for allowing parsing of strings, defaults to False + :type supports_date_string_parsing: bool, optional :return: returns a tuple with the status as a bool and a status message :rtype: Tuple[bool, str] """ @@ -114,22 +126,16 @@ def _validate_kpi_from_df( { "debug_str": "Check #4: Validate date column is parseable", "status": lambda: _validate_date_column_is_parseable( - df, date_column_name=date_column_name, supports_tz_aware=supports_tz_aware + df, date_column_name=date_column_name, supports_date_string_parsing=supports_date_string_parsing ), }, { - "debug_str": "Check #5: Validate date column is tz-naive if tz-aware not supported", - "status": lambda: _validate_date_column_is_tz_naive( - df, date_column_name=date_column_name - ) if not supports_tz_aware else (True, "Accepted!"), - }, - { - "debug_str": "Check #6: Validate dimensions", + "debug_str": "Check #5: Validate dimensions", "status": lambda: _validate_dimensions(kpi_info), }, { "debug_str": ( - "Check #7: Validate KPI has no more than " + "Check #6: Validate KPI has no more than " f"{MAX_ROWS_FOR_DEEPDRILLS} rows" ), "status": lambda: _validate_for_maximum_kpi_size(kpi_info), @@ -238,7 +244,7 @@ def _validate_kpi_not_datetime( def _validate_date_column_is_parseable( df: pd.core.frame.DataFrame, date_column_name: str, - supports_tz_aware: bool, + supports_date_string_parsing: bool, ) -> Tuple[bool, str]: """Validate if specified date column is parseable. @@ -246,11 +252,14 @@ def _validate_date_column_is_parseable( :type df: pd.core.frame.DataFrame :param date_column_name: Name of the date column :type date_column_name: str + :param supports_date_string_parsing: Whether the date column supports + parsing date strings. + :type supports_date_string_parsing: bool :return: returns a tuple with the status as a bool and a status message :rtype: Tuple[bool, str] """ # has to be datetime only then proceed else exit - if supports_tz_aware: + if supports_date_string_parsing: # try to parse date col # TODO: ensure this parses only tz-aware data and nothing else # (str, int, float, etc.) @@ -259,7 +268,9 @@ def _validate_date_column_is_parseable( # support only datetime type (not datetime with tz, strings, etc.) date_col = df[date_column_name] - if not is_datetime(date_col): + if not ( + is_datetime(date_col) or date_col.apply(lambda x: isinstance(x, (date, datetime))).all() + ): invalid_type_err_msg = ( "The datetime column is of the type" f" {df[date_column_name].dtype}, use 'cast' to convert to datetime." @@ -269,31 +280,6 @@ def _validate_date_column_is_parseable( return True, "Accepted!" -def _validate_date_column_is_tz_naive( - df: pd.core.frame.DataFrame, - date_column_name: str, -) -> Tuple[bool, str]: - """Validate if specified date column is tz-naive. - - :param df: A pandas DataFrame - :type df: pd.core.frame.DataFrame - :param date_column_name: Name of the date column - :type date_column_name: str - :return: returns a tuple with the status as a bool and a status message - :rtype: Tuple[bool, str] - """ - date_col = df[date_column_name] - all_tz_naive = date_col.apply(lambda t: t.tz is None).all() - if not all_tz_naive: - invalid_type_err_msg = ( - "The datetime column has timezone aware data," - " use 'cast' to convert to timezone naive." - ) - return False, invalid_type_err_msg - - return True, "Accepted!" - - def _validate_for_maximum_kpi_size( kpi_info: Dict[str, Any], ) -> Tuple[bool, str]: @@ -304,12 +290,13 @@ def _validate_for_maximum_kpi_size( :rtype: Tuple[bool, str] """ try: - num_rows = DataLoader(kpi_info, days_before=60).get_count() + end_date = datetime.now().date() + num_rows = DataLoader(kpi_info, end_date=end_date, days_before=60).get_count() except Exception as e: # noqa: B902 logger.error( - "Unable to load data for KPI validation of max size", exc_info=1 + "Unable to load data for KPI validation of max size", exc_info=e ) - return False, "Could not load data. Error: " + str(e) + return False, f"Could not load data. Error: {str(e)}" if num_rows <= MAX_ROWS_FOR_DEEPDRILLS: return True, "Accepted!" diff --git a/chaos_genius/databases/models/alert_model.py b/chaos_genius/databases/models/alert_model.py index 30bdd6cf3..c0ebfc1f2 100644 --- a/chaos_genius/databases/models/alert_model.py +++ b/chaos_genius/databases/models/alert_model.py @@ -14,6 +14,7 @@ class Alert(PkModel): alert_name = Column(db.Text(), nullable=False) alert_type = Column(db.String(80), nullable=False) # Event Alert, KPI Alert alert_status = Column(db.Boolean(), default=True, nullable=False, server_default=sqlalchemy.sql.expression.literal(True)) + last_anomaly_timestamp = Column(db.DateTime, nullable=True, default=None) data_source = Column(db.Integer) alert_query = Column(db.Text()) @@ -61,6 +62,7 @@ def as_dict(self): "active": self.active, "created_at": self.created_at, "alert_status": self.alert_status, + "last_anomaly_timestamp": self.last_anomaly_timestamp, "daily_digest": self.daily_digest, "weekly_digest": self.weekly_digest } diff --git a/chaos_genius/databases/models/data_source_metadata_model.py b/chaos_genius/databases/models/data_source_metadata_model.py new file mode 100644 index 000000000..783e920e2 --- /dev/null +++ b/chaos_genius/databases/models/data_source_metadata_model.py @@ -0,0 +1,37 @@ +# -*- coding: utf-8 -*- +"""data source metadata model.""" +import datetime as dt + +from sqlalchemy.dialects.postgresql import JSONB + +from chaos_genius.databases.base_model import Column, PkModel, db + + +class DataSourceMetadata(PkModel): + """Table for storing the metadata information of the data source.""" + __tablename__ = "data_source_metadata" + + data_source_id = Column(db.Integer, nullable=False, index=True) + metadata_type = Column(db.String(80), nullable=False, index=True) + metadata_param = Column(db.Text()) # TODO: should this be kept? current not being used + metadata_info = Column(JSONB, default=lambda: {}) + created_at = Column(db.DateTime, nullable=False, default=dt.datetime.utcnow) + + def __init__(self, **kwargs): + """Create instance.""" + super().__init__(**kwargs) + + def __repr__(self): + """Represent instance as a unique string""" + return f"" + + @property + def as_dict(self): + return { + "id": self.id, + "data_source_id": self.data_source_id, + "metadata_type": self.metadata_type, + "metadata_param": self.metadata_param, + "metadata_info": self.metadata_info, + "created_at": self.created_at + } diff --git a/chaos_genius/databases/models/data_source_model.py b/chaos_genius/databases/models/data_source_model.py index 8c24cbbb4..ee578c9e8 100644 --- a/chaos_genius/databases/models/data_source_model.py +++ b/chaos_genius/databases/models/data_source_model.py @@ -10,12 +10,13 @@ class DataSource(PkModel): __tablename__ = "data_source" name = Column(db.String(80), nullable=False) - connection_type = Column(db.String(80)) # TODO: Make the nullable=False + connection_type = Column(db.String(80)) # TODO: Make the nullable=False db_uri = Column(db.Text()) active = Column(db.Boolean(), default=False) is_third_party = Column(db.Boolean(), default=True) connection_status = Column(db.String(80)) sync_status = Column(db.String(80)) + database_timezone = Column(db.String(80), server_default='UTC', nullable=False) # configs field sourceConfig = Column(db.JSON) @@ -45,6 +46,7 @@ def safe_dict(self): "is_third_party": self.is_third_party, "connection_status": self.connection_status, "sync_status": self.sync_status, + "database_timezone": self.database_timezone, "last_sync": self.last_sync, "created_at": self.created_at } @@ -59,6 +61,7 @@ def as_dict(self): "is_third_party": self.is_third_party, "connection_status": self.connection_status, "sync_status": self.sync_status, + "database_timezone": self.database_timezone, "sourceConfig": self.sourceConfig, "destinationConfig": self.destinationConfig, "connectionConfig": self.connectionConfig, @@ -73,7 +76,7 @@ def meta_info(cls): return{ "name": "data_source", "table_name": "data_source", - "fields":[ + "fields": [ { "name": "name", "is_editable": True, @@ -107,6 +110,12 @@ def meta_info(cls): "is_editable": False, "is_sensitive": False, + }, + { + "name": "database_timezone", + "is_editable": True, + "is_sensitive": True, + }, { "name": "sourceConfig", @@ -147,4 +156,3 @@ def meta_info(cls): ] } - diff --git a/chaos_genius/databases/models/kpi_model.py b/chaos_genius/databases/models/kpi_model.py index 0fa0f5a26..262a660e8 100644 --- a/chaos_genius/databases/models/kpi_model.py +++ b/chaos_genius/databases/models/kpi_model.py @@ -120,29 +120,34 @@ def meta_info(cls): "is_editable": False, "is_sensitive": False, }, + { + "name": "schema_name", + "is_editable": True, + "is_sensitive": False + }, { "name": "kpi_type", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "kpi_query", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "table_name", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "metric", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "aggregation", - "is_editable": False, + "is_editable": True, "is_sensitive": False, "options": [{ "label": "Mean", @@ -157,22 +162,22 @@ def meta_info(cls): }, { "name": "datetime_column", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "filters", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "dimensions", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, { "name": "timezone_aware", - "is_editable": False, + "is_editable": True, "is_sensitive": False, }, # TODO: Fix this with some better implementation diff --git a/chaos_genius/jobs/__init__.py b/chaos_genius/jobs/__init__.py index fc2439567..023864bbb 100644 --- a/chaos_genius/jobs/__init__.py +++ b/chaos_genius/jobs/__init__.py @@ -1,3 +1,4 @@ from .anomaly_tasks import * from .alert_tasks import * -from .analytics_scheduler import * \ No newline at end of file +from .analytics_scheduler import * +from .metadata_prefetch import * \ No newline at end of file diff --git a/chaos_genius/jobs/analytics_scheduler.py b/chaos_genius/jobs/analytics_scheduler.py index d1355927e..0eee92361 100644 --- a/chaos_genius/jobs/analytics_scheduler.py +++ b/chaos_genius/jobs/analytics_scheduler.py @@ -48,7 +48,7 @@ def _get_scheduled_time_daily(self, kpi: Kpi, time_field: str = "time"): if time_field in scheduler_params: hour, minute, second = map(int, scheduler_params[time_field].split(":")) scheduled_time = scheduled_time.replace( - hour=hour, minute=minute, second=second + hour=hour, minute=minute, second=second, microsecond=0 ) else: creation_time = kpi.created_at @@ -56,6 +56,7 @@ def _get_scheduled_time_daily(self, kpi: Kpi, time_field: str = "time"): hour=creation_time.hour, minute=creation_time.minute, second=creation_time.second, + microsecond=0, ) return scheduled_time @@ -73,11 +74,12 @@ def _get_scheduled_time_hourly(self, kpi: Kpi, time_field: str = "time"): if time_field in scheduler_params: _, minute, second = map(int, scheduler_params[time_field].split(":")) - scheduled_time = scheduled_time.replace(minute=minute, second=second) + scheduled_time = scheduled_time.replace( + minute=minute, second=second, microsecond=0 + ) else: scheduled_time = scheduled_time.replace( - minute=HOURLY_SCHEDULE_RUN_MINUTE, - second=0, + minute=HOURLY_SCHEDULE_RUN_MINUTE, second=0, microsecond=0 ) return scheduled_time diff --git a/chaos_genius/jobs/anomaly_tasks.py b/chaos_genius/jobs/anomaly_tasks.py index acf6ab653..44addab20 100644 --- a/chaos_genius/jobs/anomaly_tasks.py +++ b/chaos_genius/jobs/anomaly_tasks.py @@ -54,10 +54,6 @@ def anomaly_single_kpi(kpi_id, end_date=None): ) task_id = checkpoint.task_id - anomaly_end_date = run_anomaly_for_kpi(kpi_id, end_date, task_id=task_id) - - kpi = cast(Kpi, Kpi.get_by_id(kpi_id)) - def _checkpoint_success(checkpoint: str): checkpoint_success(task_id, kpi.id, "Anomaly", checkpoint) logger.info( @@ -80,32 +76,35 @@ def _checkpoint_failure(checkpoint: str, e: Optional[Exception]): exc_info=e, ) - if anomaly_end_date: - logger.info(f"Completed the anomaly for KPI ID: {kpi_id}.") + try: + run_anomaly_for_kpi(kpi_id, end_date, task_id=task_id) + + kpi = cast(Kpi, Kpi.get_by_id(kpi_id)) kpi.scheduler_params = update_scheduler_params("anomaly_status", "completed") _checkpoint_success("Anomaly complete") + logger.info(f"Completed the anomaly for KPI ID: {kpi_id}.") + try: - # anomaly_end_date is same as the last date (of data in DB) - _, errors = trigger_anomaly_alerts_for_kpi(kpi, anomaly_end_date) + _, errors = trigger_anomaly_alerts_for_kpi(kpi) if not errors: logger.info(f"Triggered the alerts for KPI {kpi_id}.") _checkpoint_success("Alert trigger") else: logger.error(f"Alert trigger failed for the KPI ID: {kpi_id}.") - _checkpoint_failure("Alert trigger", None) + # we only log the first exception + _checkpoint_failure("Alert trigger", errors[0][1]) except Exception as e: logger.error(f"Alert trigger failed for the KPI ID: {kpi_id}.", exc_info=e) _checkpoint_failure("Alert trigger", e) - else: - logger.error(f"Anomaly failed for the for KPI ID: {kpi_id}.") + + except Exception as e: + kpi = cast(Kpi, Kpi.get_by_id(kpi_id)) kpi.scheduler_params = update_scheduler_params("anomaly_status", "failed") - _checkpoint_failure("Anomaly complete", None) + _checkpoint_failure("Anomaly complete", e) flag_modified(kpi, "scheduler_params") kpi.update(commit=True) - return anomaly_end_date - @celery.task def rca_single_kpi(kpi_id: int): diff --git a/chaos_genius/jobs/metadata_prefetch.py b/chaos_genius/jobs/metadata_prefetch.py new file mode 100644 index 000000000..d28b562e1 --- /dev/null +++ b/chaos_genius/jobs/metadata_prefetch.py @@ -0,0 +1,45 @@ +import logging +from typing import cast + +from celery import group +from celery.app.base import Celery + +from chaos_genius.controllers.data_source_controller import get_data_source_list +from chaos_genius.controllers.data_source_metadata_controller import ( + run_metadata_prefetch, +) +from chaos_genius.extensions import celery as celery_ext +from chaos_genius.settings import METADATA_SYNC_TIME + +celery = cast(Celery, celery_ext.celery) +logger = logging.getLogger(__name__) + + +@celery.task +def metadata_prefetch_daily_scheduler(): + """Celery task to check and trigger metadata prefetch from all active data sources.""" + data_sources = get_data_source_list() + ds_task_groups = [] + for data_source in data_sources: + logger.info(f"Starting metadata prefetch for Data Source: {data_source.id}") + ds_task_groups.append(fetch_data_source_schema.s(data_source.id)) + g = group(ds_task_groups) + res = g.apply_async() + return res + + +@celery.task +def fetch_data_source_schema(data_source_id): + """Scan schema of the data source and store that in the database. + + Args: + data_source_id (int): Id of the data source. + + Raises: + Exception: Raise if no data source is found. + + """ + if not data_source_id: + raise Exception("No data source id provided") + status = run_metadata_prefetch(data_source_id=data_source_id) + return status diff --git a/chaos_genius/settings.py b/chaos_genius/settings.py index fa55a6424..5ada7d1a5 100644 --- a/chaos_genius/settings.py +++ b/chaos_genius/settings.py @@ -8,7 +8,9 @@ """ import os from typing import Union +import warnings +import pytz from dotenv import load_dotenv from chaos_genius.core.rca.constants import TIME_RANGES_BY_KEY @@ -37,7 +39,7 @@ def _make_bool(val: Union[str, bool]) -> bool: DEBUG = ENV == "development" SQLALCHEMY_DATABASE_URI = os.getenv("DATABASE_URL_CG_DB") SECRET_KEY = os.getenv("SECRET_KEY", default="t8GIEp8hWmR8y6VLqd6qQCMXzjRaKsx8nRruWNtFuec=") -SEND_FILE_MAX_AGE_DEFAULT = os.getenv("SEND_FILE_MAX_AGE_DEFAULT") +SEND_FILE_MAX_AGE_DEFAULT = int(os.getenv("SEND_FILE_MAX_AGE_DEFAULT")) BCRYPT_LOG_ROUNDS = os.getenv("BCRYPT_LOG_ROUNDS", default=13) DEBUG_TB_ENABLED = DEBUG DEBUG_TB_INTERCEPT_REDIRECTS = False @@ -83,8 +85,15 @@ def _make_bool(val: Union[str, bool]) -> bool: if enabled_time_range not in TIME_RANGES_BY_KEY.keys(): raise ValueError(f"Values in DEEPDRILLS_ENABLED_TIME_RANGES must be one of {', '.join(TIME_RANGES_BY_KEY.keys())}. Got: {enabled_time_range}.") TIMEZONE = os.getenv('TIMEZONE', default='UTC') -if TIMEZONE not in SUPPORTED_TIMEZONES: - raise ValueError(f"Value of TIMEZONE must be one of {', '.join(SUPPORTED_TIMEZONES)}. Got: {TIMEZONE}.") +# TODO : Deprecate SUPPORTED_TIMEZONES over releases. +if TIMEZONE in SUPPORTED_TIMEZONES: + warnings.warn( + "TIMEZONE as 3 letter abbreviation will be deprecated in the future. Please refer to https://docs.chaosgenius.io/docs/Operator_Guides/Configuration/supported-timezones for the list of supported timezones.", + FutureWarning + ) +elif TIMEZONE not in pytz.all_timezones: + raise ValueError(f"Invalid Timezone Provided. Got: {TIMEZONE}. Please refer to https://docs.chaosgenius.io/docs/Operator_Guides/Configuration/supported-timezones for the list of supported timezones.") +# else, timezone is valid SENTRY_DSN = os.getenv('SENTRY_DSN') @@ -93,7 +102,7 @@ def _make_bool(val: Union[str, bool]) -> bool: TASK_CHECKPOINT_LIMIT: int = int(os.getenv("TASK_CHECKPOINT_LIMIT", 1000)) """Number of last checkpoints to retrieve in Task Monitor""" -CHAOSGENIUS_VERSION_MAIN = os.getenv("CHAOSGENIUS_VERSION_MAIN", "0.5.1") +CHAOSGENIUS_VERSION_MAIN = os.getenv("CHAOSGENIUS_VERSION_MAIN", "0.6.0") """ChaosGenius version - semver part only""" CHAOSGENIUS_VERSION_POSTFIX = os.getenv("CHAOSGENIUS_VERSION_POSTFIX", "git") """ChaosGenius version - postfix to identify deployment""" @@ -119,3 +128,7 @@ def _make_bool(val: Union[str, bool]) -> bool: """Alert Configuration""" EVENT_ALERTS_ENABLED = _make_bool(os.getenv("REACT_APP_EVENT_ALERT", default=False)) + +METADATA_SYNC_TIME = os.getenv("METADATA_SYNC_TIME", "03:00") +if len(METADATA_SYNC_TIME.split(':')) != 2: + raise ValueError("Metadata prefetch time is invalid. Must be in HH:MM format.") diff --git a/chaos_genius/templates/digest.html b/chaos_genius/templates/digest.html index b50db9d22..05b30f2a5 100644 --- a/chaos_genius/templates/digest.html +++ b/chaos_genius/templates/digest.html @@ -341,18 +341,17 @@

      Alerts Dashboard