diff --git a/.github/workflows/alerts.yaml b/.github/workflows/alerts.yaml index ce93834c9..fc37ceb96 100644 --- a/.github/workflows/alerts.yaml +++ b/.github/workflows/alerts.yaml @@ -40,3 +40,12 @@ jobs: alert_inputs: >- namespace=zenko,job_notification_producer=artesca-data-backbeat-notification-producer-headless,job_notification_processors=artesca-data-backbeat-notification-processor,notificationFailuresWarningThreshold=0.1,notificationFailuresCriticalThreshold=0.5,notification_producer_replicas=3,notification_processor_replicas=2 github_token: ${{ secrets.GIT_ACCESS_TOKEN }} + + - name: Render and test oplog populator + uses: scality/action-prom-render-test@1.0.2 + with: + alert_file_path: monitoring/oplog-populator/alerts.yaml + test_file_path: monitoring/oplog-populator/alerts.test.yaml + alert_inputs: >- + namespace=zenko,oplog_populator_job=artesca-data-backbeat-oplog-populator-headless,oplogPopulatorChangeStreamLagThreshold=10 + github_token: ${{ secrets.GIT_ACCESS_TOKEN }} diff --git a/.github/workflows/docker-build.yaml b/.github/workflows/docker-build.yaml index 9586c99c6..94501ac8b 100644 --- a/.github/workflows/docker-build.yaml +++ b/.github/workflows/docker-build.yaml @@ -52,7 +52,8 @@ jobs: replication/alerts.yaml:application/prometheus-alerts+yaml \ notification/dashboard.json:application/grafana-dashboard+json \ notification/alerts.yaml:application/prometheus-alerts+yaml \ - oplog-populator/dashboard.json:application/grafana-dashboard+json + oplog-populator/dashboard.json:application/grafana-dashboard+json \ + oplog-populator/alerts.yaml:application/prometheus-alerts+yaml working-directory: monitoring - name: Push policies into the development namespace diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index ed49f374c..cf7cf18bc 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -53,7 +53,8 @@ jobs: replication/alerts.yaml:application/prometheus-alerts+yaml \ notification/dashboard.json:application/grafana-dashboard+json \ notification/alerts.yaml:application/prometheus-alerts+yaml \ - oplog-populator/dashboard.json:application/grafana-dashboard+json + oplog-populator/dashboard.json:application/grafana-dashboard+json \ + oplog-populator/alerts.yaml:application/prometheus-alerts+yaml working-directory: monitoring - name: Push policies into the production namespace diff --git a/monitoring/oplog-populator/alerts.test.yaml b/monitoring/oplog-populator/alerts.test.yaml new file mode 100644 index 000000000..75a1b41d9 --- /dev/null +++ b/monitoring/oplog-populator/alerts.test.yaml @@ -0,0 +1,52 @@ +evaluation_interval: 1m +rule_files: + - alerts.rendered.yaml + +tests: + + - name: Oplog Populator Replicas + interval: 1m + input_series: + - series: up{namespace="zenko",job="artesca-data-backbeat-oplog-populator-headless",pod="oplog-populator-1"} + values: 1 1 0 + alert_rule_test: + - alertname: OplogPopulatorUnavailable + eval_time: 1m + exp_alerts: [] + - alertname: OplogPopulatorUnavailable + eval_time: 2m + exp_alerts: [] + - alertname: OplogPopulatorUnavailable + eval_time: 3m + exp_alerts: + - exp_labels: + severity: critical + exp_annotations: + zenko_service: backbeat-oplog-populator + description: "Oplog populator pod is down" + summary: "Oplog populator service is in critical state" + + - name: Connector Configuration Failure + interval: 1m + input_series: + - series: oplog_populator_reconfiguration{connector="example-connector",success="false",job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"} + values: 0+0x4 0+40x4 160+50x6 + - series: oplog_populator_reconfiguration{connector="example-connector",job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"} + values: 100+100x16 + alert_rule_test: + - alertname: KafkaConnectFailedConnectorConfiguration + eval_time: 5m + exp_alerts: [] + - alertname: KafkaConnectFailedConnectorConfiguration + eval_time: 10m + exp_alerts: [] + - alertname: KafkaConnectFailedConnectorConfiguration + eval_time: 16m + exp_alerts: + - exp_labels: + severity: critical + connector: example-connector + exp_annotations: + zenko_service: backbeat-oplog-populator + description: "Oplog populator failed to configure connector" + summary: "Oplog populator couldn't update kafka connect connector" diff --git a/monitoring/oplog-populator/alerts.yaml b/monitoring/oplog-populator/alerts.yaml new file mode 100644 index 000000000..cd172eb21 --- /dev/null +++ b/monitoring/oplog-populator/alerts.yaml @@ -0,0 +1,53 @@ +x-inputs: +- name: namespace + type: constant + value: zenko +- name: oplog_populator_job + type: config + value: artesca-data-backbeat-oplog-populator-headless +- name: oplogPopulatorChangeStreamLagThreshold + type: config + value: 10 + +groups: +- name: Oplog Populator + rules: + + - alert: OplogPopulatorUnavailable + Expr: | + sum(up{namespace="${namespace}",job="${oplog_populator_job}"}) < 1 + For: "30s" + Labels: + severity: critical + Annotations: + zenko_service: backbeat-oplog-populator + description: "Oplog populator pod is down" + summary: "Oplog populator service is in critical state" + + - alert: KafkaConnectFailedConnectorConfiguration + Expr: | + sum by(connector) (increase(oplog_populator_reconfiguration{success="false",job="${oplog_populator_job}",namespace="${namespace}"}[1m])) + > 0 + For: "5m" + Labels: + severity: critical + Annotations: + zenko_service: backbeat-oplog-populator + description: "Oplog populator failed to configure connector" + summary: "Oplog populator couldn't update kafka connect connector" + + - alert: OplogPopulatorMetastoreChangeStreamLagThreshold + Expr: | + histogram_quantile( + 0.99, + sum by(le) (rate(oplog_populator_acknowledgement_lag_sec_bucket{job="${oplog_populator_job}",namespace="${namespace}"}[1m])) + ) + >= ${oplogPopulatorChangeStreamLagThreshold} + For: "5m" + Labels: + severity: critical + Annotations: + zenko_service: backbeat-oplog-populator + description: "Oplog populator metastore change stream lag is too big" + summary: "Oplog populator configuration lag is above threshold" +