Skip to content

Commit

Permalink
BB-311 add oplog populator alerts
Browse files Browse the repository at this point in the history
  • Loading branch information
Kerkesni committed Jan 6, 2023
1 parent d1d9e63 commit 3ae6f5e
Show file tree
Hide file tree
Showing 3 changed files with 114 additions and 0 deletions.
9 changes: 9 additions & 0 deletions .github/workflows/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,12 @@ jobs:
alert_inputs: >-
namespace=zenko,job_notification_producer=artesca-data-backbeat-notification-producer-headless,job_notification_processors=artesca-data-backbeat-notification-processor,notificationFailuresWarningThreshold=0.1,notificationFailuresCriticalThreshold=0.5,notification_producer_replicas=3,notification_processor_replicas=2
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}

- name: Render and test oplog populator
uses: scality/[email protected]
with:
alert_file_path: monitoring/oplog-populator/alerts.yaml
test_file_path: monitoring/oplog-populator/alerts.test.yaml
alert_inputs: >-
namespace=zenko,oplog_populator_job=artesca-data-backbeat-oplog-populator-headless,oplogPopulatorChangeStreamLagThreshold=10
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}
52 changes: 52 additions & 0 deletions monitoring/oplog-populator/alerts.test.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
evaluation_interval: 1m
rule_files:
- alerts.rendered.yaml

tests:

- name: Oplog Populator Replicas
interval: 1m
input_series:
- series: up{namespace="zenko",job="artesca-data-backbeat-oplog-populator-headless",pod="oplog-populator-1"}
values: 1 1 0
alert_rule_test:
- alertname: OplogPopulatorUnavailable
eval_time: 1m
exp_alerts: []
- alertname: OplogPopulatorUnavailable
eval_time: 2m
exp_alerts: []
- alertname: OplogPopulatorUnavailable
eval_time: 3m
exp_alerts:
- exp_labels:
severity: critical
exp_annotations:
zenko_service: backbeat-oplog-populator
description: "Oplog populator pod is down"
summary: "Oplog populator service is in critical state"

- name: Connector Configuration Failure
interval: 1m
input_series:
- series: oplog_populator_reconfiguration{connector="example-connector",success="false",job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"}
values: 0+0x4 0+40x4 160+50x6
- series: oplog_populator_reconfiguration{connector="example-connector",job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"}
values: 100+100x16
alert_rule_test:
- alertname: KafkaConnectFailedConnectorConfiguration
eval_time: 5m
exp_alerts: []
- alertname: KafkaConnectFailedConnectorConfiguration
eval_time: 10m
exp_alerts: []
- alertname: KafkaConnectFailedConnectorConfiguration
eval_time: 16m
exp_alerts:
- exp_labels:
severity: critical
connector: example-connector
exp_annotations:
zenko_service: backbeat-oplog-populator
description: "Oplog populator failed to configure connector"
summary: "Oplog populator couldn't update kafka connect connector"
53 changes: 53 additions & 0 deletions monitoring/oplog-populator/alerts.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
x-inputs:
- name: namespace
type: constant
value: zenko
- name: oplog_populator_job
type: config
value: artesca-data-backbeat-oplog-populator-headless
- name: oplogPopulatorChangeStreamLagThreshold
type: config
value: 10

groups:
- name: Oplog Populator
rules:

- alert: OplogPopulatorUnavailable
Expr: |
sum(up{namespace="${namespace}",job="${oplog_populator_job}"}) < 1
For: "30s"
Labels:
severity: critical
Annotations:
zenko_service: backbeat-oplog-populator
description: "Oplog populator pod is down"
summary: "Oplog populator service is in critical state"

- alert: KafkaConnectFailedConnectorConfiguration
Expr: |
sum by(connector) (increase(oplog_populator_reconfiguration{success="false",job="${oplog_populator_job}",namespace="${namespace}"}[1m]))
> 0
For: "5m"
Labels:
severity: critical
Annotations:
zenko_service: backbeat-oplog-populator
description: "Oplog populator failed to configure connector"
summary: "Oplog populator couldn't update kafka connect connector"

- alert: OplogPopulatorMetastoreChangeStreamLagThreshold
Expr: |
histogram_quantile(
0.99,
sum by(le) (rate(oplog_populator_acknowledgement_lag_sec_bucket{job="${oplog_populator_job}",namespace="${namespace}"}[1m]))
)
>= ${oplogPopulatorChangeStreamLagThreshold}
For: "5m"
Labels:
severity: critical
Annotations:
zenko_service: backbeat-oplog-populator
description: "Oplog populator metastore change stream lag is too big"
summary: "Oplog populator configuration lag is above threshold"

0 comments on commit 3ae6f5e

Please sign in to comment.