Skip to content

Commit

Permalink
Add alert if the number of connector exceeds the soft limit
Browse files Browse the repository at this point in the history
issue: BB-601
  • Loading branch information
williamlardier committed Sep 9, 2024
1 parent 73669eb commit 1d94901
Show file tree
Hide file tree
Showing 4 changed files with 90 additions and 0 deletions.
1 change: 1 addition & 0 deletions .github/workflows/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -81,4 +81,5 @@ jobs:
namespace=zenko
oplog_populator_job=artesca-data-backbeat-oplog-populator-headless
oplogPopulatorChangeStreamLagThreshold=10
oplogPopulatorChangeStreamSoftLimit=10
github_token: ${{ secrets.GIT_ACCESS_TOKEN }}
49 changes: 49 additions & 0 deletions monitoring/oplog-populator/alerts.rendered.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
groups:
- name: Oplog Populator
rules:
- alert: OplogPopulatorUnavailable
annotations:
description: Oplog populator pod is down
summary: Oplog populator service is in critical state
zenko_service: backbeat-oplog-populator
expr: |
sum(up{namespace="zenko",job="artesca-data-backbeat-oplog-populator-headless"}) < 1
for: 30s
labels:
severity: critical
- alert: KafkaConnectFailedConnectorConfiguration
annotations:
description: Oplog populator failed to configure connector
summary: Oplog populator couldn't update kafka connect connector
zenko_service: backbeat-oplog-populator
expr: |
sum by(connector) (increase(oplog_populator_reconfiguration{success="false",job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"}[1m]))
> 0
for: 5m
labels:
severity: critical
- alert: KafkaConnectReplicasAboveThreshold
annotations:
description: Kafka connect replica count is above soft limit
summary: Kafka connect replica count is above soft limit. Consider using a single
replica for your cluster
zenko_service: backbeat-oplog-populator
expr: "(sum(up{namespace=\"zenko\",job=\"artesca-data-base-queue-connector-metrics\"})
> bool 10) * \n(10 > bool 0)\n"
for: 5m
labels:
severity: warning
- alert: OplogPopulatorMetastoreChangeStreamLagThreshold
annotations:
description: Oplog populator metastore change stream lag is too big
summary: Oplog populator configuration lag is above threshold
zenko_service: backbeat-oplog-populator
expr: |
histogram_quantile(
0.99,
sum by(le) (rate(s3_oplog_populator_acknowledgement_lag_seconds_bucket{job="artesca-data-backbeat-oplog-populator-headless",namespace="zenko"}[1m]))
)
>= 10
for: 5m
labels:
severity: critical
22 changes: 22 additions & 0 deletions monitoring/oplog-populator/alerts.test.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -73,3 +73,25 @@ tests:
zenko_service: backbeat-oplog-populator
description: "Oplog populator restarted connectors too many times"
summary: "Oplog populator connectors keep failing after restarts"

- name: Number of Kafka Connect Replicas
interval: 1m
input_series:
- series: up{namespace="zenko",job="artesca-data-base-queue-connector-metrics"}
values: 1x3 11x20 1x3
alert_rule_test:
- alertname: KafkaConnectReplicasAboveThreshold
eval_time: 4m
exp_alerts: []
- alertname: KafkaConnectReplicasAboveThreshold
eval_time: 15m
exp_alerts:
- exp_labels:
severity: warning
exp_annotations:
zenko_service: backbeat-oplog-populator
description: "Kafka connect replica count is above soft limit"
summary: "Kafka connect replica count is above soft limit. Consider using a single replica for your cluster"
- alertname: KafkaConnectReplicasAbove
eval_time: 25m
exp_alerts: []
18 changes: 18 additions & 0 deletions monitoring/oplog-populator/alerts.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,12 @@ x-inputs:
- name: oplogPopulatorChangeStreamLagThreshold
type: config
value: 10
- name: oplogPopulatorChangeStreamSoftLimit
type: config
value: 0
- name: kafka_connect_job
type: config
value: artesca-data-base-queue-connector-metrics

groups:
- name: Oplog Populator
Expand Down Expand Up @@ -36,6 +42,18 @@ groups:
description: "Oplog populator failed to configure connector"
summary: "Oplog populator couldn't update kafka connect connector"

- alert: KafkaConnectReplicasAboveThreshold
Expr: |
(sum(up{namespace="${namespace}",job="${kafka_connect_job}"}) > bool ${oplogPopulatorChangeStreamSoftLimit}) *
(${oplogPopulatorChangeStreamSoftLimit} > bool 0)
For: "5m"
Labels:
severity: warning
Annotations:
zenko_service: backbeat-oplog-populator
description: "Kafka connect replica count is above soft limit"
summary: "Kafka connect replica count is above soft limit. Consider using a single replica for your cluster"

- alert: OplogPopulatorMetastoreChangeStreamLagThreshold
Expr: |
histogram_quantile(
Expand Down

0 comments on commit 1d94901

Please sign in to comment.