Skip to content

Commit

Permalink
refactor: unique alert names (opendatahub-io#1397)
Browse files Browse the repository at this point in the history
  • Loading branch information
jackdelahunt authored Nov 28, 2024
1 parent 55fd878 commit 76cae1b
Show file tree
Hide file tree
Showing 9 changed files with 248 additions and 177 deletions.
136 changes: 68 additions & 68 deletions config/monitoring/prometheus/apps/prometheus-configs.yaml

Large diffs are not rendered by default.

26 changes: 16 additions & 10 deletions tests/prometheus_unit_tests/codeflare-alerting.unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,13 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high
exp_alerts: []

- interval: 1m
Expand All @@ -32,16 +38,16 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 5m and 1h Burn Rate high
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 3)."
summary: CodeFlare Operator Probe Success Burn Rate
summary: CodeFlare Operator Probe Success 5m and 1h Burn Rate high
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-availability.md'

- interval: 1m
Expand All @@ -52,16 +58,16 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 30m and 6h Burn Rate high
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 16)."
summary: CodeFlare Operator Probe Success Burn Rate
summary: CodeFlare Operator Probe Success 30m and 6h Burn Rate high
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md'

- interval: 1m
Expand All @@ -72,16 +78,16 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: CodeFlare Operator Probe Success Burn Rate
alertname: CodeFlare Operator Probe Success 2h and 1d Burn Rate high
instance: "codeflare-operator"
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
message: "High error budget burn for codeflare-operator (current value: 61)."
summary: CodeFlare Operator Probe Success Burn Rate
summary: CodeFlare Operator Probe Success 2h and 1d Burn Rate high
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Distributed-Workloads/codeflare-operator-probe-success-burn-rate.md'

# operator running
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,20 @@ tests:
values: "0x60"
- series: haproxy_backend_http_responses_total:burnrate1d{component="dsp"}
values: "0x60"
- series: haproxy_backend_http_responses_total:burnrate3d{component="dsp"}
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high
exp_alerts: []

- interval: 1m
Expand All @@ -31,14 +42,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 5m and 1h Burn Rate high
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
summary: "Data Science Pipelines Application Route Error Burn Rate"
summary: "Data Science Pipelines Application Route Error 5m and 1h Burn Rate high"
message: "High error budget burn for (current value: 3)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

Expand All @@ -50,14 +61,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 30m and 6h Burn Rate high
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
summary: "Data Science Pipelines Application Route Error Burn Rate"
summary: "Data Science Pipelines Application Route Error 30m and 6h Burn Rate high"
message: "High error budget burn for (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

Expand All @@ -69,14 +80,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 2h and 1d Burn Rate high
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
summary: "Data Science Pipelines Application Route Error Burn Rate"
summary: "Data Science Pipelines Application Route Error 2h and 1d Burn Rate high"
message: "High error budget burn for (current value: 61)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

Expand All @@ -88,14 +99,14 @@ tests:
values: "1+1x200"
alert_rule_test:
- eval_time: 3h
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Application Route Error Burn Rate
alertname: Data Science Pipelines Application Route Error 6h and 3d Burn Rate high
namespace: "redhat-ods-applications"
severity: info
exp_annotations:
summary: "Data Science Pipelines Application Route Error Burn Rate"
summary: "Data Science Pipelines Application Route Error 6h and 3d Burn Rate high"
message: "High error budget burn for (current value: 181)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-application-error-burn-rate.md'

Expand All @@ -116,7 +127,13 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 3h
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 3h
alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 3h
alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high
exp_alerts: []


Expand All @@ -128,15 +145,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high
instance: "data-science-pipelines-operator"
namespace: "redhat-ods-applications"
severity: critical
exp_annotations:
summary: "Data Science Pipelines Operator Probe Success Burn Rate"
summary: "Data Science Pipelines Operator Probe Success 5m and 1h Burn Rate high"
message: "High error budget burn for data-science-pipelines-operator (current value: 3)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'

Expand All @@ -148,15 +165,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high
instance: "data-science-pipelines-operator"
namespace: "redhat-ods-applications"
severity: critical
exp_annotations:
summary: "Data Science Pipelines Operator Probe Success Burn Rate"
summary: "Data Science Pipelines Operator Probe Success 30m and 6h Burn Rate high"
message: "High error budget burn for data-science-pipelines-operator (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'

Expand All @@ -168,15 +185,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: Data Science Pipelines Operator Probe Success Burn Rate
alertname: Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high
instance: "data-science-pipelines-operator"
namespace: "redhat-ods-applications"
severity: warning
exp_annotations:
summary: "Data Science Pipelines Operator Probe Success Burn Rate"
summary: "Data Science Pipelines Operator Probe Success 2h and 1d Burn Rate high"
message: "High error budget burn for data-science-pipelines-operator (current value: 61)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Data-Science-Pipelines/data-science-pipelines-operator-probe-success-burn-rate.md'

Expand Down
26 changes: 16 additions & 10 deletions tests/prometheus_unit_tests/kserve-alerting.unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -21,8 +21,14 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high
exp_alerts: []

- interval: 1m
input_series:
Expand All @@ -32,15 +38,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 5m and 1h Burn Rate high
instance: "kserve-controller-manager"
severity: critical
exp_annotations:
message: "High error budget burn for kserve-controller-manager (current value: 3)."
summary: Kserve Controller Probe Success Burn Rate
summary: Kserve Controller Probe Success 5m and 1h Burn Rate high
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"

- interval: 1m
Expand All @@ -51,15 +57,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 30m and 6h Burn Rate high
instance: "kserve-controller-manager"
severity: critical
exp_annotations:
message: "High error budget burn for kserve-controller-manager (current value: 16)."
summary: Kserve Controller Probe Success Burn Rate
summary: Kserve Controller Probe Success 30m and 6h Burn Rate high
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"

- interval: 1m
Expand All @@ -70,13 +76,13 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: Kserve Controller Probe Success Burn Rate
alertname: Kserve Controller Probe Success 2h and 1d Burn Rate high
instance: "kserve-controller-manager"
severity: warning
exp_annotations:
message: "High error budget burn for kserve-controller-manager (current value: 61)."
summary: Kserve Controller Probe Success Burn Rate
summary: Kserve Controller Probe Success 2h and 1d Burn Rate high
triage: "https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-kserve-controller-probe-success-burn-rate.md"
26 changes: 16 additions & 10 deletions tests/prometheus_unit_tests/model-mesh-alerting.unit-tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,13 @@ tests:
values: "0x60"
alert_rule_test:
- eval_time: 1h
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high
exp_alerts: []
- eval_time: 1h
alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high
exp_alerts: []

- interval: 1m
Expand All @@ -31,15 +37,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 2m
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 5m and 1h Burn Rate high
namespace: "redhat-ods-applications"
instance: "modelmesh-controller"
severity: critical
exp_annotations:
summary: "Modelmesh Controller Probe Success Burn Rate"
summary: "Modelmesh Controller Probe Success 5m and 1h Burn Rate high"
message: "High error budget burn for modelmesh-controller (current value: 3)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'

Expand All @@ -51,15 +57,15 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 15m
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high
exp_alerts:
- exp_labels:
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 30m and 6h Burn Rate high
namespace: "redhat-ods-applications"
instance: "modelmesh-controller"
severity: critical
exp_annotations:
summary: "Modelmesh Controller Probe Success Burn Rate"
summary: "Modelmesh Controller Probe Success 30m and 6h Burn Rate high"
message: "High error budget burn for modelmesh-controller (current value: 16)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'

Expand All @@ -71,14 +77,14 @@ tests:
values: "1+1x60"
alert_rule_test:
- eval_time: 1h
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high
exp_alerts:
- exp_labels:
alertname: Modelmesh Controller Probe Success Burn Rate
alertname: Modelmesh Controller Probe Success 2h and 1d Burn Rate high
namespace: "redhat-ods-applications"
instance: "modelmesh-controller"
severity: warning
exp_annotations:
summary: "Modelmesh Controller Probe Success Burn Rate"
summary: "Modelmesh Controller Probe Success 2h and 1d Burn Rate high"
message: "High error budget burn for modelmesh-controller (current value: 61)."
triage: 'https://gitlab.cee.redhat.com/service/managed-tenants-sops/-/blob/main/RHODS/Model-Serving/rhods-modelmesh-controller-probe-success-burn-rate.md'
Loading

0 comments on commit 76cae1b

Please sign in to comment.