-
Notifications
You must be signed in to change notification settings - Fork 9
/
Copy pathalertrules.yaml
90 lines (89 loc) · 6.01 KB
/
alertrules.yaml
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
# max by(job,namespace,network,address,name) is the primary key which all the metrics are aggregated by to avoid alerts flapping: i.e. when a K8s pod restarts
# The same result could be achieved by applying max without(instance,pod)
# --
# last_over_time is used to avoid alerts flapping when the alert is firing and the alert disappear, to override and extend the default endsAt behaviour: i.e. when the watcher takes a while to restart
# --
# All the queries are also filtered by the "environment" label: i.e. to not mix metrics coming from staging and production environments
{{ if and .Values.prometheusRules.enabled ( ne .Values.config.environment "ci" ) }}
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
labels:
{{ toYaml .Values.prometheusRules.labels | indent 4 }}
name: {{ include "app.name" . }}
spec:
groups:
- name: {{ include "app.name" . }}-{{ .Values.config.environment }}.rules
rules:
- alert: ValidatorOutOfActiveSet
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/account/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> is not present in the current validation active set because it has not been selected by Phragmen'
expr: max without(instance,pod) (last_over_time(polkadot_validator_out_of_active_set_state{environment="{{ .Values.config.environment }}"}[10m])) > 0
for: 2m
labels:
severity: info
origin: {{ .Values.prometheusRules.origin }}
{{ if ne .Values.prometheusRules.producerStall false }}
- alert: ProducerStallShort
annotations:
message: 'Blocks were not produced for 1 hour by <a href="https://{{`{{ $labels.network }}`}}.subscan.io/account/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a>'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Producer-Stall"
expr: max without(instance,pod) (increase(polkadot_validator_blocks_produced{environment="{{ .Values.config.environment }}"}[10m])) == 0 and max without(instance,pod) (last_over_time(polkadot_validator_out_of_active_set_state{environment="{{ .Values.config.environment }}"}[10m])) == 0
for: 60m
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
- alert: ProducerStallLong
annotations:
message: 'Blocks were not produced for 3 hours by <a href="https://{{`{{ $labels.network }}`}}.subscan.io/account/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a>'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Producer-Stall"
expr: max without(instance,pod) (increase(polkadot_validator_blocks_produced{environment="{{ .Values.config.environment }}"}[10m])) == 0 and max without(instance,pod) (last_over_time(polkadot_validator_out_of_active_set_state{environment="{{ .Values.config.environment }}"}[10m])) == 0
for: 180m
labels:
severity: critical
origin: {{ .Values.prometheusRules.origin }}
{{ end }}
- alert: ValidatorSlashed
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> was reported for Slash, an advanced double check can be carried <a href="https://{{`{{ $labels.network }}`}}.subscan.io/event?address={{`{{ $labels.address }}`}}&module=staking&event_id=slashreported">here</a>. This message is going to RESOLVE by itself soon.'
runbook_url: "https://github.com/w3f/infrastructure/wiki/Validator-Slashed"
expr: max without(instance,pod) (increase(polkadot_validator_slashed_reports{environment="{{ .Values.config.environment }}"}[5m])) > 0
for: 30s
labels:
severity: critical
origin: {{ .Values.prometheusRules.origin }}
{{ if ne .Values.prometheusRules.changed false }}
- alert: ValidatorRewardDestinationChanged
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> may have changed his reward destination recently, please double check. This message is going to RESOLVE by itself soon.'
expr: max without(instance,pod) (increase(polkadot_validator_payee_changed_reports{environment="{{ .Values.config.environment }}"}[5m])) > 0
for: 30s
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
- alert: ValidatorCommissionRateChanged
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> may have changed his commission rate recently, please double check. This message is going to RESOLVE by itself soon.'
expr: max without(instance,pod) (increase(polkadot_validator_commission_changed_reports{environment="{{ .Values.config.environment }}"}[5m])) > 0
for: 30s
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
{{ end }}
- alert: ValidatorCommissionRateUnexpected
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> has an unexpected commission rate, please double check.'
expr: max without(instance,pod) (last_over_time(polkadot_validator_unexpected_commission_state{environment="{{ .Values.config.environment }}"}[10m])) > 0
for: 1m
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
- alert: ValidatorRewardDestinationUnexpected
annotations:
message: 'Target <a href="https://{{`{{ $labels.network }}`}}.subscan.io/validator/{{`{{ $labels.address }}`}}">{{`{{ $labels.name }}`}}</a> has an unexpected reward destination, please double check.'
expr: max without(instance,pod) (last_over_time(polkadot_validator_unexpected_payee_state{environment="{{ .Values.config.environment }}"}[10m])) > 0
for: 1m
labels:
severity: warning
origin: {{ .Values.prometheusRules.origin }}
{{ end }}