Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Extended alerting #254

Merged
merged 10 commits into from
Feb 9, 2024
Original file line number Diff line number Diff line change
Expand Up @@ -37,14 +37,22 @@ spec:
value: DNS DHCP
- name: severity
value: warning
- receiver: NVVS DevOps Dev PreProd
- receiver: NVVS DevOps Dev PreProd Critical
groupWait: 10s
repeatInterval: 24h
matchers:
- name: service
value: DNS DHCP Dev PreProd
- name: severity
value: critical
- receiver: NVVS DevOps Dev PreProd
groupWait: 10s
repeatInterval: 24h
matchers:
- name: service
value: DNS DHCP Dev PreProd
- name: severity
value: warning
- receiver: Team PKI
groupWait: 10s
repeatInterval: 7d
Expand Down Expand Up @@ -114,6 +122,19 @@ spec:
username: {{`'{{ template "slack.alerts.username" . }}'`}}
color: {{`'{{ template "slack.alerts.color" . }}'`}}
iconEmoji: {{`'{{ template "slack.alerts.icon_emoji" . }}'`}}
- name: NVVS DevOps Dev PreProd Critical
slackConfigs:
- apiURL:
key: development_pre_production_dhcp_dns_slack_webhook_url
name: slack-webhooks
optional: false
channel: '#mojo-staff-device-dhcp-dns-alerts-dev-preprod'
sendResolved: true
title: {{`'{{ template "slack.alerts.title" . }}'`}}
text: {{`'{{ template "slack.alerts.text" . }}'`}}
username: {{`'{{ template "slack.alerts.username" . }}'`}}
color: {{`'{{ template "slack.alerts.color" . }}'`}}
iconEmoji: {{`'{{ template "slack.alerts.icon_emoji" . }}'`}}
- name: NVVS DevOps Dev PreProd
slackConfigs:
- apiURL:
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,177 @@
apiVersion: monitoring.coreos.com/v1
kind: PrometheusRule
metadata:
annotations:
meta.helm.sh/release-name: {{ .Release.Name }}
meta.helm.sh/release-namespace: monitoring
prometheus-operator-validated: "true"
creationTimestamp: "2022-08-25T09:36:30Z"
generation: 1
labels:
app: {{ .Release.Name }}
app.kubernetes.io/instance: {{ .Release.Name }}
app.kubernetes.io/managed-by: Helm
app.kubernetes.io/part-of: {{ .Release.Name }}
chart: {{ .Chart.Name }}
heritage: Helm
release: {{ .Release.Name }}
name: {{ .Release.Name }}.dns-dhcp-pre-production.rules
namespace: {{ .Release.Namespace }}
spec:
groups:
- name: {{ .Release.Name }}.dns-dhcp-pre-production.rules
rules:
- alert: DHCP KEA Configuration Reload Failed
expr: aws_kea_dhcp_config_reload_failed_sum{account_id="{{ .Values.pre_production_account_id }}" } > 0
for: 1m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: The configuration reload has failed
description: The {{ "{{ $labels.name }}" }} Configuration reload has failed {{ "{{ $value }}" }} time(s).
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP ECS Task count
expr: aws_ecs_containerinsights_running_task_count_average{dimension_ClusterName="staff-device-pre-production-dhcp-cluster"} < 1
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP ECS Task count is below 1
description: The ECS task count is below 1 for DHCP
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP ECS High CPU/Memory Alert
expr: aws_ecs_cpuutilization_average{dimension_ClusterName="staff-device-pre-production-dhcp-cluster"} > 60 or aws_ecs_memory_utilization_average{dimension_ClusterName="staff-device-pre-production-dhcp-cluster"} > 60
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP ECS CPU or MemoryUtilization is above 60 percent
description: The Memory or CPU is currently {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP KEA Failed Leases
expr: aws_kea_dhcp_alloc_engine_v4_alloc_fail_sum{account_id="{{ .Values.pre_production_account_id }}" } > 10
for: 10m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: KEA DHCP Failed leases > 10
description: KEA DHCP Failed leases is greater than 10. The current value is {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP KEA Server Alert
expr: aws_kea_dhcp_error_sum{account_id="{{ .Values.pre_production_account_id }}" } > 150 or aws_kea_dhcp_fatal_sum{account_id="{{ .Values.pre_production_account_id }}" } > 150
for: 7m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP KEA Server Alert ERROR or FATAL > 150
description: The server alert ERROR or FATAL is greater than 150. The current value is {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP RDS CPU Alert
expr: aws_rds_cpuutilization_average{dimension_DBInstanceIdentifier="staff-device-pre-production-dhcp-db"} > 60
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP RDS CPU is above 60 percent
description: The RDS CPU for DHCP has been above 60 percent for 5 minutes.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP Subnet Usage Alert
expr: subnet_statistics_usage > 80
for: 5m
labels:
severity: warning
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP Subnet utilisation > 80%
description: The DHCP Subnet utilisation has been above 80 percent for 5 minutes.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DHCP Subnet Usage Alert
expr: subnet_statistics_usage > 90
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DHCP Subnet utilisation > 90%
description: The DHCP Subnet utilisation has been above 90 percent for 5 minutes.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: DNS ECS High CPU/Memory Alert
expr: aws_ecs_cpuutilization_average{dimension_ClusterName="staff-device-pre-production-dns-cluster"} > 60 or aws_ecs_memory_utilization_average{dimension_ClusterName="staff-device-pre-production-dns-cluster"} > 60
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DNS ECS CPU or MemoryUtilization is above 60 percent
description: The Memory or CPU is currently {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics?orgId=1&var-account_id=473630360727
- alert: DNS ECS Unhealthy Container Alert
expr: aws_networkelb_un_healthy_host_count_average{dimension_LoadBalancer=~".+dns.+",account_id="{{ .Values.pre_production_account_id }}"} > 0
for: 1m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DNS ECS Unhealthy Container is above 0
description: Unhealthy Container count is currently {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics?orgId=1&var-account_id=473630360727
- alert: DNS ECS Task Count
expr: aws_ecs_containerinsights_running_task_count_average{dimension_ClusterName="staff-device-pre-production-dns-cluster"} < 1
for: 5m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: DNS ECS Task count is below 1
description: The ECS task count has fallen below 1 for DNS.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics?orgId=1&var-account_id=473630360727
- alert: Missing Request Metrics for Pre Production
expr: absent_over_time(aws_kea_dhcp_pkt4_request_received_average{account_id="{{ .Values.pre_production_account_id }}" }[10m]) == 1
for: 10m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: Missing request metrics for Pre Production
description: No metrics recieved for 10m.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: Missing Offer Metrics for Pre Production
expr: absent_over_time(aws_kea_dhcp_pkt4_offer_sent_average{account_id="{{ .Values.pre_production_account_id }}" }[10m]) == 1
for: 10m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: Missing offer metrics for Pre Production
description: No metrics recieved for 10m.
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/cEwjsH1Gk/kea-dhcp-metrics?orgId=1&refresh=1m&var-account_id=473630360727
- alert: Prometheus_Build_Info Alert
expr: prometheus_build_info{instance="localhost:9090"} > 1
for: 1m
labels:
severity: critical
service: DNS DHCP Dev PreProd
namespace: {{ .Release.Namespace }}
annotations:
summary: Prometheus_build_info is above 0.5
description: prometheus_build_info of {{ "{{ $labels.instance }}" }} in {{ "{{ $labels.job }}" }} is currently {{ "{{ $value }}" }}
grafana_dashboard_url: ""
Original file line number Diff line number Diff line change
Expand Up @@ -121,7 +121,7 @@ spec:
description: The Memory or CPU is currently {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics
- alert: DNS ECS Unhealthy Container Alert
expr: aws_networkelb_un_healthy_host_count_minimum{dimension_LoadBalancer=~".+dns.+",account_id="{{ .Values.production_account_id }}"} > 0
expr: aws_networkelb_un_healthy_host_count_average{dimension_LoadBalancer=~".+dns.+",account_id="{{ .Values.production_account_id }}"} > 0
for: 1m
labels:
severity: critical
Expand All @@ -130,7 +130,7 @@ spec:
annotations:
summary: DNS ECS Unhealthy Container is above 0
description: Unhealthy Container count is currently {{ "{{ $value }}" }}
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics
grafana_dashboard_url: https://monitoring-alerting.staff.service.justice.gov.uk/d/tm5gLH1Gz/bind-dns-metrics
- alert: DNS ECS Task Count
expr: aws_ecs_containerinsights_running_task_count_average{dimension_ClusterName="staff-device-production-dns-cluster"} < 1
for: 5m
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,4 @@ data:
network_access_control_critical_slack_webhook_url: {{ .Values.alertmanager.alert_rules.network_access_control_critical_slack_webhook_url }}
pagerduty_routing_key: {{ .Values.alertmanager.alert_rules.pagerduty_routing_key }}
dhcp_dns_slack_webhook_url: {{ .Values.alertmanager.alert_rules.dhcp_dns_slack_webhook_url }}
development_pre_production_dhcp_dns_slack_webhook_url: {{ .Values.alertmanager.alert_rules.development_pre_production_dhcp_dns_slack_webhook_url }}
Loading