From 27b5ee5a206a096e61f6c953c5b62f8c73daacd9 Mon Sep 17 00:00:00 2001 From: Nigel Pain Date: Tue, 17 Dec 2024 14:28:26 +0000 Subject: [PATCH 1/3] added config map for crime evidence grafana dashboard --- .../laa-crime-evidence-prod/06-grafana.yaml | 648 ++++++++++++++++++ 1 file changed, 648 insertions(+) create mode 100644 namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/06-grafana.yaml diff --git a/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/06-grafana.yaml b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/06-grafana.yaml new file mode 100644 index 00000000000..9ca76e7643a --- /dev/null +++ b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/06-grafana.yaml @@ -0,0 +1,648 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: laa-crime-evidence-dashboard + namespace: laa-crime-evidence-prod + labels: + grafana_dashboard: "laa-crime-evidence" +data: + laa-crime-evidence-dashboard.json: | + { + "annotations": { + "list": [ + { + "builtIn": 1, + "datasource": { + "type": "datasource", + "uid": "grafana" + }, + "enable": true, + "hide": true, + "iconColor": "rgba(0, 211, 255, 1)", + "limit": 100, + "name": "Annotations & Alerts", + "showIn": 0, + "target": { + "limit": 100, + "matchAny": false, + "tags": [], + "type": "dashboard" + }, + "type": "dashboard" + } + ] + }, + "editable": true, + "fiscalYearStartMonth": 0, + "graphTooltip": 1, + "id": 107, + "links": [], + "liveNow": false, + "panels": [ + { + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 0 + }, + "id": 23, + "title": "Crime Evidence Metrics", + "type": "row" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "% of Object Heap Type Memory Used", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 0, + "y": 1 + }, + "id": 17, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(jvm_memory_used_bytes{area=\"heap\", id=\"Tenured Gen\", namespace=\"$namespace\"}/jvm_memory_max_bytes{area=\"heap\", id=\"Tenured Gen\", namespace=\"$namespace\"})*100", + "legendFormat": "Tenured Gen Object Heap Usage for Pod: {{pod}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(jvm_memory_used_bytes{area=\"heap\", id=\"Survivor Space\", namespace=\"$namespace\"}/jvm_memory_max_bytes{area=\"heap\", id=\"Survivor Space\", namespace=\"$namespace\"})*100", + "hide": false, + "legendFormat": "Survivor Space Object Heap Usage for Pod: {{pod}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "(jvm_memory_used_bytes{area=\"heap\", id=\"Eden Space\", namespace=\"$namespace\"}/jvm_memory_max_bytes{area=\"heap\", id=\"Eden Space\", namespace=\"$namespace\"})*100", + "hide": false, + "legendFormat": "Eden Space Object Heap Usage for Pod: {{pod}}", + "range": true, + "refId": "C" + } + ], + "title": "Object Heap Memory Used", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "% CPU Usage", + "axisPlacement": "left", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 12, + "w": 12, + "x": 12, + "y": 1 + }, + "id": 25, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "process_cpu_usage{namespace=\"$namespace\"} * 100", + "legendFormat": "JVM Process CPU Usage for Pod: {{pod}}", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "system_cpu_usage{namespace=\"$namespace\"} * 100", + "hide": false, + "legendFormat": "System CPU Usage for Pod: {{pod}}", + "range": true, + "refId": "B" + } + ], + "title": "JVM and System CPU Usage for Pods", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Time in Seconds", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 13 + }, + "id": 27, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(http_server_requests_seconds_sum{outcome=\"SUCCESS\", namespace=\"$namespace\"})/\nsum(http_server_requests_seconds_count{outcome=\"SUCCESS\", namespace=\"$namespace\"})", + "legendFormat": "Average Successful Response Time", + "range": true, + "refId": "A" + } + ], + "title": "Average Successful Response Time", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Increase in Errors", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 12, + "y": 13 + }, + "id": 29, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(http_server_requests_seconds_count{outcome=\"SERVER_ERROR\", namespace=\"$namespace\"}[10m]))", + "legendFormat": "Increase in server errors over the past 10 minutes", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(http_server_requests_seconds_count{outcome=\"CLIENT_ERROR\", namespace=\"$namespace\"}[10m]))", + "hide": false, + "legendFormat": "Increase in client errors over the past 10 minutes", + "range": true, + "refId": "B" + } + ], + "title": "Increases in Client and Server Error Responses", + "type": "timeseries" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "palette-classic" + }, + "custom": { + "axisCenteredZero": false, + "axisColorMode": "text", + "axisLabel": "Number of logs", + "axisPlacement": "auto", + "barAlignment": 0, + "drawStyle": "line", + "fillOpacity": 0, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineInterpolation": "linear", + "lineWidth": 1, + "pointSize": 5, + "scaleDistribution": { + "type": "linear" + }, + "showPoints": "auto", + "spanNulls": false, + "stacking": { + "group": "A", + "mode": "none" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 10, + "w": 12, + "x": 0, + "y": 23 + }, + "id": 31, + "options": { + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(logback_events_total{level=\"error\", namespace=\"$namespace\"}[10m]))", + "legendFormat": "Increase in error level logs in the past 10 minutes", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "editorMode": "code", + "expr": "sum(increase(logback_events_total{level=\"info\", namespace=\"$namespace\"}[10m]))", + "hide": false, + "legendFormat": "Increase in info level logs in the past 10 minutes", + "range": true, + "refId": "B" + } + ], + "title": "Increases in logs in past 10 minutes (error and info level)", + "type": "timeseries" + } + ], + "refresh": "30s", + "schemaVersion": 37, + "style": "dark", + "tags": [ + "laa-crime-evidence" + ], + "templating": { + "list": [ + { + "current": { + "selected": true, + "text": "laa-crime-evidence-prod", + "value": "laa-crime-evidence-prod" + }, + "datasource": { + "type": "prometheus", + "uid": "prometheus" + }, + "definition": "label_values(kube_deployment_metadata_generation, namespace)", + "hide": 0, + "includeAll": false, + "label": "Namespace", + "multi": false, + "name": "namespace", + "options": [], + "query": { + "query": "label_values(kube_deployment_metadata_generation, namespace)", + "refId": "Prometheus-namespace-Variable-Query" + }, + "refresh": 1, + "regex": "/^laa-crime-evidence-/", + "skipUrlSync": false, + "sort": 0, + "tagValuesQuery": "", + "tagsQuery": "", + "type": "query", + "useTags": false + } + ] + }, + "time": { + "from": "now-30m", + "to": "now" + }, + "timepicker": { + "refresh_intervals": [ + "10s", + "30s", + "1m", + "5m", + "15m", + "30m", + "1h", + "2h", + "1d" + ], + "time_options": [ + "5m", + "15m", + "1h", + "6h", + "12h", + "24h", + "2d", + "7d", + "30d" + ] + }, + "timezone": "browser", + "title": "LAA Crime Evidence", + "uid": "6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2", + "version": 2, + "weekStart": "" + } \ No newline at end of file From e87806a6d4438d69ede9b5f9b2cc9d1c31576574 Mon Sep 17 00:00:00 2001 From: Nigel Pain Date: Tue, 17 Dec 2024 15:56:50 +0000 Subject: [PATCH 2/3] added config for prometheus alerts for crime evidence --- .../05-prometheus-rules.yaml | 134 ++++++++++++++++++ 1 file changed, 134 insertions(+) create mode 100644 namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/05-prometheus-rules.yaml diff --git a/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/05-prometheus-rules.yaml b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/05-prometheus-rules.yaml new file mode 100644 index 00000000000..c17c964703e --- /dev/null +++ b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/05-prometheus-rules.yaml @@ -0,0 +1,134 @@ +apiVersion: monitoring.coreos.com/v1 +kind: PrometheusRule +metadata: + namespace: laa-crime-evidence-prod + labels: + role: alert-rules + name: prometheus-custom-rules-laa-crime-evidence +spec: + groups: + - name: application-rules + rules: + - alert: Client Response Errors + expr: sum(increase(http_server_requests_seconds_count{outcome="CLIENT_ERROR", namespace="laa-crime-evidence-prod"}[10m])) > 1 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: There has been an increase in client error responses in the Crime Evidence Service running on production in the past 10 minutes. This may indicate a problem with clients calling the application - including intrusion attempts. + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Server Response Error + expr: sum(increase(http_server_requests_seconds_count{outcome="SERVER_ERROR", namespace="laa-crime-evidence-prod"}[10m])) > 1 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: There has been an increase in server error responses from the Crime Evidence Service running on production in the past 10 minutes. This may indicate a problem with the server processing client requests - likely a bug. + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: JVM CPU Usage + expr: (process_cpu_usage{namespace="laa-crime-evidence-prod"} * 100) > 95 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: The crime-evidence running on a production pod has been using over 95% of the CPU for a minute. This may indicate the pods running this application require more CPU resources. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: System CPU Usage + expr: (system_cpu_usage{namespace="laa-crime-evidence-prod"} * 100) > 95 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: A pod that runs the crime-evidence on production has been using over 95% of the pod's CPU for a minute. This may indicate there is some underlying process other than our application on the pod that is using up all the CPU resources and warrants further investigation. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Object Heap Memory Usage - Tenured Gen + expr: ((jvm_memory_used_bytes{area="heap", id="Tenured Gen", namespace="laa-crime-evidence-prod"}/jvm_memory_max_bytes{area="heap", id="Tenured Gen", namespace="laa-crime-evidence-prod"})*100) > 95 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: Over 95% of the "Tenured Gen" object heap memory on a production pod had been used. This may indicate our application needs more object heap memory or we have a memory resource leak in our application. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Object Heap Memory Usage - Survivor Space + expr: ((jvm_memory_used_bytes{area="heap", id="Survivor Space", namespace="laa-crime-evidence-prod"}/jvm_memory_max_bytes{area="heap", id="Survivor Space", namespace="laa-crime-evidence-prod"})*100) > 95 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: Over 95% of the "Survivor Space" object heap memory on a production pod had been used. This may indicate our application needs more object heap memory or we have a memory resource leak in our application. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Object Heap Memory Usage - Eden Space + expr: ((jvm_memory_used_bytes{area="heap", id="Eden Space", namespace="laa-crime-evidence-prod"}/jvm_memory_max_bytes{area="heap", id="Eden Space", namespace="laa-crime-evidence-prod"})*100) > 95 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: Over 95% of the "Eden Space" object heap memory on a production pod had been used. This may indicate our application needs more object heap memory or we have a memory resource leak in our application. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Response Time Excessive + expr: (sum(http_server_requests_seconds_sum{outcome="SUCCESS", namespace="laa-crime-evidence-prod"})/sum(http_server_requests_seconds_count{outcome="SUCCESS", namespace="laa-crime-evidence-prod"})) > 5 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: The response time for successful responses on production is taking on average over five seconds. This indicates responses to our clients is taking too long. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Logging Error + expr: sum(increase(logback_events_total{level="error", namespace="$namespace"}[10m])) > 1 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: There had been an error in the logs of the Crime Evidence service in the past 10 minutes. This indicates that there may be a bug with the application. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod + - alert: Instance-Down + expr: absent(up{namespace="laa-crime-evidence-prod"}) == 1 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: The production instance of the Crime Evidence service has been down for >1m. + - alert: Quota-Exceeded + expr: 100 * kube_resourcequota{job="kube-state-metrics",type="used",namespace="laa-crime-evidence-prod"} / ignoring(instance, job, type) (kube_resourcequota{job="kube-state-metrics",type="hard",namespace="laa-crime-evidence-prod"} > 0) > 90 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: Namespace {{ $labels.namespace }} is using {{ printf "%0.0f" $value}}% of its {{ $labels.resource }} quota. + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubequotaexceeded + - alert: KubePodCrashLooping + expr: round(rate(kube_pod_container_status_restarts_total{job="kube-state-metrics", namespace="laa-crime-evidence-prod"}[10m]) * 60 * 10) > 1 + for: 5m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: Pod {{ $labels.namespace }}/{{ $labels.pod }} ({{ $labels.container }}) is restarting excessively + runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubepodcrashlooping + - alert: Increase in 403 Blocked Requests + expr: sum(increase(nginx_ingress_controller_requests{exported_namespace="laa-crime-evidence-prod", ingress="laa-crime-evidence", status="403"}[5m])) > 1 + for: 1m + labels: + severity: laa-crime-evidence-alerts-prod + namespace: laa-crime-evidence-prod + annotations: + message: The rate of requests blocked by the internal ingress has been increasing over the past 5 minutes. + dashboard_url: https://grafana.live.cloud-platform.service.justice.gov.uk/d/6ab2e7a7933f4d42bd5871e8df8ff6b3703625e2/laa-crime-evidence?var-namespace=laa-crime-evidence-prod \ No newline at end of file From 9db6bdff38f6ac70ce4f24eaea0b18850c4f9cec Mon Sep 17 00:00:00 2001 From: Nigel Pain Date: Wed, 18 Dec 2024 08:13:59 +0000 Subject: [PATCH 3/3] added config for new crime evidence secret for slack webhook --- .../laa-crime-evidence-prod/resources/secret.tf | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/resources/secret.tf b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/resources/secret.tf index a6ab1785910..07b490fcdcb 100644 --- a/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/resources/secret.tf +++ b/namespaces/live.cloud-platform.service.justice.gov.uk/laa-crime-evidence-prod/resources/secret.tf @@ -20,5 +20,10 @@ module "secrets_manager" { recovery_window_in_days = 7 k8s_secret_name = "maat-api-oauth-client-secret" }, + "crime_evidence_alert_webhook_prod" = { + description = "Crime Evidence Slack Webhook", + recovery_window_in_days = 7, + k8s_secret_name = "crime-evidence-alert-webhook-prod" + } } } \ No newline at end of file