diff --git a/monitoring-mixins/agent-flow-mixin/deploy/alerts.yaml b/monitoring-mixins/agent-flow-mixin/deploy/alerts.yaml index 14b3f3e4..1a4c02ea 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/alerts.yaml +++ b/monitoring-mixins/agent-flow-mixin/deploy/alerts.yaml @@ -50,3 +50,15 @@ groups: max by (cluster, namespace, sha256) (agent_config_hash and on(cluster, namespace) cluster_node_info) ) > 1 for: 5m + - name: agent_controller + rules: + - alert: SlowComponentEvaluations + annotations: + message: Flow component evaluations are taking too long. + expr: sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0 + for: 15m + - alert: UnhealthyComponents + annotations: + message: Unhealthy Flow components detected. + expr: sum(agent_component_controller_running_components{health_type!="healthy"}) > 0 + for: 15m diff --git a/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-controller.json b/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-controller.json index 1068e500..d431a453 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-controller.json +++ b/monitoring-mixins/agent-flow-mixin/deploy/dashboards_out/agent-flow-controller.json @@ -343,13 +343,39 @@ }, { "datasource": "${datasource}", - "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", + "description": "The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.\n\nIdeally, no component should take more than 1 minute to evaluate. The components displayed in this chart\nmay be a sign of a problem with the pipeline.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, "gridPos": { "h": 10, "w": 8, "x": 16, "y": 12 }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\n/ scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))\n", + "instant": false, + "legendFormat": "{{component_id}}", + "range": true + } + ], + "title": "Slow components evaluation times", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 22 + }, "maxDataPoints": 30, "options": { "calculate": false, @@ -395,7 +421,7 @@ "gridPos": { "h": 10, "w": 8, - "x": 0, + "x": 8, "y": 22 }, "maxDataPoints": 30, diff --git a/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml index 8c05f5d0..6d7fd673 100644 --- a/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/agent-flow-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -1196,13 +1196,39 @@ data: }, { "datasource": "${datasource}", - "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", + "description": "The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate.\n\nIdeally, no component should take more than 1 minute to evaluate. The components displayed in this chart\nmay be a sign of a problem with the pipeline.\n", + "fieldConfig": { + "defaults": { + "unit": "percentunit" + } + }, "gridPos": { "h": 10, "w": 8, "x": 16, "y": 12 }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval]))\n/ scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster=\"$cluster\", namespace=\"$namespace\"}[$__rate_interval])))\n", + "instant": false, + "legendFormat": "{{component_id}}", + "range": true + } + ], + "title": "Slow components evaluation times", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Detailed histogram view of how long component evaluations take.\n\nThe goal is to design your config so that evaluations take as little\ntime as possible; under 100ms is a good goal.\n", + "gridPos": { + "h": 10, + "w": 8, + "x": 0, + "y": 22 + }, "maxDataPoints": 30, "options": { "calculate": false, @@ -1248,7 +1274,7 @@ data: "gridPos": { "h": 10, "w": 8, - "x": 0, + "x": 8, "y": 22 }, "maxDataPoints": 30, diff --git a/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json b/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json index 4b9a01d0..8f5007b3 100644 --- a/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/agent-flow-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "operations/agent-flow-mixin" } }, - "version": "5a7c16a8d6511535d815d50d974ca26f4e99524a", - "sum": "SoE2uqP81c1mYT38Q39mfWAAuv9ItXKO/hkIJp+hM9E=" + "version": "5c0fa9bf270beafcabfe46f9966f5b2c2dc84fff", + "sum": "TyMxO1sQ2p21MKAYqufOO5HVp/UtTyMBMLmBK4kM6Pw=" } ], "legacyImports": false diff --git a/monitoring-mixins/agent-flow-mixin/prometheus-alert.yaml b/monitoring-mixins/agent-flow-mixin/prometheus-alert.yaml index 05aabbf8..9eacc644 100644 --- a/monitoring-mixins/agent-flow-mixin/prometheus-alert.yaml +++ b/monitoring-mixins/agent-flow-mixin/prometheus-alert.yaml @@ -57,4 +57,15 @@ spec: max by (cluster, namespace, sha256) (agent_config_hash and on(cluster, namespace) cluster_node_info) ) > 1 for: 5m - + - name: agent_controller + rules: + - alert: SlowComponentEvaluations + annotations: + message: Flow component evaluations are taking too long. + expr: sum by (cluster, namespace, component_id) (rate(agent_component_evaluation_slow_seconds[10m])) > 0 + for: 15m + - alert: UnhealthyComponents + annotations: + message: Unhealthy Flow components detected. + expr: sum(agent_component_controller_running_components{health_type!="healthy"}) > 0 + for: 15m diff --git a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts.libsonnet b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts.libsonnet index 92555dc8..d1473e60 100644 --- a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts.libsonnet +++ b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/alerts.libsonnet @@ -2,6 +2,7 @@ prometheusAlerts+: { groups+: [ (import './alerts/clustering.libsonnet'), + (import './alerts/controller.libsonnet'), ], }, } diff --git a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet index 1dcfcd3c..3876d612 100644 --- a/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet +++ b/monitoring-mixins/agent-flow-mixin/vendor/github.com/grafana/agent/operations/agent-flow-mixin/dashboards/controller.libsonnet @@ -233,6 +233,28 @@ local filename = 'agent-flow-controller.json'; ]) ), + // Slow components evaluation time % + ( + panel.new(title='Slow components evaluation times', type='timeseries') + + panel.withUnit('percentunit') + + panel.withDescription(||| + The percentage of time spent evaluating 'slow' components - components that took longer than 1 minute to evaluate. + + Ideally, no component should take more than 1 minute to evaluate. The components displayed in this chart + may be a sign of a problem with the pipeline. + |||) + + panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + + panel.withQueries([ + panel.newQuery( + expr=||| + sum by (component_id) (rate(agent_component_evaluation_slow_seconds{cluster="$cluster", namespace="$namespace"}[$__rate_interval])) + / scalar(sum(rate(agent_component_evaluation_seconds_sum{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))) + |||, + legendFormat='{{component_id}}', + ), + ]) + ), + // Component evaluation histogram ( panel.newHeatmap('Component evaluation histogram') + @@ -242,7 +264,7 @@ local filename = 'agent-flow-controller.json'; The goal is to design your config so that evaluations take as little time as possible; under 100ms is a good goal. |||) + - panel.withPosition({ x: 16, y: 12, w: 8, h: 10 }) + + panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr='sum by (le) (increase(agent_component_evaluation_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))', @@ -261,7 +283,7 @@ local filename = 'agent-flow-controller.json'; The goal is to design your config so that most of the time components do not queue for long; under 10ms is a good goal. |||) + - panel.withPosition({ x: 0, y: 22, w: 8, h: 10 }) + + panel.withPosition({ x: 8, y: 22, w: 8, h: 10 }) + panel.withQueries([ panel.newQuery( expr='sum by (le) (increase(agent_component_dependencies_wait_seconds_bucket{cluster="$cluster", namespace="$namespace"}[$__rate_interval]))',