From 57e8a24c07b26e4e963da5aebf1f60597afdafe7 Mon Sep 17 00:00:00 2001 From: Weifeng Wang Date: Sat, 28 Sep 2024 11:58:49 +0800 Subject: [PATCH] update alloy-mixin to 1.4.1 Signed-off-by: Weifeng Wang regenerates k8s manifests Signed-off-by: Weifeng Wang --- .../alloy/manifests/k8s-all-in-one.yaml | 266 +++++++++++------- .../logs/k8s-all-in-one.yaml | 266 +++++++++++------- .../metrics/k8s-all-in-one.yaml | 266 +++++++++++------- .../profiles/k8s-all-in-one.yaml | 266 +++++++++++------- .../traces/k8s-all-in-one.yaml | 266 +++++++++++------- .../all-in-one/k8s-all-in-one.yaml | 266 +++++++++++------- .../monolithic-mode/logs/k8s-all-in-one.yaml | 266 +++++++++++------- .../metrics/k8s-all-in-one.yaml | 266 +++++++++++------- .../profiles/k8s-all-in-one.yaml | 266 +++++++++++------- .../traces/k8s-all-in-one.yaml | 266 +++++++++++------- .../read-write-mode/logs/k8s-all-in-one.yaml | 266 +++++++++++------- .../metrics/k8s-all-in-one.yaml | 266 +++++++++++------- .../deploy/alloy-mixin-alerts.yaml | 66 +++-- .../dashboards_out/alloy-cluster-node.json | 5 +- .../alloy-cluster-overview.json | 31 +- .../dashboards_out/alloy-controller.json | 5 +- .../deploy/dashboards_out/alloy-logs.json | 2 +- .../dashboards_out/alloy-opentelemetry.json | 17 +- .../alloy-prometheus-remote-write.json | 121 ++++---- .../dashboards_out/alloy-resources.json | 3 - .../deploy/manifests/k8s-all-in-one.yaml | 266 +++++++++++------- .../alloy-mixin/deploy/prometheus-alerts.yaml | 66 +++-- .../alloy-mixin/jsonnetfile.json | 2 +- .../alloy-mixin/jsonnetfile.lock.json | 10 +- 24 files changed, 2421 insertions(+), 1365 deletions(-) diff --git a/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml b/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml index dd7fe42e..6bbdcf82 100644 --- a/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml +++ b/kubernetes/common/alloy/manifests/k8s-all-in-one.yaml @@ -162,7 +162,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -583,11 +583,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -682,7 +679,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -913,6 +910,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -963,11 +986,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1093,7 +1113,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1541,11 +1561,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2919,7 +2936,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -2951,7 +2968,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3062,7 +3079,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3084,7 +3101,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3105,7 +3122,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3148,7 +3165,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3180,7 +3197,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3238,11 +3255,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3445,66 +3459,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3512,34 +3533,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3614,9 +3654,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3663,7 +3703,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3696,7 +3736,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3729,7 +3769,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3755,7 +3795,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3786,7 +3826,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3812,7 +3852,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3875,11 +3915,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4276,11 +4313,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4611,76 +4645,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml b/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml index e5aa6187..34919a4c 100644 --- a/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml +++ b/kubernetes/microservices-mode/logs/k8s-all-in-one.yaml @@ -408,7 +408,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -829,11 +829,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -928,7 +925,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1159,6 +1156,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1209,11 +1232,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1368,7 +1388,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1816,11 +1836,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3194,7 +3211,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3226,7 +3243,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3337,7 +3354,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3359,7 +3376,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3380,7 +3397,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3423,7 +3440,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3455,7 +3472,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3513,11 +3530,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3720,66 +3734,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3787,34 +3808,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3889,9 +3929,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3938,7 +3978,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3971,7 +4011,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -4004,7 +4044,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4030,7 +4070,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4061,7 +4101,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4087,7 +4127,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4150,11 +4190,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4551,11 +4588,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -6960,76 +6994,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml b/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml index d01750f8..def4c7fa 100644 --- a/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml +++ b/kubernetes/microservices-mode/metrics/k8s-all-in-one.yaml @@ -186,7 +186,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -607,11 +607,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -706,7 +703,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -937,6 +934,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -987,11 +1010,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1138,7 +1158,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1586,11 +1606,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2964,7 +2981,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -2996,7 +3013,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3107,7 +3124,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3129,7 +3146,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3150,7 +3167,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3193,7 +3210,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3225,7 +3242,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3283,11 +3300,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3490,66 +3504,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3557,34 +3578,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3659,9 +3699,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3708,7 +3748,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3741,7 +3781,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3774,7 +3814,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3800,7 +3840,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3831,7 +3871,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3857,7 +3897,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3920,11 +3960,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4321,11 +4358,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -6777,76 +6811,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml b/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml index 8109e437..13e5a001 100644 --- a/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml +++ b/kubernetes/microservices-mode/profiles/k8s-all-in-one.yaml @@ -246,7 +246,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -667,11 +667,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -766,7 +763,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -997,6 +994,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1047,11 +1070,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1203,7 +1223,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1651,11 +1671,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3029,7 +3046,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3061,7 +3078,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3172,7 +3189,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3194,7 +3211,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3215,7 +3232,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3258,7 +3275,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3290,7 +3307,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3348,11 +3365,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3555,66 +3569,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3622,34 +3643,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3724,9 +3764,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3773,7 +3813,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3806,7 +3846,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3839,7 +3879,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3865,7 +3905,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3896,7 +3936,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3922,7 +3962,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3985,11 +4025,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4386,11 +4423,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -6087,76 +6121,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml b/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml index 02c62bf4..f3583420 100644 --- a/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml +++ b/kubernetes/microservices-mode/traces/k8s-all-in-one.yaml @@ -578,7 +578,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -999,11 +999,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1098,7 +1095,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1329,6 +1326,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1379,11 +1402,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1548,7 +1568,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1996,11 +2016,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3374,7 +3391,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3406,7 +3423,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3517,7 +3534,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3539,7 +3556,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3560,7 +3577,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3603,7 +3620,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3635,7 +3652,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3693,11 +3710,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3900,66 +3914,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3967,34 +3988,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -4069,9 +4109,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -4118,7 +4158,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -4151,7 +4191,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -4184,7 +4224,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4210,7 +4250,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4241,7 +4281,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4267,7 +4307,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4330,11 +4370,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4731,11 +4768,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -6846,76 +6880,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml index 4af60d71..e02c5124 100644 --- a/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml +++ b/kubernetes/monolithic-mode/all-in-one/k8s-all-in-one.yaml @@ -910,7 +910,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1331,11 +1331,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1430,7 +1427,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1661,6 +1658,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1711,11 +1734,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1896,7 +1916,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -2344,11 +2364,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3722,7 +3739,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3754,7 +3771,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3865,7 +3882,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3887,7 +3904,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3908,7 +3925,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3951,7 +3968,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3983,7 +4000,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -4041,11 +4058,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4248,66 +4262,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -4315,34 +4336,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -4417,9 +4457,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -4466,7 +4506,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -4499,7 +4539,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -4532,7 +4572,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4558,7 +4598,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4589,7 +4629,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4615,7 +4655,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4678,11 +4718,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5079,11 +5116,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -6729,76 +6763,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml index fdecd2e0..6cc80c33 100644 --- a/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml +++ b/kubernetes/monolithic-mode/logs/k8s-all-in-one.yaml @@ -387,7 +387,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -808,11 +808,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -907,7 +904,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1138,6 +1135,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1188,11 +1211,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1347,7 +1367,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1795,11 +1815,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3173,7 +3190,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3205,7 +3222,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3316,7 +3333,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3338,7 +3355,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3359,7 +3376,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3402,7 +3419,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3434,7 +3451,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3492,11 +3509,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3699,66 +3713,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3766,34 +3787,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3868,9 +3908,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3917,7 +3957,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3950,7 +3990,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3983,7 +4023,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4009,7 +4049,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4040,7 +4080,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4066,7 +4106,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4129,11 +4169,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4530,11 +4567,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5367,76 +5401,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml index 1889b970..c2c1fb1f 100644 --- a/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml +++ b/kubernetes/monolithic-mode/metrics/k8s-all-in-one.yaml @@ -174,7 +174,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -595,11 +595,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -694,7 +691,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -925,6 +922,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -975,11 +998,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1126,7 +1146,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1574,11 +1594,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2952,7 +2969,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -2984,7 +3001,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3095,7 +3112,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3117,7 +3134,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3138,7 +3155,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3181,7 +3198,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3213,7 +3230,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3271,11 +3288,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3478,66 +3492,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3545,34 +3566,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3647,9 +3687,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3696,7 +3736,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3729,7 +3769,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3762,7 +3802,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3788,7 +3828,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3819,7 +3859,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3845,7 +3885,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3908,11 +3948,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4309,11 +4346,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4934,76 +4968,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml index 5acb286e..bbf2fe5c 100644 --- a/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml +++ b/kubernetes/monolithic-mode/profiles/k8s-all-in-one.yaml @@ -237,7 +237,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -658,11 +658,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -757,7 +754,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -988,6 +985,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1038,11 +1061,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1194,7 +1214,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1642,11 +1662,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3020,7 +3037,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3052,7 +3069,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3163,7 +3180,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3185,7 +3202,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3206,7 +3223,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3249,7 +3266,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3281,7 +3298,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3339,11 +3356,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3546,66 +3560,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3613,34 +3634,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3715,9 +3755,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3764,7 +3804,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3797,7 +3837,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3830,7 +3870,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3856,7 +3896,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3887,7 +3927,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3913,7 +3953,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3976,11 +4016,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4377,11 +4414,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5255,76 +5289,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml b/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml index f34f4f5d..de80ade6 100644 --- a/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml +++ b/kubernetes/monolithic-mode/traces/k8s-all-in-one.yaml @@ -567,7 +567,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -988,11 +988,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1087,7 +1084,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1318,6 +1315,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1368,11 +1391,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1552,7 +1572,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -2000,11 +2020,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3378,7 +3395,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3410,7 +3427,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3521,7 +3538,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3543,7 +3560,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3564,7 +3581,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3607,7 +3624,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3639,7 +3656,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3697,11 +3714,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3904,66 +3918,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3971,34 +3992,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -4073,9 +4113,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -4122,7 +4162,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -4155,7 +4195,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -4188,7 +4228,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4214,7 +4254,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4245,7 +4285,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4271,7 +4311,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4334,11 +4374,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4735,11 +4772,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5894,76 +5928,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml b/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml index e3cef7b4..5967fdd6 100644 --- a/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml +++ b/kubernetes/read-write-mode/logs/k8s-all-in-one.yaml @@ -396,7 +396,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -817,11 +817,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -916,7 +913,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -1147,6 +1144,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -1197,11 +1220,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1356,7 +1376,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1804,11 +1824,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3182,7 +3199,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3214,7 +3231,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3325,7 +3342,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3347,7 +3364,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3368,7 +3385,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3411,7 +3428,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3443,7 +3460,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3501,11 +3518,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3708,66 +3722,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3775,34 +3796,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3877,9 +3917,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3926,7 +3966,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3959,7 +3999,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3992,7 +4032,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -4018,7 +4058,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -4049,7 +4089,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -4075,7 +4115,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -4138,11 +4178,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4539,11 +4576,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5806,76 +5840,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml b/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml index 540f82fb..014c8bc9 100644 --- a/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml +++ b/kubernetes/read-write-mode/metrics/k8s-all-in-one.yaml @@ -174,7 +174,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -595,11 +595,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -694,7 +691,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -925,6 +922,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -975,11 +998,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1126,7 +1146,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1574,11 +1594,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2952,7 +2969,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -2984,7 +3001,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -3095,7 +3112,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -3117,7 +3134,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3138,7 +3155,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3181,7 +3198,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3213,7 +3230,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -3271,11 +3288,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -3478,66 +3492,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -3545,34 +3566,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -3647,9 +3687,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -3696,7 +3736,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -3729,7 +3769,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -3762,7 +3802,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -3788,7 +3828,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -3819,7 +3859,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -3845,7 +3885,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -3908,11 +3948,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -4309,11 +4346,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -5113,76 +5147,116 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning --- apiVersion: monitoring.coreos.com/v1 kind: ServiceMonitor diff --git a/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml b/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml index 4cdcfb4e..3691868c 100644 --- a/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml +++ b/monitoring-mixins/alloy-mixin/deploy/alloy-mixin-alerts.yaml @@ -3,62 +3,92 @@ groups: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0 + description: 'Cluster is not converging: nodes report different number of peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 + description: A node tried to join the cluster with a name conflicting with an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0 + description: There is a node within the cluster that is stuck in Terminating state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0 + description: The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0 + description: The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json index 2db346fd..1e880875 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-node.json @@ -20,7 +20,7 @@ "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -441,11 +441,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json index 34cbde1a..8d31aaeb 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-cluster-overview.json @@ -20,7 +20,7 @@ "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -251,6 +251,32 @@ ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -301,11 +327,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json index 47b9c39f..8cbcca23 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-controller.json @@ -57,7 +57,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -505,11 +505,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json index ed430941..b6f5eeb7 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-logs.json @@ -320,5 +320,5 @@ }, "timezone": "utc", "title": "Alloy / Logs Overview", - "uid": "alloy--logs-overview" + "uid": "53c1ecddc3a1d5d4b8d6cd0c23676c31" } \ No newline at end of file diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json index acae4280..5294d6f8 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-opentelemetry.json @@ -49,7 +49,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -81,7 +81,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -192,7 +192,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -214,7 +214,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -235,7 +235,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -278,7 +278,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -310,7 +310,7 @@ "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -368,11 +368,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-prometheus-remote-write.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-prometheus-remote-write.json index daf80da8..c26e91c4 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-prometheus-remote-write.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-prometheus-remote-write.json @@ -128,66 +128,73 @@ }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -195,34 +202,53 @@ }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -297,9 +323,9 @@ }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -346,7 +372,7 @@ "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -379,7 +405,7 @@ "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -412,7 +438,7 @@ "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -438,7 +464,7 @@ "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -469,7 +495,7 @@ "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -495,7 +521,7 @@ "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -558,11 +584,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-resources.json b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-resources.json index 5bfd381e..47d26e4d 100644 --- a/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-resources.json +++ b/monitoring-mixins/alloy-mixin/deploy/dashboards_out/alloy-resources.json @@ -277,11 +277,8 @@ "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", diff --git a/monitoring-mixins/alloy-mixin/deploy/manifests/k8s-all-in-one.yaml b/monitoring-mixins/alloy-mixin/deploy/manifests/k8s-all-in-one.yaml index 79b55e92..c2a3f911 100644 --- a/monitoring-mixins/alloy-mixin/deploy/manifests/k8s-all-in-one.yaml +++ b/monitoring-mixins/alloy-mixin/deploy/manifests/k8s-all-in-one.yaml @@ -23,7 +23,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -444,11 +444,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -543,7 +540,7 @@ data: "title": "Documentation", "tooltip": "Clustering documentation", "type": "link", - "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustered-mode" + "url": "https://grafana.com/docs/alloy/latest/reference/cli/run/#clustering" }, { "asDropdown": true, @@ -774,6 +771,32 @@ data: ], "title": "Convergance state timeline", "type": "state-timeline" + }, + { + "datasource": "${datasource}", + "description": "The number of cluster peers seen by each instance.\n\nWhen cluster is converged, every peer should see all the other instances. When we have a split brain or one\npeer not joining the cluster, we will see two or more groups of instances that report different peer numbers\nfor an extended period of time and not converging.\n\nThis graph helps to identify which instances may be in a split brain state.\n", + "fieldConfig": { + "defaults": { + "unit": "peers" + } + }, + "gridPos": { + "h": 12, + "w": 24, + "x": 0, + "y": 18 + }, + "targets": [ + { + "datasource": "${datasource}", + "expr": "sum by(instance) (cluster_node_peers{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "instant": false, + "legendFormat": "{{instance}}", + "range": true + } + ], + "title": "Number of peers seen by each instance", + "type": "timeseries" } ], "refresh": "10s", @@ -824,11 +847,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -945,7 +965,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "count(alloy_component_controller_evaluating{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"})\n", + "expr": "count(group(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\"}) by (instance))\n", "instant": false, "legendFormat": "__auto", "range": true @@ -1393,11 +1413,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -1506,7 +1523,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_accepted_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_accepted_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -1538,7 +1555,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(receiver_refused_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_receiver_refused_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }} / {{ transport }}", "range": true @@ -1649,7 +1666,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "sum by (le) (increase(processor_batch_batch_send_size_ratio_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", + "expr": "sum by (le) (increase(otelcol_processor_batch_batch_send_size_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval]))\n", "format": "heatmap", "instant": false, "legendFormat": "{{le}}", @@ -1671,7 +1688,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "processor_batch_metadata_cardinality_ratio{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", + "expr": "otelcol_processor_batch_metadata_cardinality{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -1692,7 +1709,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(processor_batch_timeout_trigger_send_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_processor_batch_timeout_trigger_send_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -1735,7 +1752,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_sent_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_sent_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -1767,7 +1784,7 @@ data: "targets": [ { "datasource": "${datasource}", - "expr": "rate(exporter_send_failed_spans_ratio_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", + "expr": "rate(otelcol_exporter_send_failed_spans_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\"}[$__rate_interval])\n", "instant": false, "legendFormat": "{{ pod }}", "range": true @@ -1825,11 +1842,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2032,66 +2046,73 @@ data: }, { "datasource": "${datasource}", - "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", + "description": "Percentage of samples sent by prometheus.remote_write that succeeded.\n\nLow success rates can indicate a problem with Alloy or the remote storage.\n", "fieldConfig": { "defaults": { - "unit": "s" + "unit": "percentunit" } }, "gridPos": { "h": 10, - "w": 6, + "w": 12, "x": 0, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", + "expr": "(\n 1 - \n (\n sum(rate(prometheus_remote_storage_samples_failed_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n /\n (\n sum(rate(prometheus_remote_storage_samples_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]))\n )\n)\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "% of samples successfully sent", "range": true } ], - "title": "WAL delay", + "title": "Remote write success rate in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", "fieldConfig": { "defaults": { - "custom": { - "fillOpacity": 20, - "gradientMode": "hue", - "stacking": { - "mode": "normal" - } - }, - "unit": "Bps" + "unit": "s" } }, "gridPos": { "h": 10, - "w": 6, - "x": 6, + "w": 12, + "x": 12, "y": 12 }, "targets": [ { "datasource": "${datasource}", - "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", + "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", "instant": false, - "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", + "legendFormat": "99th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "instant": false, + "legendFormat": "50th percentile", + "range": true + }, + { + "datasource": "${datasource}", + "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "instant": false, + "legendFormat": "Average", "range": true } ], - "title": "Data write throughput", + "title": "Write latency in $cluster", "type": "timeseries" }, { "datasource": "${datasource}", - "description": "Latency of writes to the remote system made by\nprometheus.remote_write.\n", + "description": "How far behind prometheus.remote_write from samples recently written\nto the WAL.\n\nEach endpoint prometheus.remote_write is configured to send metrics\nhas its own delay. The time shown here is the sum across all\nendpoints for the given component.\n\nIt is normal for the WAL delay to be within 1-3 scrape intervals. If\nthe WAL delay continues to increase beyond that amount, try\nincreasing the number of maximum shards.\n", "fieldConfig": { "defaults": { "unit": "s" @@ -2099,34 +2120,53 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 12, - "y": 12 + "w": 8, + "x": 0, + "y": 22 }, "targets": [ { "datasource": "${datasource}", - "expr": "histogram_quantile(0.99, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", - "instant": false, - "legendFormat": "99th percentile", - "range": true - }, - { - "datasource": "${datasource}", - "expr": "histogram_quantile(0.50, sum by (le) (\n rate(prometheus_remote_storage_sent_batch_duration_seconds_bucket{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n))\n", + "expr": "sum by (instance, component_path, component_id) (\n prometheus_remote_storage_highest_timestamp_in_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}\n - ignoring(url, remote_name) group_right(instance)\n prometheus_remote_storage_queue_highest_sent_timestamp_seconds{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}\n)\n", "instant": false, - "legendFormat": "50th percentile", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true - }, + } + ], + "title": "WAL delay", + "type": "timeseries" + }, + { + "datasource": "${datasource}", + "description": "Rate of data containing samples and metadata sent by\nprometheus.remote_write.\n", + "fieldConfig": { + "defaults": { + "custom": { + "fillOpacity": 20, + "gradientMode": "hue", + "stacking": { + "mode": "normal" + } + }, + "unit": "Bps" + } + }, + "gridPos": { + "h": 10, + "w": 8, + "x": 8, + "y": 22 + }, + "targets": [ { "datasource": "${datasource}", - "expr": "sum(rate(prometheus_remote_storage_sent_batch_duration_seconds_sum{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval])) /\nsum(rate(prometheus_remote_storage_sent_batch_duration_seconds_count{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\"}[$__rate_interval]))\n", + "expr": "sum without (remote_name, url) (\n rate(prometheus_remote_storage_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval]) +\n rate(prometheus_remote_storage_metadata_bytes_total{cluster=~\"$cluster\", namespace=~\"$namespace\", job=~\"$job\", instance=~\"$instance\", component_path=~\"$component_path\", component_id=~\"$component\", url=~\"$url\"}[$__rate_interval])\n)\n", "instant": false, - "legendFormat": "Average", + "legendFormat": "{{instance}} / {{component_path}} {{component_id}}", "range": true } ], - "title": "Write latency", + "title": "Data write throughput", "type": "timeseries" }, { @@ -2201,9 +2241,9 @@ data: }, "gridPos": { "h": 10, - "w": 6, - "x": 18, - "y": 12 + "w": 8, + "x": 16, + "y": 22 }, "targets": [ { @@ -2250,7 +2290,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 22 + "y": 32 }, "targets": [ { @@ -2283,7 +2323,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 22 + "y": 32 }, "targets": [ { @@ -2316,7 +2356,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 22 + "y": 32 }, "targets": [ { @@ -2342,7 +2382,7 @@ data: "h": 10, "w": 8, "x": 0, - "y": 32 + "y": 42 }, "options": { "legend": { @@ -2373,7 +2413,7 @@ data: "h": 10, "w": 8, "x": 8, - "y": 32 + "y": 42 }, "targets": [ { @@ -2399,7 +2439,7 @@ data: "h": 10, "w": 8, "x": 16, - "y": 32 + "y": 42 }, "targets": [ { @@ -2462,11 +2502,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2863,11 +2900,8 @@ data: "type": "query" }, { - "allValue": ".*", "datasource": "${datasource}", - "includeAll": true, "label": "job", - "multi": true, "name": "job", "query": { "query": "label_values(alloy_component_controller_running_components{cluster=~\"$cluster\", namespace=~\"$namespace\"}, job)\n", @@ -2948,73 +2982,113 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers - in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) - != 0 + description: 'Cluster is not converging: nodes report different number of + peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) + (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a - split brain state. + split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed + Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. + Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an - existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) + description: A node tried to join the cluster with a name conflicting with + an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) + description: There is a node within the cluster that is stuck in Terminating + state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job + is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job + }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id + }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) + description: The receiver could not push some spans to the pipeline under + job {{ $labels.job }}. This could be due to reaching a limit such as the + ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) + description: The exporter failed to send spans to their destination under + job {{ $labels.job }}. There could be an issue with the payload or with + the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning diff --git a/monitoring-mixins/alloy-mixin/deploy/prometheus-alerts.yaml b/monitoring-mixins/alloy-mixin/deploy/prometheus-alerts.yaml index ef4d3432..e196687e 100644 --- a/monitoring-mixins/alloy-mixin/deploy/prometheus-alerts.yaml +++ b/monitoring-mixins/alloy-mixin/deploy/prometheus-alerts.yaml @@ -10,62 +10,92 @@ spec: rules: - alert: ClusterNotConverging annotations: - message: 'Cluster is not converging: nodes report different number of peers in the cluster.' - expr: stddev by (cluster, namespace, job) (sum without (state) (cluster_node_peers)) != 0 + description: 'Cluster is not converging: nodes report different number of peers in the cluster. Job is {{ $labels.job }}' + summary: Cluster is not converging. + expr: stddev by (cluster, namespace, job, cluster_name) (sum without (state) (cluster_node_peers)) != 0 for: 10m + labels: + severity: warning - alert: ClusterNodeCountMismatch annotations: - message: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. + description: Nodes report different number of peers vs. the count of observed Alloy metrics. Some Alloy metrics may be missing or the cluster is in a split brain state. Job is {{ $labels.job }} + summary: Nodes report different number of peers vs. the count of observed Alloy metrics. expr: | sum without (state) (cluster_node_peers) != - on (cluster, namespace, job) group_left - count by (cluster, namespace, job) (cluster_node_info) + on (cluster, namespace, job, cluster_name) group_left + count by (cluster, namespace, job, cluster_name) (cluster_node_info) for: 15m + labels: + severity: warning - alert: ClusterNodeUnhealthy annotations: - message: Cluster node is reporting a gossip protocol health score > 0. + description: Cluster node is reporting a gossip protocol health score > 0. Job is {{ $labels.job }} + summary: Cluster unhealthy. expr: | cluster_node_gossip_health_score > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeNameConflict annotations: - message: A node tried to join the cluster with a name conflicting with an existing peer. - expr: sum by (cluster, namespace, job) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 + description: A node tried to join the cluster with a name conflicting with an existing peer. Job is {{ $labels.job }} + summary: Cluster Node Name Conflict. + expr: sum by (cluster, namespace, job, cluster_name) (rate(cluster_node_gossip_received_events_total{event="node_conflict"}[2m])) > 0 for: 10m + labels: + severity: warning - alert: ClusterNodeStuckTerminating annotations: - message: Cluster node stuck in Terminating state. - expr: sum by (cluster, namespace, job, instance) (cluster_node_peers{state="terminating"}) > 0 + description: There is a node within the cluster that is stuck in Terminating state. Job is {{ $labels.job }} + summary: Cluster node stuck in Terminating state. + expr: sum by (cluster, namespace, job, instance, cluster_name) (cluster_node_peers{state="terminating"}) > 0 for: 10m + labels: + severity: warning - alert: ClusterConfigurationDrift annotations: - message: Cluster nodes are not using the same configuration file. + description: Cluster nodes are not using the same configuration file. Job is {{ $labels.job }} + summary: Cluster configuration drifting. expr: | count without (sha256) ( - max by (cluster, namespace, sha256, job) (alloy_config_hash and on(cluster, namespace, job) cluster_node_info) + max by (cluster, namespace, sha256, job, cluster_name) (alloy_config_hash and on(cluster, namespace, job, cluster_name) cluster_node_info) ) > 1 for: 5m + labels: + severity: warning - name: alloy_controller rules: - alert: SlowComponentEvaluations annotations: - message: Component evaluations are taking too long. + description: Component evaluations are taking too long under job {{ $labels.job }}, component_path {{ $labels.component_path }}, component_id {{ $labels.component_id }}. + summary: Component evaluations are taking too long. expr: sum by (cluster, namespace, job, component_path, component_id) (rate(alloy_component_evaluation_slow_seconds[10m])) > 0 for: 15m + labels: + severity: warning - alert: UnhealthyComponents annotations: - message: Unhealthy components detected. + description: Unhealthy components detected under job {{ $labels.job }} + summary: Unhealthy components detected. expr: sum by (cluster, namespace, job) (alloy_component_controller_running_components{health_type!="healthy"}) > 0 for: 15m + labels: + severity: warning - name: alloy_otelcol rules: - alert: OtelcolReceiverRefusedSpans annotations: - message: The receiver could not push some spans to the pipeline. - expr: sum by (cluster, namespace, job) (rate(receiver_refused_spans_ratio_total{}[1m])) > 0 + description: The receiver could not push some spans to the pipeline under job {{ $labels.job }}. This could be due to reaching a limit such as the ones imposed by otelcol.processor.memory_limiter. + summary: The receiver could not push some spans to the pipeline. + expr: sum by (cluster, namespace, job) (rate(otelcol_receiver_refused_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning - alert: OtelcolExporterFailedSpans annotations: - message: The exporter failed to send spans to their destination. - expr: sum by (cluster, namespace, job) (rate(exporter_send_failed_spans_ratio_total{}[1m])) > 0 + description: The exporter failed to send spans to their destination under job {{ $labels.job }}. There could be an issue with the payload or with the destination endpoint. + summary: The exporter failed to send spans to their destination. + expr: sum by (cluster, namespace, job) (rate(otelcol_exporter_send_failed_spans_total{}[1m])) > 0 for: 5m + labels: + severity: warning diff --git a/monitoring-mixins/alloy-mixin/jsonnetfile.json b/monitoring-mixins/alloy-mixin/jsonnetfile.json index 97d0b073..830b968a 100644 --- a/monitoring-mixins/alloy-mixin/jsonnetfile.json +++ b/monitoring-mixins/alloy-mixin/jsonnetfile.json @@ -8,7 +8,7 @@ "subdir": "operations/alloy-mixin" } }, - "version": "v1.2.0" + "version": "v1.4.1" } ], "legacyImports": true diff --git a/monitoring-mixins/alloy-mixin/jsonnetfile.lock.json b/monitoring-mixins/alloy-mixin/jsonnetfile.lock.json index 25f86fde..d91bbb8c 100644 --- a/monitoring-mixins/alloy-mixin/jsonnetfile.lock.json +++ b/monitoring-mixins/alloy-mixin/jsonnetfile.lock.json @@ -8,8 +8,8 @@ "subdir": "operations/alloy-mixin" } }, - "version": "9608b1a9d0f4a4f69dbee00c23bc3650bab2d4c1", - "sum": "UIW6KSLSxdyGf4HYhn7b9v9d0lgAL5+B9zQopQKhzvk=" + "version": "5ae33d7579f2d227d8de293232c40b7fdd864e62", + "sum": "oEPCDzcAw1sGH8Qoo/RfGBB/ec7YmXspNdmxx3Mz0wE=" }, { "source": { @@ -18,7 +18,7 @@ "subdir": "gen/grafonnet-v10.0.0" } }, - "version": "119d65363dff84a1976bba609f2ac3a8f450e760", + "version": "733beadbc8dab55c5fe1bcdcf0d8a2d215759a55", "sum": "xdcrJPJlpkq4+5LpGwN4tPAuheNNLXZjE6tDcyvFjr0=" }, { @@ -28,7 +28,7 @@ "subdir": "logs-lib/" } }, - "version": "49029fcf5e33da03d7dffda631c03dd5e989a1d4", + "version": "349a4d4936014ab83ace72f500424d6ef098201e", "sum": "tUeoie1Cc7Ih7eKOX/H9pD4PkDnTzLZzP9fupYIBMwA=" }, { @@ -38,7 +38,7 @@ "subdir": "logs-lib/logs" } }, - "version": "49029fcf5e33da03d7dffda631c03dd5e989a1d4", + "version": "349a4d4936014ab83ace72f500424d6ef098201e", "sum": "/P3y+jKwI4IDIPT6dUuaNH0PscvYUa/j7iPvyHndLXM=" }, {