From 82532a8970ff1dd6e5e8359cc72c905cd1f25c16 Mon Sep 17 00:00:00 2001 From: Stephen Lang Date: Mon, 3 Feb 2025 10:42:09 +0000 Subject: [PATCH] fix(KubeletTooManyPods): handle inconsistent instance labels --- alerts/kubelet.libsonnet | 12 +++++++++--- tests.yaml | 14 +++++++++++--- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/alerts/kubelet.libsonnet b/alerts/kubelet.libsonnet index b08d11388..a8709a9dc 100644 --- a/alerts/kubelet.libsonnet +++ b/alerts/kubelet.libsonnet @@ -60,10 +60,16 @@ local utils = import '../lib/utils.libsonnet'; // Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it. // We have to ignore this special node in the KubeletTooManyPods alert. expr: ||| - max by (cluster, node) ( - label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)") + ( + max by (cluster, instance) ( + kubelet_running_pods{%(kubeletSelector)s} > 1 + ) + * on (cluster, instance) group_left(node) + max by (cluster, instance, node) ( + kubelet_node_name{%(kubeletSelector)s} + ) ) - / + / on (cluster, node) group_left() max by (%(clusterLabel)s, node) ( kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1 ) > 0.95 diff --git a/tests.yaml b/tests.yaml index 32c1a52e2..3f1f1ba15 100644 --- a/tests.yaml +++ b/tests.yaml @@ -403,11 +403,13 @@ tests: - eval_time: 61m alertname: KubePersistentVolumeInodesFillingUp -- name: KubeletTooManyPods alert (single-node) +- name: KubeletTooManyPods alert (single-node with instance as IP address) interval: 1m input_series: - - series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}' + - series: 'kubelet_running_pods{cluster="kubernetes", instance="10.0.0.0", job="kubelet"}' values: '3x15' + - series: 'kubelet_node_name{cluster="kubernetes", instance="10.0.0.0", node="node0", job="kubelet"}' + values: '1x15' - series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}' values: '3x15' alert_rule_test: @@ -418,6 +420,7 @@ tests: exp_alerts: - exp_labels: cluster: kubernetes + instance: 10.0.0.0 node: node0 severity: info exp_annotations: @@ -425,15 +428,19 @@ tests: description: "Kubelet 'node0' is running at 100% of its Pod capacity." runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods -- name: KubeletTooManyPods alert (multi-node) +- name: KubeletTooManyPods alert (multi-node with instance as node name) interval: 1m input_series: - series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}' values: '3x15' + - series: 'kubelet_node_name{cluster="kubernetes", instance="node0", node="node0", job="kubelet"}' + values: '1x15' - series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}' values: '6x15' - series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}' values: '3x15' + - series: 'kubelet_node_name{cluster="kubernetes", instance="node1", node="node1", job="kubelet"}' + values: '1x15' - series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}' values: '3x15' alert_rule_test: @@ -444,6 +451,7 @@ tests: exp_alerts: - exp_labels: cluster: kubernetes + instance: node1 node: node1 severity: info exp_annotations: