Skip to content

Commit

Permalink
fix(KubeletTooManyPods): handle inconsistent instance labels
Browse files Browse the repository at this point in the history
  • Loading branch information
skl committed Feb 3, 2025
1 parent e3b6eee commit 82532a8
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 6 deletions.
12 changes: 9 additions & 3 deletions alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -60,10 +60,16 @@ local utils = import '../lib/utils.libsonnet';
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
// We have to ignore this special node in the KubeletTooManyPods alert.
expr: |||
max by (cluster, node) (
label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)")
(
max by (cluster, instance) (
kubelet_running_pods{%(kubeletSelector)s} > 1
)
* on (cluster, instance) group_left(node)
max by (cluster, instance, node) (
kubelet_node_name{%(kubeletSelector)s}
)
)
/
/ on (cluster, node) group_left()
max by (%(clusterLabel)s, node) (
kube_node_status_capacity{%(kubeStateMetricsSelector)s, resource="pods"} != 1
) > 0.95
Expand Down
14 changes: 11 additions & 3 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -403,11 +403,13 @@ tests:
- eval_time: 61m
alertname: KubePersistentVolumeInodesFillingUp

- name: KubeletTooManyPods alert (single-node)
- name: KubeletTooManyPods alert (single-node with instance as IP address)
interval: 1m
input_series:
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
- series: 'kubelet_running_pods{cluster="kubernetes", instance="10.0.0.0", job="kubelet"}'
values: '3x15'
- series: 'kubelet_node_name{cluster="kubernetes", instance="10.0.0.0", node="node0", job="kubelet"}'
values: '1x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '3x15'
alert_rule_test:
Expand All @@ -418,22 +420,27 @@ tests:
exp_alerts:
- exp_labels:
cluster: kubernetes
instance: 10.0.0.0
node: node0
severity: info
exp_annotations:
summary: "Kubelet is running at capacity."
description: "Kubelet 'node0' is running at 100% of its Pod capacity."
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods

- name: KubeletTooManyPods alert (multi-node)
- name: KubeletTooManyPods alert (multi-node with instance as node name)
interval: 1m
input_series:
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
values: '3x15'
- series: 'kubelet_node_name{cluster="kubernetes", instance="node0", node="node0", job="kubelet"}'
values: '1x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '6x15'
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}'
values: '3x15'
- series: 'kubelet_node_name{cluster="kubernetes", instance="node1", node="node1", job="kubelet"}'
values: '1x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '3x15'
alert_rule_test:
Expand All @@ -444,6 +451,7 @@ tests:
exp_alerts:
- exp_labels:
cluster: kubernetes
instance: node1
node: node1
severity: info
exp_annotations:
Expand Down

0 comments on commit 82532a8

Please sign in to comment.