Skip to content

Commit

Permalink
fix: KubeletTooManyPods duplicate series error (#1017)
Browse files Browse the repository at this point in the history
  • Loading branch information
skl authored Feb 3, 2025
1 parent 1ec7a45 commit 234c773
Show file tree
Hide file tree
Showing 2 changed files with 36 additions and 23 deletions.
8 changes: 2 additions & 6 deletions alerts/kubelet.libsonnet
Original file line number Diff line number Diff line change
Expand Up @@ -60,12 +60,8 @@ local utils = import '../lib/utils.libsonnet';
// Some node has a capacity of 1 like AWS's Fargate and only exists while a pod is running on it.
// We have to ignore this special node in the KubeletTooManyPods alert.
expr: |||
count by (%(clusterLabel)s, node) (
(kube_pod_status_phase{%(kubeStateMetricsSelector)s, phase="Running"} == 1)
* on (%(clusterLabel)s, namespace, pod) group_left (node)
group by (%(clusterLabel)s, namespace, pod, node) (
kube_pod_info{%(kubeStateMetricsSelector)s}
)
max by (cluster, node) (
label_replace(kubelet_running_pods{%(kubeletSelector)s} > 1, "node", "$1", "instance", "(.*)")
)
/
max by (%(clusterLabel)s, node) (
Expand Down
51 changes: 34 additions & 17 deletions tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -403,22 +403,13 @@ tests:
- eval_time: 61m
alertname: KubePersistentVolumeInodesFillingUp

- interval: 1m
- name: KubeletTooManyPods alert (single-node)
interval: 1m
input_series:
- series: 'kube_node_status_capacity{resource="pods",instance="172.17.0.5:8443",cluster="kubernetes",node="minikube",job="kube-state-metrics",namespace="kube-system"}'
values: '3+0x15'
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-1",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-1",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-2",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-2",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kube_pod_info{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",node="minikube",pod="pod-3",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kube_pod_status_phase{endpoint="https-main",instance="172.17.0.5:8443",job="kube-state-metrics",cluster="kubernetes",namespace="kube-system",phase="Running",pod="pod-3",service="kube-state-metrics"}'
values: '1+0x15'
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
values: '3x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '3x15'
alert_rule_test:
- eval_time: 10m
alertname: KubeletTooManyPods
Expand All @@ -427,11 +418,37 @@ tests:
exp_alerts:
- exp_labels:
cluster: kubernetes
node: minikube
node: node0
severity: info
exp_annotations:
summary: "Kubelet is running at capacity."
description: "Kubelet 'node0' is running at 100% of its Pod capacity."
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods

- name: KubeletTooManyPods alert (multi-node)
interval: 1m
input_series:
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node0", job="kubelet"}'
values: '3x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node0", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '6x15'
- series: 'kubelet_running_pods{cluster="kubernetes", instance="node1", job="kubelet"}'
values: '3x15'
- series: 'kube_node_status_capacity{cluster="kubernetes", node="node1", job="kube-state-metrics", resource="pods", unit="integer"}'
values: '3x15'
alert_rule_test:
- eval_time: 10m
alertname: KubeletTooManyPods
- eval_time: 15m
alertname: KubeletTooManyPods
exp_alerts:
- exp_labels:
cluster: kubernetes
node: node1
severity: info
exp_annotations:
summary: "Kubelet is running at capacity."
description: "Kubelet 'minikube' is running at 100% of its Pod capacity."
description: "Kubelet 'node1' is running at 100% of its Pod capacity."
runbook_url: https://github.com/kubernetes-monitoring/kubernetes-mixin/tree/master/runbook.md#alert-name-kubelettoomanypods

- interval: 1m
Expand Down

0 comments on commit 234c773

Please sign in to comment.