Skip to content

Commit

Permalink
kubelet: Fix semantics for *ReservedCgroup and enforceNodeAllocatable
Browse files Browse the repository at this point in the history
* Setting the {kube,system}ReservedCgroup does not make the kubelet
  enforce the limits, adding the corresponding entry in
  enforceNodeAllocatable does.
  - more explicit variable names
  - add a warning for enforcing kube and system limits.

* Streamline resource kubelet resource reservation:
- remove "master" variants: those should be handled by group_vars
- Use emtpy defaults to leave them to kubelet default configuration

* Exercise the new semantics in CI.
  • Loading branch information
VannTen committed Oct 21, 2024
1 parent cac814c commit faed356
Show file tree
Hide file tree
Showing 5 changed files with 30 additions and 22 deletions.
31 changes: 19 additions & 12 deletions roles/kubernetes/node/defaults/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,9 +8,6 @@ kubelet_bind_address: "{{ ip | default('0.0.0.0') }}"
# resolv.conf to base dns config
kube_resolv_conf: "/etc/resolv.conf"

# Set to empty to avoid cgroup creation
kubelet_enforce_node_allocatable: "\"\""

# Set systemd service hardening features
kubelet_systemd_hardening: false

Expand All @@ -24,28 +21,38 @@ kube_node_addresses: >-
{%- endfor -%}
kubelet_secure_addresses: "localhost link-local {{ kube_pods_subnet }} {{ kube_node_addresses }}"

# Reserve this space for kube resources
# Whether to run kubelet and container-engine daemons in a dedicated cgroup. (Not required for resource reservations).
kube_reserved: false
kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}"
## Reserving compute resources
# https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/

# Resource reservations for kube daemons
kube_memory_reserved: "256Mi"
kube_cpu_reserved: "100m"
kube_ephemeral_storage_reserved: "500Mi"
kube_pid_reserved: "1000"
kube_pid_reserved: 1000

# Set slice for host system daemons (sshd, NetworkManager, ...)
# You probably don't want to change this
system_slice: system.slice

# Set to true to reserve resources for system daemons
system_reserved: false
system_reserved_cgroups_for_service_slice: system.slice
system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}"
# Resource reservations for system daemons
system_memory_reserved: "512Mi"
system_cpu_reserved: "500m"
system_ephemeral_storage_reserved: "500Mi"
system_pid_reserved: 1000

# Make the kubelet enforce with cgroups the limits of Pods
enforce_allocatable_pods: true

# Enforce kube_*_reserved as limits
# WARNING: this limits the resources the kubelet and the container engine can
# use which can cause instability on your nodes
enforce_allocatable_kube_reserved: false

# Enforce system_*_reserved as limits
# WARNING: this limits the resources system daemons can use which can lock you
# out of your nodes (by OOMkilling sshd for instance)
enforce_allocatable_system_reserved: false

## Eviction Thresholds to avoid system OOMs
# https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#eviction-thresholds
eviction_hard: {}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -15,12 +15,13 @@ authorization:
{% else %}
mode: AlwaysAllow
{% endif %}
{% if kubelet_enforce_node_allocatable is defined and kubelet_enforce_node_allocatable != "\"\"" %}
{% set kubelet_enforce_node_allocatable_list = kubelet_enforce_node_allocatable.split(",") %}
enforceNodeAllocatable:
{% for item in kubelet_enforce_node_allocatable_list %}
{% if enforce_node_allocatable %}
{% for item in enforce_node_allocatable %}
- {{ item }}
{% endfor %}
{% else %}
- none # don't enforce anything
{% endif %}
staticPodPath: {{ kube_manifest_dir }}
cgroupDriver: {{ kubelet_cgroup_driver | default('systemd') }}
Expand Down Expand Up @@ -62,9 +63,7 @@ clusterDNS:
{% endfor %}
{# Node reserved CPU/memory #}
{% for scope in "kube", "system" %}
{% if lookup('ansible.builtin.vars', scope + "_reserved") | bool %}
{{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_slice_cgroup') }}
{% endif %}
{{ scope }}Reserved:
{% for resource in "cpu", "memory", "ephemeral-storage", "pid" %}
{{ resource }}: "{{ lookup('ansible.builtin.vars', scope + '_' ~ (resource | replace('-', '_')) + '_reserved') }}"
Expand Down
5 changes: 5 additions & 0 deletions roles/kubernetes/node/vars/main.yml
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
---
kube_slice_cgroup: "/{{ kube_slice.split('-') | join('.slice/') }}/"
system_slice_cgroup: "/{{ system_slice.split('-') | join('.slice/') }}/"
enforce_node_allocatable_stub:
pods: "{{ enforce_allocatable_pods }}"
kube-reserved: "{{ enforce_allocatable_kube_reserved }}"
system-reserved: "{{ enforce_allocatable_system_reserved }}"
enforce_node_allocatable: "{{ enforce_node_allocatable_stub | dict2items | selectattr('value') | map(attribute='key') }}"
3 changes: 0 additions & 3 deletions roles/kubespray-defaults/defaults/main/main.yml
Original file line number Diff line number Diff line change
Expand Up @@ -34,9 +34,6 @@ kube_proxy_mode: ipvs
## The timeout for init first control-plane
kubeadm_init_timeout: 300s

# TODO: remove this
kube_reserved_cgroups_for_service_slice: kube.slice

## List of kubeadm init phases that should be skipped during control plane setup
## By default 'addon/coredns' is skipped
## 'addon/kube-proxy' gets skipped for some network plugins
Expand Down
4 changes: 2 additions & 2 deletions tests/files/packet_ubuntu24-calico-all-in-one.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,5 +23,5 @@ containerd_registries_mirrors:
capabilities: ["pull", "resolve", "push"]
skip_verify: true

kube_reserved: true
system_reserved: true
enforce_allocatable_kube_reserved: true
enforce_allocatable_system_reserved: true

0 comments on commit faed356

Please sign in to comment.