From faed3564d18d6a1e352d1718edb362e213baf182 Mon Sep 17 00:00:00 2001 From: Max Gautier Date: Thu, 23 Nov 2023 21:28:18 +0100 Subject: [PATCH] kubelet: Fix semantics for *ReservedCgroup and enforceNodeAllocatable * Setting the {kube,system}ReservedCgroup does not make the kubelet enforce the limits, adding the corresponding entry in enforceNodeAllocatable does. - more explicit variable names - add a warning for enforcing kube and system limits. * Streamline resource kubelet resource reservation: - remove "master" variants: those should be handled by group_vars - Use emtpy defaults to leave them to kubelet default configuration * Exercise the new semantics in CI. --- roles/kubernetes/node/defaults/main.yml | 31 ++++++++++++------- .../templates/kubelet-config.v1beta1.yaml.j2 | 9 +++--- roles/kubernetes/node/vars/main.yml | 5 +++ .../kubespray-defaults/defaults/main/main.yml | 3 -- .../packet_ubuntu24-calico-all-in-one.yml | 4 +-- 5 files changed, 30 insertions(+), 22 deletions(-) diff --git a/roles/kubernetes/node/defaults/main.yml b/roles/kubernetes/node/defaults/main.yml index 83943cb484c..ae19048a2e6 100644 --- a/roles/kubernetes/node/defaults/main.yml +++ b/roles/kubernetes/node/defaults/main.yml @@ -8,9 +8,6 @@ kubelet_bind_address: "{{ ip | default('0.0.0.0') }}" # resolv.conf to base dns config kube_resolv_conf: "/etc/resolv.conf" -# Set to empty to avoid cgroup creation -kubelet_enforce_node_allocatable: "\"\"" - # Set systemd service hardening features kubelet_systemd_hardening: false @@ -24,28 +21,38 @@ kube_node_addresses: >- {%- endfor -%} kubelet_secure_addresses: "localhost link-local {{ kube_pods_subnet }} {{ kube_node_addresses }}" -# Reserve this space for kube resources -# Whether to run kubelet and container-engine daemons in a dedicated cgroup. (Not required for resource reservations). -kube_reserved: false -kube_reserved_cgroups: "/{{ kube_reserved_cgroups_for_service_slice }}" +## Reserving compute resources +# https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/ + +# Resource reservations for kube daemons kube_memory_reserved: "256Mi" kube_cpu_reserved: "100m" kube_ephemeral_storage_reserved: "500Mi" -kube_pid_reserved: "1000" +kube_pid_reserved: 1000 # Set slice for host system daemons (sshd, NetworkManager, ...) # You probably don't want to change this system_slice: system.slice -# Set to true to reserve resources for system daemons -system_reserved: false -system_reserved_cgroups_for_service_slice: system.slice -system_reserved_cgroups: "/{{ system_reserved_cgroups_for_service_slice }}" +# Resource reservations for system daemons system_memory_reserved: "512Mi" system_cpu_reserved: "500m" system_ephemeral_storage_reserved: "500Mi" system_pid_reserved: 1000 +# Make the kubelet enforce with cgroups the limits of Pods +enforce_allocatable_pods: true + +# Enforce kube_*_reserved as limits +# WARNING: this limits the resources the kubelet and the container engine can +# use which can cause instability on your nodes +enforce_allocatable_kube_reserved: false + +# Enforce system_*_reserved as limits +# WARNING: this limits the resources system daemons can use which can lock you +# out of your nodes (by OOMkilling sshd for instance) +enforce_allocatable_system_reserved: false + ## Eviction Thresholds to avoid system OOMs # https://kubernetes.io/docs/tasks/administer-cluster/reserve-compute-resources/#eviction-thresholds eviction_hard: {} diff --git a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 index 8164e51ba46..c645b300dde 100644 --- a/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 +++ b/roles/kubernetes/node/templates/kubelet-config.v1beta1.yaml.j2 @@ -15,12 +15,13 @@ authorization: {% else %} mode: AlwaysAllow {% endif %} -{% if kubelet_enforce_node_allocatable is defined and kubelet_enforce_node_allocatable != "\"\"" %} -{% set kubelet_enforce_node_allocatable_list = kubelet_enforce_node_allocatable.split(",") %} enforceNodeAllocatable: -{% for item in kubelet_enforce_node_allocatable_list %} +{% if enforce_node_allocatable %} +{% for item in enforce_node_allocatable %} - {{ item }} {% endfor %} +{% else %} +- none # don't enforce anything {% endif %} staticPodPath: {{ kube_manifest_dir }} cgroupDriver: {{ kubelet_cgroup_driver | default('systemd') }} @@ -62,9 +63,7 @@ clusterDNS: {% endfor %} {# Node reserved CPU/memory #} {% for scope in "kube", "system" %} -{% if lookup('ansible.builtin.vars', scope + "_reserved") | bool %} {{ scope }}ReservedCgroup: {{ lookup('ansible.builtin.vars', scope + '_slice_cgroup') }} -{% endif %} {{ scope }}Reserved: {% for resource in "cpu", "memory", "ephemeral-storage", "pid" %} {{ resource }}: "{{ lookup('ansible.builtin.vars', scope + '_' ~ (resource | replace('-', '_')) + '_reserved') }}" diff --git a/roles/kubernetes/node/vars/main.yml b/roles/kubernetes/node/vars/main.yml index dec0ee6225e..f38510657fa 100644 --- a/roles/kubernetes/node/vars/main.yml +++ b/roles/kubernetes/node/vars/main.yml @@ -1,3 +1,8 @@ --- kube_slice_cgroup: "/{{ kube_slice.split('-') | join('.slice/') }}/" system_slice_cgroup: "/{{ system_slice.split('-') | join('.slice/') }}/" +enforce_node_allocatable_stub: + pods: "{{ enforce_allocatable_pods }}" + kube-reserved: "{{ enforce_allocatable_kube_reserved }}" + system-reserved: "{{ enforce_allocatable_system_reserved }}" +enforce_node_allocatable: "{{ enforce_node_allocatable_stub | dict2items | selectattr('value') | map(attribute='key') }}" diff --git a/roles/kubespray-defaults/defaults/main/main.yml b/roles/kubespray-defaults/defaults/main/main.yml index d1f6922828b..725afd7c212 100644 --- a/roles/kubespray-defaults/defaults/main/main.yml +++ b/roles/kubespray-defaults/defaults/main/main.yml @@ -34,9 +34,6 @@ kube_proxy_mode: ipvs ## The timeout for init first control-plane kubeadm_init_timeout: 300s -# TODO: remove this -kube_reserved_cgroups_for_service_slice: kube.slice - ## List of kubeadm init phases that should be skipped during control plane setup ## By default 'addon/coredns' is skipped ## 'addon/kube-proxy' gets skipped for some network plugins diff --git a/tests/files/packet_ubuntu24-calico-all-in-one.yml b/tests/files/packet_ubuntu24-calico-all-in-one.yml index 310bf349fa8..f25ba19e14b 100644 --- a/tests/files/packet_ubuntu24-calico-all-in-one.yml +++ b/tests/files/packet_ubuntu24-calico-all-in-one.yml @@ -23,5 +23,5 @@ containerd_registries_mirrors: capabilities: ["pull", "resolve", "push"] skip_verify: true -kube_reserved: true -system_reserved: true +enforce_allocatable_kube_reserved: true +enforce_allocatable_system_reserved: true