From af995d4a80429a5e4c6022c79994109793079374 Mon Sep 17 00:00:00 2001 From: Guillaume Demonet Date: Fri, 8 Jul 2022 18:03:37 +0200 Subject: [PATCH] salt/cri: Add utils to wait for kube-apiserver When waiting for kube-apiserver to restart, we can't rely on K8s API to give us a proper status (in single node, it would not be reachable). However, we have observed that sometimes, changing its manifest wouldn't trigger a restart by kubelet, which leads to a broken state where nothing converges (again, in single node). So we add simple utility methods to the `cri` module, and use it in `metalk8s.kubernetes.apiserver.installed` to ensure we at least attempt to restart kubelet if nothing moved after a while. --- salt/_modules/cri.py | 126 +++++++++++-- .../kubernetes/apiserver/installed.sls | 28 ++- salt/tests/unit/formulas/fixtures/salt.py | 1 + salt/tests/unit/modules/files/test_cri.yaml | 166 ++++++++++++++++-- salt/tests/unit/modules/test_cri.py | 118 ++++++++++--- 5 files changed, 379 insertions(+), 60 deletions(-) diff --git a/salt/_modules/cri.py b/salt/_modules/cri.py index 95b12b615c..5594e75fe3 100644 --- a/salt/_modules/cri.py +++ b/salt/_modules/cri.py @@ -2,8 +2,8 @@ Various functions to interact with a CRI daemon (through :program:`crictl`). """ -import re import logging +import re import time from salt.exceptions import CommandExecutionError @@ -229,25 +229,129 @@ def stop_pod(labels): This uses the :command:`crictl` command, which should be configured correctly on the system, e.g. in :file:`/etc/crictl.yaml`. """ - selector = ",".join([f"{key}={value}" for key, value in labels.items()]) + pod_ids = get_pod_id(labels=labels, ignore_not_found=True, multiple=True) + if not pod_ids: + return "No pods to stop" + + out = __salt__["cmd.run_all"](f"crictl stopp {' '.join(pod_ids)}") + + if out["retcode"] != 0: + selector = ",".join([f"{key}={value}" for key, value in labels.items()]) + raise CommandExecutionError( + f"Unable to stop pods with labels '{selector}':\n" + f"IDS: {' '.join(pod_ids)}\nSTDERR: {out['stderr']}\nSTDOUT: {out['stdout']}" + ) + + return out["stdout"] + - pod_ids_out = __salt__["cmd.run_all"](f"crictl pods --quiet --label={selector}") +def get_pod_id( + name=None, labels=None, state=None, multiple=False, ignore_not_found=False +): + """Retrieve the pod(s) ID(s) in CRI. + + .. note:: + + This uses the :command:`crictl` command, which should be configured + correctly on the system, e.g. in :file:`/etc/crictl.yaml`. + + name (str, optional) + Name of the target pod + + labels (dict, optional) + Labels of the target pod(s) + + state (str, optional) + The state in which we want to find the target pod(s) (`None` if all states are + acceptable) + + multiple (bool) + Whether to accept multiple pods returned (raise otherwise) + + ignore_not_found (bool) + Whether to raise if no target pod can be found + """ + pod_ids_cmd = "crictl pods --quiet" + info_parts = [] + if name is not None: + pod_ids_cmd += f" --name {name}" + info_parts.append(f"name '{name}'") + if labels is not None: + selector = ",".join([f"{key}={value}" for key, value in labels.items()]) + pod_ids_cmd += f" --label {selector}" + info_parts.append(f"labels '{selector}'") + if state is not None: + pod_ids_cmd += f" --state {state}" + info_parts.append(f"state '{state}'") + info = f"with {' and '.join(info_parts)}" + + pod_ids_out = __salt__["cmd.run_all"](pod_ids_cmd) if pod_ids_out["retcode"] != 0: raise CommandExecutionError( - f"Unable to get pods with labels {selector}:\n" + f"Unable to get pod {info}:\n" f"STDERR: {pod_ids_out['stderr']}\nSTDOUT: {pod_ids_out['stdout']}" ) - pod_ids = pod_ids_out["stdout"] + pod_ids = pod_ids_out["stdout"].splitlines() if not pod_ids: - return "No pods to stop" + if ignore_not_found: + return None + raise CommandExecutionError(f"No pod found {info}") - out = __salt__["cmd.run_all"](f"crictl stopp {pod_ids}") + if multiple: + return pod_ids - if out["retcode"] != 0: + if len(pod_ids) > 1: + raise CommandExecutionError(f"More than one pod found {info}") + + return pod_ids[0] + + +def wait_pod( + name, state="ready", last_id=None, timeout=60, sleep=5, raise_on_timeout=True +): + """Wait until the pod has been created/updated. + + .. note:: + + This uses the :command:`crictl` command, which should be configured + correctly on the system, e.g. in :file:`/etc/crictl.yaml`. + + name (str) + Name of the target pod + + state (str, optional) + The state in which we want to find the target pod (`None` if all states are + acceptable) + + last_id (str, optional) + ID of the pod before it was updated (set to `None` if just waiting for a new + pod) + + timeout (int) + Number of seconds to wait before bailing out + + sleep (int) + Number of seconds to wait between two checks + + raise_on_timeout (bool) + Whether to raise if the timeout period is exceeded (otherwise, return False) + """ + start_time = time.time() + + while time.time() - start_time < timeout: + current_id = get_pod_id(name=name, state=state, ignore_not_found=True) + if current_id and current_id != last_id: + return True + remaining = timeout + start_time - time.time() + if remaining < sleep: # Don't sleep if we know it's going to time out + break + time.sleep(sleep) + + if raise_on_timeout: + verb = "updated" if last_id else "created" raise CommandExecutionError( - f"Unable to stop pods with labels {selector}:\n" - f"IDS: {pod_ids}\nSTDERR: {out['stderr']}\nSTDOUT: {out['stdout']}" + f"Pod {name} was not {verb} after {(time.time() - start_time):.0f} seconds" ) - return out["stdout"] + return False diff --git a/salt/metalk8s/kubernetes/apiserver/installed.sls b/salt/metalk8s/kubernetes/apiserver/installed.sls index 04a79d39c1..b9c1505993 100644 --- a/salt/metalk8s/kubernetes/apiserver/installed.sls +++ b/salt/metalk8s/kubernetes/apiserver/installed.sls @@ -47,6 +47,11 @@ include: }) %} {%- endif %} +{%- set pod_name = "kube-apiserver-" ~ grains.id %} +{%- set last_pod_id = salt.cri.get_pod_id( + name=pod_name, state="ready", ignore_not_found=True +) %} + Create kube-apiserver Pod manifest: metalk8s.static_pod_managed: - name: /etc/kubernetes/manifests/kube-apiserver.yaml @@ -144,12 +149,23 @@ Create kube-apiserver Pod manifest: - file: Ensure SA pub key is present - file: Ensure Ingress CA cert is present -Delay after apiserver pod deployment: - module.run: - - test.sleep: - - length: 10 - - onchanges: +{%- if last_pod_id %} +Restart kubelet to make it pick up the manifest changes: + service.running: + - name: kubelet + - watch: - metalk8s: Create kube-apiserver Pod manifest + - unless: + # Do not restart kubelet if we see the Pod was updated + - fun: cri.wait_pod + name: {{ pod_name }} + last_id: {{ last_pod_id }} + timeout: 120 + raise_on_timeout: false + - require_in: + - module: Make sure kube-apiserver container is up and ready + +{%- endif %} Make sure kube-apiserver container is up and ready: module.run: @@ -159,8 +175,6 @@ Make sure kube-apiserver container is up and ready: - timeout: 120 - onchanges: - metalk8s: Create kube-apiserver Pod manifest - - require: - - module: Delay after apiserver pod deployment http.wait_for_successful_query: - name: https://{{ host }}:6443/healthz - verify_ssl: True diff --git a/salt/tests/unit/formulas/fixtures/salt.py b/salt/tests/unit/formulas/fixtures/salt.py index dec31fdfbb..705f2ab6c2 100644 --- a/salt/tests/unit/formulas/fixtures/salt.py +++ b/salt/tests/unit/formulas/fixtures/salt.py @@ -406,6 +406,7 @@ def slsutil_renderer(salt_mock: SaltMock, source: str, **_kwargs: Any) -> Any: # Static mocks {{{ +register_basic("cri.get_pod_id")(MagicMock(return_value="abcd1234")) register_basic("file.find")(MagicMock(return_value=[])) register_basic("file.join")(lambda *args: "/".join(args)) register_basic("file.read")(MagicMock(return_value="")) diff --git a/salt/tests/unit/modules/files/test_cri.yaml b/salt/tests/unit/modules/files/test_cri.yaml index f698b4bd9c..b68b412259 100644 --- a/salt/tests/unit/modules/files/test_cri.yaml +++ b/salt/tests/unit/modules/files/test_cri.yaml @@ -3,17 +3,16 @@ stop_pod: - result: "No pods to stop" # 2. Nominal a pod get stopped - - pod_ids_out: - stdout: "123abc" + - pod_ids: + - 123abc pod_stop_out: stdout: "Stopped sandbox 123abc" result: "Stopped sandbox 123abc" # 3. 2 pods get stopped - - pod_ids_out: - stdout: |- - 123abc - 456def + - pod_ids: + - 123abc + - 456def pod_stop_out: stdout: |- Stopped sandbox 123abc @@ -23,24 +22,159 @@ stop_pod: Stopped sandbox 456def # 4. Unable to get pod ID - - pod_ids_out: - retcode: 1 - stderr: "Oo eRroR" + - pod_ids_raise: Oo eRroR raises: True - result: |- - Unable to get pods with labels my.label=ABCD: - STDERR: Oo eRroR - STDOUT: + result: Oo eRroR # 5. Unable to stop a pod - - pod_ids_out: - stdout: "123abc" + - pod_ids: + - 123abc pod_stop_out: retcode: 1 stderr: "Apparently there is an eRrOr" raises: True result: |- - Unable to stop pods with labels my.label=ABCD: + Unable to stop pods with labels 'my.label=ABCD': IDS: 123abc STDERR: Apparently there is an eRrOr STDOUT: + +get_pod_id: + # 0. Pod found by name (ok) + - &_get_pod_id_base_ok + name: example + expected_cmd_args: "--name example" + pod_ids_out: + retcode: 0 + stdout: &_get_pod_id_found_id abcdef123456 + result: *_get_pod_id_found_id + # 1. Pod found by labels (ok) + - <<: *_get_pod_id_base_ok + labels: + my.label: ABCD + expected_cmd_args: "--name example --label my.label=ABCD" + # 2. Pod found by state (ok) + - <<: *_get_pod_id_base_ok + state: ready + expected_cmd_args: "--name example --state ready" + # 3. Pod not found by name (ok) + - &_get_pod_id_base_err + <<: *_get_pod_id_base_ok + pod_ids_out: + retcode: 0 + stdout: "" + raises: True + result: No pod found with name 'example' + # 4. Pod not found by labels (ok) + - <<: *_get_pod_id_base_err + name: null + labels: + my.label: ABCD + expected_cmd_args: "--label my.label=ABCD" + result: No pod found with labels 'my.label=ABCD' + # 5. Pod not found by state (ok) + - <<: *_get_pod_id_base_err + state: ready + expected_cmd_args: "--name example --state ready" + result: No pod found with name 'example' and state 'ready' + # 6. Multiple pods found (raise) + - &_get_pod_id_multiple + <<: *_get_pod_id_base_ok + pod_ids_out: + retcode: 0 + stdout: |- + abcdef123456 + ghijkl789123 + raises: True + result: More than one pod found with name 'example' + # 7. Multiple pods found (ok) + - <<: *_get_pod_id_multiple + multiple: True + raises: False + result: + - abcdef123456 + - ghijkl789123 + # 8. No pod found and no arg (raise) + - &_get_pod_id_none + <<: *_get_pod_id_base_ok + name: null + expected_cmd_args: null + pod_ids_out: + retcode: 0 + stdout: "" + raises: True + result: No pod found + # 9. No pod found and no arg (ok) + - <<: *_get_pod_id_none + ignore_not_found: True + raises: False + result: null + # 10. Some crictl error (raise) + - <<: *_get_pod_id_base_ok + pod_ids_out: + retcode: 1 + stderr: Some Standard Error + raises: True + result: |- + Unable to get pod with name 'example': + STDERR: Some Standard Error + STDOUT: + +wait_pod: + # 0. Pod was created + - name: example + state: ready + timeout: 5 + sleep: 1 + pod_ids: + - null + - null + - abc123 + result: True + # 1. Pod was updated + - name: example + timeout: 5 + sleep: 1 + last_id: abc123 + pod_ids: + - abc123 + - null + - def456 + result: True + # 2. Some crictl error (raise) + - name: example + pod_ids_raise: Oo ErRoR + raises: True + result: Oo ErRoR + # 3. Timed out - check updated (raise) + - name: example + timeout: 2 + sleep: 1 + last_id: abc123 + pod_ids: + - abc123 + - null + - null + raises: True + result: Pod example was not updated after 2 seconds + # 4. Timed out - check created (raise) + - name: example + timeout: 2 + sleep: 1 + pod_ids: + - null + - null + - null + raises: True + result: Pod example was not created after 2 seconds + # 5. Timed out (no raise) + - name: example + timeout: 2 + sleep: 1.5 + last_id: abc123 + raise_on_timeout: False + pod_ids: + - abc123 + - null + - null + result: False diff --git a/salt/tests/unit/modules/test_cri.py b/salt/tests/unit/modules/test_cri.py index bde0de1666..2bc163747b 100644 --- a/salt/tests/unit/modules/test_cri.py +++ b/salt/tests/unit/modules/test_cri.py @@ -288,37 +288,103 @@ def test_ready(self, retcode, result): self.assertEqual(cri.ready(), result) @utils.parameterized_from_cases(YAML_TESTS_CASES["stop_pod"]) - def test_stop_pod(self, result, pod_ids_out=None, pod_stop_out=None, raises=False): - """ - Tests the return of `stop_pod` function - """ - pod_selector = {"my.label": "ABCD"} - pod_selector_str = "--label=my.label=ABCD" - pod_id = (pod_ids_out or {}).get("stdout") + def test_stop_pod( + self, result, pod_ids=None, pod_ids_raise=None, pod_stop_out=None, raises=False + ): + """Test the return value of `stop_pod`.""" + pod_labels = {"my.label": "ABCD"} + + def pod_id_mock(*args, **kwargs): + self.assertEqual(args, ()) + self.assertEqual( + kwargs, dict(labels=pod_labels, multiple=True, ignore_not_found=True) + ) + if pod_ids_raise: + raise CommandExecutionError(pod_ids_raise) + return pod_ids + + def cmd_run_mock(cmd): + self.assertEqual(cmd, f"crictl stopp {' '.join(pod_ids)}") + return utils.cmd_output(**(pod_stop_out or {})) + + salt_dict = {"cmd.run_all": MagicMock(side_effect=cmd_run_mock)} + + with patch.dict(cri.__salt__, salt_dict), patch.object( + cri, "get_pod_id", pod_id_mock + ): + if raises: + with self.assertRaises(CommandExecutionError, msg=result): + cri.stop_pod(pod_labels) + else: + self.assertEqual(cri.stop_pod(pod_labels), result) + + @utils.parameterized_from_cases(YAML_TESTS_CASES["get_pod_id"]) + def test_get_pod_id( + self, + result, + pod_ids_out=None, + pod_stop_out=None, + raises=False, + expected_cmd_args=None, + **kwargs, + ): + """Test the return value of `get_pod_id`.""" + expect_cmd = f"crictl pods --quiet" + if expected_cmd_args: + expect_cmd += f" {expected_cmd_args}" def cmd_run_mock(cmd): - if cmd.startswith("crictl pods"): - self.assertIn(pod_selector_str, cmd) - return utils.cmd_output(**(pod_ids_out or {})) - elif cmd.startswith("crictl stopp"): - self.assertIn(pod_id, cmd) - return utils.cmd_output(**(pod_stop_out or {})) - raise Exception("Should not happen !!") + self.assertEqual(cmd, expect_cmd) + return utils.cmd_output(**(pod_ids_out or {})) salt_dict = {"cmd.run_all": MagicMock(side_effect=cmd_run_mock)} with patch.dict(cri.__salt__, salt_dict): if raises: - self.assertRaisesRegex( - CommandExecutionError, - result, - cri.stop_pod, - pod_selector, - ) + with self.assertRaises(CommandExecutionError, msg=result): + cri.get_pod_id(**kwargs) else: - self.assertEqual( - cri.stop_pod( - pod_selector, - ), - result, - ) + self.assertEqual(cri.get_pod_id(**kwargs), result) + + @utils.parameterized_from_cases(YAML_TESTS_CASES["wait_pod"]) + def test_wait_pod( + self, + result, + pod_ids=None, + pod_ids_raise=None, + raises=False, + **kwargs, + ): + """Test the return value of `wait_pod`.""" + timer = [0] + + def sleep_mock(duration): + timer.append(timer[-1] + duration) + + def time_mock(): + return timer[-1] + + result_iterator = iter(pod_ids or []) + + def pod_ids_mock(*a, **k): + self.assertEqual(a, ()) + self.assertEqual( + k, + dict( + name=kwargs.get("name"), + state=kwargs.get("state", "ready"), + ignore_not_found=True, + ), + ) + if pod_ids_raise: + raise CommandExecutionError(pod_ids_raise) + return next(result_iterator) + + with patch("time.sleep", sleep_mock), patch( + "time.time", time_mock + ), patch.object(cri, "get_pod_id", MagicMock(side_effect=pod_ids_mock)): + if raises: + with self.assertRaises(CommandExecutionError, msg=result): + cri.wait_pod(**kwargs) + else: + self.assertEqual(cri.wait_pod(**kwargs), result)