From 73b25c6f480400acf50e28f6f3143c4baa89e1c2 Mon Sep 17 00:00:00 2001 From: David Louks <2402775+dlouks@users.noreply.github.com> Date: Wed, 13 Jan 2021 16:15:11 -0600 Subject: [PATCH 1/2] Remove ignore_errors from drain tasks and enable retires --- .../remove-node/pre-remove/defaults/main.yml | 2 ++ roles/remove-node/pre-remove/tasks/main.yml | 35 +++++++++++++------ 2 files changed, 26 insertions(+), 11 deletions(-) diff --git a/roles/remove-node/pre-remove/defaults/main.yml b/roles/remove-node/pre-remove/defaults/main.yml index 1e5b849cc92..deaa8afb7f5 100644 --- a/roles/remove-node/pre-remove/defaults/main.yml +++ b/roles/remove-node/pre-remove/defaults/main.yml @@ -2,3 +2,5 @@ allow_ungraceful_removal: false drain_grace_period: 300 drain_timeout: 360s +drain_retries: 3 +drain_retry_delay_seconds: 10 diff --git a/roles/remove-node/pre-remove/tasks/main.yml b/roles/remove-node/pre-remove/tasks/main.yml index 563fa036eb8..681804cbf81 100644 --- a/roles/remove-node/pre-remove/tasks/main.yml +++ b/roles/remove-node/pre-remove/tasks/main.yml @@ -1,14 +1,26 @@ --- -- name: cordon-node | Mark all nodes as unschedulable before drain # noqa 301 - command: >- - {{ bin_dir }}/kubectl cordon {{ hostvars[item]['kube_override_hostname']|default(item) }} - with_items: - - "{{ node.split(',') | default(groups['kube-node']) }}" - register: result - failed_when: result.rc != 0 and not allow_ungraceful_removal +- name: remove-node | Set `nodes_to_drain` as empty list + set_fact: + nodes_to_drain: [] + +- name: remove-node | Identify nodes to drain, ignore non-cluster nodes + shell: | + set -o pipefail + {{ bin_dir }}/kubectl get nodes -o json \ + | jq .items[].metadata.name \ + | jq "select(. | test(\"^{{ hostvars[item]['kube_override_hostname']|default(item) }}$\"))" + loop: "{{ node.split(',') | default(groups['kube-node']) }}" + register: nodes delegate_to: "{{ groups['kube-master']|first }}" + changed_when: false + run_once: true + +- name: remove-node | Generate list of nodes to drain + set_fact: + nodes_to_drain: "{{ nodes_to_drain }} + [ '{{ item.stdout | regex_replace('\"', '') }}' ]" + loop: "{{ nodes.results }}" + when: item.stdout != "" run_once: true - ignore_errors: yes - name: remove-node | Drain node except daemonsets resource # noqa 301 command: >- @@ -18,10 +30,11 @@ --grace-period {{ drain_grace_period }} --timeout {{ drain_timeout }} --delete-local-data {{ hostvars[item]['kube_override_hostname']|default(item) }} - with_items: - - "{{ node.split(',') | default(groups['kube-node']) }}" + loop: "{{ nodes_to_drain }}" register: result failed_when: result.rc != 0 and not allow_ungraceful_removal delegate_to: "{{ groups['kube-master']|first }}" run_once: true - ignore_errors: yes + until: result.rc == 0 or allow_ungraceful_removal + retries: "{{ drain_retries }}" + delay: "{{ drain_retry_delay_seconds }}" From 80735526ba2e4bd71a82de171b52a2aa638dfb6e Mon Sep 17 00:00:00 2001 From: David Louks <2402775+dlouks@users.noreply.github.com> Date: Fri, 15 Jan 2021 12:06:29 -0600 Subject: [PATCH 2/2] Fix lint error by checking if stdout length is not 0, ie string is not empty. --- roles/remove-node/pre-remove/tasks/main.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/roles/remove-node/pre-remove/tasks/main.yml b/roles/remove-node/pre-remove/tasks/main.yml index 681804cbf81..42316e209d4 100644 --- a/roles/remove-node/pre-remove/tasks/main.yml +++ b/roles/remove-node/pre-remove/tasks/main.yml @@ -19,7 +19,7 @@ set_fact: nodes_to_drain: "{{ nodes_to_drain }} + [ '{{ item.stdout | regex_replace('\"', '') }}' ]" loop: "{{ nodes.results }}" - when: item.stdout != "" + when: item.stdout | length != 0 run_once: true - name: remove-node | Drain node except daemonsets resource # noqa 301