From b547906101a44486c1fd3e0e5bf90fc4414df15b Mon Sep 17 00:00:00 2001 From: Dean Roehrich Date: Mon, 21 Oct 2024 09:27:11 -0500 Subject: [PATCH] Add ability repair the state of the NoExecute/NoSchedule taints (#403) A rabbit that has lost its NoSchedule taint, but retains its nnf.cray.hpe.com/taints_and_labels_completed=true label, was not able to repair its taints. This change allows the nnf_systemconfiguration_controller to examine the node and determine whether the label is stale with respect to the state of the taints, and to correct the taints if necessary. Signed-off-by: Dean Roehrich --- .../nnf_systemconfiguration_controller.go | 44 ++++++++++++++++--- ...nnf_systemconfiguration_controller_test.go | 32 ++++++++++++++ 2 files changed, 69 insertions(+), 7 deletions(-) diff --git a/internal/controller/nnf_systemconfiguration_controller.go b/internal/controller/nnf_systemconfiguration_controller.go index 3dacf722..651c99e5 100644 --- a/internal/controller/nnf_systemconfiguration_controller.go +++ b/internal/controller/nnf_systemconfiguration_controller.go @@ -220,14 +220,43 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context, labels = make(map[string]string) } - if _, present := labels[nnfv1alpha3.TaintsAndLabelsCompletedLabel]; present { - continue + taint := &corev1.Taint{ + Key: nnfv1alpha3.RabbitNodeTaintKey, + Value: "true", } - taint := &corev1.Taint{ - Key: nnfv1alpha3.RabbitNodeTaintKey, - Value: "true", - Effect: effect, + staleLabel := false + _, hasCompletedLabel := labels[nnfv1alpha3.TaintsAndLabelsCompletedLabel] + if effect == corev1.TaintEffectNoSchedule && hasCompletedLabel { + // We're in pass 1. + // The presence of the label means that the taint state has been + // completed. On the first pass we verify that this node is + // correctly labelled. + + // NoSchedule should be there... + taint.Effect = corev1.TaintEffectNoSchedule + if !taints.TaintExists(node.Spec.Taints, taint) { + // The label is incorrect; the expected taint was not present. + staleLabel = true + + } + // NoExecute should NOT be there... + taint.Effect = corev1.TaintEffectNoExecute + if taints.TaintExists(node.Spec.Taints, taint) { + // The label is incorrect; this taint should not have been here. + staleLabel = true + + } + if !staleLabel { + // This node is complete and correct; go to the next one. + continue + } + // Clear the label and continue working on this node. + delete(labels, nnfv1alpha3.TaintsAndLabelsCompletedLabel) + node.SetLabels(labels) + } else if hasCompletedLabel { + // All other passes honor the label. + continue } if effect == clearNoExecute { @@ -244,6 +273,7 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context, node.SetLabels(labels) } else { // Add the taint. + taint.Effect = effect node, doUpdate, err = taints.AddOrUpdateTaint(node, taint) if err != nil { log.Error(err, "unable to add taint to spec", "key", taint.Key, "effect", taint.Effect) @@ -258,7 +288,7 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context, node.SetLabels(labels) } - if doUpdate { + if doUpdate || staleLabel { updatedNode = true if err := r.Update(ctx, node); err != nil { log.Error(err, "unable to update taints and/or labels") diff --git a/internal/controller/nnf_systemconfiguration_controller_test.go b/internal/controller/nnf_systemconfiguration_controller_test.go index 18453cf3..a9d2e886 100644 --- a/internal/controller/nnf_systemconfiguration_controller_test.go +++ b/internal/controller/nnf_systemconfiguration_controller_test.go @@ -157,6 +157,8 @@ var _ = Describe("Adding taints and labels to nodes", func() { When("a node is added", func() { It("should be properly tainted and labeled", func() { + var err error + var updated bool node1 := makeNode("rabbit1") Expect(k8sClient.Create(context.TODO(), node1)).To(Succeed()) By("verifying node1") @@ -186,6 +188,36 @@ var _ = Describe("Adding taints and labels to nodes", func() { By("verifying node1 was not touched when nodes 2 and 3 were added") node1ResVer3 := verifyTaintsAndLabels(node1) Expect(node1ResVer3).To(Equal(node1ResVer2)) + + By("verifying node1 repairs itself when its two taints are flipped") + Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1)) + node1, updated, err = taints.RemoveTaint(node1, taintNoSchedule) + Expect(err).To(BeNil()) + Expect(updated).To(BeTrue()) + node1, updated, err = taints.AddOrUpdateTaint(node1, taintNoExecute) + Expect(err).To(BeNil()) + Expect(updated).To(BeTrue()) + Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed()) + By("verifying node1") + _ = verifyTaintsAndLabels(node1) + + By("verifying node1 repairs itself when it loses its NoSchedule taint") + Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1)) + node1, updated, err = taints.RemoveTaint(node1, taintNoSchedule) + Expect(err).To(BeNil()) + Expect(updated).To(BeTrue()) + Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed()) + By("verifying node1") + _ = verifyTaintsAndLabels(node1) + + By("verifying node1 repairs itself when it acquires its NoExecute taint") + Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1)) + node1, updated, err = taints.AddOrUpdateTaint(node1, taintNoExecute) + Expect(err).To(BeNil()) + Expect(updated).To(BeTrue()) + Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed()) + By("verifying node1") + _ = verifyTaintsAndLabels(node1) }) It("should not taint and label a non-Rabbit node", func() {