Skip to content

Commit

Permalink
Add ability repair the state of the NoExecute/NoSchedule taints (#403)
Browse files Browse the repository at this point in the history
A rabbit that has lost its NoSchedule taint, but retains its
nnf.cray.hpe.com/taints_and_labels_completed=true label, was not able to
repair its taints.

This change allows the nnf_systemconfiguration_controller to examine the node
and determine whether the label is stale with respect to the state of the
taints, and to correct the taints if necessary.

Signed-off-by: Dean Roehrich <[email protected]>
  • Loading branch information
roehrich-hpe authored Oct 21, 2024
1 parent 49d8beb commit b547906
Show file tree
Hide file tree
Showing 2 changed files with 69 additions and 7 deletions.
44 changes: 37 additions & 7 deletions internal/controller/nnf_systemconfiguration_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -220,14 +220,43 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context,
labels = make(map[string]string)
}

if _, present := labels[nnfv1alpha3.TaintsAndLabelsCompletedLabel]; present {
continue
taint := &corev1.Taint{
Key: nnfv1alpha3.RabbitNodeTaintKey,
Value: "true",
}

taint := &corev1.Taint{
Key: nnfv1alpha3.RabbitNodeTaintKey,
Value: "true",
Effect: effect,
staleLabel := false
_, hasCompletedLabel := labels[nnfv1alpha3.TaintsAndLabelsCompletedLabel]
if effect == corev1.TaintEffectNoSchedule && hasCompletedLabel {
// We're in pass 1.
// The presence of the label means that the taint state has been
// completed. On the first pass we verify that this node is
// correctly labelled.

// NoSchedule should be there...
taint.Effect = corev1.TaintEffectNoSchedule
if !taints.TaintExists(node.Spec.Taints, taint) {
// The label is incorrect; the expected taint was not present.
staleLabel = true

}
// NoExecute should NOT be there...
taint.Effect = corev1.TaintEffectNoExecute
if taints.TaintExists(node.Spec.Taints, taint) {
// The label is incorrect; this taint should not have been here.
staleLabel = true

}
if !staleLabel {
// This node is complete and correct; go to the next one.
continue
}
// Clear the label and continue working on this node.
delete(labels, nnfv1alpha3.TaintsAndLabelsCompletedLabel)
node.SetLabels(labels)
} else if hasCompletedLabel {
// All other passes honor the label.
continue
}

if effect == clearNoExecute {
Expand All @@ -244,6 +273,7 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context,
node.SetLabels(labels)
} else {
// Add the taint.
taint.Effect = effect
node, doUpdate, err = taints.AddOrUpdateTaint(node, taint)
if err != nil {
log.Error(err, "unable to add taint to spec", "key", taint.Key, "effect", taint.Effect)
Expand All @@ -258,7 +288,7 @@ func (r *NnfSystemConfigurationReconciler) labelsAndTaints(ctx context.Context,
node.SetLabels(labels)
}

if doUpdate {
if doUpdate || staleLabel {
updatedNode = true
if err := r.Update(ctx, node); err != nil {
log.Error(err, "unable to update taints and/or labels")
Expand Down
32 changes: 32 additions & 0 deletions internal/controller/nnf_systemconfiguration_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -157,6 +157,8 @@ var _ = Describe("Adding taints and labels to nodes", func() {

When("a node is added", func() {
It("should be properly tainted and labeled", func() {
var err error
var updated bool
node1 := makeNode("rabbit1")
Expect(k8sClient.Create(context.TODO(), node1)).To(Succeed())
By("verifying node1")
Expand Down Expand Up @@ -186,6 +188,36 @@ var _ = Describe("Adding taints and labels to nodes", func() {
By("verifying node1 was not touched when nodes 2 and 3 were added")
node1ResVer3 := verifyTaintsAndLabels(node1)
Expect(node1ResVer3).To(Equal(node1ResVer2))

By("verifying node1 repairs itself when its two taints are flipped")
Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1))
node1, updated, err = taints.RemoveTaint(node1, taintNoSchedule)
Expect(err).To(BeNil())
Expect(updated).To(BeTrue())
node1, updated, err = taints.AddOrUpdateTaint(node1, taintNoExecute)
Expect(err).To(BeNil())
Expect(updated).To(BeTrue())
Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed())
By("verifying node1")
_ = verifyTaintsAndLabels(node1)

By("verifying node1 repairs itself when it loses its NoSchedule taint")
Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1))
node1, updated, err = taints.RemoveTaint(node1, taintNoSchedule)
Expect(err).To(BeNil())
Expect(updated).To(BeTrue())
Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed())
By("verifying node1")
_ = verifyTaintsAndLabels(node1)

By("verifying node1 repairs itself when it acquires its NoExecute taint")
Expect(k8sClient.Get(context.TODO(), client.ObjectKeyFromObject(node1), node1))
node1, updated, err = taints.AddOrUpdateTaint(node1, taintNoExecute)
Expect(err).To(BeNil())
Expect(updated).To(BeTrue())
Expect(k8sClient.Update(context.TODO(), node1)).To(Succeed())
By("verifying node1")
_ = verifyTaintsAndLabels(node1)
})

It("should not taint and label a non-Rabbit node", func() {
Expand Down

0 comments on commit b547906

Please sign in to comment.