Skip to content

Commit

Permalink
fix(10661): volumes don't block deletion of unreachable nodes
Browse files Browse the repository at this point in the history
  • Loading branch information
typeid committed May 31, 2024
1 parent eaf97bc commit 3603655
Show file tree
Hide file tree
Showing 2 changed files with 121 additions and 1 deletion.
11 changes: 10 additions & 1 deletion internal/controllers/machine/machine_controller.go
Original file line number Diff line number Diff line change
Expand Up @@ -401,6 +401,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu

// After node draining is completed, and if isNodeVolumeDetachingAllowed returns True, make sure all
// volumes are detached before proceeding to delete the Node.
// In case the node is unreachable, the detachment is skipped.
if r.isNodeVolumeDetachingAllowed(m) {
// The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time,
// so its transition time can be used to record the first time we wait for volume detachment.
Expand Down Expand Up @@ -690,7 +691,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
return ctrl.Result{}, nil
}

// shouldWaitForNodeVolumes returns true if node status still have volumes attached
// shouldWaitForNodeVolumes returns true if node status still have volumes attached and the node is reachable
// pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
// this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic
// because if the node is deleted before detach success, then the underline VMDK will be deleted together with the Machine
Expand All @@ -712,6 +713,14 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus
return true, err
}

if noderefutil.IsNodeUnreachable(node) {
// If a node is unreachable, we can't detach the volume.
// We need to skip the detachment as we otherwise block deletions
// of unreachable nodes when a volume is attached.
log.Info("Skipping volume detachment as node is unreachable.")
return true, nil
}

return len(node.Status.VolumesAttached) != 0, nil
}

Expand Down
111 changes: 111 additions & 0 deletions internal/controllers/machine/machine_controller_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1599,6 +1599,117 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) {
}
}

func TestShouldWaitForNodeVolumes(t *testing.T) {
testCluster := &clusterv1.Cluster{
TypeMeta: metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: "test-cluster"},
}

attachedVolumes := []corev1.AttachedVolume{
{
Name: "test-volume",
DevicePath: "test-path",
},
}

tests := []struct {
name string
node *corev1.Node
expected bool
}{
{
name: "Node has volumes attached",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
VolumesAttached: attachedVolumes,
},
},
expected: true,
},
{
name: "Node has no volumes attached",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "test-node",
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionTrue,
},
},
},
},
expected: false,
},
{
name: "Node is unreachable and has volumes attached",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "unreachable-node",
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
},
},
VolumesAttached: attachedVolumes,
},
},
expected: true,
},
{
name: "Node is unreachable and has no volumes attached",
node: &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "unreachable-node",
},
Status: corev1.NodeStatus{
Conditions: []corev1.NodeCondition{
{
Type: corev1.NodeReady,
Status: corev1.ConditionUnknown,
},
},
},
},
expected: true,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
g := NewWithT(t)

Check failure on line 1694 in internal/controllers/machine/machine_controller_test.go

View workflow job for this annotation

GitHub Actions / lint

unknown field UnstructuredCachingClient in struct literal of type Reconciler (typecheck)
var objs []client.Object
objs = append(objs, testCluster, tt.node)

c := fake.NewClientBuilder().WithObjects(objs...).Build()
tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster))
r := &Reconciler{
Client: c,
UnstructuredCachingClient: c,
Tracker: tracker,
}

got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name)
g.Expect(err).ToNot(HaveOccurred())
g.Expect(got).To(Equal(tt.expected))
})
}
}

func TestIsDeleteNodeAllowed(t *testing.T) {
deletionts := metav1.Now()

Expand Down

0 comments on commit 3603655

Please sign in to comment.