kubernetes-sigs · k8s-ci-robot · Jun 17, 2024 · May 31, 2024
diff --git a/internal/controllers/machine/machine_controller.go b/internal/controllers/machine/machine_controller.go
@@ -400,6 +400,7 @@ func (r *Reconciler) reconcileDelete(ctx context.Context, cluster *clusterv1.Clu
 
 		// After node draining is completed, and if isNodeVolumeDetachingAllowed returns True, make sure all
 		// volumes are detached before proceeding to delete the Node.
+		// In case the node is unreachable, the detachment is skipped.
 		if r.isNodeVolumeDetachingAllowed(m) {
 			// The VolumeDetachSucceededCondition never exists before we wait for volume detachment for the first time,
 			// so its transition time can be used to record the first time we wait for volume detachment.
@@ -689,7 +690,7 @@ func (r *Reconciler) drainNode(ctx context.Context, cluster *clusterv1.Cluster,
 	return ctrl.Result{}, nil
 }
 
-// shouldWaitForNodeVolumes returns true if node status still have volumes attached
+// shouldWaitForNodeVolumes returns true if node status still have volumes attached and the node is reachable
 // pod deletion and volume detach happen asynchronously, so pod could be deleted before volume detached from the node
 // this could cause issue for some storage provisioner, for example, vsphere-volume this is problematic
 // because if the node is deleted before detach success, then the underline VMDK will be deleted together with the Machine
@@ -711,6 +712,14 @@ func (r *Reconciler) shouldWaitForNodeVolumes(ctx context.Context, cluster *clus
 		return true, err
 	}
 
+	if noderefutil.IsNodeUnreachable(node) {
+		// If a node is unreachable, we can't detach the volume.
+		// We need to skip the detachment as we otherwise block deletions
+		// of unreachable nodes when a volume is attached.
+		log.Info("Skipping volume detachment as node is unreachable.")
+		return false, nil
+	}
+
 	return len(node.Status.VolumesAttached) != 0, nil
 }
 

diff --git a/internal/controllers/machine/machine_controller_test.go b/internal/controllers/machine/machine_controller_test.go
@@ -1591,6 +1591,116 @@ func TestIsNodeVolumeDetachingAllowed(t *testing.T) {
 	}
 }
 
+func TestShouldWaitForNodeVolumes(t *testing.T) {
+	testCluster := &clusterv1.Cluster{
+		TypeMeta:   metav1.TypeMeta{Kind: "Cluster", APIVersion: clusterv1.GroupVersion.String()},
+		ObjectMeta: metav1.ObjectMeta{Namespace: metav1.NamespaceDefault, Name: "test-cluster"},
+	}
+
+	attachedVolumes := []corev1.AttachedVolume{
+		{
+			Name:       "test-volume",
+			DevicePath: "test-path",
+		},
+	}
+
+	tests := []struct {
+		name     string
+		node     *corev1.Node
+		expected bool
+	}{
+		{
+			name: "Node has volumes attached",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-node",
+				},
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{
+							Type:   corev1.NodeReady,
+							Status: corev1.ConditionTrue,
+						},
+					},
+					VolumesAttached: attachedVolumes,
+				},
+			},
+			expected: true,
+		},
+		{
+			name: "Node has no volumes attached",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "test-node",
+				},
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{
+							Type:   corev1.NodeReady,
+							Status: corev1.ConditionTrue,
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+		{
+			name: "Node is unreachable and has volumes attached",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "unreachable-node",
+				},
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{
+							Type:   corev1.NodeReady,
+							Status: corev1.ConditionUnknown,
+						},
+					},
+					VolumesAttached: attachedVolumes,
+				},
+			},
+			expected: false,
+		},
+		{
+			name: "Node is unreachable and has no volumes attached",
+			node: &corev1.Node{
+				ObjectMeta: metav1.ObjectMeta{
+					Name: "unreachable-node",
+				},
+				Status: corev1.NodeStatus{
+					Conditions: []corev1.NodeCondition{
+						{
+							Type:   corev1.NodeReady,
+							Status: corev1.ConditionUnknown,
+						},
+					},
+				},
+			},
+			expected: false,
+		},
+	}
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			g := NewWithT(t)
+
+			var objs []client.Object
+			objs = append(objs, testCluster, tt.node)
+
+			c := fake.NewClientBuilder().WithObjects(objs...).Build()
+			tracker := remote.NewTestClusterCacheTracker(ctrl.Log, c, c, fakeScheme, client.ObjectKeyFromObject(testCluster))
+			r := &Reconciler{
+				Client:  c,
+				Tracker: tracker,
+			}
+
+			got, err := r.shouldWaitForNodeVolumes(ctx, testCluster, tt.node.Name)
+			g.Expect(err).ToNot(HaveOccurred())
+			g.Expect(got).To(Equal(tt.expected))
+		})
+	}
+}
+
 func TestIsDeleteNodeAllowed(t *testing.T) {
 	deletionts := metav1.Now()