diff --git a/.changelog/14720.txt b/.changelog/14720.txt new file mode 100644 index 00000000000..c6ee35f40bc --- /dev/null +++ b/.changelog/14720.txt @@ -0,0 +1,3 @@ +```release-note:bug +csi: Fixed a bug where volume claims on lost or garbage collected nodes could not be freed +``` diff --git a/nomad/csi_endpoint.go b/nomad/csi_endpoint.go index f63097d8e89..7c0d8a675ca 100644 --- a/nomad/csi_endpoint.go +++ b/nomad/csi_endpoint.go @@ -636,6 +636,22 @@ func (v *CSIVolume) nodeUnpublishVolume(vol *structs.CSIVolume, claim *structs.C return err } + // If the node has been GC'd or is down, we can't send it a node + // unpublish. We need to assume the node has unpublished at its + // end. If it hasn't, any controller unpublish will potentially + // hang or error and need to be retried. + if claim.NodeID != "" { + node, err := snap.NodeByID(memdb.NewWatchSet(), claim.NodeID) + if err != nil { + return err + } + if node == nil || node.Status == structs.NodeStatusDown { + v.logger.Debug("skipping node unpublish for down or GC'd node") + claim.State = structs.CSIVolumeClaimStateNodeDetached + return v.checkpointClaim(vol, claim) + } + } + if claim.AllocationID != "" { err := v.nodeUnpublishVolumeImpl(vol, claim) if err != nil {