hashicorp · tgross · Apr 30, 2020 · Apr 28, 2020 · Apr 28, 2020 · Apr 28, 2020
diff --git a/nomad/fsm.go b/nomad/fsm.go
@@ -270,6 +270,8 @@ func (n *nomadFSM) Apply(log *raft.Log) interface{} {
 		return n.applyCSIVolumeDeregister(buf[1:], log.Index)
 	case structs.CSIVolumeClaimRequestType:
 		return n.applyCSIVolumeClaim(buf[1:], log.Index)
+	case structs.CSIVolumeClaimBatchRequestType:
+		return n.applyCSIVolumeBatchClaim(buf[1:], log.Index)
 	case structs.ScalingEventRegisterRequestType:
 		return n.applyUpsertScalingEvent(buf[1:], log.Index)
 	}
@@ -1156,33 +1158,35 @@ func (n *nomadFSM) applyCSIVolumeDeregister(buf []byte, index uint64) interface{
 	return nil
 }
 
-func (n *nomadFSM) applyCSIVolumeClaim(buf []byte, index uint64) interface{} {
-	var req structs.CSIVolumeClaimRequest
-	if err := structs.Decode(buf, &req); err != nil {
+func (n *nomadFSM) applyCSIVolumeBatchClaim(buf []byte, index uint64) interface{} {
+	var batch *structs.CSIVolumeClaimBatchRequest
+	if err := structs.Decode(buf, &batch); err != nil {
 		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
-	defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_claim"}, time.Now())
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_batch_claim"}, time.Now())
 
-	ws := memdb.NewWatchSet()
-	alloc, err := n.state.AllocByID(ws, req.AllocationID)
-	if err != nil {
-		n.logger.Error("AllocByID failed", "error", err)
-		return err
-	}
-	if alloc == nil {
-		n.logger.Error("AllocByID failed to find alloc", "alloc_id", req.AllocationID)
+	for _, req := range batch.Claims {
+		err := n.state.CSIVolumeClaim(index, req.RequestNamespace(),
+			req.VolumeID, req.ToClaim())
 		if err != nil {
-			return err
+			n.logger.Error("CSIVolumeClaim for batch failed", "error", err)
+			return err // note: fails the remaining batch
 		}
+	}
+	return nil
+}
 
-		return structs.ErrUnknownAllocationPrefix
+func (n *nomadFSM) applyCSIVolumeClaim(buf []byte, index uint64) interface{} {
+	var req structs.CSIVolumeClaimRequest
+	if err := structs.Decode(buf, &req); err != nil {
+		panic(fmt.Errorf("failed to decode request: %v", err))
 	}
+	defer metrics.MeasureSince([]string{"nomad", "fsm", "apply_csi_volume_claim"}, time.Now())
 
 	if err := n.state.CSIVolumeClaim(index, req.RequestNamespace(), req.VolumeID, req.ToClaim()); err != nil {
 		n.logger.Error("CSIVolumeClaim failed", "error", err)
 		return err
 	}
-
 	return nil
 }
 

diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go
@@ -2068,9 +2068,14 @@ func (s *StateStore) CSIVolumeClaim(index uint64, namespace, id string, claim *s
 		return err
 	}
 
-	err = volume.Claim(claim, alloc)
-	if err != nil {
-		return err
+	// in the case of a job deregistration, there will be no allocation ID
+	// for the claim but we still want to write an updated index to the volume
+	// so that volume reaping is triggered
+	if claim.AllocationID != "" {
+		err = volume.Claim(claim, alloc)
+		if err != nil {
+			return err
+		}
 	}
 
 	volume.ModifyIndex = index

diff --git a/nomad/structs/csi.go b/nomad/structs/csi.go
@@ -575,6 +575,10 @@ const (
 	CSIVolumeClaimRelease
 )
 
+type CSIVolumeClaimBatchRequest struct {
+	Claims []CSIVolumeClaimRequest
+}
+
 type CSIVolumeClaimRequest struct {
 	VolumeID     string
 	AllocationID string

diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go
@@ -90,6 +90,7 @@ const (
 	CSIVolumeRegisterRequestType
 	CSIVolumeDeregisterRequestType
 	CSIVolumeClaimRequestType
+	CSIVolumeClaimBatchRequestType
 	ScalingEventRegisterRequestType
 )
 

diff --git a/nomad/volumewatcher/batcher.go b/nomad/volumewatcher/batcher.go
@@ -0,0 +1,125 @@
+package volumewatcher
+
+import (
+	"context"
+	"time"
+
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// VolumeUpdateBatcher is used to batch the updates for volume claims
+type VolumeUpdateBatcher struct {
+	// batch is the batching duration
+	batch time.Duration
+
+	// raft is used to actually commit the updates
+	raft VolumeRaftEndpoints
+
+	// workCh is used to pass evaluations to the daemon process
+	workCh chan *updateWrapper
+
+	// ctx is used to exit the daemon batcher
+	ctx context.Context
+}
+
+// NewVolumeUpdateBatcher returns an VolumeUpdateBatcher that uses the
+// passed raft endpoints to create the updates to volume claims, and
+// exits the batcher when the passed exit channel is closed.
+func NewVolumeUpdateBatcher(batchDuration time.Duration, raft VolumeRaftEndpoints, ctx context.Context) *VolumeUpdateBatcher {
+	b := &VolumeUpdateBatcher{
+		batch:  batchDuration,
+		raft:   raft,
+		ctx:    ctx,
+		workCh: make(chan *updateWrapper, 10),
+	}
+
+	go b.batcher()
+	return b
+}
+
+// CreateUpdate batches the volume claim update and returns a future
+// that tracks the completion of the request.
+func (b *VolumeUpdateBatcher) CreateUpdate(claims []structs.CSIVolumeClaimRequest) *BatchFuture {
+	wrapper := &updateWrapper{
+		claims: claims,
+		f:      make(chan *BatchFuture, 1),
+	}
+
+	b.workCh <- wrapper
+	return <-wrapper.f
+}
+
+type updateWrapper struct {
+	claims []structs.CSIVolumeClaimRequest
+	f      chan *BatchFuture
+}
+
+// batcher is the long lived batcher goroutine
+func (b *VolumeUpdateBatcher) batcher() {
+	var timerCh <-chan time.Time
+	claims := make(map[string]structs.CSIVolumeClaimRequest)
+	future := NewBatchFuture()
+	for {
+		select {
+		case <-b.ctx.Done():
+			// note: we can't flush here because we're likely no
+			// longer the leader
+			return
+		case w := <-b.workCh:
+			if timerCh == nil {
+				timerCh = time.After(b.batch)
+			}
+
+			// de-dupe and store the claim update, and attach the future
+			for _, upd := range w.claims {
+				claims[upd.VolumeID+upd.RequestNamespace()] = upd
+			}
+			w.f <- future
+		case <-timerCh:
+			// Capture the future and create a new one
+			f := future
+			future = NewBatchFuture()
+
+			// Create the batch request
+			req := structs.CSIVolumeClaimBatchRequest{}
+			for _, claim := range claims {
+				req.Claims = append(req.Claims, claim)
+			}
+
+			// Upsert the claims in a go routine
+			go f.Set(b.raft.UpsertVolumeClaims(&req))
+
+			// Reset the claims list and timer
+			claims = make(map[string]structs.CSIVolumeClaimRequest)
+			timerCh = nil
+		}
+	}
+}
+
+// BatchFuture is a future that can be used to retrieve the index for
+// the update or any error in the update process
+type BatchFuture struct {
+	index  uint64
+	err    error
+	waitCh chan struct{}
+}
+
+// NewBatchFuture returns a new BatchFuture
+func NewBatchFuture() *BatchFuture {
+	return &BatchFuture{
+		waitCh: make(chan struct{}),
+	}
+}
+
+// Set sets the results of the future, unblocking any client.
+func (f *BatchFuture) Set(index uint64, err error) {
+	f.index = index
+	f.err = err
+	close(f.waitCh)
+}
+
+// Results returns the creation index and any error.
+func (f *BatchFuture) Results() (uint64, error) {
+	<-f.waitCh
+	return f.index, f.err
+}
diff --git a/nomad/volumewatcher/batcher_test.go b/nomad/volumewatcher/batcher_test.go
@@ -0,0 +1,85 @@
+package volumewatcher
+
+import (
+	"context"
+	"fmt"
+	"sync"
+	"testing"
+
+	"github.com/hashicorp/nomad/helper/testlog"
+	"github.com/hashicorp/nomad/nomad/mock"
+	"github.com/hashicorp/nomad/nomad/state"
+	"github.com/hashicorp/nomad/nomad/structs"
+	"github.com/stretchr/testify/require"
+)
+
+// TestVolumeWatch_Batcher tests the update batching logic
+func TestVolumeWatch_Batcher(t *testing.T) {
+	t.Parallel()
+	require := require.New(t)
+
+	ctx, exitFn := context.WithCancel(context.Background())
+	defer exitFn()
+
+	srv := &MockBatchingRPCServer{}
+	srv.state = state.TestStateStore(t)
+	srv.volumeUpdateBatcher = NewVolumeUpdateBatcher(CrossVolumeUpdateBatchDuration, srv, ctx)
+
+	plugin := mock.CSIPlugin()
+	node := testNode(nil, plugin, srv.State())
+
+	// because we wait for the results to return from the batch for each
+	// Watcher.updateClaims, we can't test that we're batching except across
+	// multiple volume watchers. create 2 volumes and their watchers here.
+	alloc0 := mock.Alloc()
+	alloc0.ClientStatus = structs.AllocClientStatusComplete
+	vol0 := testVolume(nil, plugin, alloc0, node.ID)
+	w0 := &volumeWatcher{
+		v:            vol0,
+		rpc:          srv,
+		state:        srv.State(),
+		updateClaims: srv.UpdateClaims,
+		logger:       testlog.HCLogger(t),
+	}
+
+	alloc1 := mock.Alloc()
+	alloc1.ClientStatus = structs.AllocClientStatusComplete
+	vol1 := testVolume(nil, plugin, alloc1, node.ID)
+	w1 := &volumeWatcher{
+		v:            vol1,
+		rpc:          srv,
+		state:        srv.State(),
+		updateClaims: srv.UpdateClaims,
+		logger:       testlog.HCLogger(t),
+	}
+
+	srv.nextCSIControllerDetachError = fmt.Errorf("some controller plugin error")
+
+	var wg sync.WaitGroup
+	wg.Add(2)
+
+	go func() {
+		w0.volumeReapImpl(vol0)
+		wg.Done()
+	}()
+	go func() {
+		w1.volumeReapImpl(vol1)
+		wg.Done()
+	}()
+
+	wg.Wait()
+
+	require.Equal(structs.CSIVolumeClaimStateNodeDetached, vol0.PastClaims[alloc0.ID].State)
+	require.Equal(structs.CSIVolumeClaimStateNodeDetached, vol1.PastClaims[alloc1.ID].State)
+	require.Equal(2, srv.countCSINodeDetachVolume)
+	require.Equal(2, srv.countCSIControllerDetachVolume)
+	require.Equal(2, srv.countUpdateClaims)
+
+	// note: it's technically possible that the volumeReapImpl
+	// goroutines get de-scheduled and we don't write both updates in
+	// the same batch. but this seems really unlikely, so we're
+	// testing for both cases here so that if we start seeing a flake
+	// here in the future we have a clear cause for it.
+	require.GreaterOrEqual(srv.countUpsertVolumeClaims, 1)
+	require.Equal(1, srv.countUpsertVolumeClaims)
+}
diff --git a/nomad/volumewatcher/interfaces.go b/nomad/volumewatcher/interfaces.go
@@ -0,0 +1,28 @@
+package volumewatcher
+
+import (
+	cstructs "github.com/hashicorp/nomad/client/structs"
+	"github.com/hashicorp/nomad/nomad/structs"
+)
+
+// VolumeRaftEndpoints exposes the volume watcher to a set of functions
+// to apply data transforms via Raft.
+type VolumeRaftEndpoints interface {
+
+	// UpsertVolumeClaims applys a batch of claims to raft
+	UpsertVolumeClaims(*structs.CSIVolumeClaimBatchRequest) (uint64, error)
+}
+
+// ClientRPC is a minimal interface of the Server, intended as an aid
+// for testing logic surrounding server-to-server or server-to-client
+// RPC calls and to avoid circular references between the nomad
+// package and the volumewatcher
+type ClientRPC interface {
+	ControllerDetachVolume(args *cstructs.ClientCSIControllerDetachVolumeRequest, reply *cstructs.ClientCSIControllerDetachVolumeResponse) error
+	NodeDetachVolume(args *cstructs.ClientCSINodeDetachVolumeRequest, reply *cstructs.ClientCSINodeDetachVolumeResponse) error
+}
+
+// claimUpdater is the function used to update claims on behalf of a volume
+// (used to wrap batch updates so that we can test
+// volumeWatcher methods synchronously without batching)
+type updateClaimsFn func(claims []structs.CSIVolumeClaimRequest) (uint64, error)