-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
CSI: improve controller RPC reliability #17996
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
```release-note:bug | ||
csi: Fixed a bug in sending concurrent requests to CSI controller plugins by serializing them per plugin | ||
``` | ||
|
||
```release-note:bug | ||
csi: Fixed a bug where CSI controller requests could be sent to unhealthy plugins | ||
``` | ||
|
||
```release-note:bug | ||
csi: Fixed a bug where CSI controller requests could not be sent to controllers on nodes ineligible for scheduling | ||
``` |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -4,6 +4,7 @@ | |
package nomad | ||
|
||
import ( | ||
"context" | ||
"fmt" | ||
"net/http" | ||
"strings" | ||
|
@@ -549,7 +550,9 @@ func (v *CSIVolume) controllerPublishVolume(req *structs.CSIVolumeClaimRequest, | |
cReq.PluginID = plug.ID | ||
cResp := &cstructs.ClientCSIControllerAttachVolumeResponse{} | ||
|
||
err = v.srv.RPC(method, cReq, cResp) | ||
err = v.serializedControllerRPC(plug.ID, func() error { | ||
return v.srv.RPC(method, cReq, cResp) | ||
}) | ||
if err != nil { | ||
if strings.Contains(err.Error(), "FailedPrecondition") { | ||
return fmt.Errorf("%v: %v", structs.ErrCSIClientRPCRetryable, err) | ||
|
@@ -586,6 +589,57 @@ func (v *CSIVolume) volAndPluginLookup(namespace, volID string) (*structs.CSIPlu | |
return plug, vol, nil | ||
} | ||
|
||
// serializedControllerRPC ensures we're only sending a single controller RPC to | ||
// a given plugin if the RPC can cause conflicting state changes. | ||
// | ||
// The CSI specification says that we SHOULD send no more than one in-flight | ||
// request per *volume* at a time, with an allowance for losing state | ||
// (ex. leadership transitions) which the plugins SHOULD handle gracefully. | ||
// | ||
// In practice many CSI plugins rely on k8s-specific sidecars for serializing | ||
// storage provider API calls globally (ex. concurrently attaching EBS volumes | ||
// to an EC2 instance results in a race for device names). So we have to be much | ||
// more conservative about concurrency in Nomad than the spec allows. | ||
func (v *CSIVolume) serializedControllerRPC(pluginID string, fn func() error) error { | ||
|
||
for { | ||
v.srv.volumeControllerLock.Lock() | ||
future := v.srv.volumeControllerFutures[pluginID] | ||
if future == nil { | ||
future, futureDone := context.WithCancel(v.srv.shutdownCtx) | ||
v.srv.volumeControllerFutures[pluginID] = future | ||
v.srv.volumeControllerLock.Unlock() | ||
|
||
err := fn() | ||
|
||
// close the future while holding the lock and not in a defer so | ||
// that we can ensure we've cleared it from the map before allowing | ||
// anyone else to take the lock and write a new one | ||
v.srv.volumeControllerLock.Lock() | ||
futureDone() | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I find myself wanting simpler map thread-safety, and a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yeah we're locking around both access to the map and the construction/cancellation of contexts. With a // spin up a context goroutine whether we need it or not!
future, futureDone := context.WithCancel(v.srv.shutdownCtx)
defer futureDone()
for {
ok := v.srv.volumeControllerFutures.CompareAndSwap(pluginID, nil, future)
if ok {
// success path
err := fn()
futureDone()
v.srv.volumeControllerFutures.CompareAndDelete(pluginID, future)
return err
} else {
// wait path
// racy! what if it's been deleted?
waitOn, ok := v.srv.volumeControllerFutures.Load(pluginID)
if !ok {
continue // lost the race with the delete, start again
}
select {
case <-waitOn.Done():
continue // other goroutine is done
case <-v.srv.shutdownCh:
return structs.ErrNoLeader
}
}
} There are a few reasons why this isn't the best solution:
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nice, thanks for the mock-up and detailed explanation! |
||
delete(v.srv.volumeControllerFutures, pluginID) | ||
v.srv.volumeControllerLock.Unlock() | ||
|
||
return err | ||
} else { | ||
v.srv.volumeControllerLock.Unlock() | ||
|
||
select { | ||
case <-future.Done(): | ||
continue | ||
case <-v.srv.shutdownCh: | ||
// The csi_hook publish workflow on the client will retry if it | ||
// gets this error. On unpublish, we don't want to block client | ||
// shutdown so we give up on error. The new leader's | ||
// volumewatcher will iterate all the claims at startup to | ||
// detect this and mop up any claims in the NodeDetached state | ||
// (volume GC will run periodically as well) | ||
return structs.ErrNoLeader | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Worth wrapping this with any more info about where it came from, or does that happen upstream of here? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The caller's logger will wrap it sufficiently. "No leader" is fairly foundational and there's nothing we can meaningfully add. |
||
} | ||
} | ||
} | ||
} | ||
|
||
// allowCSIMount is called on Job register to check mount permission | ||
func allowCSIMount(aclObj *acl.ACL, namespace string) bool { | ||
return aclObj.AllowPluginRead() && | ||
|
@@ -863,8 +917,11 @@ func (v *CSIVolume) controllerUnpublishVolume(vol *structs.CSIVolume, claim *str | |
Secrets: vol.Secrets, | ||
} | ||
req.PluginID = vol.PluginID | ||
err = v.srv.RPC("ClientCSI.ControllerDetachVolume", req, | ||
&cstructs.ClientCSIControllerDetachVolumeResponse{}) | ||
|
||
err = v.serializedControllerRPC(vol.PluginID, func() error { | ||
return v.srv.RPC("ClientCSI.ControllerDetachVolume", req, | ||
&cstructs.ClientCSIControllerDetachVolumeResponse{}) | ||
}) | ||
if err != nil { | ||
return fmt.Errorf("could not detach from controller: %v", err) | ||
} | ||
|
@@ -1139,7 +1196,9 @@ func (v *CSIVolume) deleteVolume(vol *structs.CSIVolume, plugin *structs.CSIPlug | |
cReq.PluginID = plugin.ID | ||
cResp := &cstructs.ClientCSIControllerDeleteVolumeResponse{} | ||
|
||
return v.srv.RPC(method, cReq, cResp) | ||
return v.serializedControllerRPC(plugin.ID, func() error { | ||
return v.srv.RPC(method, cReq, cResp) | ||
}) | ||
} | ||
|
||
func (v *CSIVolume) ListExternal(args *structs.CSIVolumeExternalListRequest, reply *structs.CSIVolumeExternalListResponse) error { | ||
|
@@ -1286,7 +1345,9 @@ func (v *CSIVolume) CreateSnapshot(args *structs.CSISnapshotCreateRequest, reply | |
} | ||
cReq.PluginID = pluginID | ||
cResp := &cstructs.ClientCSIControllerCreateSnapshotResponse{} | ||
err = v.srv.RPC(method, cReq, cResp) | ||
err = v.serializedControllerRPC(pluginID, func() error { | ||
return v.srv.RPC(method, cReq, cResp) | ||
}) | ||
if err != nil { | ||
multierror.Append(&mErr, fmt.Errorf("could not create snapshot: %v", err)) | ||
continue | ||
|
@@ -1360,7 +1421,9 @@ func (v *CSIVolume) DeleteSnapshot(args *structs.CSISnapshotDeleteRequest, reply | |
cReq := &cstructs.ClientCSIControllerDeleteSnapshotRequest{ID: snap.ID} | ||
cReq.PluginID = plugin.ID | ||
cResp := &cstructs.ClientCSIControllerDeleteSnapshotResponse{} | ||
err = v.srv.RPC(method, cReq, cResp) | ||
err = v.serializedControllerRPC(plugin.ID, func() error { | ||
return v.srv.RPC(method, cReq, cResp) | ||
}) | ||
if err != nil { | ||
multierror.Append(&mErr, fmt.Errorf("could not delete %q: %v", snap.ID, err)) | ||
} | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,6 +6,7 @@ package nomad | |
import ( | ||
"fmt" | ||
"strings" | ||
"sync" | ||
"testing" | ||
"time" | ||
|
||
|
@@ -21,6 +22,7 @@ import ( | |
cconfig "github.com/hashicorp/nomad/client/config" | ||
cstructs "github.com/hashicorp/nomad/client/structs" | ||
"github.com/hashicorp/nomad/helper/uuid" | ||
"github.com/hashicorp/nomad/lib/lang" | ||
"github.com/hashicorp/nomad/nomad/mock" | ||
"github.com/hashicorp/nomad/nomad/state" | ||
"github.com/hashicorp/nomad/nomad/structs" | ||
|
@@ -1971,3 +1973,49 @@ func TestCSI_RPCVolumeAndPluginLookup(t *testing.T) { | |
require.Nil(t, vol) | ||
require.EqualError(t, err, fmt.Sprintf("volume not found: %s", id2)) | ||
} | ||
|
||
func TestCSI_SerializedControllerRPC(t *testing.T) { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I've done some end-to-end testing of this new function with reschedules but I want to follow-up with some additional testing of drains, broken plugin configurations, etc., to make sure that everything is recoverable even in the corner cases. |
||
ci.Parallel(t) | ||
|
||
srv, shutdown := TestServer(t, func(c *Config) { c.NumSchedulers = 0 }) | ||
defer shutdown() | ||
testutil.WaitForLeader(t, srv.RPC) | ||
|
||
var wg sync.WaitGroup | ||
wg.Add(3) | ||
|
||
timeCh := make(chan lang.Pair[string, time.Duration]) | ||
|
||
testFn := func(pluginID string, dur time.Duration) { | ||
defer wg.Done() | ||
c := NewCSIVolumeEndpoint(srv, nil) | ||
now := time.Now() | ||
err := c.serializedControllerRPC(pluginID, func() error { | ||
time.Sleep(dur) | ||
return nil | ||
}) | ||
elapsed := time.Since(now) | ||
timeCh <- lang.Pair[string, time.Duration]{pluginID, elapsed} | ||
must.NoError(t, err) | ||
} | ||
|
||
go testFn("plugin1", 50*time.Millisecond) | ||
go testFn("plugin2", 50*time.Millisecond) | ||
go testFn("plugin1", 50*time.Millisecond) | ||
|
||
totals := map[string]time.Duration{} | ||
for i := 0; i < 3; i++ { | ||
pair := <-timeCh | ||
totals[pair.First] += pair.Second | ||
} | ||
|
||
wg.Wait() | ||
|
||
// plugin1 RPCs should block each other | ||
must.GreaterEq(t, 150*time.Millisecond, totals["plugin1"]) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I see the test passes, but how does There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. For The maximum value of 200ms is assuming that any overhead is (far) less than 50ms, so that asserts that we're not accidentally waiting behind |
||
must.Less(t, 200*time.Millisecond, totals["plugin1"]) | ||
|
||
// plugin1 RPCs should not block plugin2 RPCs | ||
must.GreaterEq(t, 50*time.Millisecond, totals["plugin2"]) | ||
must.Less(t, 100*time.Millisecond, totals["plugin2"]) | ||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I think we're good, but sharing here some of my thought process: For this to work, all such RPCs need to be run on a single server to block in memory. Looking at what I think are the relevant RPCs, it seems like they are all(?)
v.srv.forward()
ed (to the leader). I suppose this is what you meant by "with an allowance for losing state (ex. leadership transitions)" - i.e. this serialization could be broken by switching leaders? and we can't reasonably avoid that caveat.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Right, that's exactly it.