From 91f8126b96bbd6c6b8ff6c6aeeaece009bc5404e Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 9 Mar 2021 10:34:07 -0500 Subject: [PATCH] scheduler/csi: fix early return when multiple volumes are requested When multiple CSI volumes are requested, the feasibility check could return early for read/write volumes with free claims, even if a later volume in the request was not feasible for any other reason (including not existing at all). This can result in random failure to fail feasibility checking, depending on how the map of volumes was being ordered at runtime. Remove the early return from the feasibility check. Add a test to verify that missing volumes in the map will cause a failure; this test will not catch a regression every test run because of the random map ordering, but any failure will be caught over the course of several CI runs. --- CHANGELOG.md | 1 + scheduler/feasible.go | 18 +++++++++--------- scheduler/feasible_test.go | 17 +++++++++++++++++ 3 files changed, 27 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b89de8fc94..5b53b54abb7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,7 @@ BUG FIXES: * cli: Fixed a bug where non-int proxy port would panic CLI [[GH-10072](https://github.com/hashicorp/nomad/issues/10072)] * cli: Fixed a bug where `nomad operator debug` incorrectly parsed https Consul API URLs. [[GH-10082](https://github.com/hashicorp/nomad/pull/10082)] * client: Fixed log formatting when killing tasks. [[GH-10135](https://github.com/hashicorp/nomad/issues/10135)] + * scheduler: Fixed a bug where jobs requesting multiple CSI volumes could be incorrectly scheduled if only one of the volumes passed feasibility checking. [[GH-10143](https://github.com/hashicorp/nomad/issues/10143)] * ui: Fixed the rendering of interstitial components shown after processing a dynamic application sizing recommendation. [[GH-10094](https://github.com/hashicorp/nomad/pull/10094)] IMPROVEMENTS: diff --git a/scheduler/feasible.go b/scheduler/feasible.go index 3ab25292db7..399ab457e86 100644 --- a/scheduler/feasible.go +++ b/scheduler/feasible.go @@ -312,15 +312,15 @@ func (c *CSIVolumeChecker) hasPlugins(n *structs.Node) (bool, string) { if !vol.WriteSchedulable() { return false, fmt.Sprintf(FilterConstraintCSIVolumeNoWriteTemplate, vol.ID) } - if vol.WriteFreeClaims() { - return true, "" - } - - // Check the blocking allocations to see if they belong to this job - for id := range vol.WriteAllocs { - a, err := c.ctx.State().AllocByID(ws, id) - if err != nil || a == nil || a.Namespace != c.namespace || a.JobID != c.jobID { - return false, fmt.Sprintf(FilterConstraintCSIVolumeInUseTemplate, vol.ID) + if !vol.WriteFreeClaims() { + // Check the blocking allocations to see if they belong to this job + for id := range vol.WriteAllocs { + a, err := c.ctx.State().AllocByID(ws, id) + if err != nil || a == nil || + a.Namespace != c.namespace || a.JobID != c.jobID { + return false, fmt.Sprintf( + FilterConstraintCSIVolumeInUseTemplate, vol.ID) + } } } } diff --git a/scheduler/feasible_test.go b/scheduler/feasible_test.go index 9caaa72cf81..0e9ec8d7880 100644 --- a/scheduler/feasible_test.go +++ b/scheduler/feasible_test.go @@ -395,6 +395,23 @@ func TestCSIVolumeChecker(t *testing.T) { t.Fatalf("case(%d) failed: got %v; want %v", i, act, c.Result) } } + + // add a missing volume + volumes["missing"] = &structs.VolumeRequest{ + Type: "csi", + Name: "bar", + Source: "does-not-exist", + } + + checker = NewCSIVolumeChecker(ctx) + checker.SetNamespace(structs.DefaultNamespace) + + for _, node := range nodes { + checker.SetVolumes(volumes) + act := checker.Feasible(node) + require.False(t, act, "request with missing volume should never be feasible") + } + } func TestNetworkChecker(t *testing.T) {