From 50438d85bdc026711646e6d3ed86fd3b8cef6e2c Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 19 Oct 2021 20:22:03 -0400 Subject: [PATCH] Fix preemption panic (#11346) Fix a bug where the scheduler may panic when preemption is enabled. The conditions are a bit complicated: A job with higher priority that schedule multiple allocations that preempt other multiple allocations on the same node, due to port/network/device assignments. The cause of the bug is incidental mutation of internal cached data. `RankedNode` computes and cache proposed allocations in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L42-L53 . But scheduler then mutates the list to remove pre-emptable allocs in https://github.com/hashicorp/nomad/blob/v1.1.6/scheduler/rank.go#L293-L294, and `RemoveAllocs` mutates and sets the tail of cached slice with `nil`s triggering a nil-pointer derefencing case. I fixed the issue by avoiding the mutation in `RemoveAllocs` - the micro-optimization there doesn't seem necessary. Fixes https://github.com/hashicorp/nomad/issues/11342 --- nomad/structs/funcs.go | 19 +++-- scheduler/preemption_test.go | 130 +++++++++++++++++++++++++++++++++++ 2 files changed, 139 insertions(+), 10 deletions(-) diff --git a/nomad/structs/funcs.go b/nomad/structs/funcs.go index 50e55980552..f9ea6f652a4 100644 --- a/nomad/structs/funcs.go +++ b/nomad/structs/funcs.go @@ -44,24 +44,23 @@ func warningsFormatter(es []error) string { // RemoveAllocs is used to remove any allocs with the given IDs // from the list of allocations -func RemoveAllocs(alloc []*Allocation, remove []*Allocation) []*Allocation { +func RemoveAllocs(allocs []*Allocation, remove []*Allocation) []*Allocation { + if len(remove) == 0 { + return allocs + } // Convert remove into a set removeSet := make(map[string]struct{}) for _, remove := range remove { removeSet[remove.ID] = struct{}{} } - n := len(alloc) - for i := 0; i < n; i++ { - if _, ok := removeSet[alloc[i].ID]; ok { - alloc[i], alloc[n-1] = alloc[n-1], nil - i-- - n-- + r := make([]*Allocation, 0, len(allocs)) + for _, alloc := range allocs { + if _, ok := removeSet[alloc.ID]; !ok { + r = append(r, alloc) } } - - alloc = alloc[:n] - return alloc + return r } // FilterTerminalAllocs filters out all allocations in a terminal state and diff --git a/scheduler/preemption_test.go b/scheduler/preemption_test.go index ab2084ff317..0c37844980e 100644 --- a/scheduler/preemption_test.go +++ b/scheduler/preemption_test.go @@ -1381,6 +1381,136 @@ func TestPreemption(t *testing.T) { } } +// TestPreemptionMultiple tests evicting multiple allocations in the same time +func TestPreemptionMultiple(t *testing.T) { + // The test setup: + // * a node with 4 GPUs + // * a low priority job with 4 allocs, each is using 1 GPU + // + // Then schedule a high priority job needing 2 allocs, using 2 GPUs each. + // Expectation: + // All low priority allocs should preempted to accomodate the high priority job + h := NewHarness(t) + + // node with 4 GPUs + node := mock.Node() + node.NodeResources = &structs.NodeResources{ + Cpu: structs.NodeCpuResources{ + CpuShares: 4000, + }, + Memory: structs.NodeMemoryResources{ + MemoryMB: 8192, + }, + Disk: structs.NodeDiskResources{ + DiskMB: 100 * 1024, + }, + Networks: []*structs.NetworkResource{ + { + Device: "eth0", + CIDR: "192.168.0.100/32", + MBits: 1000, + }, + }, + Devices: []*structs.NodeDeviceResource{ + { + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + Attributes: map[string]*psstructs.Attribute{ + "memory": psstructs.NewIntAttribute(11, psstructs.UnitGiB), + "cuda_cores": psstructs.NewIntAttribute(3584, ""), + "graphics_clock": psstructs.NewIntAttribute(1480, psstructs.UnitMHz), + "memory_bandwidth": psstructs.NewIntAttribute(11, psstructs.UnitGBPerS), + }, + Instances: []*structs.NodeDevice{ + { + ID: "dev0", + Healthy: true, + }, + { + ID: "dev1", + Healthy: true, + }, + { + ID: "dev2", + Healthy: true, + }, + { + ID: "dev3", + Healthy: true, + }, + }, + }, + }, + } + + require.NoError(t, h.State.UpsertNode(structs.MsgTypeTestSetup, h.NextIndex(), node)) + + // low priority job with 4 allocs using all 4 GPUs + lowPrioJob := mock.Job() + lowPrioJob.Priority = 5 + lowPrioJob.TaskGroups[0].Count = 4 + lowPrioJob.TaskGroups[0].Networks = nil + lowPrioJob.TaskGroups[0].Tasks[0].Services = nil + lowPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil + lowPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{ + Name: "gpu", + Count: 1, + }} + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), lowPrioJob)) + + allocs := []*structs.Allocation{} + allocIDs := map[string]struct{}{} + for i := 0; i < 4; i++ { + alloc := createAllocWithDevice(uuid.Generate(), lowPrioJob, lowPrioJob.TaskGroups[0].Tasks[0].Resources, &structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{fmt.Sprintf("dev%d", i)}, + }) + alloc.NodeID = node.ID + + allocs = append(allocs, alloc) + allocIDs[alloc.ID] = struct{}{} + } + require.NoError(t, h.State.UpsertAllocs(structs.MsgTypeTestSetup, h.NextIndex(), allocs)) + + // new high priority job with 2 allocs, each using 2 GPUs + highPrioJob := mock.Job() + highPrioJob.Priority = 100 + highPrioJob.TaskGroups[0].Count = 2 + highPrioJob.TaskGroups[0].Networks = nil + highPrioJob.TaskGroups[0].Tasks[0].Services = nil + highPrioJob.TaskGroups[0].Tasks[0].Resources.Networks = nil + highPrioJob.TaskGroups[0].Tasks[0].Resources.Devices = structs.ResourceDevices{{ + Name: "gpu", + Count: 2, + }} + require.NoError(t, h.State.UpsertJob(structs.MsgTypeTestSetup, h.NextIndex(), highPrioJob)) + + // schedule + eval := &structs.Evaluation{ + Namespace: structs.DefaultNamespace, + ID: uuid.Generate(), + Priority: highPrioJob.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: highPrioJob.ID, + Status: structs.EvalStatusPending, + } + require.NoError(t, h.State.UpsertEvals(structs.MsgTypeTestSetup, h.NextIndex(), []*structs.Evaluation{eval})) + + // Process the evaluation + require.NoError(t, h.Process(NewServiceScheduler, eval)) + require.Len(t, h.Plans, 1) + require.Contains(t, h.Plans[0].NodePreemptions, node.ID) + + preempted := map[string]struct{}{} + for _, alloc := range h.Plans[0].NodePreemptions[node.ID] { + preempted[alloc.ID] = struct{}{} + } + require.Equal(t, allocIDs, preempted) +} + // helper method to create allocations with given jobs and resources func createAlloc(id string, job *structs.Job, resource *structs.Resources) *structs.Allocation { return createAllocInner(id, job, resource, nil, nil)