From 53dac68400c8dd4d0e8edaa6fc0b609cf089cad8 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 21 Apr 2020 08:56:05 -0400 Subject: [PATCH 1/3] test for allocated devices on job in-update update When an alloc is updated in-place, test that the allocated devices are preserved in new alloc struct. --- scheduler/generic_sched_test.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index 0223acebb12..deb6ddecd60 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -2072,6 +2072,15 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) { require.NoError(t, h.State.UpsertJob(h.NextIndex(), job)) require.NoError(t, h.State.UpsertDeployment(h.NextIndex(), d)) + taskName := job.TaskGroups[0].Tasks[0].Name + + adr := structs.AllocatedDeviceResource{ + Type: "gpu", + Vendor: "nvidia", + Name: "1080ti", + DeviceIDs: []string{uuid.Generate()}, + } + // Create allocs that are part of the old deployment var allocs []*structs.Allocation for i := 0; i < 10; i++ { @@ -2082,6 +2091,7 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) { alloc.Name = fmt.Sprintf("my-job.web[%d]", i) alloc.DeploymentID = d.ID alloc.DeploymentStatus = &structs.AllocDeploymentStatus{Healthy: helper.BoolToPtr(true)} + alloc.AllocatedResources.Tasks[taskName].Devices = []*structs.AllocatedDeviceResource{&adr} allocs = append(allocs, alloc) } require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs)) @@ -2155,13 +2165,16 @@ func TestServiceSched_JobModify_InPlace(t *testing.T) { } h.AssertEvalStatus(t, structs.EvalStatusComplete) - // Verify the network did not change + // Verify the allocated networks and devices did not change rp := structs.Port{Label: "admin", Value: 5000} for _, alloc := range out { - for _, resources := range alloc.TaskResources { + for _, resources := range alloc.AllocatedResources.Tasks { if resources.Networks[0].ReservedPorts[0] != rp { t.Fatalf("bad: %#v", alloc) } + if len(resources.Devices) == 0 || reflect.DeepEqual(resources.Devices[0], adr) { + t.Fatalf("bad devices has changed: %#v", alloc) + } } } From 04000c9ba9f184d5458deb201b9db51a9a1f7366 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 21 Apr 2020 08:57:15 -0400 Subject: [PATCH 2/3] Ensure that alloc updates preserve device offers When an alloc is updated in-place, ensure that the allocated device are preserved and carried over to new alloc. --- scheduler/util.go | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/scheduler/util.go b/scheduler/util.go index b0a3ed7c79c..ee18e30007f 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -614,22 +614,25 @@ func inplaceUpdate(ctx Context, eval *structs.Evaluation, job *structs.Job, continue } - // Restore the network offers from the existing allocation. + // Restore the network and device offers from the existing allocation. // We do not allow network resources (reserved/dynamic ports) // to be updated. This is guarded in taskUpdated, so we can // safely restore those here. for task, resources := range option.TaskResources { var networks structs.Networks + var devices []*structs.AllocatedDeviceResource if update.Alloc.AllocatedResources != nil { if tr, ok := update.Alloc.AllocatedResources.Tasks[task]; ok { networks = tr.Networks + devices = tr.Devices } } else if tr, ok := update.Alloc.TaskResources[task]; ok { networks = tr.Networks } - // Add thhe networks back + // Add the networks and devices back resources.Networks = networks + resources.Devices = devices } // Create a shallow copy @@ -892,15 +895,17 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy return false, true, nil } - // Restore the network offers from the existing allocation. + // Restore the network and device offers from the existing allocation. // We do not allow network resources (reserved/dynamic ports) // to be updated. This is guarded in taskUpdated, so we can // safely restore those here. for task, resources := range option.TaskResources { var networks structs.Networks + var devices []*structs.AllocatedDeviceResource if existing.AllocatedResources != nil { if tr, ok := existing.AllocatedResources.Tasks[task]; ok { networks = tr.Networks + devices = tr.Devices } } else if tr, ok := existing.TaskResources[task]; ok { networks = tr.Networks @@ -908,6 +913,7 @@ func genericAllocUpdateFn(ctx Context, stack Stack, evalID string) allocUpdateTy // Add the networks back resources.Networks = networks + resources.Devices = devices } // Create a shallow copy From dacb63448906d20ca07ac5dda9539f91ac1937cc Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 21 Apr 2020 09:27:40 -0400 Subject: [PATCH 3/3] add changelog [ci skip] --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 77197d1ee13..8d37ef797ed 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,7 @@ BUG FIXES: * connect: Fixed a bug where some connect proxy fields would be dropped from 'job inspect' output [[GH-7397](https://github.com/hashicorp/nomad/issues/7397)] * connect: Fixed a bug where an absent connect sidecar_service stanza would trigger panic [[GH-7683](https://github.com/hashicorp/nomad/pull/7683)] * connect: Fixed bugs where some connect parameters would be ignored [[GH-7690](https://github.com/hashicorp/nomad/pull/7690)] [[GH-7684](https://github.com/hashicorp/nomad/pull/7684)] + * scheduler: Fixed a bug in managing allocated devices for a job allocation in in-place update scenarios [[GH-7762](https://github.com/hashicorp/nomad/issues/7762)] ## 0.11.0 (April 8, 2020)