From 5328f3c7e292a376ed2f00bb4d8458fad9dfb3be Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Mon, 25 Apr 2022 17:05:08 -0700 Subject: [PATCH] client: fix waiting on preempted alloc Fixes #10200 **The bug** A user reported receiving the following error when an alloc was placed that needed to preempt existing allocs: ``` [ERROR] client.alloc_watcher: error querying previous alloc: alloc_id=28... previous_alloc=8e... error="rpc error: alloc lookup failed: index error: UUID must be 36 characters" ``` The previous alloc (8e) was already complete on the client. This is possible if an alloc stops *after* the scheduling decision was made to preempt it, but *before* the node running both allocations was able to pull and start the preemptor. While that is hopefully a narrow window of time, you can expect it to occur in high throughput batch scheduling heavy systems. However the RPC error made no sense! `previous_alloc` in the logs was a valid 36 character UUID! **The fix** The fix is: ``` - prevAllocID: c.Alloc.PreviousAllocation, + prevAllocID: watchedAllocID, ``` The alloc watcher new func used for preemption improperly referenced Alloc.PreviousAllocation instead of the passed in watchedAllocID. When multiple allocs are preempted, a watcher is created for each with watchedAllocID set properly by the caller. In this case Alloc.PreviousAllocation="" -- which is where the `UUID must be 36 characters` error was coming from! Sadly we were properly referencing watchedAllocID in the log, so it made the error make no sense! **The repro** I was able to reproduce this with a dev agent with [preemption enabled](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-hcl) and [lowered limits](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-limits-hcl) for ease of repro. First I started a [low priority count 3 job](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-lo-nomad), then a [high priority job](https://gist.github.com/schmichael/53f79cbd898afdfab76865ad8c7fc6a0#file-preempt-hi-nomad) that evicts 2 low priority jobs. Everything worked as expected. However if I force it to use the [remotePrevAlloc implementation](https://github.com/hashicorp/nomad/blob/v1.3.0-beta.1/client/allocwatcher/alloc_watcher.go#L147), it reproduces the bug because the watcher references PreviousAllocation instead of watchedAllocID. --- .changelog/12779.txt | 3 +++ client/allocwatcher/alloc_watcher.go | 23 ++++++++++++++++------- 2 files changed, 19 insertions(+), 7 deletions(-) create mode 100644 .changelog/12779.txt diff --git a/.changelog/12779.txt b/.changelog/12779.txt new file mode 100644 index 00000000000..b0d1a3d89e3 --- /dev/null +++ b/.changelog/12779.txt @@ -0,0 +1,3 @@ +```release-note:bug +client: Fixed a bug that could prevent a preempting alloc from ever starting. +``` diff --git a/client/allocwatcher/alloc_watcher.go b/client/allocwatcher/alloc_watcher.go index 5cb1dd75fb6..7c4c7d2fa80 100644 --- a/client/allocwatcher/alloc_watcher.go +++ b/client/allocwatcher/alloc_watcher.go @@ -102,7 +102,7 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string, migrate := tg.EphemeralDisk != nil && tg.EphemeralDisk.Migrate if m != nil { - // Local Allocation because there's no meta + // Local Allocation because there's an alloc runner return &localPrevAlloc{ allocID: c.Alloc.ID, prevAllocID: watchedAllocID, @@ -117,7 +117,7 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string, return &remotePrevAlloc{ allocID: c.Alloc.ID, - prevAllocID: c.Alloc.PreviousAllocation, + prevAllocID: watchedAllocID, tasks: tasks, config: c.Config, migrate: migrate, @@ -127,11 +127,17 @@ func newMigratorForAlloc(c Config, tg *structs.TaskGroup, watchedAllocID string, } } +// newWatcherForAlloc uses a local or rpc-based watcher depending on whether +// AllocRunnerMeta is nil or not. +// +// Note that c.Alloc.PreviousAllocation must NOT be used in this func as it +// used for preemption which has a distinct field. The caller is responsible +// for passing the allocation to be watched as watchedAllocID. func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) PrevAllocWatcher { logger := c.Logger.Named("alloc_watcher").With("alloc_id", c.Alloc.ID).With("previous_alloc", watchedAllocID) if m != nil { - // Local Allocation because there's no meta + // Local Allocation because there's an alloc runner return &localPrevAlloc{ allocID: c.Alloc.ID, prevAllocID: watchedAllocID, @@ -144,7 +150,7 @@ func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) Prev return &remotePrevAlloc{ allocID: c.Alloc.ID, - prevAllocID: c.Alloc.PreviousAllocation, + prevAllocID: watchedAllocID, config: c.Config, rpc: c.RPC, migrateToken: c.MigrateToken, @@ -152,9 +158,12 @@ func newWatcherForAlloc(c Config, watchedAllocID string, m AllocRunnerMeta) Prev } } -// NewAllocWatcher creates a PrevAllocWatcher appropriate for whether this -// alloc's previous allocation was local or remote. If this alloc has no -// previous alloc then a noop implementation is returned. +// NewAllocWatcher creates a PrevAllocWatcher if either PreviousAllocation or +// PreemptedRunners are set. If any of the allocs to watch have local runners, +// wait for them to terminate directly. +// For allocs which are either running on another node or have already +// terminated their alloc runners, use a remote backend which watches the alloc +// status via rpc. func NewAllocWatcher(c Config) (PrevAllocWatcher, PrevAllocMigrator) { if c.Alloc.PreviousAllocation == "" && c.PreemptedRunners == nil { return NoopPrevAlloc{}, NoopPrevAlloc{}