From 18621799be5db9ec0ba5ded3e52c80dbc5d95844 Mon Sep 17 00:00:00 2001 From: Mahmood Ali Date: Tue, 20 Jul 2021 15:13:28 -0400 Subject: [PATCH] client: avoid acting on stale data after launch (#10907) When the client launches, use a consistent read to fetch its own allocs, but allow stale read afterwards as long as reads don't revert into older state. This change addresses an edge case affecting restarting client. When a client restarts, it may fetch a stale data concerning its allocs: allocs that have completed prior to the client shutdown may still have "run/running" desired/client status, and have the client attempt to re-run again. An alternative approach is to track the indices such that the client set MinQueryIndex on the maximum index the client ever saw, or compare received allocs against locally restored client state. Garbage collection complicates this approach (local knowledge is not complete), and the approach still risks starting "dead" allocations (e.g. the allocation may have been placed when client just restarted and have already been reschuled by the time the client started. This approach here is effective against all kinds of stalness problems with small overhead. --- .changelog/10907.txt | 4 ++++ client/client.go | 14 +++++++++++--- 2 files changed, 15 insertions(+), 3 deletions(-) create mode 100644 .changelog/10907.txt diff --git a/.changelog/10907.txt b/.changelog/10907.txt new file mode 100644 index 00000000000..a709b2dc445 --- /dev/null +++ b/.changelog/10907.txt @@ -0,0 +1,4 @@ +```release-note:bug +client: Fixed a bug where a restarted client may start an already completed tasks in rare conditions +``` + diff --git a/client/client.go b/client/client.go index 6252e950045..ee82f4da7e6 100644 --- a/client/client.go +++ b/client/client.go @@ -1980,8 +1980,15 @@ func (c *Client) watchAllocations(updates chan *allocUpdates) { NodeID: c.NodeID(), SecretID: c.secretNodeID(), QueryOptions: structs.QueryOptions{ - Region: c.Region(), - AllowStale: true, + Region: c.Region(), + + // Make a consistent read query when the client starts + // to avoid acting on stale data that predates this + // client state before a client restart. + // + // After the first request, only require monotonically + // increasing state. + AllowStale: false, }, } var resp structs.NodeClientAllocsResponse @@ -2131,7 +2138,8 @@ OUTER: c.logger.Debug("updated allocations", "index", resp.Index, "total", len(resp.Allocs), "pulled", len(allocsResp.Allocs), "filtered", len(filtered)) - // Update the query index. + // After the first request, only require monotonically increasing state. + req.AllowStale = true if resp.Index > req.MinQueryIndex { req.MinQueryIndex = resp.Index }