From d77dc01b68e3a6b7260ce0fc2a0cb408e72245ac Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Tue, 9 Feb 2021 11:34:07 -0500 Subject: [PATCH 1/4] deployments: comment clarifications and more logging --- nomad/deploymentwatcher/deployment_watcher.go | 14 +++++++++----- nomad/structs/structs.go | 8 +++++--- 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index 02d698848c4..3286fb165da 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -427,10 +427,13 @@ FAIL: // This is the successful case, and we stop the loop return case <-deadlineTimer.C: - // We have hit the progress deadline so fail the deployment. We need - // to determine whether we should roll back the job by inspecting - // which allocs as part of the deployment are healthy and which - // aren't. + // We have hit the progress deadline, so fail the deployment + // unless we're waiting for manual promotion. We need to determine + // whether we should roll back the job by inspecting which allocs + // as part of the deployment are healthy and which aren't. The + // deadlineHit flag is never reset, so even in the case of a + // manual promotion, we'll describe any failure as a progress + // deadline failure at this point. deadlineHit = true fail, rback, err := w.shouldFail() if err != nil { @@ -475,6 +478,7 @@ FAIL: // to avoid resetting, causing a deployment failure. if !next.IsZero() { deadlineTimer.Reset(time.Until(next)) + w.logger.Trace("resetting deadline") } } @@ -674,7 +678,7 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) { } // Unhealthy allocs and we need to autorevert - return true, true, nil + return fail, true, nil } return fail, false, nil diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 17b2c31083c..8e574cee8a8 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -8776,11 +8776,13 @@ type DeploymentState struct { AutoPromote bool // ProgressDeadline is the deadline by which an allocation must transition - // to healthy before the deployment is considered failed. + // to healthy before the deployment is considered failed. This value is set + // by the jobspec `update.progress_deadline` field. ProgressDeadline time.Duration - // RequireProgressBy is the time by which an allocation must transition - // to healthy before the deployment is considered failed. + // RequireProgressBy is the time by which an allocation must transition to + // healthy before the deployment is considered failed. This value is reset + // to "now" + ProgressDeadline when an allocation updates the deployment. RequireProgressBy time.Time // Promoted marks whether the canaries have been promoted From ce97bc3f8cf88dc4f18e119e8221e38e112bfea2 Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Thu, 11 Feb 2021 16:42:58 -0500 Subject: [PATCH 2/4] deploymentwatcher: failing test Test demonstrating a bug with canary deployments when there are multiple groups. If group A's canary becomes healthy before group B's, the deadline for the overall deployment will be set to that of group A. When the deployment is promoted, if group A is done it will not contribute to the next deadline cutoff. Group B's old deadline will be used instead, which will be in the past and immediately trigger a deployment progress failure. --- .../deployments_watcher_test.go | 206 ++++++++++++++++++ 1 file changed, 206 insertions(+) diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index 8777a398f05..71ab0f8a205 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -1334,6 +1334,212 @@ func TestDeploymentWatcher_PromotedCanary_UpdatedAllocs(t *testing.T) { }) } +func TestDeploymentWatcher_ProgressDeadline_LatePromote(t *testing.T) { + t.Parallel() + require := require.New(t) + mtype := structs.MsgTypeTestSetup + + w, m := defaultTestDeploymentWatcher(t) + w.SetEnabled(true, m.state) + + m.On("UpdateDeploymentStatus", mocker.MatchedBy(func(args *structs.DeploymentStatusUpdateRequest) bool { + return true + })).Return(nil).Maybe() + + progressTimeout := time.Millisecond * 1000 + j := mock.Job() + j.TaskGroups[0].Name = "group1" + j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() + j.TaskGroups[0].Update.MaxParallel = 2 + j.TaskGroups[0].Update.AutoRevert = false + j.TaskGroups[0].Update.ProgressDeadline = progressTimeout + j.TaskGroups = append(j.TaskGroups, j.TaskGroups[0].Copy()) + j.TaskGroups[0].Name = "group2" + + d := mock.Deployment() + d.JobID = j.ID + d.TaskGroups = map[string]*structs.DeploymentState{ + "group1": { + ProgressDeadline: progressTimeout, + Promoted: false, + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 3, + PlacedAllocs: 0, + HealthyAllocs: 0, + UnhealthyAllocs: 0, + }, + "group2": { + ProgressDeadline: progressTimeout, + Promoted: false, + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 1, + PlacedAllocs: 0, + HealthyAllocs: 0, + UnhealthyAllocs: 0, + }, + } + + require.NoError(m.state.UpsertJob(mtype, m.nextIndex(), j)) + require.NoError(m.state.UpsertDeployment(m.nextIndex(), d)) + + // require that we get a call to UpsertDeploymentPromotion + matchConfig := &matchDeploymentPromoteRequestConfig{ + Promotion: &structs.DeploymentPromoteRequest{ + DeploymentID: d.ID, + All: true, + }, + Eval: true, + } + matcher := matchDeploymentPromoteRequest(matchConfig) + m.On("UpdateDeploymentPromotion", mocker.MatchedBy(matcher)).Return(nil) + m1 := matchUpdateAllocDesiredTransitions([]string{d.ID}) + m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil) + + // create canaries + + now := time.Now() + + canary1 := mock.Alloc() + canary1.Job = j + canary1.DeploymentID = d.ID + canary1.TaskGroup = "group1" + canary1.DesiredStatus = structs.AllocDesiredStatusRun + canary1.ModifyTime = now.UnixNano() + + canary2 := mock.Alloc() + canary2.Job = j + canary2.DeploymentID = d.ID + canary2.TaskGroup = "group2" + canary2.DesiredStatus = structs.AllocDesiredStatusRun + canary2.ModifyTime = now.UnixNano() + + allocs := []*structs.Allocation{canary1, canary2} + err := m.state.UpsertAllocs(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // 2nd group's canary becomes healthy + + now = time.Now() + + canary2 = canary2.Copy() + canary2.ModifyTime = now.UnixNano() + canary2.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{canary2} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // wait for long enough to ensure we read deployment update channel + // this sleep creates the race condition associated with #7058 + time.Sleep(50 * time.Millisecond) + + // 1st group's canary becomes healthy + now = time.Now() + + canary1 = canary1.Copy() + canary1.ModifyTime = now.UnixNano() + canary1.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{canary1} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // ensure progress_deadline has definitely expired + time.Sleep(progressTimeout) + + // promote the deployment + + req := &structs.DeploymentPromoteRequest{ + DeploymentID: d.ID, + All: true, + } + err = w.PromoteDeployment(req, &structs.DeploymentUpdateResponse{}) + require.NoError(err) + + // wait for long enough to ensure we read deployment update channel + time.Sleep(50 * time.Millisecond) + + // create new allocs for promoted deployment + // (these come from plan_apply, not a client update) + now = time.Now() + + alloc1a := mock.Alloc() + alloc1a.Job = j + alloc1a.DeploymentID = d.ID + alloc1a.TaskGroup = "group1" + alloc1a.ClientStatus = structs.AllocClientStatusPending + alloc1a.DesiredStatus = structs.AllocDesiredStatusRun + alloc1a.ModifyTime = now.UnixNano() + + alloc1b := mock.Alloc() + alloc1b.Job = j + alloc1b.DeploymentID = d.ID + alloc1b.TaskGroup = "group1" + alloc1b.ClientStatus = structs.AllocClientStatusPending + alloc1b.DesiredStatus = structs.AllocDesiredStatusRun + alloc1b.ModifyTime = now.UnixNano() + + allocs = []*structs.Allocation{alloc1a, alloc1b} + err = m.state.UpsertAllocs(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // allocs become healthy + + now = time.Now() + + alloc1a = alloc1a.Copy() + alloc1a.ClientStatus = structs.AllocClientStatusRunning + alloc1a.ModifyTime = now.UnixNano() + alloc1a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + alloc1b = alloc1b.Copy() + alloc1b.ClientStatus = structs.AllocClientStatusRunning + alloc1b.ModifyTime = now.UnixNano() + alloc1b.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{alloc1a, alloc1b} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // ensure any progress deadline has expired + time.Sleep(progressTimeout) + + // without a scheduler running we'll never mark the deployment as + // successful, so test that healthy == desired and that we haven't failed + deployment, err := m.state.DeploymentByID(nil, d.ID) + require.NoError(err) + require.Equal(structs.DeploymentStatusRunning, deployment.Status) + + group1 := deployment.TaskGroups["group1"] + + require.Equal(group1.DesiredTotal, group1.HealthyAllocs, "not enough healthy") + require.Equal(group1.DesiredTotal, group1.PlacedAllocs, "not enough placed") + require.Equal(0, group1.UnhealthyAllocs) + + group2 := deployment.TaskGroups["group2"] + require.Equal(group2.DesiredTotal, group2.HealthyAllocs, "not enough healthy") + require.Equal(group2.DesiredTotal, group2.PlacedAllocs, "not enough placed") + require.Equal(0, group2.UnhealthyAllocs) +} + // Test scenario where deployment initially has no progress deadline // After the deployment is updated, a failed alloc's DesiredTransition should be set func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) { From a4544bf5078742e314717a64e87b3f4b9244862f Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 17 Feb 2021 11:51:19 -0500 Subject: [PATCH 3/4] deploymentwatcher: reset progress deadline on promotion --- nomad/state/state_store.go | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 19f088ed778..2317631e2cb 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -4046,6 +4046,10 @@ func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, inde continue } + // reset the progress deadline + if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() { + status.RequireProgressBy = time.Now().Add(status.ProgressDeadline) + } status.Promoted = true } From 80e28a5c1772a1b5f25c43d85880c4448cf342fa Mon Sep 17 00:00:00 2001 From: Tim Gross Date: Wed, 17 Feb 2021 14:03:06 -0500 Subject: [PATCH 4/4] docs updates and changelog entry --- CHANGELOG.md | 1 + .../content/docs/job-specification/update.mdx | 21 +++++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 5aae22c0a24..a5835f6ac8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ BUG FIXES: * consul/connect: Fixed a bug preventing more than one connect gateway per Nomad client [[GH-9849](https://github.com/hashicorp/nomad/pull/9849)] * consul/connect: Fixed a bug where connect sidecar services would be re-registered unnecessarily. [[GH-10059](https://github.com/hashicorp/nomad/pull/10059)] * consul/connect: Fixed a bug where the sidecar health checks would fail if `host_network` was defined. [[GH-9975](https://github.com/hashicorp/nomad/issues/9975)] + * deployments: Fixed a bug where deployments with multiple task groups and manual promotion would fail if promoted after the progress deadline. [[GH-10042](https://github.com/hashicorp/nomad/issues/10042)] * drivers/docker: Fixed a bug preventing multiple ports to be mapped to the same container port [[GH-9951](https://github.com/hashicorp/nomad/issues/9951)] * driver/qemu: Fixed a bug where network namespaces were not supported for QEMU workloads [[GH-9861](https://github.com/hashicorp/nomad/pull/9861)] * nomad/structs: Fixed a bug where static ports with the same value but different `host_network` were invalid [[GH-9946](https://github.com/hashicorp/nomad/issues/9946)] diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx index 2f316afd1a3..74a5bd9146f 100644 --- a/website/content/docs/job-specification/update.mdx +++ b/website/content/docs/job-specification/update.mdx @@ -83,20 +83,23 @@ a future release. - `progress_deadline` `(string: "10m")` - Specifies the deadline in which an allocation must be marked as healthy. The deadline begins when the first allocation for the deployment is created and is reset whenever an allocation - as part of the deployment transitions to a healthy state. If no allocation - transitions to the healthy state before the progress deadline, the deployment - is marked as failed. If the `progress_deadline` is set to `0`, the first - allocation to be marked as unhealthy causes the deployment to fail. This is - specified using a label suffix like "2m" or "1h". + as part of the deployment transitions to a healthy state or when a + deployment is manually promoted. If no allocation transitions to the healthy + state before the progress deadline, the deployment is marked as failed. If + the `progress_deadline` is set to `0`, the first allocation to be marked as + unhealthy causes the deployment to fail. This is specified using a label + suffix like "2m" or "1h". - `auto_revert` `(bool: false)` - Specifies if the job should auto-revert to the last stable job on deployment failure. A job is marked as stable if all the allocations as part of its deployment were marked healthy. -- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to the - canary version when all canaries become healthy during a deployment. Defaults to - false which means canaries must be manually updated with the `nomad deployment promote` - command. +- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to + the canary version when all canaries become healthy during a + deployment. Defaults to false which means canaries must be manually updated + with the `nomad deployment promote` command. If a job has multiple task + groups, all must be set to `auto_promote = true` in order for the deployment + to be promoted automatically. - `canary` `(int: 0)` - Specifies that changes to the job that would result in destructive updates should create the specified number of canaries without