diff --git a/CHANGELOG.md b/CHANGELOG.md index 5aae22c0a24..a5835f6ac8e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -23,6 +23,7 @@ BUG FIXES: * consul/connect: Fixed a bug preventing more than one connect gateway per Nomad client [[GH-9849](https://github.com/hashicorp/nomad/pull/9849)] * consul/connect: Fixed a bug where connect sidecar services would be re-registered unnecessarily. [[GH-10059](https://github.com/hashicorp/nomad/pull/10059)] * consul/connect: Fixed a bug where the sidecar health checks would fail if `host_network` was defined. [[GH-9975](https://github.com/hashicorp/nomad/issues/9975)] + * deployments: Fixed a bug where deployments with multiple task groups and manual promotion would fail if promoted after the progress deadline. [[GH-10042](https://github.com/hashicorp/nomad/issues/10042)] * drivers/docker: Fixed a bug preventing multiple ports to be mapped to the same container port [[GH-9951](https://github.com/hashicorp/nomad/issues/9951)] * driver/qemu: Fixed a bug where network namespaces were not supported for QEMU workloads [[GH-9861](https://github.com/hashicorp/nomad/pull/9861)] * nomad/structs: Fixed a bug where static ports with the same value but different `host_network` were invalid [[GH-9946](https://github.com/hashicorp/nomad/issues/9946)] diff --git a/nomad/deploymentwatcher/deployment_watcher.go b/nomad/deploymentwatcher/deployment_watcher.go index 02d698848c4..3286fb165da 100644 --- a/nomad/deploymentwatcher/deployment_watcher.go +++ b/nomad/deploymentwatcher/deployment_watcher.go @@ -427,10 +427,13 @@ FAIL: // This is the successful case, and we stop the loop return case <-deadlineTimer.C: - // We have hit the progress deadline so fail the deployment. We need - // to determine whether we should roll back the job by inspecting - // which allocs as part of the deployment are healthy and which - // aren't. + // We have hit the progress deadline, so fail the deployment + // unless we're waiting for manual promotion. We need to determine + // whether we should roll back the job by inspecting which allocs + // as part of the deployment are healthy and which aren't. The + // deadlineHit flag is never reset, so even in the case of a + // manual promotion, we'll describe any failure as a progress + // deadline failure at this point. deadlineHit = true fail, rback, err := w.shouldFail() if err != nil { @@ -475,6 +478,7 @@ FAIL: // to avoid resetting, causing a deployment failure. if !next.IsZero() { deadlineTimer.Reset(time.Until(next)) + w.logger.Trace("resetting deadline") } } @@ -674,7 +678,7 @@ func (w *deploymentWatcher) shouldFail() (fail, rollback bool, err error) { } // Unhealthy allocs and we need to autorevert - return true, true, nil + return fail, true, nil } return fail, false, nil diff --git a/nomad/deploymentwatcher/deployments_watcher_test.go b/nomad/deploymentwatcher/deployments_watcher_test.go index 8777a398f05..71ab0f8a205 100644 --- a/nomad/deploymentwatcher/deployments_watcher_test.go +++ b/nomad/deploymentwatcher/deployments_watcher_test.go @@ -1334,6 +1334,212 @@ func TestDeploymentWatcher_PromotedCanary_UpdatedAllocs(t *testing.T) { }) } +func TestDeploymentWatcher_ProgressDeadline_LatePromote(t *testing.T) { + t.Parallel() + require := require.New(t) + mtype := structs.MsgTypeTestSetup + + w, m := defaultTestDeploymentWatcher(t) + w.SetEnabled(true, m.state) + + m.On("UpdateDeploymentStatus", mocker.MatchedBy(func(args *structs.DeploymentStatusUpdateRequest) bool { + return true + })).Return(nil).Maybe() + + progressTimeout := time.Millisecond * 1000 + j := mock.Job() + j.TaskGroups[0].Name = "group1" + j.TaskGroups[0].Update = structs.DefaultUpdateStrategy.Copy() + j.TaskGroups[0].Update.MaxParallel = 2 + j.TaskGroups[0].Update.AutoRevert = false + j.TaskGroups[0].Update.ProgressDeadline = progressTimeout + j.TaskGroups = append(j.TaskGroups, j.TaskGroups[0].Copy()) + j.TaskGroups[0].Name = "group2" + + d := mock.Deployment() + d.JobID = j.ID + d.TaskGroups = map[string]*structs.DeploymentState{ + "group1": { + ProgressDeadline: progressTimeout, + Promoted: false, + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 3, + PlacedAllocs: 0, + HealthyAllocs: 0, + UnhealthyAllocs: 0, + }, + "group2": { + ProgressDeadline: progressTimeout, + Promoted: false, + PlacedCanaries: []string{}, + DesiredCanaries: 1, + DesiredTotal: 1, + PlacedAllocs: 0, + HealthyAllocs: 0, + UnhealthyAllocs: 0, + }, + } + + require.NoError(m.state.UpsertJob(mtype, m.nextIndex(), j)) + require.NoError(m.state.UpsertDeployment(m.nextIndex(), d)) + + // require that we get a call to UpsertDeploymentPromotion + matchConfig := &matchDeploymentPromoteRequestConfig{ + Promotion: &structs.DeploymentPromoteRequest{ + DeploymentID: d.ID, + All: true, + }, + Eval: true, + } + matcher := matchDeploymentPromoteRequest(matchConfig) + m.On("UpdateDeploymentPromotion", mocker.MatchedBy(matcher)).Return(nil) + m1 := matchUpdateAllocDesiredTransitions([]string{d.ID}) + m.On("UpdateAllocDesiredTransition", mocker.MatchedBy(m1)).Return(nil) + + // create canaries + + now := time.Now() + + canary1 := mock.Alloc() + canary1.Job = j + canary1.DeploymentID = d.ID + canary1.TaskGroup = "group1" + canary1.DesiredStatus = structs.AllocDesiredStatusRun + canary1.ModifyTime = now.UnixNano() + + canary2 := mock.Alloc() + canary2.Job = j + canary2.DeploymentID = d.ID + canary2.TaskGroup = "group2" + canary2.DesiredStatus = structs.AllocDesiredStatusRun + canary2.ModifyTime = now.UnixNano() + + allocs := []*structs.Allocation{canary1, canary2} + err := m.state.UpsertAllocs(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // 2nd group's canary becomes healthy + + now = time.Now() + + canary2 = canary2.Copy() + canary2.ModifyTime = now.UnixNano() + canary2.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{canary2} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // wait for long enough to ensure we read deployment update channel + // this sleep creates the race condition associated with #7058 + time.Sleep(50 * time.Millisecond) + + // 1st group's canary becomes healthy + now = time.Now() + + canary1 = canary1.Copy() + canary1.ModifyTime = now.UnixNano() + canary1.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: true, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{canary1} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // ensure progress_deadline has definitely expired + time.Sleep(progressTimeout) + + // promote the deployment + + req := &structs.DeploymentPromoteRequest{ + DeploymentID: d.ID, + All: true, + } + err = w.PromoteDeployment(req, &structs.DeploymentUpdateResponse{}) + require.NoError(err) + + // wait for long enough to ensure we read deployment update channel + time.Sleep(50 * time.Millisecond) + + // create new allocs for promoted deployment + // (these come from plan_apply, not a client update) + now = time.Now() + + alloc1a := mock.Alloc() + alloc1a.Job = j + alloc1a.DeploymentID = d.ID + alloc1a.TaskGroup = "group1" + alloc1a.ClientStatus = structs.AllocClientStatusPending + alloc1a.DesiredStatus = structs.AllocDesiredStatusRun + alloc1a.ModifyTime = now.UnixNano() + + alloc1b := mock.Alloc() + alloc1b.Job = j + alloc1b.DeploymentID = d.ID + alloc1b.TaskGroup = "group1" + alloc1b.ClientStatus = structs.AllocClientStatusPending + alloc1b.DesiredStatus = structs.AllocDesiredStatusRun + alloc1b.ModifyTime = now.UnixNano() + + allocs = []*structs.Allocation{alloc1a, alloc1b} + err = m.state.UpsertAllocs(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // allocs become healthy + + now = time.Now() + + alloc1a = alloc1a.Copy() + alloc1a.ClientStatus = structs.AllocClientStatusRunning + alloc1a.ModifyTime = now.UnixNano() + alloc1a.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + alloc1b = alloc1b.Copy() + alloc1b.ClientStatus = structs.AllocClientStatusRunning + alloc1b.ModifyTime = now.UnixNano() + alloc1b.DeploymentStatus = &structs.AllocDeploymentStatus{ + Canary: false, + Healthy: helper.BoolToPtr(true), + Timestamp: now, + } + + allocs = []*structs.Allocation{alloc1a, alloc1b} + err = m.state.UpdateAllocsFromClient(mtype, m.nextIndex(), allocs) + require.NoError(err) + + // ensure any progress deadline has expired + time.Sleep(progressTimeout) + + // without a scheduler running we'll never mark the deployment as + // successful, so test that healthy == desired and that we haven't failed + deployment, err := m.state.DeploymentByID(nil, d.ID) + require.NoError(err) + require.Equal(structs.DeploymentStatusRunning, deployment.Status) + + group1 := deployment.TaskGroups["group1"] + + require.Equal(group1.DesiredTotal, group1.HealthyAllocs, "not enough healthy") + require.Equal(group1.DesiredTotal, group1.PlacedAllocs, "not enough placed") + require.Equal(0, group1.UnhealthyAllocs) + + group2 := deployment.TaskGroups["group2"] + require.Equal(group2.DesiredTotal, group2.HealthyAllocs, "not enough healthy") + require.Equal(group2.DesiredTotal, group2.PlacedAllocs, "not enough placed") + require.Equal(0, group2.UnhealthyAllocs) +} + // Test scenario where deployment initially has no progress deadline // After the deployment is updated, a failed alloc's DesiredTransition should be set func TestDeploymentWatcher_Watch_StartWithoutProgressDeadline(t *testing.T) { diff --git a/nomad/state/state_store.go b/nomad/state/state_store.go index 19f088ed778..2317631e2cb 100644 --- a/nomad/state/state_store.go +++ b/nomad/state/state_store.go @@ -4046,6 +4046,10 @@ func (s *StateStore) UpdateDeploymentPromotion(msgType structs.MessageType, inde continue } + // reset the progress deadline + if status.ProgressDeadline > 0 && !status.RequireProgressBy.IsZero() { + status.RequireProgressBy = time.Now().Add(status.ProgressDeadline) + } status.Promoted = true } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 17b2c31083c..8e574cee8a8 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -8776,11 +8776,13 @@ type DeploymentState struct { AutoPromote bool // ProgressDeadline is the deadline by which an allocation must transition - // to healthy before the deployment is considered failed. + // to healthy before the deployment is considered failed. This value is set + // by the jobspec `update.progress_deadline` field. ProgressDeadline time.Duration - // RequireProgressBy is the time by which an allocation must transition - // to healthy before the deployment is considered failed. + // RequireProgressBy is the time by which an allocation must transition to + // healthy before the deployment is considered failed. This value is reset + // to "now" + ProgressDeadline when an allocation updates the deployment. RequireProgressBy time.Time // Promoted marks whether the canaries have been promoted diff --git a/website/content/docs/job-specification/update.mdx b/website/content/docs/job-specification/update.mdx index 2f316afd1a3..74a5bd9146f 100644 --- a/website/content/docs/job-specification/update.mdx +++ b/website/content/docs/job-specification/update.mdx @@ -83,20 +83,23 @@ a future release. - `progress_deadline` `(string: "10m")` - Specifies the deadline in which an allocation must be marked as healthy. The deadline begins when the first allocation for the deployment is created and is reset whenever an allocation - as part of the deployment transitions to a healthy state. If no allocation - transitions to the healthy state before the progress deadline, the deployment - is marked as failed. If the `progress_deadline` is set to `0`, the first - allocation to be marked as unhealthy causes the deployment to fail. This is - specified using a label suffix like "2m" or "1h". + as part of the deployment transitions to a healthy state or when a + deployment is manually promoted. If no allocation transitions to the healthy + state before the progress deadline, the deployment is marked as failed. If + the `progress_deadline` is set to `0`, the first allocation to be marked as + unhealthy causes the deployment to fail. This is specified using a label + suffix like "2m" or "1h". - `auto_revert` `(bool: false)` - Specifies if the job should auto-revert to the last stable job on deployment failure. A job is marked as stable if all the allocations as part of its deployment were marked healthy. -- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to the - canary version when all canaries become healthy during a deployment. Defaults to - false which means canaries must be manually updated with the `nomad deployment promote` - command. +- `auto_promote` `(bool: false)` - Specifies if the job should auto-promote to + the canary version when all canaries become healthy during a + deployment. Defaults to false which means canaries must be manually updated + with the `nomad deployment promote` command. If a job has multiple task + groups, all must be set to `auto_promote = true` in order for the deployment + to be promoted automatically. - `canary` `(int: 0)` - Specifies that changes to the job that would result in destructive updates should create the specified number of canaries without