Skip to content

Commit

Permalink
Add the ability to disable rescheduling on lost. Implements issue #10366
Browse files Browse the repository at this point in the history
  • Loading branch information
DominicLavery authored and Juanadelacuesta committed Nov 8, 2023
1 parent 9d075c4 commit 72ad585
Show file tree
Hide file tree
Showing 15 changed files with 231 additions and 58 deletions.
3 changes: 3 additions & 0 deletions .changelog/16867.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:feature
**Reschedule on Lost**: Adds the ability to prevent tasks on down nodes from being rescheduled
```
28 changes: 17 additions & 11 deletions api/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf(""),
Count: pointerOf(1),
Name: pointerOf(""),
Count: pointerOf(1),
RescheduleOnLost: pointerOf(true),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -399,8 +400,9 @@ func TestJobs_Canonicalize(t *testing.T) {
JobModifyIndex: pointerOf(uint64(0)),
TaskGroups: []*TaskGroup{
{
Name: pointerOf(""),
Count: pointerOf(1),
Name: pointerOf(""),
Count: pointerOf(1),
RescheduleOnLost: pointerOf(true),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -555,8 +557,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("cache"),
Count: pointerOf(1),
Name: pointerOf("cache"),
Count: pointerOf(1),
RescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Interval: pointerOf(5 * time.Minute),
Attempts: pointerOf(10),
Expand Down Expand Up @@ -666,8 +669,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("cache"),
Count: pointerOf(1),
Name: pointerOf("cache"),
Count: pointerOf(1),
RescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Interval: pointerOf(5 * time.Minute),
Attempts: pointerOf(10),
Expand Down Expand Up @@ -930,8 +934,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Count: pointerOf(1),
Name: pointerOf("bar"),
Count: pointerOf(1),
RescheduleOnLost: pointerOf(true),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -1038,7 +1043,8 @@ func TestJobs_Canonicalize(t *testing.T) {
ParentID: pointerOf("lol"),
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Name: pointerOf("bar"),
RescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Delay: pointerOf(15 * time.Second),
Attempts: pointerOf(2),
Expand Down
5 changes: 4 additions & 1 deletion api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ type TaskGroup struct {
MaxClientDisconnect *time.Duration `mapstructure:"max_client_disconnect" hcl:"max_client_disconnect,optional"`
Scaling *ScalingPolicy `hcl:"scaling,block"`
Consul *Consul `hcl:"consul,block"`
RescheduleOnLost *bool `hcl:"reschedule_on_lost,optional"`
}

// NewTaskGroup creates a new TaskGroup.
Expand Down Expand Up @@ -577,7 +578,9 @@ func (g *TaskGroup) Canonicalize(job *Job) {
for _, s := range g.Services {
s.Canonicalize(nil, g, job)
}

if g.RescheduleOnLost == nil {
g.RescheduleOnLost = pointerOf(true)
}
}

// These needs to be in sync with DefaultServiceJobRestartPolicy in
Expand Down
6 changes: 6 additions & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,12 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta
RenderTemplates: *taskGroup.RestartPolicy.RenderTemplates,
}

if taskGroup.RescheduleOnLost == nil {
tg.RescheduleOnLost = true
} else {
tg.RescheduleOnLost = *taskGroup.RescheduleOnLost
}

if taskGroup.ShutdownDelay != nil {
tg.ShutdownDelay = taskGroup.ShutdownDelay
}
Expand Down
6 changes: 4 additions & 2 deletions command/agent/job_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3058,6 +3058,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
Operand: "z",
},
},
RescheduleOnLost: true,
Affinities: []*structs.Affinity{
{
LTarget: "x",
Expand Down Expand Up @@ -3552,8 +3553,9 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
},
TaskGroups: []*structs.TaskGroup{
{
Name: "group1",
Count: 5,
Name: "group1",
Count: 5,
RescheduleOnLost: true,
Constraints: []*structs.Constraint{
{
LTarget: "x",
Expand Down
23 changes: 17 additions & 6 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -629,8 +629,23 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time,
return false
}

// If the job is deleted all allocs can be removed
if job == nil {
return true
}

tg := job.LookupTaskGroup(a.TaskGroup)
if tg == nil {
return true
}

// Don't GC lost allocs when RescheduleOnLost is disabled
if !job.Stop && !tg.RescheduleOnLost && a.ClientStatus == structs.AllocClientStatusLost {
return false
}

// If the job is deleted, stopped or dead all allocs can be removed
if job == nil || job.Stop || job.Status == structs.JobStatusDead {
if job.Stop || job.Status == structs.JobStatusDead {
return true
}

Expand All @@ -647,12 +662,8 @@ func allocGCEligible(a *structs.Allocation, job *structs.Job, gcTime time.Time,
return true
}

var reschedulePolicy *structs.ReschedulePolicy
tg := job.LookupTaskGroup(a.TaskGroup)
reschedulePolicy := tg.ReschedulePolicy

if tg != nil {
reschedulePolicy = tg.ReschedulePolicy
}
// No reschedule policy or rescheduling is disabled
if reschedulePolicy == nil || (!reschedulePolicy.Unlimited && reschedulePolicy.Attempts == 0) {
return true
Expand Down
21 changes: 21 additions & 0 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1946,6 +1946,7 @@ func TestAllocation_GCEligible(t *testing.T) {
DesiredStatus string
JobStatus string
JobStop bool
RescheduleOnLost *bool
AllocJobModifyIndex uint64
JobModifyIndex uint64
ModifyIndex uint64
Expand Down Expand Up @@ -2120,6 +2121,23 @@ func TestAllocation_GCEligible(t *testing.T) {
JobStop: true,
ShouldGC: true,
},
{
Desc: "GC when alloc is lost and eligible for reschedule",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
GCTime: fail,
JobStatus: structs.JobStatusDead,
ShouldGC: true,
},
{
Desc: "Don't GC when alloc is lost and not being rescheduled",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
RescheduleOnLost: pointer.Of(false),
GCTime: fail,
JobStatus: structs.JobStatusDead,
ShouldGC: false,
},
{
Desc: "GC when job status is dead",
ClientStatus: structs.AllocClientStatusFailed,
Expand Down Expand Up @@ -2166,6 +2184,9 @@ func TestAllocation_GCEligible(t *testing.T) {
alloc.NextAllocation = tc.NextAllocID
job := mock.Job()
alloc.TaskGroup = job.TaskGroups[0].Name
if tc.RescheduleOnLost != nil {
job.TaskGroups[0].RescheduleOnLost = *tc.RescheduleOnLost
}
job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
if tc.JobStatus != "" {
job.Status = tc.JobStatus
Expand Down
5 changes: 3 additions & 2 deletions nomad/mock/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ func Job() *structs.Job {
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 10,
Name: "web",
Count: 10,
RescheduleOnLost: true,
Constraints: []*structs.Constraint{
{
LTarget: "${attr.consul.version}",
Expand Down
67 changes: 55 additions & 12 deletions nomad/structs/diff_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1253,32 +1253,38 @@ func TestJobDiff(t *testing.T) {
Old: &Job{
TaskGroups: []*TaskGroup{
{
Name: "foo",
Count: 1,
Name: "foo",
Count: 1,
RescheduleOnLost: true,
},
{
Name: "bar",
Count: 1,
Name: "bar",
Count: 1,
RescheduleOnLost: false,
},
{
Name: "baz",
Count: 1,
Name: "baz",
Count: 1,
RescheduleOnLost: true,
},
},
},
New: &Job{
TaskGroups: []*TaskGroup{
{
Name: "bar",
Count: 1,
Name: "bar",
Count: 1,
RescheduleOnLost: false,
},
{
Name: "baz",
Count: 2,
Name: "baz",
Count: 2,
RescheduleOnLost: true,
},
{
Name: "bam",
Count: 1,
Name: "bam",
Count: 1,
RescheduleOnLost: true,
},
},
},
Expand All @@ -1295,6 +1301,12 @@ func TestJobDiff(t *testing.T) {
Old: "",
New: "1",
},
{
Type: DiffTypeAdded,
Name: "RescheduleOnLost",
Old: "",
New: "true",
},
},
},
{
Expand Down Expand Up @@ -1323,6 +1335,12 @@ func TestJobDiff(t *testing.T) {
Old: "1",
New: "",
},
{
Type: DiffTypeDeleted,
Name: "RescheduleOnLost",
Old: "true",
New: "",
},
},
},
},
Expand Down Expand Up @@ -1841,6 +1859,31 @@ func TestTaskGroupDiff(t *testing.T) {
},
},
},
{
TestCase: "Reschedule on lost diff",
Old: &TaskGroup{
Name: "foo",
Count: 100,
RescheduleOnLost: true,
},
New: &TaskGroup{
Name: "foo",
Count: 100,
RescheduleOnLost: false,
},
Expected: &TaskGroupDiff{
Type: DiffTypeEdited,
Name: "foo",
Fields: []*FieldDiff{
{
Type: DiffTypeEdited,
Name: "RescheduleOnLost",
Old: "true",
New: "false",
},
},
},
},
{
TestCase: "Map diff",
Old: &TaskGroup{
Expand Down
5 changes: 5 additions & 0 deletions nomad/structs/structs.go
Original file line number Diff line number Diff line change
Expand Up @@ -6635,6 +6635,11 @@ type TaskGroup struct {
// MaxClientDisconnect, if set, configures the client to allow placed
// allocations for tasks in this group to attempt to resume running without a restart.
MaxClientDisconnect *time.Duration

// RescheduleOnLost is used to control how allocations on disconnected
// nodes are handled. For backwards compatibility, it defaults to true.
// When true, such jobs are rescheduled.
RescheduleOnLost bool
}

func (tg *TaskGroup) Copy() *TaskGroup {
Expand Down
Loading

0 comments on commit 72ad585

Please sign in to comment.