Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add a new parameter to avoid starting a replacement for lost allocs #19101

Merged
merged 50 commits into from
Dec 6, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
50 commits
Select commit Hold shift + click to select a range
fcf9f1e
Add the ability to disable rescheduling on lost. Implements issue #10366
DominicLavery Apr 12, 2023
ccbdc9b
func: add lost allocs that wont be rescheduled to the count
Juanadelacuesta Nov 3, 2023
d9ea238
fix: update api tests to include new reschedule on lost option
Juanadelacuesta Nov 6, 2023
cd5c88c
func: mark non reschedule allocs as unknown
Juanadelacuesta Nov 6, 2023
fd1f54f
func: remove the garbage collect error
Juanadelacuesta Nov 6, 2023
b72b364
Update core_sched.go
Juanadelacuesta Nov 6, 2023
866d3d3
temp: recoginize the lost alloct as reconnecting
Juanadelacuesta Nov 8, 2023
2a028d6
func: allow plans for lost nodes
Juanadelacuesta Nov 8, 2023
4a57aaf
style: rename option
Juanadelacuesta Nov 21, 2023
3d8f652
func: remove the changes to shouldFilter, unknown status should not b…
Juanadelacuesta Nov 22, 2023
967f840
func: remove the changes to shouldFilter, unknown status should not b…
Juanadelacuesta Nov 22, 2023
8d9a2ee
fix: add missing config to test for should filter
Juanadelacuesta Nov 22, 2023
c399f13
style: improve documentation of the new option
Juanadelacuesta Nov 22, 2023
7319000
fix: wrong logic in filterOutByClientStatus from multiple statuses ch…
Juanadelacuesta Nov 22, 2023
a2f7c4d
func: update tests for new feature
Juanadelacuesta Nov 22, 2023
d1c2d7c
func: update tests
Juanadelacuesta Nov 27, 2023
2f7d431
fix: veryfy the tainted nod is not nil before checking the status
Juanadelacuesta Nov 27, 2023
85c9e78
fix: update test after inverse the logic of the new option
Juanadelacuesta Nov 27, 2023
749ca4c
func: add warning for single on lost while reschedule
Juanadelacuesta Nov 27, 2023
6cf3501
style: refactor filter by tainted
Juanadelacuesta Nov 27, 2023
072e5fd
style: linyer fix
Juanadelacuesta Nov 27, 2023
b1bf777
func: race an error when singleInstance and reschedle policy are both…
Juanadelacuesta Nov 27, 2023
f82a2b6
style: remove added new lines
Juanadelacuesta Nov 28, 2023
3ec11a1
style: rename new option
Juanadelacuesta Nov 28, 2023
e4ab2f5
Update scheduler/reconcile_test.go
Juanadelacuesta Nov 28, 2023
08cdc29
style: update documentation
Juanadelacuesta Nov 28, 2023
53d40e0
style: improve documentation, add test for dont GC when desired statu…
Juanadelacuesta Nov 28, 2023
2ef9263
func: expand tests for LostNode_AvoidRescheduleOnLost, add functional…
Juanadelacuesta Nov 29, 2023
cc5c95c
fix: update error message
Juanadelacuesta Nov 29, 2023
1111c35
style: replace switch for else/if block
Juanadelacuesta Nov 29, 2023
46038d3
style: linter fix
Juanadelacuesta Nov 29, 2023
18efe4c
fix: update tests results
Juanadelacuesta Nov 29, 2023
4e7a7f2
style: fix documentation
Juanadelacuesta Nov 29, 2023
9d09e7f
Update website/content/docs/job-specification/group.mdx
Juanadelacuesta Nov 29, 2023
e48c6c6
Update website/content/docs/job-specification/group.mdx
Juanadelacuesta Nov 30, 2023
90e5a6b
style: improve documentation
Juanadelacuesta Nov 30, 2023
d5998eb
Update scheduler/reconcile.go
Juanadelacuesta Nov 30, 2023
7b46d07
func: remove changes of the avoidreschedule policy from destructive u…
Juanadelacuesta Nov 30, 2023
8fda437
style: improve documentation
Juanadelacuesta Nov 30, 2023
9a3f1a6
Update website/content/docs/job-specification/group.mdx
Juanadelacuesta Dec 1, 2023
d0b01be
Update nomad/plan_apply.go
Juanadelacuesta Dec 1, 2023
2a3f8d1
Update nomad/structs/structs.go
Juanadelacuesta Dec 1, 2023
47e267b
style: rename option to prevent_reschedule_on_lost
Juanadelacuesta Dec 1, 2023
ab7ebf2
fix: update test
Juanadelacuesta Dec 4, 2023
17cb0bd
Update nomad/structs/structs.go
Juanadelacuesta Dec 6, 2023
a6d3ada
Update nomad/structs/structs.go
Juanadelacuesta Dec 6, 2023
9267748
Update scheduler/util_test.go
Juanadelacuesta Dec 6, 2023
ee08f4d
Update website/content/docs/job-specification/group.mdx
Juanadelacuesta Dec 6, 2023
85d20b3
Update website/content/docs/job-specification/group.mdx
Juanadelacuesta Dec 6, 2023
f97e280
func: add new section to documentation
Juanadelacuesta Dec 6, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions .changelog/16867.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:feature
**Reschedule on Lost**: Adds the ability to prevent tasks on down nodes from being rescheduled
```
56 changes: 33 additions & 23 deletions api/jobs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -312,8 +312,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf(""),
Count: pointerOf(1),
Name: pointerOf(""),
Count: pointerOf(1),
PreventRescheduleOnLost: pointerOf(false),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -399,8 +400,9 @@ func TestJobs_Canonicalize(t *testing.T) {
JobModifyIndex: pointerOf(uint64(0)),
TaskGroups: []*TaskGroup{
{
Name: pointerOf(""),
Count: pointerOf(1),
Name: pointerOf(""),
Count: pointerOf(1),
PreventRescheduleOnLost: pointerOf(false),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -491,8 +493,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Count: pointerOf(1),
Name: pointerOf("bar"),
PreventRescheduleOnLost: pointerOf(false),
Count: pointerOf(1),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -555,8 +558,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("cache"),
Count: pointerOf(1),
Name: pointerOf("cache"),
Count: pointerOf(1),
PreventRescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Interval: pointerOf(5 * time.Minute),
Attempts: pointerOf(10),
Expand Down Expand Up @@ -666,8 +670,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("cache"),
Count: pointerOf(1),
Name: pointerOf("cache"),
Count: pointerOf(1),
PreventRescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Interval: pointerOf(5 * time.Minute),
Attempts: pointerOf(10),
Expand Down Expand Up @@ -795,7 +800,6 @@ func TestJobs_Canonicalize(t *testing.T) {
},
},
},

{
name: "periodic",
input: &Job{
Expand Down Expand Up @@ -865,7 +869,8 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Name: pointerOf("bar"),
PreventRescheduleOnLost: pointerOf(true),
Consul: &Consul{
Namespace: "",
},
Expand All @@ -885,7 +890,8 @@ func TestJobs_Canonicalize(t *testing.T) {
},
},
{
Name: pointerOf("baz"),
Name: pointerOf("baz"),
PreventRescheduleOnLost: pointerOf(false),
Tasks: []*Task{
{
Name: "task1",
Expand Down Expand Up @@ -930,8 +936,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Count: pointerOf(1),
Name: pointerOf("bar"),
Count: pointerOf(1),
PreventRescheduleOnLost: pointerOf(true),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -979,8 +986,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
},
{
Name: pointerOf("baz"),
Count: pointerOf(1),
Name: pointerOf("baz"),
PreventRescheduleOnLost: pointerOf(false),
Count: pointerOf(1),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -1038,7 +1046,8 @@ func TestJobs_Canonicalize(t *testing.T) {
ParentID: pointerOf("lol"),
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Name: pointerOf("bar"),
PreventRescheduleOnLost: pointerOf(true),
RestartPolicy: &RestartPolicy{
Delay: pointerOf(15 * time.Second),
Attempts: pointerOf(2),
Expand Down Expand Up @@ -1111,8 +1120,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
TaskGroups: []*TaskGroup{
{
Name: pointerOf("bar"),
Count: pointerOf(1),
Name: pointerOf("bar"),
PreventRescheduleOnLost: pointerOf(true),
Count: pointerOf(1),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -1166,8 +1176,9 @@ func TestJobs_Canonicalize(t *testing.T) {
},
},
{
Name: pointerOf("baz"),
Count: pointerOf(1),
Name: pointerOf("baz"),
PreventRescheduleOnLost: pointerOf(false),
Count: pointerOf(1),
EphemeralDisk: &EphemeralDisk{
Sticky: pointerOf(false),
Migrate: pointerOf(false),
Expand Down Expand Up @@ -1223,7 +1234,6 @@ func TestJobs_Canonicalize(t *testing.T) {
},
},
},

{
name: "multiregion",
input: &Job{
Expand Down
5 changes: 4 additions & 1 deletion api/tasks.go
Original file line number Diff line number Diff line change
Expand Up @@ -459,6 +459,7 @@ type TaskGroup struct {
MaxClientDisconnect *time.Duration `mapstructure:"max_client_disconnect" hcl:"max_client_disconnect,optional"`
Scaling *ScalingPolicy `hcl:"scaling,block"`
Consul *Consul `hcl:"consul,block"`
PreventRescheduleOnLost *bool `hcl:"prevent_reschedule_on_lost,optional"`
}

// NewTaskGroup creates a new TaskGroup.
Expand Down Expand Up @@ -577,7 +578,9 @@ func (g *TaskGroup) Canonicalize(job *Job) {
for _, s := range g.Services {
s.Canonicalize(nil, g, job)
}

if g.PreventRescheduleOnLost == nil {
g.PreventRescheduleOnLost = pointerOf(false)
}
}

// These needs to be in sync with DefaultServiceJobRestartPolicy in
Expand Down
5 changes: 3 additions & 2 deletions client/client.go
Original file line number Diff line number Diff line change
Expand Up @@ -2666,10 +2666,11 @@ func (c *Client) updateAlloc(update *structs.Allocation) {
return
}

alloc := ar.Alloc()
// Reconnect unknown allocations if they were updated and are not terminal.
reconnect := update.ClientStatus == structs.AllocClientStatusUnknown &&
update.AllocModifyIndex > ar.Alloc().AllocModifyIndex &&
!update.ServerTerminalStatus()
update.AllocModifyIndex > alloc.AllocModifyIndex &&
(!update.ServerTerminalStatus() || !alloc.PreventRescheduleOnLost())
if reconnect {
err = ar.Reconnect(update)
if err != nil {
Expand Down
6 changes: 6 additions & 0 deletions command/agent/job_endpoint.go
Original file line number Diff line number Diff line change
Expand Up @@ -1135,6 +1135,12 @@ func ApiTgToStructsTG(job *structs.Job, taskGroup *api.TaskGroup, tg *structs.Ta
RenderTemplates: *taskGroup.RestartPolicy.RenderTemplates,
}

if taskGroup.PreventRescheduleOnLost == nil {
tg.PreventRescheduleOnLost = false
} else {
tg.PreventRescheduleOnLost = *taskGroup.PreventRescheduleOnLost
}

if taskGroup.ShutdownDelay != nil {
tg.ShutdownDelay = taskGroup.ShutdownDelay
}
Expand Down
6 changes: 4 additions & 2 deletions command/agent/job_endpoint_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -3058,6 +3058,7 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
Operand: "z",
},
},
PreventRescheduleOnLost: false,
Affinities: []*structs.Affinity{
{
LTarget: "x",
Expand Down Expand Up @@ -3552,8 +3553,9 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) {
},
TaskGroups: []*structs.TaskGroup{
{
Name: "group1",
Count: 5,
Name: "group1",
Count: 5,
PreventRescheduleOnLost: false,
Constraints: []*structs.Constraint{
{
LTarget: "x",
Expand Down
48 changes: 34 additions & 14 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -1940,20 +1940,21 @@ func TestCoreScheduler_PartitionJobReap(t *testing.T) {
// Tests various scenarios when allocations are eligible to be GCed
func TestAllocation_GCEligible(t *testing.T) {
type testCase struct {
Desc string
GCTime time.Time
ClientStatus string
DesiredStatus string
JobStatus string
JobStop bool
AllocJobModifyIndex uint64
JobModifyIndex uint64
ModifyIndex uint64
NextAllocID string
ReschedulePolicy *structs.ReschedulePolicy
RescheduleTrackers []*structs.RescheduleEvent
ThresholdIndex uint64
ShouldGC bool
Desc string
GCTime time.Time
ClientStatus string
DesiredStatus string
JobStatus string
JobStop bool
PreventRescheduleOnLost *bool
AllocJobModifyIndex uint64
JobModifyIndex uint64
ModifyIndex uint64
NextAllocID string
ReschedulePolicy *structs.ReschedulePolicy
RescheduleTrackers []*structs.RescheduleEvent
ThresholdIndex uint64
ShouldGC bool
}

fail := time.Now()
Expand Down Expand Up @@ -2120,6 +2121,14 @@ func TestAllocation_GCEligible(t *testing.T) {
JobStop: true,
ShouldGC: true,
},
{
Desc: "GC when alloc is lost and eligible for reschedule",
ClientStatus: structs.AllocClientStatusLost,
DesiredStatus: structs.AllocDesiredStatusStop,
GCTime: fail,
JobStatus: structs.JobStatusDead,
ShouldGC: true,
},
tgross marked this conversation as resolved.
Show resolved Hide resolved
{
Desc: "GC when job status is dead",
ClientStatus: structs.AllocClientStatusFailed,
Expand Down Expand Up @@ -2155,6 +2164,14 @@ func TestAllocation_GCEligible(t *testing.T) {
},
ShouldGC: true,
},
{
Desc: "GC when alloc is unknown and but desired state is running",
ClientStatus: structs.AllocClientStatusUnknown,
DesiredStatus: structs.AllocDesiredStatusRun,
GCTime: fail,
JobStatus: structs.JobStatusRunning,
ShouldGC: false,
},
}

for _, tc := range harness {
Expand All @@ -2166,6 +2183,9 @@ func TestAllocation_GCEligible(t *testing.T) {
alloc.NextAllocation = tc.NextAllocID
job := mock.Job()
alloc.TaskGroup = job.TaskGroups[0].Name
if tc.PreventRescheduleOnLost != nil {
job.TaskGroups[0].PreventRescheduleOnLost = *tc.PreventRescheduleOnLost
}
job.TaskGroups[0].ReschedulePolicy = tc.ReschedulePolicy
if tc.JobStatus != "" {
job.Status = tc.JobStatus
Expand Down
5 changes: 3 additions & 2 deletions nomad/mock/job.go
Original file line number Diff line number Diff line change
Expand Up @@ -31,8 +31,9 @@ func Job() *structs.Job {
},
TaskGroups: []*structs.TaskGroup{
{
Name: "web",
Count: 10,
Name: "web",
Count: 10,
PreventRescheduleOnLost: false,
Constraints: []*structs.Constraint{
{
LTarget: "${attr.consul.version}",
Expand Down
19 changes: 19 additions & 0 deletions nomad/plan_apply.go
Original file line number Diff line number Diff line change
Expand Up @@ -737,6 +737,11 @@ func evaluateNodePlan(snap *state.StateSnapshot, plan *structs.Plan, nodeID stri
return true, "", nil
}
return false, "node is disconnected and contains invalid updates", nil
} else if node.Status == structs.NodeStatusDown {
if isValidForDownNode(plan, node.ID) {
return true, "", nil
}
return false, "node is down and contains invalid updates", nil
} else if node.Status != structs.NodeStatusReady {
return false, "node is not ready for placements", nil
}
Expand Down Expand Up @@ -790,3 +795,17 @@ func isValidForDisconnectedNode(plan *structs.Plan, nodeID string) bool {

return true
}

// The plan is only valid for a node down if it only contains
// updates to mark allocations as unknown and those allocations are configured
// as non reschedulables when lost or if the allocs are being updated to lost.
func isValidForDownNode(plan *structs.Plan, nodeID string) bool {
for _, alloc := range plan.NodeAllocation[nodeID] {
if !(alloc.ClientStatus == structs.AllocClientStatusUnknown && alloc.PreventRescheduleOnLost()) &&
(alloc.ClientStatus != structs.AllocClientStatusLost) {
return false
}
}

return true
}
Loading