From 691efe13df2be6f104936995b259c245d1f62aa3 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 24 May 2016 17:23:18 -0700 Subject: [PATCH 1/3] Dont restart successfully finished batch allocations --- nomad/structs/structs.go | 32 ++++++++++++++++++++++++++++++++ scheduler/generic_sched.go | 8 ++++---- scheduler/util.go | 15 +++++++++------ 3 files changed, 45 insertions(+), 10 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index dbff03732ca..5b2d328ffd0 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1904,6 +1904,21 @@ func (ts *TaskState) Failed() bool { } } +// Successful returns whether a task finished successfully. +func (ts *TaskState) Successful() bool { + l := len(ts.Events) + if ts.State != TaskStateDead || l == 0 { + return false + } + + e := ts.Events[l-1] + if e.Type != TaskTerminated { + return false + } + + return e.ExitCode == 0 +} + const ( // TaskDriveFailure indicates that the task could not be started due to a // failure in the driver. @@ -2336,6 +2351,23 @@ func (a *Allocation) TerminalStatus() bool { } } +// RanSuccessfully returns whether the client has ran the allocation and all +// tasks finished successfully +func (a *Allocation) RanSuccessfully() bool { + // Handle the case the client hasn't started the allocation. + if len(a.TaskStates) == 0 { + return false + } + + // Check to see if all the tasks finised successfully in the allocation + allSuccess := true + for _, state := range a.TaskStates { + allSuccess = allSuccess && state.Successful() + } + + return allSuccess +} + // Stub returns a list stub for the allocation func (a *Allocation) Stub() *AllocListStub { return &AllocListStub{ diff --git a/scheduler/generic_sched.go b/scheduler/generic_sched.go index 2cce204a115..d589714704e 100644 --- a/scheduler/generic_sched.go +++ b/scheduler/generic_sched.go @@ -235,12 +235,12 @@ func (s *GenericScheduler) filterCompleteAllocs(allocs []*structs.Allocation) [] filter := func(a *structs.Allocation) bool { if s.batch { // Allocs from batch jobs should be filtered when the desired status - // is terminal or when the client status is failed so that they will - // be replaced. If they are complete but not failed, they shouldn't - // be replaced. + // is terminal and the client did not finish or when the client + // status is failed so that they will be replaced. If they are + // complete but not failed, they shouldn't be replaced. switch a.DesiredStatus { case structs.AllocDesiredStatusStop, structs.AllocDesiredStatusEvict, structs.AllocDesiredStatusFailed: - return true + return !a.RanSuccessfully() default: } diff --git a/scheduler/util.go b/scheduler/util.go index 4dc411dc9f6..9c3035ee4aa 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -81,13 +81,16 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool, continue } - // If we are on a tainted node, we must migrate + // If we are on a tainted node, we must migrate if we are a service or + // if the batch allocation did not finish if taintedNodes[exist.NodeID] { - result.migrate = append(result.migrate, allocTuple{ - Name: name, - TaskGroup: tg, - Alloc: exist, - }) + if exist.Job.Type != structs.JobTypeBatch || !exist.RanSuccessfully() { + result.migrate = append(result.migrate, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) + } continue } From 4a34fe4914596fef91c12201d7181375daef2a4f Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 24 May 2016 17:47:03 -0700 Subject: [PATCH 2/3] Add test to verify drain doesn't restart successful batch and add to ignore list --- scheduler/generic_sched_test.go | 68 +++++++++++++++++++++++++++++++++ scheduler/util.go | 14 ++++--- scheduler/util_test.go | 3 ++ 3 files changed, 79 insertions(+), 6 deletions(-) diff --git a/scheduler/generic_sched_test.go b/scheduler/generic_sched_test.go index e3d5e9a84cb..43853fd2ec6 100644 --- a/scheduler/generic_sched_test.go +++ b/scheduler/generic_sched_test.go @@ -1316,3 +1316,71 @@ func TestBatchSched_Run_FailedAlloc(t *testing.T) { h.AssertEvalStatus(t, structs.EvalStatusComplete) } + +func TestBatchSched_ReRun_SuccessfullyFinishedAlloc(t *testing.T) { + h := NewHarness(t) + + // Create two nodes, one that is drained and has a successfully finished + // alloc and a fresh undrained one + node := mock.Node() + node.Drain = true + node2 := mock.Node() + noErr(t, h.State.UpsertNode(h.NextIndex(), node)) + noErr(t, h.State.UpsertNode(h.NextIndex(), node2)) + + // Create a job + job := mock.Job() + job.Type = structs.JobTypeBatch + job.TaskGroups[0].Count = 1 + noErr(t, h.State.UpsertJob(h.NextIndex(), job)) + + // Create a successful alloc + alloc := mock.Alloc() + alloc.Job = job + alloc.JobID = job.ID + alloc.NodeID = node.ID + alloc.Name = "my-job.web[0]" + alloc.ClientStatus = structs.AllocClientStatusComplete + alloc.TaskStates = map[string]*structs.TaskState{ + "web": &structs.TaskState{ + State: structs.TaskStateDead, + Events: []*structs.TaskEvent{ + { + Type: structs.TaskTerminated, + ExitCode: 0, + }, + }, + }, + } + noErr(t, h.State.UpsertAllocs(h.NextIndex(), []*structs.Allocation{alloc})) + + // Create a mock evaluation to rerun the job + eval := &structs.Evaluation{ + ID: structs.GenerateUUID(), + Priority: job.Priority, + TriggeredBy: structs.EvalTriggerJobRegister, + JobID: job.ID, + } + + // Process the evaluation + err := h.Process(NewBatchScheduler, eval) + if err != nil { + t.Fatalf("err: %v", err) + } + + // Ensure no plan + if len(h.Plans) != 0 { + t.Fatalf("bad: %#v", h.Plans) + } + + // Lookup the allocations by JobID + out, err := h.State.AllocsByJob(job.ID) + noErr(t, err) + + // Ensure no replacement alloc was placed. + if len(out) != 1 { + t.Fatalf("bad: %#v", out) + } + + h.AssertEvalStatus(t, structs.EvalStatusComplete) +} diff --git a/scheduler/util.go b/scheduler/util.go index 9c3035ee4aa..e910f3db117 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -84,13 +84,14 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool, // If we are on a tainted node, we must migrate if we are a service or // if the batch allocation did not finish if taintedNodes[exist.NodeID] { - if exist.Job.Type != structs.JobTypeBatch || !exist.RanSuccessfully() { - result.migrate = append(result.migrate, allocTuple{ - Name: name, - TaskGroup: tg, - Alloc: exist, - }) + if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { + goto IGNORE } + result.migrate = append(result.migrate, allocTuple{ + Name: name, + TaskGroup: tg, + Alloc: exist, + }) continue } @@ -105,6 +106,7 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool, } // Everything is up-to-date + IGNORE: result.ignore = append(result.ignore, allocTuple{ Name: name, TaskGroup: tg, diff --git a/scheduler/util_test.go b/scheduler/util_test.go index 1d689fe2b38..57e2cb48293 100644 --- a/scheduler/util_test.go +++ b/scheduler/util_test.go @@ -74,6 +74,7 @@ func TestDiffAllocs(t *testing.T) { ID: structs.GenerateUUID(), NodeID: "zip", Name: "my-job.web[10]", + Job: oldJob, }, // Migrate the 3rd @@ -81,6 +82,7 @@ func TestDiffAllocs(t *testing.T) { ID: structs.GenerateUUID(), NodeID: "dead", Name: "my-job.web[2]", + Job: oldJob, }, } @@ -155,6 +157,7 @@ func TestDiffSystemAllocs(t *testing.T) { ID: structs.GenerateUUID(), NodeID: "dead", Name: "my-job.web[0]", + Job: oldJob, }, } From d5fa76ec21bc01f56bb2c3c6d529b9ca82ddb364 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Tue, 24 May 2016 18:18:10 -0700 Subject: [PATCH 3/3] comment --- scheduler/util.go | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/scheduler/util.go b/scheduler/util.go index e910f3db117..6a851ae36bb 100644 --- a/scheduler/util.go +++ b/scheduler/util.go @@ -84,6 +84,11 @@ func diffAllocs(job *structs.Job, taintedNodes map[string]bool, // If we are on a tainted node, we must migrate if we are a service or // if the batch allocation did not finish if taintedNodes[exist.NodeID] { + // If the job is batch and finished succesfully, the fact that the + // node is tainted does not mean it should be migrated as the work + // was already succesfully finished. However for service/system + // jobs, tasks should never complete. The check of batch type, + // defends against client bugs. if exist.Job.Type == structs.JobTypeBatch && exist.RanSuccessfully() { goto IGNORE }