Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update Evicted allocations to lost when lost #6902

Merged
merged 2 commits into from
Jan 7, 2020
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ BUG FIXES:
* agent: Fixed race condition in logging when using `nomad monitor` command [[GH-6872](https://github.com/hashicorp/nomad/issues/6872)]
* cli: Fixed a bug where `nomad monitor -node-id` would cause a cli panic when no nodes where found. [[GH-6828](https://github.com/hashicorp/nomad/issues/6828)]
* consul/connect: Fixed a bug where Connect-enabled jobs failed to validate when service names used interpolation. [[GH-6855](https://github.com/hashicorp/nomad/issues/6855)]
* scheduler: Fixed a bug that caused evicted allocs on a lost node to be stuck in running. [[GH-6902](https://github.com/hashicorp/nomad/issues/6902)]

## 0.10.2 (December 4, 2019)

Expand Down
177 changes: 99 additions & 78 deletions scheduler/generic_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2628,99 +2628,120 @@ func TestServiceSched_JobDeregister_Stopped(t *testing.T) {
}

func TestServiceSched_NodeDown(t *testing.T) {
h := NewHarness(t)

// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

var allocs []*structs.Allocation
for i := 0; i < 10; i++ {
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)
allocs = append(allocs, alloc)
cases := []struct {
desired string
client string
migrate bool
reschedule bool
terminal bool
lost bool
}{
{
desired: structs.AllocDesiredStatusStop,
client: structs.AllocClientStatusRunning,
lost: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusPending,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusRunning,
migrate: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusLost,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusComplete,
terminal: true,
},
{
desired: structs.AllocDesiredStatusRun,
client: structs.AllocClientStatusFailed,
reschedule: true,
},
{
desired: structs.AllocDesiredStatusEvict,
client: structs.AllocClientStatusRunning,
lost: true,
},
}

// Cover each terminal case and ensure it doesn't change to lost
allocs[7].DesiredStatus = structs.AllocDesiredStatusRun
allocs[7].ClientStatus = structs.AllocClientStatusLost
allocs[8].DesiredStatus = structs.AllocDesiredStatusRun
allocs[8].ClientStatus = structs.AllocClientStatusFailed
allocs[9].DesiredStatus = structs.AllocDesiredStatusRun
allocs[9].ClientStatus = structs.AllocClientStatusComplete

toBeRescheduled := map[string]bool{allocs[8].ID: true}
for i, tc := range cases {
t.Run(fmt.Sprintf(""), func(t *testing.T) {
h := NewHarness(t)

// Mark some allocs as running
for i := 0; i < 4; i++ {
out := allocs[i]
out.ClientStatus = structs.AllocClientStatusRunning
}
// Register a node
node := mock.Node()
node.Status = structs.NodeStatusDown
require.NoError(t, h.State.UpsertNode(h.NextIndex(), node))

// Mark appropriate allocs for migration
toBeMigrated := map[string]bool{}
for i := 0; i < 3; i++ {
out := allocs[i]
out.DesiredTransition.Migrate = helper.BoolToPtr(true)
toBeMigrated[out.ID] = true
}
// Generate a fake job with allocations and an update policy.
job := mock.Job()
require.NoError(t, h.State.UpsertJob(h.NextIndex(), job))

toBeLost := map[string]bool{}
for i := len(toBeMigrated); i < 7; i++ {
toBeLost[allocs[i].ID] = true
}
alloc := mock.Alloc()
alloc.Job = job
alloc.JobID = job.ID
alloc.NodeID = node.ID
alloc.Name = fmt.Sprintf("my-job.web[%d]", i)

require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))
alloc.DesiredStatus = tc.desired
alloc.ClientStatus = tc.client

// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))
// Mark for migration if necessary
alloc.DesiredTransition.Migrate = helper.BoolToPtr(tc.migrate)

// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)
allocs := []*structs.Allocation{alloc}
require.NoError(t, h.State.UpsertAllocs(h.NextIndex(), allocs))

// Ensure a single plan
require.Len(t, h.Plans, 1)
plan := h.Plans[0]

// Test the scheduler marked all non-terminal allocations as lost
require.Len(t, plan.NodeUpdate[node.ID], len(toBeMigrated)+len(toBeLost)+len(toBeRescheduled))
// Create a mock evaluation to deal with drain
eval := &structs.Evaluation{
Namespace: structs.DefaultNamespace,
ID: uuid.Generate(),
Priority: 50,
TriggeredBy: structs.EvalTriggerNodeUpdate,
JobID: job.ID,
NodeID: node.ID,
Status: structs.EvalStatusPending,
}
require.NoError(t, h.State.UpsertEvals(h.NextIndex(), []*structs.Evaluation{eval}))

for _, out := range plan.NodeUpdate[node.ID] {
t.Run("alloc "+out.ID, func(t *testing.T) {
require.Equal(t, structs.AllocDesiredStatusStop, out.DesiredStatus)
// Process the evaluation
err := h.Process(NewServiceScheduler, eval)
require.NoError(t, err)

if toBeMigrated[out.ID] {
// there is no indicator on job itself that marks it as migrated
require.NotEqual(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeLost[out.ID] {
require.Equal(t, structs.AllocClientStatusLost, out.ClientStatus)
} else if toBeRescheduled[out.ID] {
require.Equal(t, structs.AllocClientStatusFailed, out.ClientStatus)
if tc.terminal {
// No plan for terminal state allocs
require.Len(t, h.Plans, 0)
} else {
require.Fail(t, "unexpected alloc update")
require.Len(t, h.Plans, 1)

plan := h.Plans[0]
out := plan.NodeUpdate[node.ID]
require.Len(t, out, 1)

outAlloc := out[0]
if tc.migrate {
require.NotEqual(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else if tc.reschedule {
require.Equal(t, structs.AllocClientStatusFailed, outAlloc.ClientStatus)
} else if tc.lost {
require.Equal(t, structs.AllocClientStatusLost, outAlloc.ClientStatus)
} else {
require.Fail(t, "unexpected alloc update")
}
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
})
}

h.AssertEvalStatus(t, structs.EvalStatusComplete)
}

func TestServiceSched_NodeUpdate(t *testing.T) {
Expand Down
7 changes: 4 additions & 3 deletions scheduler/util.go
Original file line number Diff line number Diff line change
Expand Up @@ -809,9 +809,10 @@ func updateNonTerminalAllocsToLost(plan *structs.Plan, tainted map[string]*struc
continue
}

// If the scheduler has marked it as stop already but the alloc wasn't
// terminal on the client change the status to lost.
if alloc.DesiredStatus == structs.AllocDesiredStatusStop &&
// If the scheduler has marked it as stop or evict already but the alloc
// wasn't terminal on the client change the status to lost.
if (alloc.DesiredStatus == structs.AllocDesiredStatusStop ||
alloc.DesiredStatus == structs.AllocDesiredStatusEvict) &&
(alloc.ClientStatus == structs.AllocClientStatusRunning ||
alloc.ClientStatus == structs.AllocClientStatusPending) {
plan.AppendStoppedAlloc(alloc, allocLost, structs.AllocClientStatusLost)
Expand Down