Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

scheduler: fix panic when preempting and evicting allocs #6792

Merged
merged 4 commits into from
Dec 3, 2019
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@ BUG FIXES:

* core: Ignore `server` config values if `server` is disabled [[GH-6047](https://github.com/hashicorp/nomad/issues/6047)]
* core: Added `semver` constraint for strict Semver 2.0 version comparisons [[GH-6699](https://github.com/hashicorp/nomad/issues/6699)]
* core: Fixed server panic caused by a plan evicting and preempting allocs on a node [[GH-6792](https://github.com/hashicorp/nomad/issues/6792)]
* api: Return a 404 if endpoint not found instead of redirecting to /ui/ [[GH-6658](https://github.com/hashicorp/nomad/issues/6658)]
* api: Decompress web socket response body if gzipped on error responses [[GH-6650](https://github.com/hashicorp/nomad/issues/6650)]
* api: Fixed a bug where some FS/Allocation API endpoints didn't return error messages [[GH-6427](https://github.com/hashicorp/nomad/issues/6427)]
Expand Down
13 changes: 6 additions & 7 deletions scheduler/context.go
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,9 @@ type Context interface {
// Reset is invoked after making a placement
Reset()

// ProposedAllocs returns the proposed allocations for a node
// which is the existing allocations, removing evictions, and
// adding any planned placements.
// ProposedAllocs returns the proposed allocations for a node which are
// the existing allocations, removing evictions, and adding any planned
// placements.
ProposedAllocs(nodeID string) ([]*structs.Allocation, error)

// RegexpCache is a cache of regular expressions
Expand Down Expand Up @@ -120,22 +120,21 @@ func (e *EvalContext) Reset() {
func (e *EvalContext) ProposedAllocs(nodeID string) ([]*structs.Allocation, error) {
// Get the existing allocations that are non-terminal
ws := memdb.NewWatchSet()
existingAlloc, err := e.state.AllocsByNodeTerminal(ws, nodeID, false)
proposed, err := e.state.AllocsByNodeTerminal(ws, nodeID, false)
if err != nil {
return nil, err
}

// Determine the proposed allocation by first removing allocations
// that are planned evictions and adding the new allocations.
proposed := existingAlloc
if update := e.plan.NodeUpdate[nodeID]; len(update) > 0 {
proposed = structs.RemoveAllocs(existingAlloc, update)
proposed = structs.RemoveAllocs(proposed, update)
}

// Remove any allocs that are being preempted
nodePreemptedAllocs := e.plan.NodePreemptions[nodeID]
if len(nodePreemptedAllocs) > 0 {
proposed = structs.RemoveAllocs(existingAlloc, nodePreemptedAllocs)
proposed = structs.RemoveAllocs(proposed, nodePreemptedAllocs)
}

// We create an index of the existing allocations so that if an inplace
Expand Down
116 changes: 113 additions & 3 deletions scheduler/context_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -106,9 +106,9 @@ func TestEvalContext_ProposedAlloc(t *testing.T) {
ClientStatus: structs.AllocClientStatusPending,
TaskGroup: "web",
}
noErr(t, state.UpsertJobSummary(998, mock.JobSummary(alloc1.JobID)))
noErr(t, state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID)))
noErr(t, state.UpsertAllocs(1000, []*structs.Allocation{alloc1, alloc2}))
require.NoError(t, state.UpsertJobSummary(998, mock.JobSummary(alloc1.JobID)))
require.NoError(t, state.UpsertJobSummary(999, mock.JobSummary(alloc2.JobID)))
require.NoError(t, state.UpsertAllocs(1000, []*structs.Allocation{alloc1, alloc2}))

// Add a planned eviction to alloc1
plan := ctx.Plan()
Expand Down Expand Up @@ -149,6 +149,116 @@ func TestEvalContext_ProposedAlloc(t *testing.T) {
}
}

// TestEvalContext_ProposedAlloc_EvictPreempt asserts both Evicted and
// Preempted allocs are removed from the allocs propsed for a node.
//
// See https://github.com/hashicorp/nomad/issues/6787
//
func TestEvalContext_ProposedAlloc_EvictPreempt(t *testing.T) {
t.Parallel()
state, ctx := testContext(t)
nodes := []*RankedNode{
{
Node: &structs.Node{
ID: uuid.Generate(),
NodeResources: &structs.NodeResources{
Cpu: structs.NodeCpuResources{
CpuShares: 1024 * 3,
},
Memory: structs.NodeMemoryResources{
MemoryMB: 1024 * 3,
},
},
},
},
}

// Add existing allocations
j1, j2, j3 := mock.Job(), mock.Job(), mock.Job()
allocEvict := &structs.Allocation{
ID: uuid.Generate(),
Namespace: structs.DefaultNamespace,
EvalID: uuid.Generate(),
NodeID: nodes[0].Node.ID,
JobID: j1.ID,
Job: j1,
AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1024,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 1024,
},
},
},
},
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
TaskGroup: "web",
}
allocPreempt := &structs.Allocation{
ID: uuid.Generate(),
Namespace: structs.DefaultNamespace,
EvalID: uuid.Generate(),
NodeID: nodes[0].Node.ID,
JobID: j2.ID,
Job: j2,
AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1024,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 1024,
},
},
},
},
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
TaskGroup: "web",
}
allocPropose := &structs.Allocation{
ID: uuid.Generate(),
Namespace: structs.DefaultNamespace,
EvalID: uuid.Generate(),
NodeID: nodes[0].Node.ID,
JobID: j3.ID,
Job: j3,
AllocatedResources: &structs.AllocatedResources{
Tasks: map[string]*structs.AllocatedTaskResources{
"web": {
Cpu: structs.AllocatedCpuResources{
CpuShares: 1024,
},
Memory: structs.AllocatedMemoryResources{
MemoryMB: 1024,
},
},
},
},
DesiredStatus: structs.AllocDesiredStatusRun,
ClientStatus: structs.AllocClientStatusPending,
TaskGroup: "web",
}
require.NoError(t, state.UpsertJobSummary(998, mock.JobSummary(allocEvict.JobID)))
require.NoError(t, state.UpsertJobSummary(999, mock.JobSummary(allocPreempt.JobID)))
require.NoError(t, state.UpsertJobSummary(999, mock.JobSummary(allocPropose.JobID)))
require.NoError(t, state.UpsertAllocs(1000, []*structs.Allocation{allocEvict, allocPreempt, allocPropose}))

// Plan to evict one alloc and preempt another
plan := ctx.Plan()
plan.NodePreemptions[nodes[0].Node.ID] = []*structs.Allocation{allocEvict}
plan.NodeUpdate[nodes[0].Node.ID] = []*structs.Allocation{allocPreempt}

proposed, err := ctx.ProposedAllocs(nodes[0].Node.ID)
require.NoError(t, err)
require.Len(t, proposed, 1)
}

func TestEvalEligibility_JobStatus(t *testing.T) {
e := NewEvalEligibility()
cc := "v1:100"
Expand Down
Loading