-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Add CLI and API support for forcing rescheduling of failed allocs #4274
Changes from 7 commits
242cc19
3b7d23f
268a99e
1bad719
4f9d92c
2d0e273
b5e18b6
879c2c9
53c05c5
e2f13d2
b2006cc
ae5d8fd
2ea09b8
a5ca379
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -90,8 +90,25 @@ func (s *HTTPServer) jobForceEvaluate(resp http.ResponseWriter, req *http.Reques | |
if req.Method != "PUT" && req.Method != "POST" { | ||
return nil, CodedError(405, ErrInvalidMethod) | ||
} | ||
args := structs.JobEvaluateRequest{ | ||
JobID: jobName, | ||
var args structs.JobEvaluateRequest | ||
|
||
// TODO(preetha) remove in 0.9 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use the COMPAT tag so we can grep our code base |
||
// For backwards compatibility allow using this endpoint without a payload | ||
if req.ContentLength == 0 { | ||
args = structs.JobEvaluateRequest{ | ||
JobID: jobName, | ||
} | ||
} else { | ||
if err := decodeBody(req, &args); err != nil { | ||
return nil, CodedError(400, err.Error()) | ||
} | ||
if args.JobID == "" { | ||
return nil, CodedError(400, "Job ID must be specified") | ||
} | ||
|
||
if jobName != "" && args.JobID != jobName { | ||
return nil, CodedError(400, "JobID not same as job name") | ||
} | ||
} | ||
s.parseWriteRequest(req, &args.WriteRequest) | ||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,106 @@ | ||
package command | ||
|
||
import ( | ||
"fmt" | ||
"strings" | ||
|
||
"github.com/hashicorp/nomad/api" | ||
"github.com/hashicorp/nomad/api/contexts" | ||
"github.com/posener/complete" | ||
) | ||
|
||
type JobEvalCommand struct { | ||
Meta | ||
forceRescheduling bool | ||
} | ||
|
||
func (c *JobEvalCommand) Help() string { | ||
helpText := ` | ||
Usage: nomad job eval [options] <job_id> | ||
|
||
Force an evaluation of the provided job id | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ...provided job ID. Forcing an evaluation will trigger the scheduler to re-evaluate the job. The force flags allow operators to force the scheduler to create new allocations under certain scenarios. |
||
|
||
General Options: | ||
|
||
` + generalOptionsUsage() + ` | ||
|
||
Eval Options: | ||
|
||
-force-reschedule | ||
Force reschedule any failed allocations even if they are not currently | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. remove any and and add a period to then end. |
||
eligible for rescheduling | ||
` | ||
return strings.TrimSpace(helpText) | ||
} | ||
|
||
func (c *JobEvalCommand) Synopsis() string { | ||
return "Force evaluating a job using its job id" | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Force an evaluation for the job |
||
} | ||
|
||
func (c *JobEvalCommand) AutocompleteFlags() complete.Flags { | ||
return mergeAutocompleteFlags(c.Meta.AutocompleteFlags(FlagSetClient), | ||
complete.Flags{ | ||
"-force-reschedule": complete.PredictNothing, | ||
}) | ||
} | ||
|
||
func (c *JobEvalCommand) AutocompleteArgs() complete.Predictor { | ||
return complete.PredictFunc(func(a complete.Args) []string { | ||
client, err := c.Meta.Client() | ||
if err != nil { | ||
return nil | ||
} | ||
|
||
resp, _, err := client.Search().PrefixSearch(a.Last, contexts.Jobs, nil) | ||
if err != nil { | ||
return []string{} | ||
} | ||
return resp.Matches[contexts.Jobs] | ||
}) | ||
} | ||
|
||
func (c *JobEvalCommand) Name() string { return "eval" } | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
|
||
func (c *JobEvalCommand) Run(args []string) int { | ||
flags := c.Meta.FlagSet(c.Name(), FlagSetClient) | ||
flags.Usage = func() { c.Ui.Output(c.Help()) } | ||
flags.BoolVar(&c.forceRescheduling, "force-reschedule", false, "") | ||
|
||
if err := flags.Parse(args); err != nil { | ||
return 1 | ||
} | ||
|
||
// Check that we either got no jobs or exactly one. | ||
args = flags.Args() | ||
if len(args) > 1 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can we make this != 1 and remove https://github.com/hashicorp/nomad/pull/4274/files#diff-16a42a3ce250660a08779d4eb8f566aeR88 |
||
c.Ui.Error("This command takes either no arguments or one: <job>") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
|
||
c.Ui.Error(commandErrorText(c)) | ||
return 1 | ||
} | ||
|
||
// Get the HTTP client | ||
client, err := c.Meta.Client() | ||
if err != nil { | ||
c.Ui.Error(fmt.Sprintf("Error initializing client: %s", err)) | ||
return 1 | ||
} | ||
|
||
if len(args) == 0 { | ||
c.Ui.Error("Must provide a job ID") | ||
return 1 | ||
} | ||
|
||
// Call eval end point | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. endpoint |
||
jobID := args[0] | ||
|
||
opts := api.EvalOptions{ | ||
ForceReschedule: c.forceRescheduling, | ||
} | ||
evalId, _, err := client.Jobs().ForceEvaluate(jobID, opts, nil) | ||
if err != nil { | ||
c.Ui.Error(fmt.Sprintf("Error evaluating job: %s", err)) | ||
return 1 | ||
} | ||
c.Ui.Output(fmt.Sprintf("Created eval ID: %q ", evalId)) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We should probably have this invoke the eval monitor and provide a detach flag |
||
return 0 | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,65 @@ | ||
package command | ||
|
||
import ( | ||
"strings" | ||
"testing" | ||
|
||
"github.com/hashicorp/nomad/nomad/mock" | ||
"github.com/mitchellh/cli" | ||
"github.com/posener/complete" | ||
"github.com/stretchr/testify/assert" | ||
) | ||
|
||
func TestJobEvalCommand_Implements(t *testing.T) { | ||
t.Parallel() | ||
var _ cli.Command = &JobEvalCommand{} | ||
} | ||
|
||
func TestJobEvalCommand_Fails(t *testing.T) { | ||
t.Parallel() | ||
ui := new(cli.MockUi) | ||
cmd := &JobEvalCommand{Meta: Meta{Ui: ui}} | ||
|
||
// Fails on misuse | ||
if code := cmd.Run([]string{"some", "bad", "args"}); code != 1 { | ||
t.Fatalf("expected exit code 1, got: %d", code) | ||
} | ||
if out := ui.ErrorWriter.String(); !strings.Contains(out, commandErrorText(cmd)) { | ||
t.Fatalf("expected help output, got: %s", out) | ||
} | ||
ui.ErrorWriter.Reset() | ||
|
||
// Fails when job ID is not specified | ||
if code := cmd.Run([]string{}); code != 1 { | ||
t.Fatalf("expect exit 1, got: %d", code) | ||
} | ||
if out := ui.ErrorWriter.String(); !strings.Contains(out, "Must provide a job ID") { | ||
t.Fatalf("unexpected error: %v", out) | ||
} | ||
ui.ErrorWriter.Reset() | ||
|
||
} | ||
|
||
func TestJobEvalCommand_AutocompleteArgs(t *testing.T) { | ||
assert := assert.New(t) | ||
t.Parallel() | ||
|
||
srv, _, url := testServer(t, true, nil) | ||
defer srv.Shutdown() | ||
|
||
ui := new(cli.MockUi) | ||
cmd := &JobEvalCommand{Meta: Meta{Ui: ui, flagAddress: url}} | ||
|
||
// Create a fake job | ||
state := srv.Agent.Server().State() | ||
j := mock.Job() | ||
assert.Nil(state.UpsertJob(1000, j)) | ||
|
||
prefix := j.ID[:len(j.ID)-5] | ||
args := complete.Args{Last: prefix} | ||
predictor := cmd.AutocompleteArgs() | ||
|
||
res := predictor.Predict(args) | ||
assert.Equal(1, len(res)) | ||
assert.Equal(j.ID, res[0]) | ||
} | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why not add a test that submits a job and runs the force eval against it There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. done |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -39,6 +39,13 @@ var ( | |
RTarget: ">= 0.6.1", | ||
Operand: structs.ConstraintVersion, | ||
} | ||
|
||
// allowRescheduleTransition is the transition that allows failed | ||
// allocations to be force rescheduled. We create a one off | ||
// variable to avoid creating a new object for every request. | ||
allowForceRescheduleTransition = &structs.DesiredTransition{ | ||
ForceReschedule: helper.BoolToPtr(true), | ||
} | ||
) | ||
|
||
// Job endpoint is used for job interactions | ||
|
@@ -538,6 +545,28 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis | |
return fmt.Errorf("can't evaluate parameterized job") | ||
} | ||
|
||
forceRescheduleAllocs := make(map[string]*structs.DesiredTransition) | ||
|
||
if args.EvalOptions.ForceReschedule { | ||
// Find any failed allocs that could be force rescheduled | ||
allocs, err := snap.AllocsByJob(ws, args.RequestNamespace(), args.JobID, false) | ||
if err != nil { | ||
return err | ||
} | ||
|
||
for _, alloc := range allocs { | ||
taskGroup := job.LookupTaskGroup(alloc.TaskGroup) | ||
// Forcing rescheduling is only allowed if task group has rescheduling enabled | ||
if taskGroup == nil || taskGroup.ReschedulePolicy == nil || !taskGroup.ReschedulePolicy.Enabled() { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't we remove |
||
continue | ||
} | ||
|
||
if alloc.NextAllocation == "" && alloc.ClientStatus == structs.AllocClientStatusFailed { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Check if the desired transition is already set |
||
forceRescheduleAllocs[alloc.ID] = allowForceRescheduleTransition | ||
} | ||
} | ||
} | ||
|
||
// Create a new evaluation | ||
eval := &structs.Evaluation{ | ||
ID: uuid.Generate(), | ||
|
@@ -549,13 +578,14 @@ func (j *Job) Evaluate(args *structs.JobEvaluateRequest, reply *structs.JobRegis | |
JobModifyIndex: job.ModifyIndex, | ||
Status: structs.EvalStatusPending, | ||
} | ||
update := &structs.EvalUpdateRequest{ | ||
Evals: []*structs.Evaluation{eval}, | ||
WriteRequest: structs.WriteRequest{Region: args.Region}, | ||
|
||
// Create a AllocUpdateDesiredTransitionRequest request with the eval and any forced rescheduled allocs | ||
updateTransitionReq := &structs.AllocUpdateDesiredTransitionRequest{ | ||
Allocs: forceRescheduleAllocs, | ||
Evals: []*structs.Evaluation{eval}, | ||
} | ||
_, evalIndex, err := j.srv.raftApply(structs.AllocUpdateDesiredTransitionRequestType, updateTransitionReq) | ||
|
||
// Commit this evaluation via Raft | ||
_, evalIndex, err := j.srv.raftApply(structs.EvalUpdateRequestType, update) | ||
if err != nil { | ||
j.srv.logger.Printf("[ERR] nomad.job: Eval create failed: %v", err) | ||
return err | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Worth renaming to EvaluateWithOpts?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I modeled this after
JobDeregisterOptions
.There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I meant the name of the function. Since it can be more than just force given it takes a generic options object