-
Notifications
You must be signed in to change notification settings - Fork 2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Restart unhealthy tasks #3105
Restart unhealthy tasks #3105
Changes from 25 commits
a720bb5
bd1a342
1608e59
ebbf87f
555d1e2
c2d895d
78c72f8
7e103f6
850d991
3db835c
526528c
568b963
9fb2865
092057a
8b8c164
237c096
f8e872c
40ed262
5cd1d57
10dc1c7
3c0a42b
6f72270
a508bb9
5141c95
1564e1c
8014762
cde908e
fa836d8
924813d
6bcf019
10ae18c
967825d
3d7446d
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -10,8 +10,8 @@ import ( | |
// ConsulServiceAPI is the interface the Nomad Client uses to register and | ||
// remove services and checks from Consul. | ||
type ConsulServiceAPI interface { | ||
RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error | ||
RegisterTask(allocID string, task *structs.Task, restarter consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error | ||
RemoveTask(allocID string, task *structs.Task) | ||
UpdateTask(allocID string, existing, newTask *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error | ||
UpdateTask(allocID string, existing, newTask *structs.Task, restart consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would advice against adding anything else at this point. I would convert to a config struct. No action required in this PR but its getting a bit much There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I was thinking the same thing! |
||
AllocRegistrations(allocID string) (*consul.AllocRegistration, error) | ||
} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -49,7 +49,7 @@ var ( | |
// TaskHooks is an interface which provides hooks into the tasks life-cycle | ||
type TaskHooks interface { | ||
// Restart is used to restart the task | ||
Restart(source, reason string) | ||
Restart(source, reason string, failure bool) | ||
|
||
// Signal is used to signal the task | ||
Signal(source, reason string, s os.Signal) error | ||
|
@@ -439,7 +439,8 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time | |
} | ||
|
||
if restart { | ||
tm.config.Hooks.Restart(consulTemplateSourceName, "template with change_mode restart re-rendered") | ||
const failure = false | ||
tm.config.Hooks.Restart(consulTemplateSourceName, "template with change_mode restart re-rendered", failure) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. My little const pattern is pretty funky, and I'd be happy to remove it. I do it because I hate seeing method calls with literal booleans in them and having no idea what those booleans do without looking at the method signature and/or docs. |
||
} else if len(signals) != 0 { | ||
var mErr multierror.Error | ||
for signal := range signals { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -37,6 +37,7 @@ type RestartTracker struct { | |
waitRes *dstructs.WaitResult | ||
startErr error | ||
restartTriggered bool // Whether the task has been signalled to be restarted | ||
failure bool // Whether a failure triggered the restart | ||
count int // Current number of attempts. | ||
onSuccess bool // Whether to restart on successful exit code. | ||
startTime time.Time // When the interval began | ||
|
@@ -59,6 +60,7 @@ func (r *RestartTracker) SetStartError(err error) *RestartTracker { | |
r.lock.Lock() | ||
defer r.lock.Unlock() | ||
r.startErr = err | ||
r.failure = true | ||
return r | ||
} | ||
|
||
|
@@ -67,15 +69,20 @@ func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker | |
r.lock.Lock() | ||
defer r.lock.Unlock() | ||
r.waitRes = res | ||
r.failure = true | ||
return r | ||
} | ||
|
||
// SetRestartTriggered is used to mark that the task has been signalled to be | ||
// restarted | ||
func (r *RestartTracker) SetRestartTriggered() *RestartTracker { | ||
func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Comment on the param |
||
r.lock.Lock() | ||
defer r.lock.Unlock() | ||
r.restartTriggered = true | ||
if failure { | ||
r.failure = true | ||
} else { | ||
r.restartTriggered = true | ||
} | ||
return r | ||
} | ||
|
||
|
@@ -106,6 +113,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) { | |
r.startErr = nil | ||
r.waitRes = nil | ||
r.restartTriggered = false | ||
r.failure = false | ||
}() | ||
|
||
// Hot path if a restart was triggered | ||
|
@@ -134,66 +142,40 @@ func (r *RestartTracker) GetState() (string, time.Duration) { | |
r.startTime = now | ||
} | ||
|
||
if r.startErr != nil { | ||
return r.handleStartError() | ||
} else if r.waitRes != nil { | ||
return r.handleWaitResult() | ||
} | ||
|
||
return "", 0 | ||
} | ||
|
||
// handleStartError returns the new state and potential wait duration for | ||
// restarting the task after it was not successfully started. On start errors, | ||
// the restart policy is always treated as fail mode to ensure we don't | ||
// infinitely try to start a task. | ||
func (r *RestartTracker) handleStartError() (string, time.Duration) { | ||
// If the error is not recoverable, do not restart. | ||
if !structs.IsRecoverable(r.startErr) { | ||
r.reason = ReasonUnrecoverableErrror | ||
return structs.TaskNotRestarting, 0 | ||
} | ||
|
||
if r.count > r.policy.Attempts { | ||
if r.policy.Mode == structs.RestartPolicyModeFail { | ||
r.reason = fmt.Sprintf( | ||
`Exceeded allowed attempts %d in interval %v and mode is "fail"`, | ||
r.policy.Attempts, r.policy.Interval) | ||
return structs.TaskNotRestarting, 0 | ||
} else { | ||
r.reason = ReasonDelay | ||
return structs.TaskRestarting, r.getDelay() | ||
// Handle restarts due to failures | ||
if r.failure { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Invert it:
|
||
if r.startErr != nil { | ||
// If the error is not recoverable, do not restart. | ||
if !structs.IsRecoverable(r.startErr) { | ||
r.reason = ReasonUnrecoverableErrror | ||
return structs.TaskNotRestarting, 0 | ||
} | ||
} else if r.waitRes != nil { | ||
// If the task started successfully and restart on success isn't specified, | ||
// don't restart but don't mark as failed. | ||
if r.waitRes.Successful() && !r.onSuccess { | ||
r.reason = "Restart unnecessary as task terminated successfully" | ||
return structs.TaskTerminated, 0 | ||
} | ||
} | ||
} | ||
|
||
r.reason = ReasonWithinPolicy | ||
return structs.TaskRestarting, r.jitter() | ||
} | ||
|
||
// handleWaitResult returns the new state and potential wait duration for | ||
// restarting the task after it has exited. | ||
func (r *RestartTracker) handleWaitResult() (string, time.Duration) { | ||
// If the task started successfully and restart on success isn't specified, | ||
// don't restart but don't mark as failed. | ||
if r.waitRes.Successful() && !r.onSuccess { | ||
r.reason = "Restart unnecessary as task terminated successfully" | ||
return structs.TaskTerminated, 0 | ||
} | ||
|
||
if r.count > r.policy.Attempts { | ||
if r.policy.Mode == structs.RestartPolicyModeFail { | ||
r.reason = fmt.Sprintf( | ||
`Exceeded allowed attempts %d in interval %v and mode is "fail"`, | ||
r.policy.Attempts, r.policy.Interval) | ||
return structs.TaskNotRestarting, 0 | ||
} else { | ||
r.reason = ReasonDelay | ||
return structs.TaskRestarting, r.getDelay() | ||
if r.count > r.policy.Attempts { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you comment how you get to this case. |
||
if r.policy.Mode == structs.RestartPolicyModeFail { | ||
r.reason = fmt.Sprintf( | ||
`Exceeded allowed attempts %d in interval %v and mode is "fail"`, | ||
r.policy.Attempts, r.policy.Interval) | ||
return structs.TaskNotRestarting, 0 | ||
} else { | ||
r.reason = ReasonDelay | ||
return structs.TaskRestarting, r.getDelay() | ||
} | ||
} | ||
|
||
r.reason = ReasonWithinPolicy | ||
return structs.TaskRestarting, r.jitter() | ||
} | ||
|
||
r.reason = ReasonWithinPolicy | ||
return structs.TaskRestarting, r.jitter() | ||
return "", 0 | ||
} | ||
|
||
// getDelay returns the delay time to enter the next interval. | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -99,7 +99,7 @@ func TestClient_RestartTracker_RestartTriggered(t *testing.T) { | |
p := testPolicy(true, structs.RestartPolicyModeFail) | ||
p.Attempts = 0 | ||
rt := newRestartTracker(p, structs.JobTypeService) | ||
if state, when := rt.SetRestartTriggered().GetState(); state != structs.TaskRestarting && when != 0 { | ||
if state, when := rt.SetRestartTriggered(false).GetState(); state != structs.TaskRestarting && when != 0 { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Unit test the new case There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Added cde908e |
||
t.Fatalf("expect restart immediately, got %v %v", state, when) | ||
} | ||
} | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Place a comment