From 6e779b863e991819c160ee8ac96a1018330fa9c5 Mon Sep 17 00:00:00 2001 From: Alex Dadgar Date: Thu, 24 Mar 2016 15:43:55 -0700 Subject: [PATCH] Explain restart decision and display in alloc-status --- api/tasks.go | 3 ++- client/restarts.go | 32 ++++++++++++++++++++++++++++++-- client/task_runner.go | 10 ++++++++-- command/alloc_status.go | 14 ++++++++++++-- nomad/structs/structs.go | 10 +++++++++- 5 files changed, 61 insertions(+), 8 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index f357c78dbf4..ee0cc492025 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -163,7 +163,7 @@ const ( TaskTerminated = "Terminated" TaskKilled = "Killed" TaskRestarting = "Restarting" - TaskNotRestarting = "Restarts Exceeded" + TaskNotRestarting = "Not Restarting" TaskDownloadingArtifacts = "Downloading Artifacts" TaskArtifactDownloadFailed = "Failed Artifact Download" ) @@ -173,6 +173,7 @@ const ( type TaskEvent struct { Type string Time int64 + RestartReason string DriverError string ExitCode int Signal int diff --git a/client/restarts.go b/client/restarts.go index 66d94bf9295..c794c0414af 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -1,6 +1,7 @@ package client import ( + "fmt" "math/rand" "sync" "time" @@ -9,8 +10,15 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) -// jitter is the percent of jitter added to restart delays. -const jitter = 0.25 +const ( + // jitter is the percent of jitter added to restart delays. + jitter = 0.25 + + ReasonNoRestartsAllowed = "Policy allows no restarts" + ReasonUnrecoverableErrror = "Error was unrecoverable" + ReasonWithinPolicy = "Restart within policy" + ReasonDelay = "Exceeded allowed attempts, applying a delay" +) func newRestartTracker(policy *structs.RestartPolicy, jobType string) *RestartTracker { onSuccess := true @@ -31,6 +39,7 @@ type RestartTracker struct { count int // Current number of attempts. onSuccess bool // Whether to restart on successful exit code. startTime time.Time // When the interval began + reason string // The reason for the last state policy *structs.RestartPolicy rand *rand.Rand lock sync.Mutex @@ -60,6 +69,14 @@ func (r *RestartTracker) SetWaitResult(res *cstructs.WaitResult) *RestartTracker return r } +// GetReason returns a human-readable description for the last state returned by +// GetState. +func (r *RestartTracker) GetReason() string { + r.lock.Lock() + defer r.lock.Unlock() + return r.reason +} + // GetState returns the tasks next state given the set exit code and start // error. One of the following states are returned: // * TaskRestarting - Task should be restarted @@ -76,6 +93,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) { // Hot path if no attempts are expected if r.policy.Attempts == 0 { + r.reason = ReasonNoRestartsAllowed if r.waitRes != nil && r.waitRes.Successful() { return structs.TaskTerminated, 0 } @@ -109,13 +127,17 @@ func (r *RestartTracker) GetState() (string, time.Duration) { func (r *RestartTracker) handleStartError() (string, time.Duration) { // If the error is not recoverable, do not restart. if rerr, ok := r.startErr.(*cstructs.RecoverableError); !(ok && rerr.Recoverable) { + r.reason = ReasonUnrecoverableErrror return structs.TaskNotRestarting, 0 } if r.count > r.policy.Attempts { + r.reason = fmt.Sprintf("Exceeded allowed attempts %d in interval %v", + r.policy.Attempts, r.policy.Interval) return structs.TaskNotRestarting, 0 } + r.reason = ReasonWithinPolicy return structs.TaskRestarting, r.jitter() } @@ -125,17 +147,23 @@ func (r *RestartTracker) handleWaitResult() (string, time.Duration) { // If the task started successfully and restart on success isn't specified, // don't restart but don't mark as failed. if r.waitRes.Successful() && !r.onSuccess { + r.reason = "Restart unnecessary as task terminated successfully" return structs.TaskTerminated, 0 } if r.count > r.policy.Attempts { if r.policy.Mode == structs.RestartPolicyModeFail { + r.reason = fmt.Sprintf( + `Exceeded allowed atttempts %d in interval %v and mode is "fail"`, + r.policy.Attempts, r.policy.Interval) return structs.TaskNotRestarting, 0 } else { + r.reason = ReasonDelay return structs.TaskRestarting, r.getDelay() } } + r.reason = ReasonWithinPolicy return structs.TaskRestarting, r.jitter() } diff --git a/client/task_runner.go b/client/task_runner.go index 6689c7903b2..7c50ba8e0f5 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -345,16 +345,22 @@ func (r *TaskRunner) run() { RESTART: state, when := r.restartTracker.GetState() r.restartTracker.SetStartError(nil).SetWaitResult(nil) + reason := r.restartTracker.GetReason() switch state { case structs.TaskNotRestarting, structs.TaskTerminated: r.logger.Printf("[INFO] client: Not restarting task: %v for alloc: %v ", r.task.Name, r.alloc.ID) if state == structs.TaskNotRestarting { - r.setState(structs.TaskStateDead, structs.NewTaskEvent(structs.TaskNotRestarting)) + r.setState(structs.TaskStateDead, + structs.NewTaskEvent(structs.TaskNotRestarting). + SetRestartReason(reason)) } return case structs.TaskRestarting: r.logger.Printf("[INFO] client: Restarting task %q for alloc %q in %v", r.task.Name, r.alloc.ID, when) - r.setState(structs.TaskStatePending, structs.NewTaskEvent(structs.TaskRestarting).SetRestartDelay(when)) + r.setState(structs.TaskStatePending, + structs.NewTaskEvent(structs.TaskRestarting). + SetRestartDelay(when). + SetRestartReason(reason)) default: r.logger.Printf("[ERR] client: restart tracker returned unknown state: %q", state) return diff --git a/command/alloc_status.go b/command/alloc_status.go index ead7ab45404..fdd3879d9e6 100644 --- a/command/alloc_status.go +++ b/command/alloc_status.go @@ -7,6 +7,7 @@ import ( "time" "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/client" ) type AllocStatusCommand struct { @@ -240,9 +241,18 @@ func (c *AllocStatusCommand) taskStatus(alloc *api.Allocation) { } desc = strings.Join(parts, ", ") case api.TaskRestarting: - desc = fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay)) + in := fmt.Sprintf("Task restarting in %v", time.Duration(event.StartDelay)) + if event.RestartReason != "" && event.RestartReason != client.ReasonWithinPolicy { + desc = fmt.Sprintf("%s - %s", event.RestartReason, in) + } else { + desc = in + } case api.TaskNotRestarting: - desc = "Task exceeded restart policy" + if event.RestartReason != "" { + desc = event.RestartReason + } else { + desc = "Task exceeded restart policy" + } } // Reverse order so we are sorted by time diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 0904afcdbf5..ef57eb882b4 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -1830,7 +1830,7 @@ const ( // TaskNotRestarting indicates that the task has failed and is not being // restarted because it has exceeded its restart policy. - TaskNotRestarting = "Restarts Exceeded" + TaskNotRestarting = "Not Restarting" // TaskDownloadingArtifacts means the task is downloading the artifacts // specified in the task. @@ -1847,6 +1847,9 @@ type TaskEvent struct { Type string Time int64 // Unix Nanosecond timestamp + // Restart fields. + RestartReason string + // Driver Failure fields. DriverError string // A driver error occured while starting the task. @@ -1924,6 +1927,11 @@ func (e *TaskEvent) SetRestartDelay(delay time.Duration) *TaskEvent { return e } +func (e *TaskEvent) SetRestartReason(reason string) *TaskEvent { + e.RestartReason = reason + return e +} + func (e *TaskEvent) SetDownloadError(err error) *TaskEvent { if err != nil { e.DownloadError = err.Error()