From a720bb5a916fec22c839fefb0b5d1030cad3182d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 24 Aug 2017 10:23:23 -0700 Subject: [PATCH 01/33] Add restart fields --- api/tasks.go | 31 +++++++++++++++++-------------- command/agent/job_endpoint.go | 29 ++++++++++++++++------------- jobspec/parse.go | 3 +++ nomad/structs/structs.go | 29 ++++++++++++++++------------- 4 files changed, 52 insertions(+), 40 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index 76c1be65889..d3804850c62 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -82,20 +82,23 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) { // The ServiceCheck data model represents the consul health check that // Nomad registers for a Task type ServiceCheck struct { - Id string - Name string - Type string - Command string - Args []string - Path string - Protocol string - PortLabel string `mapstructure:"port"` - Interval time.Duration - Timeout time.Duration - InitialStatus string `mapstructure:"initial_status"` - TLSSkipVerify bool `mapstructure:"tls_skip_verify"` - Header map[string][]string - Method string + Id string + Name string + Type string + Command string + Args []string + Path string + Protocol string + PortLabel string `mapstructure:"port"` + Interval time.Duration + Timeout time.Duration + InitialStatus string `mapstructure:"initial_status"` + TLSSkipVerify bool `mapstructure:"tls_skip_verify"` + Header map[string][]string + Method string + RestartAfter int + RestartGrace time.Duration + RestartWarning bool } // The Service model represents a Consul service definition diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 75a77f0fcf7..ea1cfdcc68b 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -690,19 +690,22 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { structsTask.Services[i].Checks = make([]*structs.ServiceCheck, l) for j, check := range service.Checks { structsTask.Services[i].Checks[j] = &structs.ServiceCheck{ - Name: check.Name, - Type: check.Type, - Command: check.Command, - Args: check.Args, - Path: check.Path, - Protocol: check.Protocol, - PortLabel: check.PortLabel, - Interval: check.Interval, - Timeout: check.Timeout, - InitialStatus: check.InitialStatus, - TLSSkipVerify: check.TLSSkipVerify, - Header: check.Header, - Method: check.Method, + Name: check.Name, + Type: check.Type, + Command: check.Command, + Args: check.Args, + Path: check.Path, + Protocol: check.Protocol, + PortLabel: check.PortLabel, + Interval: check.Interval, + Timeout: check.Timeout, + InitialStatus: check.InitialStatus, + TLSSkipVerify: check.TLSSkipVerify, + Header: check.Header, + Method: check.Method, + RestartAfter: check.RestartAfter, + RestartGrace: check.RestartGrace, + RestartWarning: check.RestartWarning, } } } diff --git a/jobspec/parse.go b/jobspec/parse.go index efda8f95f4e..dd3105e3f06 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -965,6 +965,9 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { "tls_skip_verify", "header", "method", + "restart_grace_period", + "restart_on_warning", + "restart_after_unhealthy", } if err := checkHCLKeys(co.Val, valid); err != nil { return multierror.Prefix(err, "check ->") diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 8175024a32e..cf53c6db271 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2775,19 +2775,22 @@ const ( // The ServiceCheck data model represents the consul health check that // Nomad registers for a Task type ServiceCheck struct { - Name string // Name of the check, defaults to id - Type string // Type of the check - tcp, http, docker and script - Command string // Command is the command to run for script checks - Args []string // Args is a list of argumes for script checks - Path string // path of the health check url for http type check - Protocol string // Protocol to use if check is http, defaults to http - PortLabel string // The port to use for tcp/http checks - Interval time.Duration // Interval of the check - Timeout time.Duration // Timeout of the response from the check before consul fails the check - InitialStatus string // Initial status of the check - TLSSkipVerify bool // Skip TLS verification when Protocol=https - Method string // HTTP Method to use (GET by default) - Header map[string][]string // HTTP Headers for Consul to set when making HTTP checks + Name string // Name of the check, defaults to id + Type string // Type of the check - tcp, http, docker and script + Command string // Command is the command to run for script checks + Args []string // Args is a list of argumes for script checks + Path string // path of the health check url for http type check + Protocol string // Protocol to use if check is http, defaults to http + PortLabel string // The port to use for tcp/http checks + Interval time.Duration // Interval of the check + Timeout time.Duration // Timeout of the response from the check before consul fails the check + InitialStatus string // Initial status of the check + TLSSkipVerify bool // Skip TLS verification when Protocol=https + Method string // HTTP Method to use (GET by default) + Header map[string][]string // HTTP Headers for Consul to set when making HTTP checks + RestartAfter int // Restart task after this many unhealthy intervals + RestartGrace time.Duration // Grace time to give tasks after starting to get healthy + RestartWarning bool // If true treat checks in `warning` as unhealthy } func (sc *ServiceCheck) Copy() *ServiceCheck { From bd1a342a92ee70c94a1f9d5cab715d9c348c1fc4 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 24 Aug 2017 17:18:06 -0700 Subject: [PATCH 02/33] Nest restart fields in CheckRestart --- api/tasks.go | 53 +++++++++------- command/agent/job_endpoint.go | 43 ++++++++----- nomad/structs/structs.go | 114 +++++++++++++++++++++++++++++----- 3 files changed, 154 insertions(+), 56 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index d3804850c62..01f5b96d4d4 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -79,36 +79,43 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) { } } +// CheckRestart describes if and when a task should be restarted based on +// failing health checks. +type CheckRestart struct { + Limit int `mapstructure:"limit"` + Grace time.Duration `mapstructure:"grace_period"` + OnWarning bool `mapstructure:"on_warning"` +} + // The ServiceCheck data model represents the consul health check that // Nomad registers for a Task type ServiceCheck struct { - Id string - Name string - Type string - Command string - Args []string - Path string - Protocol string - PortLabel string `mapstructure:"port"` - Interval time.Duration - Timeout time.Duration - InitialStatus string `mapstructure:"initial_status"` - TLSSkipVerify bool `mapstructure:"tls_skip_verify"` - Header map[string][]string - Method string - RestartAfter int - RestartGrace time.Duration - RestartWarning bool + Id string + Name string + Type string + Command string + Args []string + Path string + Protocol string + PortLabel string `mapstructure:"port"` + Interval time.Duration + Timeout time.Duration + InitialStatus string `mapstructure:"initial_status"` + TLSSkipVerify bool `mapstructure:"tls_skip_verify"` + Header map[string][]string + Method string + CheckRestart *CheckRestart `mapstructure:"check_restart"` } // The Service model represents a Consul service definition type Service struct { - Id string - Name string - Tags []string - PortLabel string `mapstructure:"port"` - AddressMode string `mapstructure:"address_mode"` - Checks []ServiceCheck + Id string + Name string + Tags []string + PortLabel string `mapstructure:"port"` + AddressMode string `mapstructure:"address_mode"` + Checks []ServiceCheck + CheckRestart *CheckRestart `mapstructure:"check_restart"` } func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index ea1cfdcc68b..a3efb7d181a 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -685,27 +685,38 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { Tags: service.Tags, AddressMode: service.AddressMode, } + if service.CheckRestart != nil { + structsTask.Services[i].CheckRestart = &structs.CheckRestart{ + Limit: service.CheckRestart.Limit, + Grace: service.CheckRestart.Grace, + OnWarning: service.CheckRestart.OnWarning, + } + } if l := len(service.Checks); l != 0 { structsTask.Services[i].Checks = make([]*structs.ServiceCheck, l) for j, check := range service.Checks { structsTask.Services[i].Checks[j] = &structs.ServiceCheck{ - Name: check.Name, - Type: check.Type, - Command: check.Command, - Args: check.Args, - Path: check.Path, - Protocol: check.Protocol, - PortLabel: check.PortLabel, - Interval: check.Interval, - Timeout: check.Timeout, - InitialStatus: check.InitialStatus, - TLSSkipVerify: check.TLSSkipVerify, - Header: check.Header, - Method: check.Method, - RestartAfter: check.RestartAfter, - RestartGrace: check.RestartGrace, - RestartWarning: check.RestartWarning, + Name: check.Name, + Type: check.Type, + Command: check.Command, + Args: check.Args, + Path: check.Path, + Protocol: check.Protocol, + PortLabel: check.PortLabel, + Interval: check.Interval, + Timeout: check.Timeout, + InitialStatus: check.InitialStatus, + TLSSkipVerify: check.TLSSkipVerify, + Header: check.Header, + Method: check.Method, + } + if check.CheckRestart != nil { + structsTask.Services[i].Checks[j].CheckRestart = &structs.CheckRestart{ + Limit: check.CheckRestart.Limit, + Grace: check.CheckRestart.Grace, + OnWarning: check.CheckRestart.OnWarning, + } } } } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index cf53c6db271..e637dac7594 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2757,6 +2757,70 @@ func (tg *TaskGroup) GoString() string { return fmt.Sprintf("*%#v", *tg) } +// CheckRestart describes if and when a task should be restarted based on +// failing health checks. +type CheckRestart struct { + Limit int // Restart task after this many unhealthy intervals + Grace time.Duration // Grace time to give tasks after starting to get healthy + OnWarning bool // If true treat checks in `warning` as unhealthy +} + +func (c *CheckRestart) Copy() *CheckRestart { + if c == nil { + return nil + } + + nc := new(CheckRestart) + *nc = *c + return nc +} + +// Merge non-zero values from other CheckRestart into a copy of this +// CheckRestart. Returns nil iff both are nil. +func (c *CheckRestart) Merge(o *CheckRestart) *CheckRestart { + if c == nil { + // Just return other + return o + } + + nc := c.Copy() + + if o == nil { + // Nothing to merge + return nc.Copy() + } + + if nc.Limit == 0 { + nc.Limit = o.Limit + } + + if nc.Grace == 0 { + nc.Grace = o.Grace + } + + if !nc.OnWarning { + nc.OnWarning = o.OnWarning + } + + return nc +} + +func (c *CheckRestart) Validate() error { + if c == nil { + return nil + } + + if c.Limit < 0 { + return fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit) + } + + if c.Grace < 0 { + return fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace) + } + + return nil +} + const ( ServiceCheckHTTP = "http" ServiceCheckTCP = "tcp" @@ -2775,22 +2839,20 @@ const ( // The ServiceCheck data model represents the consul health check that // Nomad registers for a Task type ServiceCheck struct { - Name string // Name of the check, defaults to id - Type string // Type of the check - tcp, http, docker and script - Command string // Command is the command to run for script checks - Args []string // Args is a list of argumes for script checks - Path string // path of the health check url for http type check - Protocol string // Protocol to use if check is http, defaults to http - PortLabel string // The port to use for tcp/http checks - Interval time.Duration // Interval of the check - Timeout time.Duration // Timeout of the response from the check before consul fails the check - InitialStatus string // Initial status of the check - TLSSkipVerify bool // Skip TLS verification when Protocol=https - Method string // HTTP Method to use (GET by default) - Header map[string][]string // HTTP Headers for Consul to set when making HTTP checks - RestartAfter int // Restart task after this many unhealthy intervals - RestartGrace time.Duration // Grace time to give tasks after starting to get healthy - RestartWarning bool // If true treat checks in `warning` as unhealthy + Name string // Name of the check, defaults to id + Type string // Type of the check - tcp, http, docker and script + Command string // Command is the command to run for script checks + Args []string // Args is a list of argumes for script checks + Path string // path of the health check url for http type check + Protocol string // Protocol to use if check is http, defaults to http + PortLabel string // The port to use for tcp/http checks + Interval time.Duration // Interval of the check + Timeout time.Duration // Timeout of the response from the check before consul fails the check + InitialStatus string // Initial status of the check + TLSSkipVerify bool // Skip TLS verification when Protocol=https + Method string // HTTP Method to use (GET by default) + Header map[string][]string // HTTP Headers for Consul to set when making HTTP checks + CheckRestart *CheckRestart // If and when a task should be restarted based on checks } func (sc *ServiceCheck) Copy() *ServiceCheck { @@ -2801,6 +2863,7 @@ func (sc *ServiceCheck) Copy() *ServiceCheck { *nsc = *sc nsc.Args = helper.CopySliceString(sc.Args) nsc.Header = helper.CopyMapStringSliceString(sc.Header) + nsc.CheckRestart = sc.CheckRestart.Copy() return nsc } @@ -2866,7 +2929,7 @@ func (sc *ServiceCheck) validate() error { } - return nil + return sc.CheckRestart.Validate() } // RequiresPort returns whether the service check requires the task has a port. @@ -2939,6 +3002,9 @@ type Service struct { Tags []string // List of tags for the service Checks []*ServiceCheck // List of checks associated with the service + + // CheckRestart will be propagated to Checks if set. + CheckRestart *CheckRestart } func (s *Service) Copy() *Service { @@ -2948,6 +3014,7 @@ func (s *Service) Copy() *Service { ns := new(Service) *ns = *s ns.Tags = helper.CopySliceString(ns.Tags) + ns.CheckRestart = s.CheckRestart.Copy() if s.Checks != nil { checks := make([]*ServiceCheck, len(ns.Checks)) @@ -2983,6 +3050,14 @@ func (s *Service) Canonicalize(job string, taskGroup string, task string) { for _, check := range s.Checks { check.Canonicalize(s.Name) } + + // If CheckRestart is set propagate it to checks + if s.CheckRestart != nil { + for _, check := range s.Checks { + // Merge Service CheckRestart into Check's so Check's takes precedence + check.CheckRestart = check.CheckRestart.Merge(s.CheckRestart) + } + } } // Validate checks if the Check definition is valid @@ -3016,6 +3091,11 @@ func (s *Service) Validate() error { mErr.Errors = append(mErr.Errors, fmt.Errorf("check %s invalid: %v", c.Name, err)) } } + + if s.CheckRestart != nil && len(s.Checks) == 0 { + mErr.Errors = append(mErr.Errors, fmt.Errorf("check_restart specified but no checks")) + } + return mErr.ErrorOrNil() } From 1608e59415e5e7060cadb00c5abd79f5c4ae8610 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 25 Aug 2017 22:40:18 -0700 Subject: [PATCH 03/33] Add check watcher for restarting unhealthy tasks --- client/consul.go | 4 +- client/task_runner.go | 42 ++++- command/agent/consul/check_watcher.go | 244 ++++++++++++++++++++++++++ command/agent/consul/client.go | 74 +++++++- jobspec/parse.go | 70 +++++++- nomad/structs/structs.go | 6 + 6 files changed, 426 insertions(+), 14 deletions(-) create mode 100644 command/agent/consul/check_watcher.go diff --git a/client/consul.go b/client/consul.go index 89666e41e45..02e40ef0f09 100644 --- a/client/consul.go +++ b/client/consul.go @@ -10,8 +10,8 @@ import ( // ConsulServiceAPI is the interface the Nomad Client uses to register and // remove services and checks from Consul. type ConsulServiceAPI interface { - RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error + RegisterTask(allocID string, task *structs.Task, restarter consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error RemoveTask(allocID string, task *structs.Task) - UpdateTask(allocID string, existing, newTask *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error + UpdateTask(allocID string, existing, newTask *structs.Task, restart consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error AllocRegistrations(allocID string) (*consul.AllocRegistration, error) } diff --git a/client/task_runner.go b/client/task_runner.go index cd5afbd9197..36326448991 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -141,6 +141,10 @@ type TaskRunner struct { // restartCh is used to restart a task restartCh chan *structs.TaskEvent + // lastStart tracks the last time this task was started or restarted + lastStart time.Time + lastStartMu sync.Mutex + // signalCh is used to send a signal to a task signalCh chan SignalEvent @@ -1362,6 +1366,11 @@ func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { // startTask creates the driver, task dir, and starts the task. func (r *TaskRunner) startTask() error { + // Update lastStart + r.lastStartMu.Lock() + r.lastStart = time.Now() + r.lastStartMu.Unlock() + // Create a driver drv, err := r.createDriver() if err != nil { @@ -1439,7 +1448,7 @@ func (r *TaskRunner) registerServices(d driver.Driver, h driver.DriverHandle, n exec = h } interpolatedTask := interpolateServices(r.envBuilder.Build(), r.task) - return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, exec, n) + return r.consul.RegisterTask(r.alloc.ID, interpolatedTask, r, exec, n) } // interpolateServices interpolates tags in a service and checks with values from the @@ -1641,7 +1650,7 @@ func (r *TaskRunner) updateServices(d driver.Driver, h driver.ScriptExecutor, ol r.driverNetLock.Lock() net := r.driverNet.Copy() r.driverNetLock.Unlock() - return r.consul.UpdateTask(r.alloc.ID, oldInterpolatedTask, newInterpolatedTask, exec, net) + return r.consul.UpdateTask(r.alloc.ID, oldInterpolatedTask, newInterpolatedTask, r, exec, net) } // handleDestroy kills the task handle. In the case that killing fails, @@ -1671,6 +1680,16 @@ func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, // Restart will restart the task func (r *TaskRunner) Restart(source, reason string) { + r.lastStartMu.Lock() + defer r.lastStartMu.Unlock() + + r.restart(source, reason) +} + +// restart is the internal task restart method. Callers must hold lastStartMu. +func (r *TaskRunner) restart(source, reason string) { + r.lastStart = time.Now() + reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) @@ -1680,6 +1699,25 @@ func (r *TaskRunner) Restart(source, reason string) { } } +// RestartBy deadline. Restarts a task iff the last time it was started was +// before the deadline. Returns true if restart occurs; false if skipped. +func (r *TaskRunner) RestartBy(deadline time.Time, source, reason string) { + r.lastStartMu.Lock() + defer r.lastStartMu.Unlock() + + if r.lastStart.Before(deadline) { + r.restart(source, reason) + } +} + +// LastStart returns the last time this task was started (including restarts). +func (r *TaskRunner) LastStart() time.Time { + r.lastStartMu.Lock() + ls := r.lastStart + r.lastStartMu.Unlock() + return ls +} + // Signal will send a signal to the task func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go new file mode 100644 index 00000000000..46967ef5f30 --- /dev/null +++ b/command/agent/consul/check_watcher.go @@ -0,0 +1,244 @@ +package consul + +import ( + "context" + "fmt" + "log" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/nomad/structs" +) + +const ( + // defaultPollFreq is the default rate to poll the Consul Checks API + defaultPollFreq = 900 * time.Millisecond +) + +type ConsulChecks interface { + Checks() (map[string]*api.AgentCheck, error) +} + +type TaskRestarter interface { + LastStart() time.Time + RestartBy(deadline time.Time, source, reason string) +} + +// checkRestart handles restarting a task if a check is unhealthy. +type checkRestart struct { + allocID string + taskName string + checkID string + checkName string + + // remove this checkID (if true only checkID will be set) + remove bool + + task TaskRestarter + grace time.Duration + interval time.Duration + timeLimit time.Duration + warning bool + + // unhealthyStart is the time a check first went unhealthy. Set to the + // zero value if the check passes before timeLimit. + // This is the only mutable field on checkRestart. + unhealthyStart time.Time + + logger *log.Logger +} + +// update restart state for check and restart task if necessary. Currrent +// timestamp is passed in so all check updates have the same view of time (and +// to ease testing). +func (c *checkRestart) update(now time.Time, status string) { + switch status { + case api.HealthCritical: + case api.HealthWarning: + if !c.warning { + // Warnings are ok, reset state and exit + c.unhealthyStart = time.Time{} + return + } + default: + // All other statuses are ok, reset state and exit + c.unhealthyStart = time.Time{} + return + } + + if now.Before(c.task.LastStart().Add(c.grace)) { + // In grace period, reset state and exit + c.unhealthyStart = time.Time{} + return + } + + if c.unhealthyStart.IsZero() { + // First failure, set restart deadline + c.unhealthyStart = now + } + + // restart timeLimit after start of this check becoming unhealthy + restartAt := c.unhealthyStart.Add(c.timeLimit) + + // Must test >= because if limit=1, restartAt == first failure + if now.UnixNano() >= restartAt.UnixNano() { + // hasn't become healthy by deadline, restart! + c.logger.Printf("[DEBUG] consul.health: restarting alloc %q task %q due to unhealthy check %q", c.allocID, c.taskName, c.checkName) + c.task.RestartBy(now, "healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName)) + } +} + +// checkWatcher watches Consul checks and restarts tasks when they're +// unhealthy. +type checkWatcher struct { + consul ConsulChecks + + pollFreq time.Duration + + watchCh chan *checkRestart + + // done is closed when Run has exited + done chan struct{} + + // lastErr is true if the last Consul call failed. It is used to + // squelch repeated error messages. + lastErr bool + + logger *log.Logger +} + +// newCheckWatcher creates a new checkWatcher but does not call its Run method. +func newCheckWatcher(logger *log.Logger, consul ConsulChecks) *checkWatcher { + return &checkWatcher{ + consul: consul, + pollFreq: defaultPollFreq, + watchCh: make(chan *checkRestart, 8), + done: make(chan struct{}), + logger: logger, + } +} + +// Run the main Consul checks watching loop to restart tasks when their checks +// fail. Blocks until context is canceled. +func (w *checkWatcher) Run(ctx context.Context) { + defer close(w.done) + + // map of check IDs to their metadata + checks := map[string]*checkRestart{} + + // timer for check polling + checkTimer := time.NewTimer(0) + defer checkTimer.Stop() // ensure timer is never leaked + resetTimer := func(d time.Duration) { + if !checkTimer.Stop() { + <-checkTimer.C + } + checkTimer.Reset(d) + } + + // Main watch loop + for { + // Don't start watching until we actually have checks that + // trigger restarts. + for len(checks) == 0 { + select { + case c := <-w.watchCh: + if c.remove { + // should not happen + w.logger.Printf("[DEBUG] consul.health: told to stop watching an unwatched check: %q", c.checkID) + } else { + checks[c.checkID] = c + + // First check should be after grace period + resetTimer(c.grace) + } + case <-ctx.Done(): + return + } + } + + // As long as there are checks to be watched, keep watching + for len(checks) > 0 { + select { + case c := <-w.watchCh: + if c.remove { + delete(checks, c.checkID) + } else { + checks[c.checkID] = c + w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", c.allocID, c.taskName, c.checkName) + } + case <-ctx.Done(): + return + case <-checkTimer.C: + checkTimer.Reset(w.pollFreq) + + // Set "now" as the point in time the following check results represent + now := time.Now() + + results, err := w.consul.Checks() + if err != nil { + if !w.lastErr { + w.lastErr = true + w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err) + } + continue + } + + w.lastErr = false + + // Loop over watched checks and update their status from results + for cid, check := range checks { + result, ok := results[cid] + if !ok { + w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) + continue + } + + check.update(now, result.Status) + } + } + } + } +} + +// Watch a task and restart it if unhealthy. +func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) { + if !check.Watched() { + // Not watched, noop + return + } + + c := checkRestart{ + allocID: allocID, + taskName: taskName, + checkID: checkID, + checkName: check.Name, + task: restarter, + interval: check.Interval, + grace: check.CheckRestart.Grace, + timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), + warning: check.CheckRestart.OnWarning, + logger: w.logger, + } + + select { + case w.watchCh <- &c: + // sent watch + case <-w.done: + // exited; nothing to do + } +} + +// Unwatch a task. +func (w *checkWatcher) Unwatch(cid string) { + c := checkRestart{ + checkID: cid, + remove: true, + } + select { + case w.watchCh <- &c: + // sent remove watch + case <-w.done: + // exited; nothing to do + } +} diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 8285785fbde..911ba628e56 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -1,6 +1,7 @@ package consul import ( + "context" "fmt" "log" "net" @@ -223,6 +224,9 @@ type ServiceClient struct { // seen is 1 if Consul has ever been seen; otherise 0. Accessed with // atomics. seen int32 + + // checkWatcher restarts checks that are unhealthy. + checkWatcher *checkWatcher } // NewServiceClient creates a new Consul ServiceClient from an existing Consul API @@ -245,6 +249,7 @@ func NewServiceClient(consulClient AgentAPI, skipVerifySupport bool, logger *log allocRegistrations: make(map[string]*AllocRegistration), agentServices: make(map[string]struct{}), agentChecks: make(map[string]struct{}), + checkWatcher: newCheckWatcher(logger, consulClient), } } @@ -267,6 +272,12 @@ func (c *ServiceClient) hasSeen() bool { // be called exactly once. func (c *ServiceClient) Run() { defer close(c.exitCh) + + // start checkWatcher + ctx, cancelWatcher := context.WithCancel(context.Background()) + defer cancelWatcher() + go c.checkWatcher.Run(ctx) + retryTimer := time.NewTimer(0) <-retryTimer.C // disabled by default failures := 0 @@ -274,6 +285,7 @@ func (c *ServiceClient) Run() { select { case <-retryTimer.C: case <-c.shutdownCh: + cancelWatcher() case ops := <-c.opCh: c.merge(ops) } @@ -656,7 +668,7 @@ func (c *ServiceClient) checkRegs(ops *operations, allocID, serviceID string, se // Checks will always use the IP from the Task struct (host's IP). // // Actual communication with Consul is done asynchrously (see Run). -func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { +func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { // Fast path numServices := len(task.Services) if numServices == 0 { @@ -679,6 +691,18 @@ func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec dr c.addTaskRegistration(allocID, task.Name, t) c.commit(ops) + + // Start watching checks. Done after service registrations are built + // since an error building them could leak watches. + for _, service := range task.Services { + serviceID := makeTaskServiceID(allocID, task.Name, service) + for _, check := range service.Checks { + if check.Watched() { + checkID := makeCheckID(serviceID, check) + c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter) + } + } + } return nil } @@ -686,7 +710,7 @@ func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, exec dr // changed. // // DriverNetwork must not change between invocations for the same allocation. -func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { +func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Task, restarter TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { ops := &operations{} t := new(TaskRegistration) @@ -709,7 +733,13 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta // Existing service entry removed ops.deregServices = append(ops.deregServices, existingID) for _, check := range existingSvc.Checks { - ops.deregChecks = append(ops.deregChecks, makeCheckID(existingID, check)) + cid := makeCheckID(existingID, check) + ops.deregChecks = append(ops.deregChecks, cid) + + // Unwatch watched checks + if check.Watched() { + c.checkWatcher.Unwatch(cid) + } } continue } @@ -730,9 +760,9 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta } // Check to see what checks were updated - existingChecks := make(map[string]struct{}, len(existingSvc.Checks)) + existingChecks := make(map[string]*structs.ServiceCheck, len(existingSvc.Checks)) for _, check := range existingSvc.Checks { - existingChecks[makeCheckID(existingID, check)] = struct{}{} + existingChecks[makeCheckID(existingID, check)] = check } // Register new checks @@ -748,15 +778,28 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta if err != nil { return err } + for _, checkID := range newCheckIDs { sreg.checkIDs[checkID] = struct{}{} + } + + } + + // Update all watched checks as CheckRestart fields aren't part of ID + if check.Watched() { + c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) } } // Remove existing checks not in updated service - for cid := range existingChecks { + for cid, check := range existingChecks { ops.deregChecks = append(ops.deregChecks, cid) + + // Unwatch checks + if check.Watched() { + c.checkWatcher.Unwatch(cid) + } } } @@ -774,6 +817,18 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta c.addTaskRegistration(allocID, newTask.Name, t) c.commit(ops) + + // Start watching checks. Done after service registrations are built + // since an error building them could leak watches. + for _, service := range newIDs { + serviceID := makeTaskServiceID(allocID, newTask.Name, service) + for _, check := range service.Checks { + if check.Watched() { + checkID := makeCheckID(serviceID, check) + c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) + } + } + } return nil } @@ -788,7 +843,12 @@ func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { ops.deregServices = append(ops.deregServices, id) for _, check := range service.Checks { - ops.deregChecks = append(ops.deregChecks, makeCheckID(id, check)) + cid := makeCheckID(id, check) + ops.deregChecks = append(ops.deregChecks, cid) + + if check.Watched() { + c.checkWatcher.Unwatch(cid) + } } } diff --git a/jobspec/parse.go b/jobspec/parse.go index dd3105e3f06..1c90963aed7 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -922,6 +922,7 @@ func parseServices(jobName string, taskGroupName string, task *api.Task, service } delete(m, "check") + delete(m, "check_restart") if err := mapstructure.WeakDecode(m, &service); err != nil { return err @@ -941,6 +942,18 @@ func parseServices(jobName string, taskGroupName string, task *api.Task, service } } + // Filter check_restart + if cro := checkList.Filter("check_restart"); len(cro.Items) > 0 { + if len(cro.Items) > 1 { + return fmt.Errorf("check_restart '%s': cannot have more than 1 check_restart", service.Name) + } + if cr, err := parseCheckRestart(cro.Items[0]); err != nil { + return multierror.Prefix(err, fmt.Sprintf("service: '%s',", service.Name)) + } else { + service.CheckRestart = cr + } + } + task.Services[idx] = &service } @@ -965,9 +978,7 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { "tls_skip_verify", "header", "method", - "restart_grace_period", - "restart_on_warning", - "restart_after_unhealthy", + "check_restart", } if err := checkHCLKeys(co.Val, valid); err != nil { return multierror.Prefix(err, "check ->") @@ -1009,6 +1020,8 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { delete(cm, "header") } + delete(cm, "check_restart") + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ DecodeHook: mapstructure.StringToTimeDurationHookFunc(), WeaklyTypedInput: true, @@ -1021,12 +1034,63 @@ func parseChecks(service *api.Service, checkObjs *ast.ObjectList) error { return err } + // Filter check_restart + var checkRestartList *ast.ObjectList + if ot, ok := co.Val.(*ast.ObjectType); ok { + checkRestartList = ot.List + } else { + return fmt.Errorf("check_restart '%s': should be an object", check.Name) + } + + if cro := checkRestartList.Filter("check_restart"); len(cro.Items) > 0 { + if len(cro.Items) > 1 { + return fmt.Errorf("check_restart '%s': cannot have more than 1 check_restart", check.Name) + } + if cr, err := parseCheckRestart(cro.Items[0]); err != nil { + return multierror.Prefix(err, fmt.Sprintf("check: '%s',", check.Name)) + } else { + check.CheckRestart = cr + } + } + service.Checks[idx] = check } return nil } +func parseCheckRestart(cro *ast.ObjectItem) (*api.CheckRestart, error) { + valid := []string{ + "limit", + "grace_period", + "on_warning", + } + + if err := checkHCLKeys(cro.Val, valid); err != nil { + return nil, multierror.Prefix(err, "check_restart ->") + } + + var checkRestart api.CheckRestart + var crm map[string]interface{} + if err := hcl.DecodeObject(&crm, cro.Val); err != nil { + return nil, err + } + + dec, err := mapstructure.NewDecoder(&mapstructure.DecoderConfig{ + DecodeHook: mapstructure.StringToTimeDurationHookFunc(), + WeaklyTypedInput: true, + Result: &checkRestart, + }) + if err != nil { + return nil, err + } + if err := dec.Decode(crm); err != nil { + return nil, err + } + + return &checkRestart, nil +} + func parseResources(result *api.Resources, list *ast.ObjectList) error { list = list.Elem() if len(list.Items) == 0 { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index e637dac7594..a598d0b1496 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2942,6 +2942,12 @@ func (sc *ServiceCheck) RequiresPort() bool { } } +// Watched returns true if this check should be watched and trigger a restart +// on failure. +func (sc *ServiceCheck) Watched() bool { + return sc.CheckRestart != nil && sc.CheckRestart.Limit > 0 +} + // Hash all ServiceCheck fields and the check's corresponding service ID to // create an identifier. The identifier is not guaranteed to be unique as if // the PortLabel is blank, the Service's PortLabel will be used after Hash is From ebbf87f979444d9aca747f6f74a2f275c491b7ae Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 16:25:13 -0700 Subject: [PATCH 04/33] Use existing restart policy infrastructure --- client/alloc_runner.go | 3 +- client/consul_template.go | 5 +- client/restarts.go | 32 ++++++++++ client/task_runner.go | 87 +++++++++++++++------------ command/agent/consul/check_watcher.go | 65 ++++++++++++-------- 5 files changed, 125 insertions(+), 67 deletions(-) diff --git a/client/alloc_runner.go b/client/alloc_runner.go index d6486734e54..c4725c6e4fb 100644 --- a/client/alloc_runner.go +++ b/client/alloc_runner.go @@ -336,7 +336,8 @@ func (r *AllocRunner) RestoreState() error { // Restart task runner if RestoreState gave a reason if restartReason != "" { r.logger.Printf("[INFO] client: restarting alloc %s task %s: %v", r.allocID, name, restartReason) - tr.Restart("upgrade", restartReason) + const failure = false + tr.Restart("upgrade", restartReason, failure) } } else { tr.Destroy(taskDestroyEvent) diff --git a/client/consul_template.go b/client/consul_template.go index 2e1b1ac60e1..2f7a629357f 100644 --- a/client/consul_template.go +++ b/client/consul_template.go @@ -49,7 +49,7 @@ var ( // TaskHooks is an interface which provides hooks into the tasks life-cycle type TaskHooks interface { // Restart is used to restart the task - Restart(source, reason string) + Restart(source, reason string, failure bool) // Signal is used to signal the task Signal(source, reason string, s os.Signal) error @@ -439,7 +439,8 @@ func (tm *TaskTemplateManager) handleTemplateRerenders(allRenderedTime time.Time } if restart { - tm.config.Hooks.Restart(consulTemplateSourceName, "template with change_mode restart re-rendered") + const failure = false + tm.config.Hooks.Restart(consulTemplateSourceName, "template with change_mode restart re-rendered", failure) } else if len(signals) != 0 { var mErr multierror.Error for signal := range signals { diff --git a/client/restarts.go b/client/restarts.go index b6e49e31c70..93f2a8fd0cc 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -37,6 +37,7 @@ type RestartTracker struct { waitRes *dstructs.WaitResult startErr error restartTriggered bool // Whether the task has been signalled to be restarted + failure bool // Whether a failure triggered the restart count int // Current number of attempts. onSuccess bool // Whether to restart on successful exit code. startTime time.Time // When the interval began @@ -79,6 +80,15 @@ func (r *RestartTracker) SetRestartTriggered() *RestartTracker { return r } +// SetFailure is used to mark that a task should be restarted due to failure +// such as a failed Consul healthcheck. +func (r *RestartTracker) SetFailure() *RestartTracker { + r.lock.Lock() + defer r.lock.Unlock() + r.failure = true + return r +} + // GetReason returns a human-readable description for the last state returned by // GetState. func (r *RestartTracker) GetReason() string { @@ -106,6 +116,7 @@ func (r *RestartTracker) GetState() (string, time.Duration) { r.startErr = nil r.waitRes = nil r.restartTriggered = false + r.failure = false }() // Hot path if a restart was triggered @@ -138,6 +149,8 @@ func (r *RestartTracker) GetState() (string, time.Duration) { return r.handleStartError() } else if r.waitRes != nil { return r.handleWaitResult() + } else if r.failure { + return r.handleFailure() } return "", 0 @@ -196,6 +209,25 @@ func (r *RestartTracker) handleWaitResult() (string, time.Duration) { return structs.TaskRestarting, r.jitter() } +// handleFailure returns the new state and potential wait duration for +// restarting the task due to a failure like an unhealthy Consul check. +func (r *RestartTracker) handleFailure() (string, time.Duration) { + if r.count > r.policy.Attempts { + if r.policy.Mode == structs.RestartPolicyModeFail { + r.reason = fmt.Sprintf( + `Exceeded allowed attempts %d in interval %v and mode is "fail"`, + r.policy.Attempts, r.policy.Interval) + return structs.TaskNotRestarting, 0 + } else { + r.reason = ReasonDelay + return structs.TaskRestarting, r.getDelay() + } + } + + r.reason = ReasonWithinPolicy + return structs.TaskRestarting, r.jitter() +} + // getDelay returns the delay time to enter the next interval. func (r *RestartTracker) getDelay() time.Duration { end := r.startTime.Add(r.policy.Interval) diff --git a/client/task_runner.go b/client/task_runner.go index 36326448991..e31371026b9 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -65,13 +65,29 @@ var ( taskRunnerStateAllKey = []byte("simple-all") ) +// taskRestartEvent wraps a TaskEvent with additional metadata to control +// restart behavior. +type taskRestartEvent struct { + // taskEvent to report + taskEvent *structs.TaskEvent + + // if false, don't count against restart count + failure bool +} + +func newTaskRestartEvent(source, reason string, failure bool) *taskRestartEvent { + return &taskRestartEvent{ + taskEvent: structs.NewTaskEvent(source).SetRestartReason(reason), + failure: failure, + } +} + // TaskRunner is used to wrap a task within an allocation and provide the execution context. type TaskRunner struct { stateDB *bolt.DB config *config.Config updater TaskStateUpdater logger *log.Logger - alloc *structs.Allocation restartTracker *RestartTracker consul ConsulServiceAPI @@ -82,9 +98,14 @@ type TaskRunner struct { resourceUsage *cstructs.TaskResourceUsage resourceUsageLock sync.RWMutex + alloc *structs.Allocation task *structs.Task taskDir *allocdir.TaskDir + // Synchronizes access to alloc and task since the main Run loop may + // update them concurrent with reads in exported methods. + allocLock sync.Mutex + // envBuilder is used to build the task's environment envBuilder *env.Builder @@ -139,7 +160,7 @@ type TaskRunner struct { unblockLock sync.Mutex // restartCh is used to restart a task - restartCh chan *structs.TaskEvent + restartCh chan *taskRestartEvent // lastStart tracks the last time this task was started or restarted lastStart time.Time @@ -251,7 +272,7 @@ func NewTaskRunner(logger *log.Logger, config *config.Config, waitCh: make(chan struct{}), startCh: make(chan struct{}, 1), unblockCh: make(chan struct{}), - restartCh: make(chan *structs.TaskEvent), + restartCh: make(chan *taskRestartEvent), signalCh: make(chan SignalEvent), } @@ -776,7 +797,8 @@ OUTER: return } case structs.VaultChangeModeRestart: - r.Restart("vault", "new Vault token acquired") + const failure = false + r.Restart("vault", "new Vault token acquired", failure) case structs.VaultChangeModeNoop: fallthrough default: @@ -1141,7 +1163,7 @@ func (r *TaskRunner) run() { res := r.handle.Signal(se.s) se.result <- res - case event := <-r.restartCh: + case restartEvent := <-r.restartCh: r.runningLock.Lock() running := r.running r.runningLock.Unlock() @@ -1151,8 +1173,8 @@ func (r *TaskRunner) run() { continue } - r.logger.Printf("[DEBUG] client: restarting %s: %v", common, event.RestartReason) - r.setState(structs.TaskStateRunning, event, false) + r.logger.Printf("[DEBUG] client: restarting %s: %v", common, restartEvent.taskEvent.RestartReason) + r.setState(structs.TaskStateRunning, restartEvent.taskEvent, false) r.killTask(nil) close(stopCollection) @@ -1161,9 +1183,13 @@ func (r *TaskRunner) run() { <-handleWaitCh } - // Since the restart isn't from a failure, restart immediately - // and don't count against the restart policy - r.restartTracker.SetRestartTriggered() + if restartEvent.failure { + r.restartTracker.SetFailure() + } else { + // Since the restart isn't from a failure, restart immediately + // and don't count against the restart policy + r.restartTracker.SetRestartTriggered() + } break WAIT case <-r.destroyCh: @@ -1593,6 +1619,7 @@ func (r *TaskRunner) handleUpdate(update *structs.Allocation) error { for _, t := range tg.Tasks { if t.Name == r.task.Name { updatedTask = t.Copy() + break } } if updatedTask == nil { @@ -1678,20 +1705,10 @@ func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, return } -// Restart will restart the task -func (r *TaskRunner) Restart(source, reason string) { - r.lastStartMu.Lock() - defer r.lastStartMu.Unlock() - - r.restart(source, reason) -} - -// restart is the internal task restart method. Callers must hold lastStartMu. -func (r *TaskRunner) restart(source, reason string) { - r.lastStart = time.Now() - +// Restart will restart the task. +func (r *TaskRunner) Restart(source, reason string, failure bool) { reasonStr := fmt.Sprintf("%s: %s", source, reason) - event := structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reasonStr) + event := newTaskRestartEvent(source, reasonStr, failure) select { case r.restartCh <- event: @@ -1699,23 +1716,10 @@ func (r *TaskRunner) restart(source, reason string) { } } -// RestartBy deadline. Restarts a task iff the last time it was started was -// before the deadline. Returns true if restart occurs; false if skipped. -func (r *TaskRunner) RestartBy(deadline time.Time, source, reason string) { - r.lastStartMu.Lock() - defer r.lastStartMu.Unlock() - - if r.lastStart.Before(deadline) { - r.restart(source, reason) - } -} - -// LastStart returns the last time this task was started (including restarts). -func (r *TaskRunner) LastStart() time.Time { - r.lastStartMu.Lock() - ls := r.lastStart - r.lastStartMu.Unlock() - return ls +// RestartDelay returns the value of the delay for this task's restart policy +// for use by the healtcheck watcher. +func (r *TaskRunner) RestartDelay() time.Duration { + return r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup).RestartPolicy.Delay } // Signal will send a signal to the task @@ -1742,6 +1746,9 @@ func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { // Kill will kill a task and store the error, no longer restarting the task. If // fail is set, the task is marked as having failed. func (r *TaskRunner) Kill(source, reason string, fail bool) { + r.allocLock.Lock() + defer r.allocLock.Unlock() + reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) if fail { diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 46967ef5f30..feefbd52094 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -20,8 +20,8 @@ type ConsulChecks interface { } type TaskRestarter interface { - LastStart() time.Time - RestartBy(deadline time.Time, source, reason string) + RestartDelay() time.Duration + Restart(source, reason string, failure bool) } // checkRestart handles restarting a task if a check is unhealthy. @@ -34,17 +34,23 @@ type checkRestart struct { // remove this checkID (if true only checkID will be set) remove bool - task TaskRestarter - grace time.Duration - interval time.Duration - timeLimit time.Duration - warning bool + task TaskRestarter + restartDelay time.Duration + grace time.Duration + interval time.Duration + timeLimit time.Duration + warning bool + + // Mutable fields // unhealthyStart is the time a check first went unhealthy. Set to the // zero value if the check passes before timeLimit. - // This is the only mutable field on checkRestart. unhealthyStart time.Time + // graceUntil is when the check's grace period expires and unhealthy + // checks should be counted. + graceUntil time.Time + logger *log.Logger } @@ -66,9 +72,8 @@ func (c *checkRestart) update(now time.Time, status string) { return } - if now.Before(c.task.LastStart().Add(c.grace)) { - // In grace period, reset state and exit - c.unhealthyStart = time.Time{} + if now.Before(c.graceUntil) { + // In grace period exit return } @@ -81,10 +86,17 @@ func (c *checkRestart) update(now time.Time, status string) { restartAt := c.unhealthyStart.Add(c.timeLimit) // Must test >= because if limit=1, restartAt == first failure - if now.UnixNano() >= restartAt.UnixNano() { + if now.Equal(restartAt) || now.After(restartAt) { // hasn't become healthy by deadline, restart! c.logger.Printf("[DEBUG] consul.health: restarting alloc %q task %q due to unhealthy check %q", c.allocID, c.taskName, c.checkName) - c.task.RestartBy(now, "healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName)) + + // Tell TaskRunner to restart due to failure + const failure = true + c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure) + + // Reset grace time to grace + restart.delay + (restart.delay * 25%) (the max jitter) + c.graceUntil = now.Add(c.grace + c.restartDelay + time.Duration(float64(c.restartDelay)*0.25)) + c.unhealthyStart = time.Time{} } } @@ -190,7 +202,10 @@ func (w *checkWatcher) Run(ctx context.Context) { for cid, check := range checks { result, ok := results[cid] if !ok { - w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) + // Only warn if outside grace period to avoid races with check registration + if now.After(check.graceUntil) { + w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) + } continue } @@ -209,16 +224,18 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S } c := checkRestart{ - allocID: allocID, - taskName: taskName, - checkID: checkID, - checkName: check.Name, - task: restarter, - interval: check.Interval, - grace: check.CheckRestart.Grace, - timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), - warning: check.CheckRestart.OnWarning, - logger: w.logger, + allocID: allocID, + taskName: taskName, + checkID: checkID, + checkName: check.Name, + task: restarter, + restartDelay: restarter.RestartDelay(), + interval: check.Interval, + grace: check.CheckRestart.Grace, + graceUntil: time.Now().Add(check.CheckRestart.Grace), + timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), + warning: check.CheckRestart.OnWarning, + logger: w.logger, } select { From 555d1e24dc42e3b26563a76157936b36b65943b8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 17:00:25 -0700 Subject: [PATCH 05/33] on_warning=false -> ignore_warnings=false Treat warnings as unhealthy by default --- api/tasks.go | 6 ++-- command/agent/consul/check_watcher.go | 42 ++++++++++++++------------- command/agent/job_endpoint.go | 12 ++++---- jobspec/parse.go | 2 +- nomad/structs/structs.go | 10 +++---- 5 files changed, 37 insertions(+), 35 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index 01f5b96d4d4..3e0bc40af72 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -82,9 +82,9 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) { // CheckRestart describes if and when a task should be restarted based on // failing health checks. type CheckRestart struct { - Limit int `mapstructure:"limit"` - Grace time.Duration `mapstructure:"grace_period"` - OnWarning bool `mapstructure:"on_warning"` + Limit int `mapstructure:"limit"` + Grace time.Duration `mapstructure:"grace_period"` + IgnoreWarnings bool `mapstructure:"ignore_warnings"` } // The ServiceCheck data model represents the consul health check that diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index feefbd52094..6261c7d8015 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -34,12 +34,12 @@ type checkRestart struct { // remove this checkID (if true only checkID will be set) remove bool - task TaskRestarter - restartDelay time.Duration - grace time.Duration - interval time.Duration - timeLimit time.Duration - warning bool + task TaskRestarter + restartDelay time.Duration + grace time.Duration + interval time.Duration + timeLimit time.Duration + ignoreWarnings bool // Mutable fields @@ -61,8 +61,8 @@ func (c *checkRestart) update(now time.Time, status string) { switch status { case api.HealthCritical: case api.HealthWarning: - if !c.warning { - // Warnings are ok, reset state and exit + if c.ignoreWarnings { + // Warnings are ignored, reset state and exit c.unhealthyStart = time.Time{} return } @@ -79,6 +79,8 @@ func (c *checkRestart) update(now time.Time, status string) { if c.unhealthyStart.IsZero() { // First failure, set restart deadline + c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", + c.allocID, c.taskName, c.checkName, c.timeLimit) c.unhealthyStart = now } @@ -224,18 +226,18 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S } c := checkRestart{ - allocID: allocID, - taskName: taskName, - checkID: checkID, - checkName: check.Name, - task: restarter, - restartDelay: restarter.RestartDelay(), - interval: check.Interval, - grace: check.CheckRestart.Grace, - graceUntil: time.Now().Add(check.CheckRestart.Grace), - timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), - warning: check.CheckRestart.OnWarning, - logger: w.logger, + allocID: allocID, + taskName: taskName, + checkID: checkID, + checkName: check.Name, + task: restarter, + restartDelay: restarter.RestartDelay(), + interval: check.Interval, + grace: check.CheckRestart.Grace, + graceUntil: time.Now().Add(check.CheckRestart.Grace), + timeLimit: check.Interval * time.Duration(check.CheckRestart.Limit-1), + ignoreWarnings: check.CheckRestart.IgnoreWarnings, + logger: w.logger, } select { diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index a3efb7d181a..8bfc8a1895a 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -687,9 +687,9 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { } if service.CheckRestart != nil { structsTask.Services[i].CheckRestart = &structs.CheckRestart{ - Limit: service.CheckRestart.Limit, - Grace: service.CheckRestart.Grace, - OnWarning: service.CheckRestart.OnWarning, + Limit: service.CheckRestart.Limit, + Grace: service.CheckRestart.Grace, + IgnoreWarnings: service.CheckRestart.IgnoreWarnings, } } @@ -713,9 +713,9 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { } if check.CheckRestart != nil { structsTask.Services[i].Checks[j].CheckRestart = &structs.CheckRestart{ - Limit: check.CheckRestart.Limit, - Grace: check.CheckRestart.Grace, - OnWarning: check.CheckRestart.OnWarning, + Limit: check.CheckRestart.Limit, + Grace: check.CheckRestart.Grace, + IgnoreWarnings: check.CheckRestart.IgnoreWarnings, } } } diff --git a/jobspec/parse.go b/jobspec/parse.go index 1c90963aed7..61e5b9968b3 100644 --- a/jobspec/parse.go +++ b/jobspec/parse.go @@ -1063,7 +1063,7 @@ func parseCheckRestart(cro *ast.ObjectItem) (*api.CheckRestart, error) { valid := []string{ "limit", "grace_period", - "on_warning", + "ignore_warnings", } if err := checkHCLKeys(cro.Val, valid); err != nil { diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index a598d0b1496..91c3a2b76e6 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2760,9 +2760,9 @@ func (tg *TaskGroup) GoString() string { // CheckRestart describes if and when a task should be restarted based on // failing health checks. type CheckRestart struct { - Limit int // Restart task after this many unhealthy intervals - Grace time.Duration // Grace time to give tasks after starting to get healthy - OnWarning bool // If true treat checks in `warning` as unhealthy + Limit int // Restart task after this many unhealthy intervals + Grace time.Duration // Grace time to give tasks after starting to get healthy + IgnoreWarnings bool // If true treat checks in `warning` as passing } func (c *CheckRestart) Copy() *CheckRestart { @@ -2798,8 +2798,8 @@ func (c *CheckRestart) Merge(o *CheckRestart) *CheckRestart { nc.Grace = o.Grace } - if !nc.OnWarning { - nc.OnWarning = o.OnWarning + if nc.IgnoreWarnings { + nc.IgnoreWarnings = o.IgnoreWarnings } return nc From c2d895d47a36e8a606c320124b2c117db845c943 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 17:22:03 -0700 Subject: [PATCH 06/33] Add comments and move delay calc to TaskRunner --- client/task_runner.go | 7 ++++--- command/agent/consul/check_watcher.go | 17 ++++++++++++----- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index e31371026b9..cd7f40b3908 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1716,10 +1716,11 @@ func (r *TaskRunner) Restart(source, reason string, failure bool) { } } -// RestartDelay returns the value of the delay for this task's restart policy -// for use by the healtcheck watcher. +// RestartDelay returns the *max* value of the delay for this task's restart +// policy for use by the healtcheck watcher. func (r *TaskRunner) RestartDelay() time.Duration { - return r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup).RestartPolicy.Delay + delay := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup).RestartPolicy.Delay + return delay + time.Duration(float64(delay)*jitter) } // Signal will send a signal to the task diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 6261c7d8015..af19bb01984 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -15,10 +15,14 @@ const ( defaultPollFreq = 900 * time.Millisecond ) -type ConsulChecks interface { +// ChecksAPI is the part of the Consul API the checkWatcher requires. +type ChecksAPI interface { + // Checks returns a list of all checks. Checks() (map[string]*api.AgentCheck, error) } +// TaskRestarter allows the checkWatcher to restart tasks and how long the +// grace period should be afterward. type TaskRestarter interface { RestartDelay() time.Duration Restart(source, reason string, failure bool) @@ -96,8 +100,8 @@ func (c *checkRestart) update(now time.Time, status string) { const failure = true c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure) - // Reset grace time to grace + restart.delay + (restart.delay * 25%) (the max jitter) - c.graceUntil = now.Add(c.grace + c.restartDelay + time.Duration(float64(c.restartDelay)*0.25)) + // Reset grace time to grace + restart.delay + c.graceUntil = now.Add(c.grace + c.restartDelay) c.unhealthyStart = time.Time{} } } @@ -105,10 +109,13 @@ func (c *checkRestart) update(now time.Time, status string) { // checkWatcher watches Consul checks and restarts tasks when they're // unhealthy. type checkWatcher struct { - consul ConsulChecks + consul ChecksAPI + // pollFreq is how often to poll the checks API and defaults to + // defaultPollFreq pollFreq time.Duration + // watchCh is how watches (and removals) are sent to the main watching loop watchCh chan *checkRestart // done is closed when Run has exited @@ -122,7 +129,7 @@ type checkWatcher struct { } // newCheckWatcher creates a new checkWatcher but does not call its Run method. -func newCheckWatcher(logger *log.Logger, consul ConsulChecks) *checkWatcher { +func newCheckWatcher(logger *log.Logger, consul ChecksAPI) *checkWatcher { return &checkWatcher{ consul: consul, pollFreq: defaultPollFreq, From 78c72f872536f1d9767490158297f6dfe6ac331c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 17:31:55 -0700 Subject: [PATCH 07/33] Default grace period to 1s --- api/tasks.go | 18 +++++++++++++++--- command/agent/job_endpoint.go | 4 ++-- 2 files changed, 17 insertions(+), 5 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index 3e0bc40af72..291971b719f 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -82,9 +82,9 @@ func (r *RestartPolicy) Merge(rp *RestartPolicy) { // CheckRestart describes if and when a task should be restarted based on // failing health checks. type CheckRestart struct { - Limit int `mapstructure:"limit"` - Grace time.Duration `mapstructure:"grace_period"` - IgnoreWarnings bool `mapstructure:"ignore_warnings"` + Limit int `mapstructure:"limit"` + Grace *time.Duration `mapstructure:"grace_period"` + IgnoreWarnings bool `mapstructure:"ignore_warnings"` } // The ServiceCheck data model represents the consul health check that @@ -107,6 +107,14 @@ type ServiceCheck struct { CheckRestart *CheckRestart `mapstructure:"check_restart"` } +func (c *ServiceCheck) Canonicalize() { + if c.CheckRestart != nil { + if c.CheckRestart.Grace == nil { + c.CheckRestart.Grace = helper.TimeToPtr(1 * time.Second) + } + } +} + // The Service model represents a Consul service definition type Service struct { Id string @@ -127,6 +135,10 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { if s.AddressMode == "" { s.AddressMode = "auto" } + + for _, c := range s.Checks { + c.Canonicalize() + } } // EphemeralDisk is an ephemeral disk object diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 8bfc8a1895a..7c7b0089b76 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -688,7 +688,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { if service.CheckRestart != nil { structsTask.Services[i].CheckRestart = &structs.CheckRestart{ Limit: service.CheckRestart.Limit, - Grace: service.CheckRestart.Grace, + Grace: *service.CheckRestart.Grace, IgnoreWarnings: service.CheckRestart.IgnoreWarnings, } } @@ -714,7 +714,7 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { if check.CheckRestart != nil { structsTask.Services[i].Checks[j].CheckRestart = &structs.CheckRestart{ Limit: check.CheckRestart.Limit, - Grace: check.CheckRestart.Grace, + Grace: *check.CheckRestart.Grace, IgnoreWarnings: check.CheckRestart.IgnoreWarnings, } } From 7e103f69cb55b93b7c7913cd6f2e7ba16295afe9 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 21:38:31 -0700 Subject: [PATCH 08/33] Document new check_restart stanza --- website/source/api/json-jobs.html.md | 21 +++++- .../docs/job-specification/service.html.md | 74 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md index 8851df1d5cd..8e52d3ea43e 100644 --- a/website/source/api/json-jobs.html.md +++ b/website/source/api/json-jobs.html.md @@ -66,7 +66,12 @@ Below is the JSON representation of the job outputed by `$ nomad init`: "Interval": 10000000000, "Timeout": 2000000000, "InitialStatus": "", - "TLSSkipVerify": false + "TLSSkipVerify": false, + "CheckRestart": { + "Limit": 3, + "Grace": "30s", + "IgnoreWarnings": false + } }] }], "Resources": { @@ -377,6 +382,20 @@ The `Task` object supports the following keys: - `TLSSkipVerify`: If true, Consul will not attempt to verify the certificate when performing HTTPS checks. Requires Consul >= 0.7.2. + - `CheckRestart`: `CheckRestart` is an object which enables + restarting of tasks based upon Consul health checks. + + - `Limit`: The number of unhealthy checks allowed before the + service is restarted. Defaults to `0` which disables + health-based restarts. + + - `Grace`: The duration to wait after a task starts or restarts + before counting unhealthy checks count against the limit. + Defaults to "1s". + + - `IgnoreWarnings`: Treat checks that are warning as passing. + Defaults to false which means warnings are considered unhealthy. + - `ShutdownDelay` - Specifies the duration to wait when killing a task between removing it from Consul and sending it a shutdown signal. Ideally services would fail healthchecks once they receive a shutdown signal. Alternatively diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index cef747b0dd6..2cee91b9d0e 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -47,6 +47,12 @@ job "docs" { args = ["--verbose"] interval = "60s" timeout = "5s" + + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } } } } @@ -162,6 +168,72 @@ scripts. - `tls_skip_verify` `(bool: false)` - Skip verifying TLS certificates for HTTPS checks. Requires Consul >= 0.7.2. +#### `check_restart` Stanza + +As of Nomad 0.7 `check` stanzas may include a `check_restart` stanza to restart +tasks with unhealthy checks. Restarts use the parameters from the +[`restart`][restart_stanza] stanza, so if a task group has the default `15s` +delay, tasks won't be restarted for an extra 15 seconds after the +`check_restart` block considers it failed. `check_restart` stanzas have the +follow parameters: + +- `limit` `(int: 0)` - Restart task after `limit` failing health checks. For + example 1 causes a restart on the first failure. The default, `0`, disables + healtcheck based restarts. Failures must be consecutive. A single passing + check will reset the count, so flapping services may not be restarted. + +- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts + before checking its health. On restarts the `delay` and max jitter is added + to the grace period to prevent checking a task's health before it has + restarted. + +- `ignore_warnings` `(bool: false)` - By default checks with both `critical` + and `warning` statuses are considered unhealthy. Setting `ignore_warnings = + true` treats a `warning` status like `passing` and will not trigger a restart. + +For example: + +```hcl +restart { + delay = "8s" +} + +task "mysqld" { + service { + # ... + check { + type = "script" + name = "check_table" + command = "/usr/local/bin/check_mysql_table_status" + args = ["--verbose"] + interval = "20s" + timeout = "5s" + + check_restart { + # Restart the task after 3 consecutive failed checks (180s) + limit = 3 + + # Ignore failed checks for 90s after a service starts or restarts + grace = "90s" + + # Treat warnings as unhealthy (the default) + ignore_warnings = false + } + } + } +} +``` + +In this example the `mysqld` task has `90s` from startup to begin passing +healthchecks. After the grace period if `mysqld` would remain unhealthy for +`60s` (as determined by `limit * interval`) it would be restarted after `8s` +(as determined by the `restart.delay`). Nomad would then wait `100s` (as +determined by `grace + delay + (delay * 0.25)`) before checking `mysqld`'s +health again. + +~> `check_restart` stanzas may also be placed in `service` stanzas to apply the + same restart logic to multiple checks. + #### `header` Stanza HTTP checks may include a `header` stanza to set HTTP headers. The `header` @@ -170,6 +242,7 @@ the header to be set multiple times, once for each value. ```hcl service { + # ... check { type = "http" port = "lb" @@ -319,3 +392,4 @@ system of a task for that driver. [interpolation]: /docs/runtime/interpolation.html "Nomad Runtime Interpolation" [network]: /docs/job-specification/network.html "Nomad network Job Specification" [qemu]: /docs/drivers/qemu.html "Nomad qemu Driver" +[restart_stanza]: /docs/job-specification/restart.html "restart stanza" From 850d991286838fa41def8d1f0c4805cf920d3e18 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 21:40:06 -0700 Subject: [PATCH 09/33] Add changelog entry for #3105 --- CHANGELOG.md | 1 + 1 file changed, 1 insertion(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 2e9996940fc..34ab35e5c8d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,7 @@ IMPROVEMENTS: * api: Metrics endpoint exposes Prometheus formatted metrics [GH-3171] + * discovery: Allow restarting unhealthy tasks with `check_restart` [GH-3105] * telemetry: Add support for tagged metrics for Nomad clients [GH-3147] BUG FIXES: From 3db835cb8fc34cd2c3cee3b245d516a3e260377d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Tue, 12 Sep 2017 23:15:46 -0700 Subject: [PATCH 10/33] Improve check watcher logging and add tests Also expose a mock Consul Agent to allow testing ServiceClient and checkWatcher from TaskRunner without actually talking to a real Consul. --- client/consul_template_test.go | 2 +- client/consul_test.go | 4 +- client/task_runner.go | 6 +- client/task_runner_test.go | 143 ++++++++++-- client/task_runner_unix_test.go | 2 +- command/agent/consul/catalog_testing.go | 118 ++++++++++ command/agent/consul/check_watcher.go | 32 ++- command/agent/consul/check_watcher_test.go | 252 +++++++++++++++++++++ command/agent/consul/unit_test.go | 200 ++++++---------- nomad/structs/structs.go | 2 +- 10 files changed, 595 insertions(+), 166 deletions(-) create mode 100644 command/agent/consul/check_watcher_test.go diff --git a/client/consul_template_test.go b/client/consul_template_test.go index 88ee17b0e50..f8368520407 100644 --- a/client/consul_template_test.go +++ b/client/consul_template_test.go @@ -57,7 +57,7 @@ func NewMockTaskHooks() *MockTaskHooks { EmitEventCh: make(chan struct{}, 1), } } -func (m *MockTaskHooks) Restart(source, reason string) { +func (m *MockTaskHooks) Restart(source, reason string, failure bool) { m.Restarts++ select { case m.RestartCh <- struct{}{}: diff --git a/client/consul_test.go b/client/consul_test.go index 10d1ebe10d0..8703cdd215a 100644 --- a/client/consul_test.go +++ b/client/consul_test.go @@ -60,7 +60,7 @@ func newMockConsulServiceClient() *mockConsulServiceClient { return &m } -func (m *mockConsulServiceClient) UpdateTask(allocID string, old, new *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { +func (m *mockConsulServiceClient) UpdateTask(allocID string, old, new *structs.Task, restarter consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { m.mu.Lock() defer m.mu.Unlock() m.logger.Printf("[TEST] mock_consul: UpdateTask(%q, %v, %v, %T, %x)", allocID, old, new, exec, net.Hash()) @@ -68,7 +68,7 @@ func (m *mockConsulServiceClient) UpdateTask(allocID string, old, new *structs.T return nil } -func (m *mockConsulServiceClient) RegisterTask(allocID string, task *structs.Task, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { +func (m *mockConsulServiceClient) RegisterTask(allocID string, task *structs.Task, restarter consul.TaskRestarter, exec driver.ScriptExecutor, net *cstructs.DriverNetwork) error { m.mu.Lock() defer m.mu.Unlock() m.logger.Printf("[TEST] mock_consul: RegisterTask(%q, %q, %T, %x)", allocID, task.Name, exec, net.Hash()) diff --git a/client/task_runner.go b/client/task_runner.go index cd7f40b3908..cc1bf5aa61e 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -75,9 +75,9 @@ type taskRestartEvent struct { failure bool } -func newTaskRestartEvent(source, reason string, failure bool) *taskRestartEvent { +func newTaskRestartEvent(reason string, failure bool) *taskRestartEvent { return &taskRestartEvent{ - taskEvent: structs.NewTaskEvent(source).SetRestartReason(reason), + taskEvent: structs.NewTaskEvent(structs.TaskRestartSignal).SetRestartReason(reason), failure: failure, } } @@ -1708,7 +1708,7 @@ func (r *TaskRunner) handleDestroy(handle driver.DriverHandle) (destroyed bool, // Restart will restart the task. func (r *TaskRunner) Restart(source, reason string, failure bool) { reasonStr := fmt.Sprintf("%s: %s", source, reason) - event := newTaskRestartEvent(source, reasonStr, failure) + event := newTaskRestartEvent(reasonStr, failure) select { case r.restartCh <- event: diff --git a/client/task_runner_test.go b/client/task_runner_test.go index 6894115e337..f532e77df45 100644 --- a/client/task_runner_test.go +++ b/client/task_runner_test.go @@ -21,6 +21,7 @@ import ( "github.com/hashicorp/nomad/client/driver/env" cstructs "github.com/hashicorp/nomad/client/structs" "github.com/hashicorp/nomad/client/vaultclient" + "github.com/hashicorp/nomad/command/agent/consul" "github.com/hashicorp/nomad/nomad/mock" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/nomad/testutil" @@ -56,10 +57,21 @@ func (m *MockTaskStateUpdater) Update(name, state string, event *structs.TaskEve } } +// String for debugging purposes. +func (m *MockTaskStateUpdater) String() string { + s := fmt.Sprintf("Updates:\n state=%q\n failed=%t\n events=\n", m.state, m.failed) + for _, e := range m.events { + s += fmt.Sprintf(" %#v\n", e) + } + return s +} + type taskRunnerTestCtx struct { upd *MockTaskStateUpdater tr *TaskRunner allocDir *allocdir.AllocDir + vault *vaultclient.MockVaultClient + consul *mockConsulServiceClient } // Cleanup calls Destroy on the task runner and alloc dir @@ -130,7 +142,13 @@ func testTaskRunnerFromAlloc(t *testing.T, restarts bool, alloc *structs.Allocat if !restarts { tr.restartTracker = noRestartsTracker() } - return &taskRunnerTestCtx{upd, tr, allocDir} + return &taskRunnerTestCtx{ + upd: upd, + tr: tr, + allocDir: allocDir, + vault: vclient, + consul: cclient, + } } // testWaitForTaskToStart waits for the task to or fails the test @@ -657,7 +675,7 @@ func TestTaskRunner_RestartTask(t *testing.T) { // Wait for it to start go func() { testWaitForTaskToStart(t, ctx) - ctx.tr.Restart("test", "restart") + ctx.tr.Restart("test", "restart", false) // Wait for it to restart then kill go func() { @@ -1251,8 +1269,7 @@ func TestTaskRunner_Template_NewVaultToken(t *testing.T) { }) // Error the token renewal - vc := ctx.tr.vaultClient.(*vaultclient.MockVaultClient) - renewalCh, ok := vc.RenewTokens[token] + renewalCh, ok := ctx.vault.RenewTokens[token] if !ok { t.Fatalf("no renewal channel") } @@ -1279,13 +1296,12 @@ func TestTaskRunner_Template_NewVaultToken(t *testing.T) { }) // Check the token was revoked - m := ctx.tr.vaultClient.(*vaultclient.MockVaultClient) testutil.WaitForResult(func() (bool, error) { - if len(m.StoppedTokens) != 1 { - return false, fmt.Errorf("Expected a stopped token: %v", m.StoppedTokens) + if len(ctx.vault.StoppedTokens) != 1 { + return false, fmt.Errorf("Expected a stopped token: %v", ctx.vault.StoppedTokens) } - if a := m.StoppedTokens[0]; a != token { + if a := ctx.vault.StoppedTokens[0]; a != token { return false, fmt.Errorf("got stopped token %q; want %q", a, token) } return true, nil @@ -1317,8 +1333,7 @@ func TestTaskRunner_VaultManager_Restart(t *testing.T) { testWaitForTaskToStart(t, ctx) // Error the token renewal - vc := ctx.tr.vaultClient.(*vaultclient.MockVaultClient) - renewalCh, ok := vc.RenewTokens[ctx.tr.vaultFuture.Get()] + renewalCh, ok := ctx.vault.RenewTokens[ctx.tr.vaultFuture.Get()] if !ok { t.Fatalf("no renewal channel") } @@ -1394,8 +1409,7 @@ func TestTaskRunner_VaultManager_Signal(t *testing.T) { testWaitForTaskToStart(t, ctx) // Error the token renewal - vc := ctx.tr.vaultClient.(*vaultclient.MockVaultClient) - renewalCh, ok := vc.RenewTokens[ctx.tr.vaultFuture.Get()] + renewalCh, ok := ctx.vault.RenewTokens[ctx.tr.vaultFuture.Get()] if !ok { t.Fatalf("no renewal channel") } @@ -1726,20 +1740,19 @@ func TestTaskRunner_ShutdownDelay(t *testing.T) { // Service should get removed quickly; loop until RemoveTask is called found := false - mockConsul := ctx.tr.consul.(*mockConsulServiceClient) deadline := destroyed.Add(task.ShutdownDelay) for time.Now().Before(deadline) { time.Sleep(5 * time.Millisecond) - mockConsul.mu.Lock() - n := len(mockConsul.ops) + ctx.consul.mu.Lock() + n := len(ctx.consul.ops) if n < 2 { - mockConsul.mu.Unlock() + ctx.consul.mu.Unlock() continue } - lastOp := mockConsul.ops[n-1].op - mockConsul.mu.Unlock() + lastOp := ctx.consul.ops[n-1].op + ctx.consul.mu.Unlock() if lastOp == "remove" { found = true @@ -1762,3 +1775,97 @@ func TestTaskRunner_ShutdownDelay(t *testing.T) { t.Fatalf("task exited before shutdown delay") } } + +// TestTaskRunner_CheckWatcher_Restart asserts that when enabled an unhealthy +// Consul check will cause a task to restart following restart policy rules. +func TestTaskRunner_CheckWatcher_Restart(t *testing.T) { + t.Parallel() + + alloc := mock.Alloc() + + // Make the restart policy fail within this test + tg := alloc.Job.TaskGroups[0] + tg.RestartPolicy.Attempts = 2 + tg.RestartPolicy.Interval = 1 * time.Minute + tg.RestartPolicy.Delay = 10 * time.Millisecond + tg.RestartPolicy.Mode = structs.RestartPolicyModeFail + + task := tg.Tasks[0] + task.Driver = "mock_driver" + task.Config = map[string]interface{}{ + "exit_code": "0", + "run_for": "100s", + } + + // Make the task register a check that fails + task.Services[0].Checks[0] = &structs.ServiceCheck{ + Name: "test-restarts", + Type: structs.ServiceCheckTCP, + Interval: 50 * time.Millisecond, + CheckRestart: &structs.CheckRestart{ + Limit: 2, + Grace: 100 * time.Millisecond, + }, + } + + ctx := testTaskRunnerFromAlloc(t, true, alloc) + + // Replace mock Consul ServiceClient, with the real ServiceClient + // backed by a mock consul whose checks are always unhealthy. + consulAgent := consul.NewMockAgent() + consulAgent.SetStatus("critical") + consulClient := consul.NewServiceClient(consulAgent, true, ctx.tr.logger) + go consulClient.Run() + defer consulClient.Shutdown() + + ctx.tr.consul = consulClient + ctx.consul = nil // prevent accidental use of old mock + + ctx.tr.MarkReceived() + go ctx.tr.Run() + defer ctx.Cleanup() + + select { + case <-ctx.tr.WaitCh(): + case <-time.After(time.Duration(testutil.TestMultiplier()*15) * time.Second): + t.Fatalf("timeout") + } + + expected := []string{ + "Received", + "Task Setup", + "Started", + "Restart Signaled", + "Killing", + "Killed", + "Restarting", + "Started", + "Restart Signaled", + "Killing", + "Killed", + "Restarting", + "Started", + "Restart Signaled", + "Killing", + "Killed", + "Not Restarting", + } + + if n := len(ctx.upd.events); n != len(expected) { + t.Fatalf("should have %d ctx.updates found %d: %s", len(expected), n, ctx.upd) + } + + if ctx.upd.state != structs.TaskStateDead { + t.Fatalf("TaskState %v; want %v", ctx.upd.state, structs.TaskStateDead) + } + + if !ctx.upd.failed { + t.Fatalf("expected failed") + } + + for i, actual := range ctx.upd.events { + if actual.Type != expected[i] { + t.Errorf("%.2d - Expected %q but found %q", i, expected[i], actual.Type) + } + } +} diff --git a/client/task_runner_unix_test.go b/client/task_runner_unix_test.go index bed7c956d79..b7c2aa4412b 100644 --- a/client/task_runner_unix_test.go +++ b/client/task_runner_unix_test.go @@ -53,7 +53,7 @@ func TestTaskRunner_RestartSignalTask_NotRunning(t *testing.T) { } // Send a restart - ctx.tr.Restart("test", "don't panic") + ctx.tr.Restart("test", "don't panic", false) if len(ctx.upd.events) != 2 { t.Fatalf("should have 2 ctx.updates: %#v", ctx.upd.events) diff --git a/command/agent/consul/catalog_testing.go b/command/agent/consul/catalog_testing.go index f0dd0326ce0..6b28940f114 100644 --- a/command/agent/consul/catalog_testing.go +++ b/command/agent/consul/catalog_testing.go @@ -1,7 +1,9 @@ package consul import ( + "fmt" "log" + "sync" "github.com/hashicorp/consul/api" ) @@ -25,3 +27,119 @@ func (m *MockCatalog) Service(service, tag string, q *api.QueryOptions) ([]*api. m.logger.Printf("[DEBUG] mock_consul: Service(%q, %q, %#v) -> (nil, nil, nil)", service, tag, q) return nil, nil, nil } + +// MockAgent is a fake in-memory Consul backend for ServiceClient. +type MockAgent struct { + // maps of what services and checks have been registered + services map[string]*api.AgentServiceRegistration + checks map[string]*api.AgentCheckRegistration + mu sync.Mutex + + // when UpdateTTL is called the check ID will have its counter inc'd + checkTTLs map[string]int + + // What check status to return from Checks() + checkStatus string +} + +// NewMockAgent that returns all checks as passing. +func NewMockAgent() *MockAgent { + return &MockAgent{ + services: make(map[string]*api.AgentServiceRegistration), + checks: make(map[string]*api.AgentCheckRegistration), + checkTTLs: make(map[string]int), + checkStatus: api.HealthPassing, + } +} + +// SetStatus that Checks() should return. Returns old status value. +func (c *MockAgent) SetStatus(s string) string { + c.mu.Lock() + old := c.checkStatus + c.checkStatus = s + c.mu.Unlock() + return old +} + +func (c *MockAgent) Services() (map[string]*api.AgentService, error) { + c.mu.Lock() + defer c.mu.Unlock() + + r := make(map[string]*api.AgentService, len(c.services)) + for k, v := range c.services { + r[k] = &api.AgentService{ + ID: v.ID, + Service: v.Name, + Tags: make([]string, len(v.Tags)), + Port: v.Port, + Address: v.Address, + EnableTagOverride: v.EnableTagOverride, + } + copy(r[k].Tags, v.Tags) + } + return r, nil +} + +func (c *MockAgent) Checks() (map[string]*api.AgentCheck, error) { + c.mu.Lock() + defer c.mu.Unlock() + + r := make(map[string]*api.AgentCheck, len(c.checks)) + for k, v := range c.checks { + r[k] = &api.AgentCheck{ + CheckID: v.ID, + Name: v.Name, + Status: c.checkStatus, + Notes: v.Notes, + ServiceID: v.ServiceID, + ServiceName: c.services[v.ServiceID].Name, + } + } + return r, nil +} + +func (c *MockAgent) CheckRegister(check *api.AgentCheckRegistration) error { + c.mu.Lock() + defer c.mu.Unlock() + c.checks[check.ID] = check + + // Be nice and make checks reachable-by-service + scheck := check.AgentServiceCheck + c.services[check.ServiceID].Checks = append(c.services[check.ServiceID].Checks, &scheck) + return nil +} + +func (c *MockAgent) CheckDeregister(checkID string) error { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.checks, checkID) + delete(c.checkTTLs, checkID) + return nil +} + +func (c *MockAgent) ServiceRegister(service *api.AgentServiceRegistration) error { + c.mu.Lock() + defer c.mu.Unlock() + c.services[service.ID] = service + return nil +} + +func (c *MockAgent) ServiceDeregister(serviceID string) error { + c.mu.Lock() + defer c.mu.Unlock() + delete(c.services, serviceID) + return nil +} + +func (c *MockAgent) UpdateTTL(id string, output string, status string) error { + c.mu.Lock() + defer c.mu.Unlock() + check, ok := c.checks[id] + if !ok { + return fmt.Errorf("unknown check id: %q", id) + } + // Flip initial status to passing + check.Status = "passing" + c.checkTTLs[id]++ + return nil +} diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index af19bb01984..ee28c87fb37 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -62,17 +62,25 @@ type checkRestart struct { // timestamp is passed in so all check updates have the same view of time (and // to ease testing). func (c *checkRestart) update(now time.Time, status string) { + healthy := func() { + if !c.unhealthyStart.IsZero() { + c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", + c.allocID, c.taskName, c.checkName) + c.unhealthyStart = time.Time{} + } + return + } switch status { case api.HealthCritical: case api.HealthWarning: if c.ignoreWarnings { // Warnings are ignored, reset state and exit - c.unhealthyStart = time.Time{} + healthy() return } default: // All other statuses are ok, reset state and exit - c.unhealthyStart = time.Time{} + healthy() return } @@ -83,8 +91,10 @@ func (c *checkRestart) update(now time.Time, status string) { if c.unhealthyStart.IsZero() { // First failure, set restart deadline - c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", - c.allocID, c.taskName, c.checkName, c.timeLimit) + if c.timeLimit != 0 { + c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", + c.allocID, c.taskName, c.checkName, c.timeLimit) + } c.unhealthyStart = now } @@ -150,12 +160,6 @@ func (w *checkWatcher) Run(ctx context.Context) { // timer for check polling checkTimer := time.NewTimer(0) defer checkTimer.Stop() // ensure timer is never leaked - resetTimer := func(d time.Duration) { - if !checkTimer.Stop() { - <-checkTimer.C - } - checkTimer.Reset(d) - } // Main watch loop for { @@ -169,9 +173,13 @@ func (w *checkWatcher) Run(ctx context.Context) { w.logger.Printf("[DEBUG] consul.health: told to stop watching an unwatched check: %q", c.checkID) } else { checks[c.checkID] = c + w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", c.allocID, c.taskName, c.checkName) - // First check should be after grace period - resetTimer(c.grace) + // Begin polling + if !checkTimer.Stop() { + <-checkTimer.C + } + checkTimer.Reset(w.pollFreq) } case <-ctx.Done(): return diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go new file mode 100644 index 00000000000..7e4947f51b9 --- /dev/null +++ b/command/agent/consul/check_watcher_test.go @@ -0,0 +1,252 @@ +package consul + +import ( + "context" + "testing" + "time" + + "github.com/hashicorp/consul/api" + "github.com/hashicorp/nomad/nomad/structs" +) + +// checkResponse is a response returned by the fakeChecksAPI after the given +// time. +type checkResponse struct { + at time.Time + id string + status string +} + +// fakeChecksAPI implements the Checks() method for testing Consul. +type fakeChecksAPI struct { + // responses is a map of check ids to their status at a particular + // time. checkResponses must be in chronological order. + responses map[string][]checkResponse +} + +func newFakeChecksAPI() *fakeChecksAPI { + return &fakeChecksAPI{responses: make(map[string][]checkResponse)} +} + +// add a new check status to Consul at the given time. +func (c *fakeChecksAPI) add(id, status string, at time.Time) { + c.responses[id] = append(c.responses[id], checkResponse{at, id, status}) +} + +func (c *fakeChecksAPI) Checks() (map[string]*api.AgentCheck, error) { + now := time.Now() + result := make(map[string]*api.AgentCheck, len(c.responses)) + + // Use the latest response for each check + for k, vs := range c.responses { + for _, v := range vs { + if v.at.After(now) { + break + } + result[k] = &api.AgentCheck{ + CheckID: k, + Name: k, + Status: v.status, + } + } + } + + return result, nil +} + +// testWatcherSetup sets up a fakeChecksAPI and a real checkWatcher with a test +// logger and faster poll frequency. +func testWatcherSetup() (*fakeChecksAPI, *checkWatcher) { + fakeAPI := newFakeChecksAPI() + cw := newCheckWatcher(testLogger(), fakeAPI) + cw.pollFreq = 10 * time.Millisecond + return fakeAPI, cw +} + +func testCheck() *structs.ServiceCheck { + return &structs.ServiceCheck{ + Name: "testcheck", + Interval: 100 * time.Millisecond, + Timeout: 100 * time.Millisecond, + CheckRestart: &structs.CheckRestart{ + Limit: 3, + Grace: 100 * time.Millisecond, + IgnoreWarnings: false, + }, + } +} + +// TestCheckWatcher_Skip asserts unwatched checks are ignored. +func TestCheckWatcher_Skip(t *testing.T) { + t.Parallel() + + // Create a check with restarting disabled + check := testCheck() + check.CheckRestart = nil + + cw := newCheckWatcher(testLogger(), newFakeChecksAPI()) + restarter1 := newFakeCheckRestarter() + cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) + + // Check should have been dropped as it's not watched + if n := len(cw.watchCh); n != 0 { + t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n) + } +} + +// TestCheckWatcher_Healthy asserts healthy tasks are not restarted. +func TestCheckWatcher_Healthy(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + check1 := testCheck() + restarter1 := newFakeCheckRestarter() + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + check2 := testCheck() + check2.CheckRestart.Limit = 1 + check2.CheckRestart.Grace = 0 + restarter2 := newFakeCheckRestarter() + cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) + + // Make both checks healthy from the beginning + fakeAPI.add("testcheck1", "passing", time.Time{}) + fakeAPI.add("testcheck2", "passing", time.Time{}) + + // Run for 1 second + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + cw.Run(ctx) + + // Assert Restart was never called + if n := len(restarter1.restarts); n > 0 { + t.Errorf("expected check 1 to not be restarted but found %d", n) + } + if n := len(restarter2.restarts); n > 0 { + t.Errorf("expected check 2 to not be restarted but found %d", n) + } +} + +// TestCheckWatcher_Unhealthy asserts unhealthy tasks are not restarted. +func TestCheckWatcher_Unhealthy(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + check1 := testCheck() + restarter1 := newFakeCheckRestarter() + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + check2 := testCheck() + check2.CheckRestart.Limit = 1 + check2.CheckRestart.Grace = 0 + restarter2 := newFakeCheckRestarter() + restarter2.restartDelay = 600 * time.Millisecond + cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) + + // Check 1 always passes, check 2 always fails + fakeAPI.add("testcheck1", "passing", time.Time{}) + fakeAPI.add("testcheck2", "critical", time.Time{}) + + // Run for 1 second + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + cw.Run(ctx) + + // Ensure restart was never called on check 1 + if n := len(restarter1.restarts); n > 0 { + t.Errorf("expected check 1 to not be restarted but found %d", n) + } + + // Ensure restart was called twice on check 2 + if n := len(restarter2.restarts); n != 2 { + t.Errorf("expected check 2 to be restarted 2 times but found %d:\n%s", n, restarter2) + } +} + +// TestCheckWatcher_HealthyWarning asserts checks in warning with +// ignore_warnings=true do not restart tasks. +func TestCheckWatcher_HealthyWarning(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + check1 := testCheck() + check1.CheckRestart.Limit = 1 + check1.CheckRestart.Grace = 0 + check1.CheckRestart.IgnoreWarnings = true + restarter1 := newFakeCheckRestarter() + restarter1.restartDelay = 1100 * time.Millisecond + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + // Check is always in warning but that's ok + fakeAPI.add("testcheck1", "warning", time.Time{}) + + // Run for 1 second + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + cw.Run(ctx) + + // Ensure restart was never called on check 1 + if n := len(restarter1.restarts); n > 0 { + t.Errorf("expected check 1 to not be restarted but found %d", n) + } +} + +// TestCheckWatcher_Flapping asserts checks that flap from healthy to unhealthy +// before the unhealthy limit is reached do not restart tasks. +func TestCheckWatcher_Flapping(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + check1 := testCheck() + check1.CheckRestart.Grace = 0 + restarter1 := newFakeCheckRestarter() + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + // Check flaps and is never failing for the full 200ms needed to restart + now := time.Now() + fakeAPI.add("testcheck1", "passing", now) + fakeAPI.add("testcheck1", "critical", now.Add(100*time.Millisecond)) + fakeAPI.add("testcheck1", "passing", now.Add(250*time.Millisecond)) + fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) + fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) + + ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + defer cancel() + cw.Run(ctx) + + // Ensure restart was never called on check 1 + if n := len(restarter1.restarts); n > 0 { + t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) + } +} + +// TestCheckWatcher_Unwatch asserts unwatching checks prevents restarts. +func TestCheckWatcher_Unwatch(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + // Unwatch immediately + check1 := testCheck() + check1.CheckRestart.Limit = 1 + check1.CheckRestart.Grace = 100 * time.Millisecond + restarter1 := newFakeCheckRestarter() + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + cw.Unwatch("testcheck1") + + // Always failing + fakeAPI.add("testcheck1", "critical", time.Time{}) + + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + cw.Run(ctx) + + // Ensure restart was never called on check 1 + if n := len(restarter1.restarts); n > 0 { + t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) + } +} diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 2a83d2989e9..3e2837c75a7 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -8,7 +8,6 @@ import ( "os" "reflect" "strings" - "sync" "testing" "time" @@ -54,12 +53,62 @@ func testTask() *structs.Task { } } +// checkRestartRecord is used by a testFakeCtx to record when restarts occur +// due to a watched check. +type checkRestartRecord struct { + timestamp time.Time + source string + reason string + failure bool +} + +// fakeCheckRestarter is a test implementation of +type fakeCheckRestarter struct { + + // restartDelay is returned by RestartDelay to control the behavior of + // the checkWatcher + restartDelay time.Duration + + // restarts is a slice of all of the restarts triggered by the checkWatcher + restarts []checkRestartRecord +} + +func newFakeCheckRestarter() *fakeCheckRestarter { + return &fakeCheckRestarter{} +} + +// RestartDelay implements part of the TaskRestarter interface needed for check +// watching and is normally fulfilled by a task runner. +// +// The return value is determined by the restartDelay field. +func (c *fakeCheckRestarter) RestartDelay() time.Duration { + return c.restartDelay +} + +// Restart implements part of the TaskRestarter interface needed for check +// watching and is normally fulfilled by a TaskRunner. +// +// Restarts are recorded in the []restarts field. +func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { + c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) +} + +// String for debugging +func (c *fakeCheckRestarter) String() string { + s := "" + for _, r := range c.restarts { + s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) + } + return s +} + // testFakeCtx contains a fake Consul AgentAPI and implements the Exec // interface to allow testing without running Consul. type testFakeCtx struct { ServiceClient *ServiceClient - FakeConsul *fakeConsul + FakeConsul *MockAgent Task *structs.Task + Restarter *fakeCheckRestarter // Ticked whenever a script is called execs chan int @@ -99,126 +148,21 @@ func (t *testFakeCtx) syncOnce() error { // setupFake creates a testFakeCtx with a ServiceClient backed by a fakeConsul. // A test Task is also provided. func setupFake() *testFakeCtx { - fc := newFakeConsul() + fc := NewMockAgent() return &testFakeCtx{ ServiceClient: NewServiceClient(fc, true, testLogger()), FakeConsul: fc, Task: testTask(), + Restarter: newFakeCheckRestarter(), execs: make(chan int, 100), } } -// fakeConsul is a fake in-memory Consul backend for ServiceClient. -type fakeConsul struct { - // maps of what services and checks have been registered - services map[string]*api.AgentServiceRegistration - checks map[string]*api.AgentCheckRegistration - mu sync.Mutex - - // when UpdateTTL is called the check ID will have its counter inc'd - checkTTLs map[string]int - - // What check status to return from Checks() - checkStatus string -} - -func newFakeConsul() *fakeConsul { - return &fakeConsul{ - services: make(map[string]*api.AgentServiceRegistration), - checks: make(map[string]*api.AgentCheckRegistration), - checkTTLs: make(map[string]int), - checkStatus: api.HealthPassing, - } -} - -func (c *fakeConsul) Services() (map[string]*api.AgentService, error) { - c.mu.Lock() - defer c.mu.Unlock() - - r := make(map[string]*api.AgentService, len(c.services)) - for k, v := range c.services { - r[k] = &api.AgentService{ - ID: v.ID, - Service: v.Name, - Tags: make([]string, len(v.Tags)), - Port: v.Port, - Address: v.Address, - EnableTagOverride: v.EnableTagOverride, - } - copy(r[k].Tags, v.Tags) - } - return r, nil -} - -func (c *fakeConsul) Checks() (map[string]*api.AgentCheck, error) { - c.mu.Lock() - defer c.mu.Unlock() - - r := make(map[string]*api.AgentCheck, len(c.checks)) - for k, v := range c.checks { - r[k] = &api.AgentCheck{ - CheckID: v.ID, - Name: v.Name, - Status: c.checkStatus, - Notes: v.Notes, - ServiceID: v.ServiceID, - ServiceName: c.services[v.ServiceID].Name, - } - } - return r, nil -} - -func (c *fakeConsul) CheckRegister(check *api.AgentCheckRegistration) error { - c.mu.Lock() - defer c.mu.Unlock() - c.checks[check.ID] = check - - // Be nice and make checks reachable-by-service - scheck := check.AgentServiceCheck - c.services[check.ServiceID].Checks = append(c.services[check.ServiceID].Checks, &scheck) - return nil -} - -func (c *fakeConsul) CheckDeregister(checkID string) error { - c.mu.Lock() - defer c.mu.Unlock() - delete(c.checks, checkID) - delete(c.checkTTLs, checkID) - return nil -} - -func (c *fakeConsul) ServiceRegister(service *api.AgentServiceRegistration) error { - c.mu.Lock() - defer c.mu.Unlock() - c.services[service.ID] = service - return nil -} - -func (c *fakeConsul) ServiceDeregister(serviceID string) error { - c.mu.Lock() - defer c.mu.Unlock() - delete(c.services, serviceID) - return nil -} - -func (c *fakeConsul) UpdateTTL(id string, output string, status string) error { - c.mu.Lock() - defer c.mu.Unlock() - check, ok := c.checks[id] - if !ok { - return fmt.Errorf("unknown check id: %q", id) - } - // Flip initial status to passing - check.Status = "passing" - c.checkTTLs[id]++ - return nil -} - func TestConsul_ChangeTags(t *testing.T) { ctx := setupFake() allocID := "allocid" - if err := ctx.ServiceClient.RegisterTask(allocID, ctx.Task, nil, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask(allocID, ctx.Task, ctx.Restarter, nil, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -260,7 +204,7 @@ func TestConsul_ChangeTags(t *testing.T) { origTask := ctx.Task ctx.Task = testTask() ctx.Task.Services[0].Tags[0] = "newtag" - if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, nil); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, nil, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } if err := ctx.syncOnce(); err != nil { @@ -342,7 +286,7 @@ func TestConsul_ChangePorts(t *testing.T) { }, } - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -430,7 +374,7 @@ func TestConsul_ChangePorts(t *testing.T) { // Removed PortLabel; should default to service's (y) }, } - if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } if err := ctx.syncOnce(); err != nil { @@ -509,7 +453,7 @@ func TestConsul_ChangeChecks(t *testing.T) { } allocID := "allocid" - if err := ctx.ServiceClient.RegisterTask(allocID, ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask(allocID, ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -576,7 +520,7 @@ func TestConsul_ChangeChecks(t *testing.T) { PortLabel: "x", }, } - if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } if err := ctx.syncOnce(); err != nil { @@ -650,7 +594,7 @@ func TestConsul_ChangeChecks(t *testing.T) { func TestConsul_RegServices(t *testing.T) { ctx := setupFake() - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, nil, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -676,7 +620,7 @@ func TestConsul_RegServices(t *testing.T) { // Make a change which will register a new service ctx.Task.Services[0].Name = "taskname-service2" ctx.Task.Services[0].Tags[0] = "tag3" - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, nil, nil); err != nil { t.Fatalf("unpexpected error registering task: %v", err) } @@ -750,7 +694,7 @@ func TestConsul_ShutdownOK(t *testing.T) { go ctx.ServiceClient.Run() // Register a task and agent - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -823,7 +767,7 @@ func TestConsul_ShutdownSlow(t *testing.T) { go ctx.ServiceClient.Run() // Register a task and agent - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -894,7 +838,7 @@ func TestConsul_ShutdownBlocked(t *testing.T) { go ctx.ServiceClient.Run() // Register a task and agent - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -951,7 +895,7 @@ func TestConsul_NoTLSSkipVerifySupport(t *testing.T) { }, } - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, nil, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, nil, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -991,7 +935,7 @@ func TestConsul_CancelScript(t *testing.T) { }, } - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -1028,7 +972,7 @@ func TestConsul_CancelScript(t *testing.T) { }, } - if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, ctx, nil); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, ctx.Restarter, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -1115,7 +1059,7 @@ func TestConsul_DriverNetwork_AutoUse(t *testing.T) { AutoAdvertise: true, } - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, net); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, net); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -1218,7 +1162,7 @@ func TestConsul_DriverNetwork_NoAutoUse(t *testing.T) { AutoAdvertise: false, } - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, net); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, net); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -1304,7 +1248,7 @@ func TestConsul_DriverNetwork_Change(t *testing.T) { } // Initial service should advertise host port x - if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx, net); err != nil { + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, ctx, net); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -1314,7 +1258,7 @@ func TestConsul_DriverNetwork_Change(t *testing.T) { orig := ctx.Task.Copy() ctx.Task.Services[0].AddressMode = structs.AddressModeHost - if err := ctx.ServiceClient.UpdateTask("allocid", orig, ctx.Task, ctx, net); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", orig, ctx.Task, ctx.Restarter, ctx, net); err != nil { t.Fatalf("unexpected error updating task: %v", err) } @@ -1324,7 +1268,7 @@ func TestConsul_DriverNetwork_Change(t *testing.T) { orig = ctx.Task.Copy() ctx.Task.Services[0].AddressMode = structs.AddressModeDriver - if err := ctx.ServiceClient.UpdateTask("allocid", orig, ctx.Task, ctx, net); err != nil { + if err := ctx.ServiceClient.UpdateTask("allocid", orig, ctx.Task, ctx.Restarter, ctx, net); err != nil { t.Fatalf("unexpected error updating task: %v", err) } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 91c3a2b76e6..5335a71056c 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -3842,7 +3842,7 @@ type TaskEvent struct { } func (te *TaskEvent) GoString() string { - return fmt.Sprintf("%v at %v", te.Type, te.Time) + return fmt.Sprintf("%v - %v", te.Time, te.Type) } // SetMessage sets the message of TaskEvent From 526528c7c9629d267dec20242066c15270a0aedd Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 09:51:34 -0700 Subject: [PATCH 11/33] Removed partially implemented allocLock --- client/task_runner.go | 7 ------- 1 file changed, 7 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index cc1bf5aa61e..57953d1bc94 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -102,10 +102,6 @@ type TaskRunner struct { task *structs.Task taskDir *allocdir.TaskDir - // Synchronizes access to alloc and task since the main Run loop may - // update them concurrent with reads in exported methods. - allocLock sync.Mutex - // envBuilder is used to build the task's environment envBuilder *env.Builder @@ -1747,9 +1743,6 @@ func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { // Kill will kill a task and store the error, no longer restarting the task. If // fail is set, the task is marked as having failed. func (r *TaskRunner) Kill(source, reason string, fail bool) { - r.allocLock.Lock() - defer r.allocLock.Unlock() - reasonStr := fmt.Sprintf("%s: %s", source, reason) event := structs.NewTaskEvent(structs.TaskKilling).SetKillReason(reasonStr) if fail { From 568b963270a71e97a848de632bd6071b571f5420 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 09:52:20 -0700 Subject: [PATCH 12/33] Remove unused lastStart field --- client/task_runner.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index 57953d1bc94..383106a0252 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -158,10 +158,6 @@ type TaskRunner struct { // restartCh is used to restart a task restartCh chan *taskRestartEvent - // lastStart tracks the last time this task was started or restarted - lastStart time.Time - lastStartMu sync.Mutex - // signalCh is used to send a signal to a task signalCh chan SignalEvent @@ -1388,11 +1384,6 @@ func (r *TaskRunner) killTask(killingEvent *structs.TaskEvent) { // startTask creates the driver, task dir, and starts the task. func (r *TaskRunner) startTask() error { - // Update lastStart - r.lastStartMu.Lock() - r.lastStart = time.Now() - r.lastStartMu.Unlock() - // Create a driver drv, err := r.createDriver() if err != nil { From 9fb28656c907108ffe807f3504b9a8c1928477f9 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 14:57:23 -0700 Subject: [PATCH 13/33] Fix whitespace --- command/agent/consul/unit_test.go | 1 - 1 file changed, 1 deletion(-) diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 3e2837c75a7..011fa0065af 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -64,7 +64,6 @@ type checkRestartRecord struct { // fakeCheckRestarter is a test implementation of type fakeCheckRestarter struct { - // restartDelay is returned by RestartDelay to control the behavior of // the checkWatcher restartDelay time.Duration From 092057a32b413591e2e8350c183c97ae3ebdd84d Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 17:27:08 -0700 Subject: [PATCH 14/33] Canonicalize and Merge CheckRestart in api --- api/tasks.go | 70 ++++++++++++++++++++++++++++++----- command/agent/job_endpoint.go | 7 ---- nomad/structs/structs.go | 46 ----------------------- 3 files changed, 61 insertions(+), 62 deletions(-) diff --git a/api/tasks.go b/api/tasks.go index 291971b719f..9b7886384c4 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -87,6 +87,63 @@ type CheckRestart struct { IgnoreWarnings bool `mapstructure:"ignore_warnings"` } +// Canonicalize CheckRestart fields if not nil. +func (c *CheckRestart) Canonicalize() { + if c == nil { + return + } + + if c.Grace == nil { + c.Grace = helper.TimeToPtr(1 * time.Second) + } +} + +// Copy returns a copy of CheckRestart or nil if unset. +func (c *CheckRestart) Copy() *CheckRestart { + if c == nil { + return nil + } + + nc := new(CheckRestart) + nc.Limit = c.Limit + if c.Grace != nil { + g := *c.Grace + nc.Grace = &g + } + nc.IgnoreWarnings = c.IgnoreWarnings + return nc +} + +// Merge values from other CheckRestart over default values on this +// CheckRestart and return merged copy. +func (c *CheckRestart) Merge(o *CheckRestart) *CheckRestart { + if c == nil { + // Just return other + return o + } + + nc := c.Copy() + + if o == nil { + // Nothing to merge + return nc + } + + if nc.Limit == 0 { + nc.Limit = o.Limit + } + + if nc.Grace == nil { + nc.Grace = o.Grace + } + + if nc.IgnoreWarnings { + nc.IgnoreWarnings = o.IgnoreWarnings + } + + return nc +} + // The ServiceCheck data model represents the consul health check that // Nomad registers for a Task type ServiceCheck struct { @@ -107,14 +164,6 @@ type ServiceCheck struct { CheckRestart *CheckRestart `mapstructure:"check_restart"` } -func (c *ServiceCheck) Canonicalize() { - if c.CheckRestart != nil { - if c.CheckRestart.Grace == nil { - c.CheckRestart.Grace = helper.TimeToPtr(1 * time.Second) - } - } -} - // The Service model represents a Consul service definition type Service struct { Id string @@ -136,8 +185,11 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { s.AddressMode = "auto" } + s.CheckRestart.Canonicalize() + for _, c := range s.Checks { - c.Canonicalize() + c.CheckRestart.Canonicalize() + c.CheckRestart = c.CheckRestart.Merge(s.CheckRestart) } } diff --git a/command/agent/job_endpoint.go b/command/agent/job_endpoint.go index 7c7b0089b76..5fcf6516165 100644 --- a/command/agent/job_endpoint.go +++ b/command/agent/job_endpoint.go @@ -685,13 +685,6 @@ func ApiTaskToStructsTask(apiTask *api.Task, structsTask *structs.Task) { Tags: service.Tags, AddressMode: service.AddressMode, } - if service.CheckRestart != nil { - structsTask.Services[i].CheckRestart = &structs.CheckRestart{ - Limit: service.CheckRestart.Limit, - Grace: *service.CheckRestart.Grace, - IgnoreWarnings: service.CheckRestart.IgnoreWarnings, - } - } if l := len(service.Checks); l != 0 { structsTask.Services[i].Checks = make([]*structs.ServiceCheck, l) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 5335a71056c..728e3b14511 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2775,36 +2775,6 @@ func (c *CheckRestart) Copy() *CheckRestart { return nc } -// Merge non-zero values from other CheckRestart into a copy of this -// CheckRestart. Returns nil iff both are nil. -func (c *CheckRestart) Merge(o *CheckRestart) *CheckRestart { - if c == nil { - // Just return other - return o - } - - nc := c.Copy() - - if o == nil { - // Nothing to merge - return nc.Copy() - } - - if nc.Limit == 0 { - nc.Limit = o.Limit - } - - if nc.Grace == 0 { - nc.Grace = o.Grace - } - - if nc.IgnoreWarnings { - nc.IgnoreWarnings = o.IgnoreWarnings - } - - return nc -} - func (c *CheckRestart) Validate() error { if c == nil { return nil @@ -3008,9 +2978,6 @@ type Service struct { Tags []string // List of tags for the service Checks []*ServiceCheck // List of checks associated with the service - - // CheckRestart will be propagated to Checks if set. - CheckRestart *CheckRestart } func (s *Service) Copy() *Service { @@ -3020,7 +2987,6 @@ func (s *Service) Copy() *Service { ns := new(Service) *ns = *s ns.Tags = helper.CopySliceString(ns.Tags) - ns.CheckRestart = s.CheckRestart.Copy() if s.Checks != nil { checks := make([]*ServiceCheck, len(ns.Checks)) @@ -3056,14 +3022,6 @@ func (s *Service) Canonicalize(job string, taskGroup string, task string) { for _, check := range s.Checks { check.Canonicalize(s.Name) } - - // If CheckRestart is set propagate it to checks - if s.CheckRestart != nil { - for _, check := range s.Checks { - // Merge Service CheckRestart into Check's so Check's takes precedence - check.CheckRestart = check.CheckRestart.Merge(s.CheckRestart) - } - } } // Validate checks if the Check definition is valid @@ -3098,10 +3056,6 @@ func (s *Service) Validate() error { } } - if s.CheckRestart != nil && len(s.Checks) == 0 { - mErr.Errors = append(mErr.Errors, fmt.Errorf("check_restart specified but no checks")) - } - return mErr.ErrorOrNil() } From 8b8c164622667584851160b9b07909819405a2c9 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 21:35:12 -0700 Subject: [PATCH 15/33] Wrap check watch updates in a struct Reusing checkRestart for both adds/removes and the main check restarting logic was confusing. --- command/agent/consul/check_watcher.go | 60 +++++++++++++--------- command/agent/consul/check_watcher_test.go | 2 +- 2 files changed, 37 insertions(+), 25 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index ee28c87fb37..ea045ddf670 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -35,9 +35,6 @@ type checkRestart struct { checkID string checkName string - // remove this checkID (if true only checkID will be set) - remove bool - task TaskRestarter restartDelay time.Duration grace time.Duration @@ -116,6 +113,13 @@ func (c *checkRestart) update(now time.Time, status string) { } } +// checkWatchUpdates add or remove checks from the watcher +type checkWatchUpdate struct { + checkID string + remove bool + checkRestart *checkRestart +} + // checkWatcher watches Consul checks and restarts tasks when they're // unhealthy. type checkWatcher struct { @@ -125,8 +129,9 @@ type checkWatcher struct { // defaultPollFreq pollFreq time.Duration - // watchCh is how watches (and removals) are sent to the main watching loop - watchCh chan *checkRestart + // checkUpdateCh is how watches (and removals) are sent to the main + // watching loop + checkUpdateCh chan checkWatchUpdate // done is closed when Run has exited done chan struct{} @@ -141,11 +146,11 @@ type checkWatcher struct { // newCheckWatcher creates a new checkWatcher but does not call its Run method. func newCheckWatcher(logger *log.Logger, consul ChecksAPI) *checkWatcher { return &checkWatcher{ - consul: consul, - pollFreq: defaultPollFreq, - watchCh: make(chan *checkRestart, 8), - done: make(chan struct{}), - logger: logger, + consul: consul, + pollFreq: defaultPollFreq, + checkUpdateCh: make(chan checkWatchUpdate, 8), + done: make(chan struct{}), + logger: logger, } } @@ -167,13 +172,14 @@ func (w *checkWatcher) Run(ctx context.Context) { // trigger restarts. for len(checks) == 0 { select { - case c := <-w.watchCh: - if c.remove { + case update := <-w.checkUpdateCh: + if update.remove { // should not happen - w.logger.Printf("[DEBUG] consul.health: told to stop watching an unwatched check: %q", c.checkID) + w.logger.Printf("[DEBUG] consul.health: told to stop watching an unwatched check: %q", update.checkID) } else { - checks[c.checkID] = c - w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", c.allocID, c.taskName, c.checkName) + checks[update.checkID] = update.checkRestart + w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", + update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) // Begin polling if !checkTimer.Stop() { @@ -189,12 +195,13 @@ func (w *checkWatcher) Run(ctx context.Context) { // As long as there are checks to be watched, keep watching for len(checks) > 0 { select { - case c := <-w.watchCh: - if c.remove { - delete(checks, c.checkID) + case update := <-w.checkUpdateCh: + if update.remove { + delete(checks, update.checkID) } else { - checks[c.checkID] = c - w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", c.allocID, c.taskName, c.checkName) + checks[update.checkID] = update.checkRestart + w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", + update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) } case <-ctx.Done(): return @@ -240,7 +247,7 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S return } - c := checkRestart{ + c := &checkRestart{ allocID: allocID, taskName: taskName, checkID: checkID, @@ -255,8 +262,13 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S logger: w.logger, } + update := checkWatchUpdate{ + checkID: checkID, + checkRestart: c, + } + select { - case w.watchCh <- &c: + case w.checkUpdateCh <- update: // sent watch case <-w.done: // exited; nothing to do @@ -265,12 +277,12 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S // Unwatch a task. func (w *checkWatcher) Unwatch(cid string) { - c := checkRestart{ + c := checkWatchUpdate{ checkID: cid, remove: true, } select { - case w.watchCh <- &c: + case w.checkUpdateCh <- c: // sent remove watch case <-w.done: // exited; nothing to do diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index 7e4947f51b9..c8544bdc76f 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -89,7 +89,7 @@ func TestCheckWatcher_Skip(t *testing.T) { cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) // Check should have been dropped as it's not watched - if n := len(cw.watchCh); n != 0 { + if n := len(cw.checkUpdateCh); n != 0 { t.Fatalf("expected 0 checks to be enqueued for watching but found %d", n) } } From 237c096661330aecaf12f209e7797d10833c7833 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 21:50:46 -0700 Subject: [PATCH 16/33] Simplify from 2 select loops to one --- command/agent/consul/check_watcher.go | 115 +++++++++++++------------- 1 file changed, 58 insertions(+), 57 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index ea045ddf670..9ae94f28234 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -166,75 +166,76 @@ func (w *checkWatcher) Run(ctx context.Context) { checkTimer := time.NewTimer(0) defer checkTimer.Stop() // ensure timer is never leaked + stopTimer := func() { + checkTimer.Stop() + select { + case <-checkTimer.C: + default: + } + } + + // disable by default + stopTimer() + // Main watch loop for { - // Don't start watching until we actually have checks that - // trigger restarts. - for len(checks) == 0 { - select { - case update := <-w.checkUpdateCh: - if update.remove { - // should not happen - w.logger.Printf("[DEBUG] consul.health: told to stop watching an unwatched check: %q", update.checkID) - } else { - checks[update.checkID] = update.checkRestart - w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", - update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) - - // Begin polling - if !checkTimer.Stop() { - <-checkTimer.C - } - checkTimer.Reset(w.pollFreq) + select { + case update := <-w.checkUpdateCh: + if update.remove { + // Remove a check + delete(checks, update.checkID) + + // disable polling if last check was removed + if len(checks) == 0 { + stopTimer() } - case <-ctx.Done(): - return + + continue } - } - // As long as there are checks to be watched, keep watching - for len(checks) > 0 { - select { - case update := <-w.checkUpdateCh: - if update.remove { - delete(checks, update.checkID) - } else { - checks[update.checkID] = update.checkRestart - w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", - update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) - } - case <-ctx.Done(): - return - case <-checkTimer.C: + // Add/update a check + checks[update.checkID] = update.checkRestart + w.logger.Printf("[DEBUG] consul.health: watching alloc %q task %q check %q", + update.checkRestart.allocID, update.checkRestart.taskName, update.checkRestart.checkName) + + // if first check was added make sure polling is enabled + if len(checks) == 1 { + stopTimer() checkTimer.Reset(w.pollFreq) + } - // Set "now" as the point in time the following check results represent - now := time.Now() + case <-ctx.Done(): + return - results, err := w.consul.Checks() - if err != nil { - if !w.lastErr { - w.lastErr = true - w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err) - } - continue + case <-checkTimer.C: + checkTimer.Reset(w.pollFreq) + + // Set "now" as the point in time the following check results represent + now := time.Now() + + results, err := w.consul.Checks() + if err != nil { + if !w.lastErr { + w.lastErr = true + w.logger.Printf("[ERR] consul.health: error retrieving health checks: %q", err) } + continue + } - w.lastErr = false - - // Loop over watched checks and update their status from results - for cid, check := range checks { - result, ok := results[cid] - if !ok { - // Only warn if outside grace period to avoid races with check registration - if now.After(check.graceUntil) { - w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) - } - continue - } + w.lastErr = false - check.update(now, result.Status) + // Loop over watched checks and update their status from results + for cid, check := range checks { + result, ok := results[cid] + if !ok { + // Only warn if outside grace period to avoid races with check registration + if now.After(check.graceUntil) { + w.logger.Printf("[WARN] consul.health: watched check %q (%s) not found in Consul", check.checkName, cid) + } + continue } + + check.update(now, result.Status) } } } From f8e872c8558b6c30007c3d7d2287e7a683fb453c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 22:42:43 -0700 Subject: [PATCH 17/33] RestartDelay isn't needed as checks are re-added on restarts @dadgar made the excellent observation in #3105 that TaskRunner removes and re-registers checks on restarts. This means checkWatcher doesn't need to do *any* internal restart tracking. Individual checks can just remove themselves and be re-added when the task restarts. --- client/task_runner.go | 7 ---- command/agent/consul/check_watcher.go | 48 +++++++++++----------- command/agent/consul/check_watcher_test.go | 28 ++++++------- command/agent/consul/unit_test.go | 37 +++++++++-------- 4 files changed, 59 insertions(+), 61 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index 383106a0252..a33df2c39a4 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1703,13 +1703,6 @@ func (r *TaskRunner) Restart(source, reason string, failure bool) { } } -// RestartDelay returns the *max* value of the delay for this task's restart -// policy for use by the healtcheck watcher. -func (r *TaskRunner) RestartDelay() time.Duration { - delay := r.alloc.Job.LookupTaskGroup(r.alloc.TaskGroup).RestartPolicy.Delay - return delay + time.Duration(float64(delay)*jitter) -} - // Signal will send a signal to the task func (r *TaskRunner) Signal(source, reason string, s os.Signal) error { diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 9ae94f28234..05b636a4bc5 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -21,10 +21,8 @@ type ChecksAPI interface { Checks() (map[string]*api.AgentCheck, error) } -// TaskRestarter allows the checkWatcher to restart tasks and how long the -// grace period should be afterward. +// TaskRestarter allows the checkWatcher to restart tasks. type TaskRestarter interface { - RestartDelay() time.Duration Restart(source, reason string, failure bool) } @@ -36,7 +34,6 @@ type checkRestart struct { checkName string task TaskRestarter - restartDelay time.Duration grace time.Duration interval time.Duration timeLimit time.Duration @@ -55,17 +52,19 @@ type checkRestart struct { logger *log.Logger } -// update restart state for check and restart task if necessary. Currrent +// apply restart state for check and restart task if necessary. Currrent // timestamp is passed in so all check updates have the same view of time (and // to ease testing). -func (c *checkRestart) update(now time.Time, status string) { +// +// Returns true if a restart was triggered in which case this check should be +// removed (checks are added on task startup). +func (c *checkRestart) apply(now time.Time, status string) bool { healthy := func() { if !c.unhealthyStart.IsZero() { c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", c.allocID, c.taskName, c.checkName) c.unhealthyStart = time.Time{} } - return } switch status { case api.HealthCritical: @@ -73,17 +72,17 @@ func (c *checkRestart) update(now time.Time, status string) { if c.ignoreWarnings { // Warnings are ignored, reset state and exit healthy() - return + return false } default: // All other statuses are ok, reset state and exit healthy() - return + return false } if now.Before(c.graceUntil) { - // In grace period exit - return + // In grace period, exit + return false } if c.unhealthyStart.IsZero() { @@ -106,11 +105,10 @@ func (c *checkRestart) update(now time.Time, status string) { // Tell TaskRunner to restart due to failure const failure = true c.task.Restart("healthcheck", fmt.Sprintf("check %q unhealthy", c.checkName), failure) - - // Reset grace time to grace + restart.delay - c.graceUntil = now.Add(c.grace + c.restartDelay) - c.unhealthyStart = time.Time{} + return true } + + return false } // checkWatchUpdates add or remove checks from the watcher @@ -179,17 +177,16 @@ func (w *checkWatcher) Run(ctx context.Context) { // Main watch loop for { + // disable polling if there are no checks + if len(checks) == 0 { + stopTimer() + } + select { case update := <-w.checkUpdateCh: if update.remove { // Remove a check delete(checks, update.checkID) - - // disable polling if last check was removed - if len(checks) == 0 { - stopTimer() - } - continue } @@ -235,7 +232,13 @@ func (w *checkWatcher) Run(ctx context.Context) { continue } - check.update(now, result.Status) + restarted := check.apply(now, result.Status) + if restarted { + // Checks are registered+watched on + // startup, so it's safe to remove them + // whenever they're restarted + delete(checks, cid) + } } } } @@ -254,7 +257,6 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S checkID: checkID, checkName: check.Name, task: restarter, - restartDelay: restarter.RestartDelay(), interval: check.Interval, grace: check.CheckRestart.Grace, graceUntil: time.Now().Add(check.CheckRestart.Grace), diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index c8544bdc76f..3098182eb1c 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -85,7 +85,7 @@ func TestCheckWatcher_Skip(t *testing.T) { check.CheckRestart = nil cw := newCheckWatcher(testLogger(), newFakeChecksAPI()) - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check) cw.Watch("testalloc1", "testtask1", "testcheck1", check, restarter1) // Check should have been dropped as it's not watched @@ -101,13 +101,13 @@ func TestCheckWatcher_Healthy(t *testing.T) { fakeAPI, cw := testWatcherSetup() check1 := testCheck() - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) check2 := testCheck() check2.CheckRestart.Limit = 1 check2.CheckRestart.Grace = 0 - restarter2 := newFakeCheckRestarter() + restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) // Make both checks healthy from the beginning @@ -135,14 +135,13 @@ func TestCheckWatcher_Unhealthy(t *testing.T) { fakeAPI, cw := testWatcherSetup() check1 := testCheck() - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) check2 := testCheck() check2.CheckRestart.Limit = 1 - check2.CheckRestart.Grace = 0 - restarter2 := newFakeCheckRestarter() - restarter2.restartDelay = 600 * time.Millisecond + check2.CheckRestart.Grace = 200 * time.Millisecond + restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) // Check 1 always passes, check 2 always fails @@ -150,7 +149,7 @@ func TestCheckWatcher_Unhealthy(t *testing.T) { fakeAPI.add("testcheck2", "critical", time.Time{}) // Run for 1 second - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) defer cancel() cw.Run(ctx) @@ -176,15 +175,14 @@ func TestCheckWatcher_HealthyWarning(t *testing.T) { check1.CheckRestart.Limit = 1 check1.CheckRestart.Grace = 0 check1.CheckRestart.IgnoreWarnings = true - restarter1 := newFakeCheckRestarter() - restarter1.restartDelay = 1100 * time.Millisecond + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) // Check is always in warning but that's ok fakeAPI.add("testcheck1", "warning", time.Time{}) // Run for 1 second - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) defer cancel() cw.Run(ctx) @@ -203,7 +201,7 @@ func TestCheckWatcher_Flapping(t *testing.T) { check1 := testCheck() check1.CheckRestart.Grace = 0 - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) // Check flaps and is never failing for the full 200ms needed to restart @@ -214,7 +212,7 @@ func TestCheckWatcher_Flapping(t *testing.T) { fakeAPI.add("testcheck1", "critical", now.Add(300*time.Millisecond)) fakeAPI.add("testcheck1", "passing", now.Add(450*time.Millisecond)) - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) + ctx, cancel := context.WithTimeout(context.Background(), 600*time.Millisecond) defer cancel() cw.Run(ctx) @@ -234,14 +232,14 @@ func TestCheckWatcher_Unwatch(t *testing.T) { check1 := testCheck() check1.CheckRestart.Limit = 1 check1.CheckRestart.Grace = 100 * time.Millisecond - restarter1 := newFakeCheckRestarter() + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) cw.Unwatch("testcheck1") // Always failing fakeAPI.add("testcheck1", "critical", time.Time{}) - ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + ctx, cancel := context.WithTimeout(context.Background(), 300*time.Millisecond) defer cancel() cw.Run(ctx) diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index 011fa0065af..ddb7e2dfc48 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -64,37 +64,43 @@ type checkRestartRecord struct { // fakeCheckRestarter is a test implementation of type fakeCheckRestarter struct { - // restartDelay is returned by RestartDelay to control the behavior of - // the checkWatcher - restartDelay time.Duration - // restarts is a slice of all of the restarts triggered by the checkWatcher restarts []checkRestartRecord -} -func newFakeCheckRestarter() *fakeCheckRestarter { - return &fakeCheckRestarter{} + // need the checkWatcher to re-Watch restarted tasks like TaskRunner + watcher *checkWatcher + + // check to re-Watch on restarts + check *structs.ServiceCheck + allocID string + taskName string + checkName string } -// RestartDelay implements part of the TaskRestarter interface needed for check -// watching and is normally fulfilled by a task runner. -// -// The return value is determined by the restartDelay field. -func (c *fakeCheckRestarter) RestartDelay() time.Duration { - return c.restartDelay +func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { + return &fakeCheckRestarter{ + watcher: w, + check: c, + allocID: allocID, + taskName: taskName, + checkName: checkName, + } } // Restart implements part of the TaskRestarter interface needed for check // watching and is normally fulfilled by a TaskRunner. // -// Restarts are recorded in the []restarts field. +// Restarts are recorded in the []restarts field and re-Watch the check. func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) + + // Re-Watch the check just like TaskRunner + c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) } // String for debugging func (c *fakeCheckRestarter) String() string { - s := "" + s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) for _, r := range c.restarts { s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) } @@ -152,7 +158,6 @@ func setupFake() *testFakeCtx { ServiceClient: NewServiceClient(fc, true, testLogger()), FakeConsul: fc, Task: testTask(), - Restarter: newFakeCheckRestarter(), execs: make(chan int, 100), } } From 40ed2625f0a5897032cc60f2fd55f3d5d94ed442 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Wed, 13 Sep 2017 23:13:34 -0700 Subject: [PATCH 18/33] Handle multiple failing checks on a single task Before this commit if a task had 2 checks cause restarts at the same time, both would trigger restarts of the task! This change removes all checks for a task whenever one of them is restarted. --- command/agent/consul/check_watcher.go | 24 ++++++ command/agent/consul/check_watcher_test.go | 98 ++++++++++++---------- 2 files changed, 80 insertions(+), 42 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 05b636a4bc5..b2b3534a69e 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -32,6 +32,7 @@ type checkRestart struct { taskName string checkID string checkName string + taskKey string // composite of allocID + taskName for uniqueness task TaskRestarter grace time.Duration @@ -221,8 +222,19 @@ func (w *checkWatcher) Run(ctx context.Context) { w.lastErr = false + // Keep track of tasks restarted this period so they + // are only restarted once and all of their checks are + // removed. + restartedTasks := map[string]struct{}{} + // Loop over watched checks and update their status from results for cid, check := range checks { + if _, ok := restartedTasks[check.taskKey]; ok { + // Check for this task already restarted; remove and skip check + delete(checks, cid) + continue + } + result, ok := results[cid] if !ok { // Only warn if outside grace period to avoid races with check registration @@ -238,6 +250,17 @@ func (w *checkWatcher) Run(ctx context.Context) { // startup, so it's safe to remove them // whenever they're restarted delete(checks, cid) + + restartedTasks[check.taskKey] = struct{}{} + } + } + + // Ensure even passing checks for restartedTasks are removed + if len(restartedTasks) > 0 { + for cid, check := range checks { + if _, ok := restartedTasks[check.taskKey]; ok { + delete(checks, cid) + } } } } @@ -256,6 +279,7 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S taskName: taskName, checkID: checkID, checkName: check.Name, + taskKey: fmt.Sprintf("%s%s", allocID, taskName), // unique task ID task: restarter, interval: check.Interval, grace: check.CheckRestart.Grace, diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index 3098182eb1c..f94e4995d0f 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -114,53 +114,17 @@ func TestCheckWatcher_Healthy(t *testing.T) { fakeAPI.add("testcheck1", "passing", time.Time{}) fakeAPI.add("testcheck2", "passing", time.Time{}) - // Run for 1 second - ctx, cancel := context.WithTimeout(context.Background(), 1*time.Second) - defer cancel() - cw.Run(ctx) - - // Assert Restart was never called - if n := len(restarter1.restarts); n > 0 { - t.Errorf("expected check 1 to not be restarted but found %d", n) - } - if n := len(restarter2.restarts); n > 0 { - t.Errorf("expected check 2 to not be restarted but found %d", n) - } -} - -// TestCheckWatcher_Unhealthy asserts unhealthy tasks are not restarted. -func TestCheckWatcher_Unhealthy(t *testing.T) { - t.Parallel() - - fakeAPI, cw := testWatcherSetup() - - check1 := testCheck() - restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) - cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) - - check2 := testCheck() - check2.CheckRestart.Limit = 1 - check2.CheckRestart.Grace = 200 * time.Millisecond - restarter2 := newFakeCheckRestarter(cw, "testalloc2", "testtask2", "testcheck2", check2) - cw.Watch("testalloc2", "testtask2", "testcheck2", check2, restarter2) - - // Check 1 always passes, check 2 always fails - fakeAPI.add("testcheck1", "passing", time.Time{}) - fakeAPI.add("testcheck2", "critical", time.Time{}) - - // Run for 1 second + // Run ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) defer cancel() cw.Run(ctx) - // Ensure restart was never called on check 1 + // Ensure restart was never called if n := len(restarter1.restarts); n > 0 { - t.Errorf("expected check 1 to not be restarted but found %d", n) + t.Errorf("expected check 1 to not be restarted but found %d:\n%s", n, restarter1) } - - // Ensure restart was called twice on check 2 - if n := len(restarter2.restarts); n != 2 { - t.Errorf("expected check 2 to be restarted 2 times but found %d:\n%s", n, restarter2) + if n := len(restarter2.restarts); n > 0 { + t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) } } @@ -181,7 +145,7 @@ func TestCheckWatcher_HealthyWarning(t *testing.T) { // Check is always in warning but that's ok fakeAPI.add("testcheck1", "warning", time.Time{}) - // Run for 1 second + // Run ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond) defer cancel() cw.Run(ctx) @@ -248,3 +212,53 @@ func TestCheckWatcher_Unwatch(t *testing.T) { t.Errorf("expected check 1 to not be restarted but found %d\n%s", n, restarter1) } } + +// TestCheckWatcher_MultipleChecks asserts that when there are multiple checks +// for a single task, all checks should be removed when any of them restart the +// task to avoid multiple restarts. +func TestCheckWatcher_MultipleChecks(t *testing.T) { + t.Parallel() + + fakeAPI, cw := testWatcherSetup() + + check1 := testCheck() + check1.CheckRestart.Limit = 1 + restarter1 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck1", check1) + cw.Watch("testalloc1", "testtask1", "testcheck1", check1, restarter1) + + check2 := testCheck() + check2.CheckRestart.Limit = 1 + restarter2 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck2", check2) + cw.Watch("testalloc1", "testtask1", "testcheck2", check2, restarter2) + + check3 := testCheck() + check3.CheckRestart.Limit = 1 + restarter3 := newFakeCheckRestarter(cw, "testalloc1", "testtask1", "testcheck3", check3) + cw.Watch("testalloc1", "testtask1", "testcheck3", check3, restarter3) + + // check 2 & 3 fail long enough to cause 1 restart, but only 1 should restart + now := time.Now() + fakeAPI.add("testcheck1", "critical", now) + fakeAPI.add("testcheck1", "passing", now.Add(150*time.Millisecond)) + fakeAPI.add("testcheck2", "critical", now) + fakeAPI.add("testcheck2", "passing", now.Add(150*time.Millisecond)) + fakeAPI.add("testcheck3", "passing", time.Time{}) + + // Run + ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond) + defer cancel() + cw.Run(ctx) + + // Ensure restart was only called on restarter1 + if n := len(restarter1.restarts); n != 1 { + t.Errorf("expected check 1 to be restarted 1 time but found %d\n%s", n, restarter1) + } + + if n := len(restarter2.restarts); n != 0 { + t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) + } + + if n := len(restarter3.restarts); n != 0 { + t.Errorf("expected check 3 to not be restarted but found %d:\n%s", n, restarter3) + } +} From 5cd1d57218dc9afc1c60344804cb9c35c34604d7 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 09:58:35 -0700 Subject: [PATCH 19/33] Watched -> TriggersRestart Watched was a silly name --- command/agent/consul/check_watcher.go | 2 +- command/agent/consul/client.go | 12 ++++++------ nomad/structs/structs.go | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index b2b3534a69e..3d69906eb30 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -269,7 +269,7 @@ func (w *checkWatcher) Run(ctx context.Context) { // Watch a task and restart it if unhealthy. func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) { - if !check.Watched() { + if !check.TriggersRestarts() { // Not watched, noop return } diff --git a/command/agent/consul/client.go b/command/agent/consul/client.go index 911ba628e56..5116a366512 100644 --- a/command/agent/consul/client.go +++ b/command/agent/consul/client.go @@ -697,7 +697,7 @@ func (c *ServiceClient) RegisterTask(allocID string, task *structs.Task, restart for _, service := range task.Services { serviceID := makeTaskServiceID(allocID, task.Name, service) for _, check := range service.Checks { - if check.Watched() { + if check.TriggersRestarts() { checkID := makeCheckID(serviceID, check) c.checkWatcher.Watch(allocID, task.Name, checkID, check, restarter) } @@ -737,7 +737,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta ops.deregChecks = append(ops.deregChecks, cid) // Unwatch watched checks - if check.Watched() { + if check.TriggersRestarts() { c.checkWatcher.Unwatch(cid) } } @@ -787,7 +787,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta } // Update all watched checks as CheckRestart fields aren't part of ID - if check.Watched() { + if check.TriggersRestarts() { c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) } } @@ -797,7 +797,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta ops.deregChecks = append(ops.deregChecks, cid) // Unwatch checks - if check.Watched() { + if check.TriggersRestarts() { c.checkWatcher.Unwatch(cid) } } @@ -823,7 +823,7 @@ func (c *ServiceClient) UpdateTask(allocID string, existing, newTask *structs.Ta for _, service := range newIDs { serviceID := makeTaskServiceID(allocID, newTask.Name, service) for _, check := range service.Checks { - if check.Watched() { + if check.TriggersRestarts() { checkID := makeCheckID(serviceID, check) c.checkWatcher.Watch(allocID, newTask.Name, checkID, check, restarter) } @@ -846,7 +846,7 @@ func (c *ServiceClient) RemoveTask(allocID string, task *structs.Task) { cid := makeCheckID(id, check) ops.deregChecks = append(ops.deregChecks, cid) - if check.Watched() { + if check.TriggersRestarts() { c.checkWatcher.Unwatch(cid) } } diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 728e3b14511..3b4a9436667 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2912,9 +2912,9 @@ func (sc *ServiceCheck) RequiresPort() bool { } } -// Watched returns true if this check should be watched and trigger a restart +// TriggersRestarts returns true if this check should be watched and trigger a restart // on failure. -func (sc *ServiceCheck) Watched() bool { +func (sc *ServiceCheck) TriggersRestarts() bool { return sc.CheckRestart != nil && sc.CheckRestart.Limit > 0 } From 10dc1c790043121020e43f8966b7de4989c17736 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 10:21:49 -0700 Subject: [PATCH 20/33] DRY up restart handling a bit. All 3 error/failure cases share restart logic, but 2 of them have special cased conditions. --- client/restarts.go | 105 +++++++++++++-------------------------------- 1 file changed, 30 insertions(+), 75 deletions(-) diff --git a/client/restarts.go b/client/restarts.go index 93f2a8fd0cc..022c1cac3b6 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -60,6 +60,7 @@ func (r *RestartTracker) SetStartError(err error) *RestartTracker { r.lock.Lock() defer r.lock.Unlock() r.startErr = err + r.failure = true return r } @@ -68,6 +69,7 @@ func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker r.lock.Lock() defer r.lock.Unlock() r.waitRes = res + r.failure = true return r } @@ -145,87 +147,40 @@ func (r *RestartTracker) GetState() (string, time.Duration) { r.startTime = now } - if r.startErr != nil { - return r.handleStartError() - } else if r.waitRes != nil { - return r.handleWaitResult() - } else if r.failure { - return r.handleFailure() - } - - return "", 0 -} - -// handleStartError returns the new state and potential wait duration for -// restarting the task after it was not successfully started. On start errors, -// the restart policy is always treated as fail mode to ensure we don't -// infinitely try to start a task. -func (r *RestartTracker) handleStartError() (string, time.Duration) { - // If the error is not recoverable, do not restart. - if !structs.IsRecoverable(r.startErr) { - r.reason = ReasonUnrecoverableErrror - return structs.TaskNotRestarting, 0 - } - - if r.count > r.policy.Attempts { - if r.policy.Mode == structs.RestartPolicyModeFail { - r.reason = fmt.Sprintf( - `Exceeded allowed attempts %d in interval %v and mode is "fail"`, - r.policy.Attempts, r.policy.Interval) - return structs.TaskNotRestarting, 0 - } else { - r.reason = ReasonDelay - return structs.TaskRestarting, r.getDelay() + // Handle restarts due to failures + if r.failure { + if r.startErr != nil { + // If the error is not recoverable, do not restart. + if !structs.IsRecoverable(r.startErr) { + r.reason = ReasonUnrecoverableErrror + return structs.TaskNotRestarting, 0 + } + } else if r.waitRes != nil { + // If the task started successfully and restart on success isn't specified, + // don't restart but don't mark as failed. + if r.waitRes.Successful() && !r.onSuccess { + r.reason = "Restart unnecessary as task terminated successfully" + return structs.TaskTerminated, 0 + } } - } - - r.reason = ReasonWithinPolicy - return structs.TaskRestarting, r.jitter() -} - -// handleWaitResult returns the new state and potential wait duration for -// restarting the task after it has exited. -func (r *RestartTracker) handleWaitResult() (string, time.Duration) { - // If the task started successfully and restart on success isn't specified, - // don't restart but don't mark as failed. - if r.waitRes.Successful() && !r.onSuccess { - r.reason = "Restart unnecessary as task terminated successfully" - return structs.TaskTerminated, 0 - } - if r.count > r.policy.Attempts { - if r.policy.Mode == structs.RestartPolicyModeFail { - r.reason = fmt.Sprintf( - `Exceeded allowed attempts %d in interval %v and mode is "fail"`, - r.policy.Attempts, r.policy.Interval) - return structs.TaskNotRestarting, 0 - } else { - r.reason = ReasonDelay - return structs.TaskRestarting, r.getDelay() + if r.count > r.policy.Attempts { + if r.policy.Mode == structs.RestartPolicyModeFail { + r.reason = fmt.Sprintf( + `Exceeded allowed attempts %d in interval %v and mode is "fail"`, + r.policy.Attempts, r.policy.Interval) + return structs.TaskNotRestarting, 0 + } else { + r.reason = ReasonDelay + return structs.TaskRestarting, r.getDelay() + } } - } - - r.reason = ReasonWithinPolicy - return structs.TaskRestarting, r.jitter() -} -// handleFailure returns the new state and potential wait duration for -// restarting the task due to a failure like an unhealthy Consul check. -func (r *RestartTracker) handleFailure() (string, time.Duration) { - if r.count > r.policy.Attempts { - if r.policy.Mode == structs.RestartPolicyModeFail { - r.reason = fmt.Sprintf( - `Exceeded allowed attempts %d in interval %v and mode is "fail"`, - r.policy.Attempts, r.policy.Interval) - return structs.TaskNotRestarting, 0 - } else { - r.reason = ReasonDelay - return structs.TaskRestarting, r.getDelay() - } + r.reason = ReasonWithinPolicy + return structs.TaskRestarting, r.jitter() } - r.reason = ReasonWithinPolicy - return structs.TaskRestarting, r.jitter() + return "", 0 } // getDelay returns the delay time to enter the next interval. From 3c0a42ba8e758314a01b166525a4fb1b0f049a6f Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 10:36:32 -0700 Subject: [PATCH 21/33] Rename unhealthy var and fix test indeterminism --- command/agent/consul/check_watcher.go | 14 +++++++------- command/agent/consul/check_watcher_test.go | 13 ++++++------- 2 files changed, 13 insertions(+), 14 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 3d69906eb30..29a158eaa05 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -42,9 +42,9 @@ type checkRestart struct { // Mutable fields - // unhealthyStart is the time a check first went unhealthy. Set to the + // unhealthyState is the time a check first went unhealthy. Set to the // zero value if the check passes before timeLimit. - unhealthyStart time.Time + unhealthyState time.Time // graceUntil is when the check's grace period expires and unhealthy // checks should be counted. @@ -61,10 +61,10 @@ type checkRestart struct { // removed (checks are added on task startup). func (c *checkRestart) apply(now time.Time, status string) bool { healthy := func() { - if !c.unhealthyStart.IsZero() { + if !c.unhealthyState.IsZero() { c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became healthy; canceling restart", c.allocID, c.taskName, c.checkName) - c.unhealthyStart = time.Time{} + c.unhealthyState = time.Time{} } } switch status { @@ -86,17 +86,17 @@ func (c *checkRestart) apply(now time.Time, status string) bool { return false } - if c.unhealthyStart.IsZero() { + if c.unhealthyState.IsZero() { // First failure, set restart deadline if c.timeLimit != 0 { c.logger.Printf("[DEBUG] consul.health: alloc %q task %q check %q became unhealthy. Restarting in %s if not healthy", c.allocID, c.taskName, c.checkName, c.timeLimit) } - c.unhealthyStart = now + c.unhealthyState = now } // restart timeLimit after start of this check becoming unhealthy - restartAt := c.unhealthyStart.Add(c.timeLimit) + restartAt := c.unhealthyState.Add(c.timeLimit) // Must test >= because if limit=1, restartAt == first failure if now.Equal(restartAt) || now.After(restartAt) { diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index f94e4995d0f..93abed25ab5 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -249,13 +249,12 @@ func TestCheckWatcher_MultipleChecks(t *testing.T) { defer cancel() cw.Run(ctx) - // Ensure restart was only called on restarter1 - if n := len(restarter1.restarts); n != 1 { - t.Errorf("expected check 1 to be restarted 1 time but found %d\n%s", n, restarter1) - } - - if n := len(restarter2.restarts); n != 0 { - t.Errorf("expected check 2 to not be restarted but found %d:\n%s", n, restarter2) + // Ensure that restart was only called once on check 1 or 2. Since + // checks are in a map it's random which check triggers the restart + // first. + if n := len(restarter1.restarts) + len(restarter2.restarts); n != 1 { + t.Errorf("expected check 1 & 2 to be restarted 1 time but found %d\ncheck 1:\n%s\ncheck 2:%s", + n, restarter1, restarter2) } if n := len(restarter3.restarts); n != 0 { From 6f72270d13bcd110631a6fb6ec5f7d1bdf184a3c Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 15:17:21 -0700 Subject: [PATCH 22/33] Test check watch updates --- command/agent/consul/check_watcher_test.go | 57 ++++++ command/agent/consul/unit_test.go | 196 +++++++++++++++------ 2 files changed, 202 insertions(+), 51 deletions(-) diff --git a/command/agent/consul/check_watcher_test.go b/command/agent/consul/check_watcher_test.go index 93abed25ab5..ccc06b5e6fa 100644 --- a/command/agent/consul/check_watcher_test.go +++ b/command/agent/consul/check_watcher_test.go @@ -2,6 +2,7 @@ package consul import ( "context" + "fmt" "testing" "time" @@ -9,6 +10,62 @@ import ( "github.com/hashicorp/nomad/nomad/structs" ) +// checkRestartRecord is used by a testFakeCtx to record when restarts occur +// due to a watched check. +type checkRestartRecord struct { + timestamp time.Time + source string + reason string + failure bool +} + +// fakeCheckRestarter is a test implementation of TaskRestarter. +type fakeCheckRestarter struct { + // restarts is a slice of all of the restarts triggered by the checkWatcher + restarts []checkRestartRecord + + // need the checkWatcher to re-Watch restarted tasks like TaskRunner + watcher *checkWatcher + + // check to re-Watch on restarts + check *structs.ServiceCheck + allocID string + taskName string + checkName string +} + +// newFakeCheckRestart creates a new TaskRestarter. It needs all of the +// parameters checkWatcher.Watch expects. +func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { + return &fakeCheckRestarter{ + watcher: w, + check: c, + allocID: allocID, + taskName: taskName, + checkName: checkName, + } +} + +// Restart implements part of the TaskRestarter interface needed for check +// watching and is normally fulfilled by a TaskRunner. +// +// Restarts are recorded in the []restarts field and re-Watch the check. +func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { + c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) + + // Re-Watch the check just like TaskRunner + c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) +} + +// String for debugging +func (c *fakeCheckRestarter) String() string { + s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) + for _, r := range c.restarts { + s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) + } + return s +} + // checkResponse is a response returned by the fakeChecksAPI after the given // time. type checkResponse struct { diff --git a/command/agent/consul/unit_test.go b/command/agent/consul/unit_test.go index ddb7e2dfc48..8bd3e08a5d8 100644 --- a/command/agent/consul/unit_test.go +++ b/command/agent/consul/unit_test.go @@ -8,6 +8,7 @@ import ( "os" "reflect" "strings" + "sync/atomic" "testing" "time" @@ -53,58 +54,14 @@ func testTask() *structs.Task { } } -// checkRestartRecord is used by a testFakeCtx to record when restarts occur -// due to a watched check. -type checkRestartRecord struct { - timestamp time.Time - source string - reason string - failure bool +// restartRecorder is a minimal TaskRestarter implementation that simply +// counts how many restarts were triggered. +type restartRecorder struct { + restarts int64 } -// fakeCheckRestarter is a test implementation of -type fakeCheckRestarter struct { - // restarts is a slice of all of the restarts triggered by the checkWatcher - restarts []checkRestartRecord - - // need the checkWatcher to re-Watch restarted tasks like TaskRunner - watcher *checkWatcher - - // check to re-Watch on restarts - check *structs.ServiceCheck - allocID string - taskName string - checkName string -} - -func newFakeCheckRestarter(w *checkWatcher, allocID, taskName, checkName string, c *structs.ServiceCheck) *fakeCheckRestarter { - return &fakeCheckRestarter{ - watcher: w, - check: c, - allocID: allocID, - taskName: taskName, - checkName: checkName, - } -} - -// Restart implements part of the TaskRestarter interface needed for check -// watching and is normally fulfilled by a TaskRunner. -// -// Restarts are recorded in the []restarts field and re-Watch the check. -func (c *fakeCheckRestarter) Restart(source, reason string, failure bool) { - c.restarts = append(c.restarts, checkRestartRecord{time.Now(), source, reason, failure}) - - // Re-Watch the check just like TaskRunner - c.watcher.Watch(c.allocID, c.taskName, c.checkName, c.check, c) -} - -// String for debugging -func (c *fakeCheckRestarter) String() string { - s := fmt.Sprintf("%s %s %s restarts:\n", c.allocID, c.taskName, c.checkName) - for _, r := range c.restarts { - s += fmt.Sprintf("%s - %s: %s (failure: %t)\n", r.timestamp, r.source, r.reason, r.failure) - } - return s +func (r *restartRecorder) Restart(source, reason string, failure bool) { + atomic.AddInt64(&r.restarts, 1) } // testFakeCtx contains a fake Consul AgentAPI and implements the Exec @@ -113,7 +70,7 @@ type testFakeCtx struct { ServiceClient *ServiceClient FakeConsul *MockAgent Task *structs.Task - Restarter *fakeCheckRestarter + Restarter *restartRecorder // Ticked whenever a script is called execs chan int @@ -158,6 +115,7 @@ func setupFake() *testFakeCtx { ServiceClient: NewServiceClient(fc, true, testLogger()), FakeConsul: fc, Task: testTask(), + Restarter: &restartRecorder{}, execs: make(chan int, 100), } } @@ -453,6 +411,9 @@ func TestConsul_ChangeChecks(t *testing.T) { Interval: time.Second, Timeout: time.Second, PortLabel: "x", + CheckRestart: &structs.CheckRestart{ + Limit: 3, + }, }, } @@ -469,6 +430,13 @@ func TestConsul_ChangeChecks(t *testing.T) { t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) } + // Assert a check restart watch update was enqueued and clear it + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 1 { + t.Fatalf("expected 1 check restart update but found %d", n) + } + upd := <-ctx.ServiceClient.checkWatcher.checkUpdateCh + c1ID := upd.checkID + // Query the allocs registrations and then again when we update. The IDs // should change reg1, err := ctx.ServiceClient.AllocRegistrations(allocID) @@ -514,6 +482,9 @@ func TestConsul_ChangeChecks(t *testing.T) { Interval: 2 * time.Second, Timeout: time.Second, PortLabel: "x", + CheckRestart: &structs.CheckRestart{ + Limit: 3, + }, }, { Name: "c2", @@ -527,6 +498,26 @@ func TestConsul_ChangeChecks(t *testing.T) { if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, ctx, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } + + // Assert 2 check restart watch updates was enqueued + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 2 { + t.Fatalf("expected 2 check restart updates but found %d", n) + } + + // First the new watch + upd = <-ctx.ServiceClient.checkWatcher.checkUpdateCh + if upd.checkID == c1ID || upd.remove { + t.Fatalf("expected check watch update to be an add of checkID=%q but found remove=%t checkID=%q", + c1ID, upd.remove, upd.checkID) + } + + // Then remove the old watch + upd = <-ctx.ServiceClient.checkWatcher.checkUpdateCh + if upd.checkID != c1ID || !upd.remove { + t.Fatalf("expected check watch update to be a removal of checkID=%q but found remove=%t checkID=%q", + c1ID, upd.remove, upd.checkID) + } + if err := ctx.syncOnce(); err != nil { t.Fatalf("unexpected error syncing task: %v", err) } @@ -549,6 +540,9 @@ func TestConsul_ChangeChecks(t *testing.T) { if expected := fmt.Sprintf(":%d", xPort); v.TCP != expected { t.Errorf("expected Port x=%v but found: %v", expected, v.TCP) } + + // update id + c1ID = k case "c2": if expected := fmt.Sprintf("http://:%d/", xPort); v.HTTP != expected { t.Errorf("expected Port x=%v but found: %v", expected, v.HTTP) @@ -592,12 +586,72 @@ func TestConsul_ChangeChecks(t *testing.T) { } } } + + // Alter a CheckRestart and make sure the watcher is updated but nothing else + origTask = ctx.Task.Copy() + ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ + { + Name: "c1", + Type: "tcp", + Interval: 2 * time.Second, + Timeout: time.Second, + PortLabel: "x", + CheckRestart: &structs.CheckRestart{ + Limit: 11, + }, + }, + { + Name: "c2", + Type: "http", + Path: "/", + Interval: time.Second, + Timeout: time.Second, + PortLabel: "x", + }, + } + if err := ctx.ServiceClient.UpdateTask("allocid", origTask, ctx.Task, nil, ctx, nil); err != nil { + t.Fatalf("unexpected error registering task: %v", err) + } + if err := ctx.syncOnce(); err != nil { + t.Fatalf("unexpected error syncing task: %v", err) + } + + if n := len(ctx.FakeConsul.checks); n != 2 { + t.Fatalf("expected 2 check but found %d:\n%#v", n, ctx.FakeConsul.checks) + } + + for k, v := range ctx.FakeConsul.checks { + if v.Name == "c1" { + if k != c1ID { + t.Errorf("expected c1 to still have id %q but found %q", c1ID, k) + } + break + } + } + + // Assert a check restart watch update was enqueued for a removal and an add + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 1 { + t.Fatalf("expected 1 check restart update but found %d", n) + } + <-ctx.ServiceClient.checkWatcher.checkUpdateCh } // TestConsul_RegServices tests basic service registration. func TestConsul_RegServices(t *testing.T) { ctx := setupFake() + // Add a check w/restarting + ctx.Task.Services[0].Checks = []*structs.ServiceCheck{ + { + Name: "testcheck", + Type: "tcp", + Interval: 100 * time.Millisecond, + CheckRestart: &structs.CheckRestart{ + Limit: 3, + }, + }, + } + if err := ctx.ServiceClient.RegisterTask("allocid", ctx.Task, ctx.Restarter, nil, nil); err != nil { t.Fatalf("unexpected error registering task: %v", err) } @@ -609,6 +663,7 @@ func TestConsul_RegServices(t *testing.T) { if n := len(ctx.FakeConsul.services); n != 1 { t.Fatalf("expected 1 service but found %d:\n%#v", n, ctx.FakeConsul.services) } + for _, v := range ctx.FakeConsul.services { if v.Name != ctx.Task.Services[0].Name { t.Errorf("expected Name=%q != %q", ctx.Task.Services[0].Name, v.Name) @@ -621,6 +676,20 @@ func TestConsul_RegServices(t *testing.T) { } } + // Assert the check update is pending + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 1 { + t.Fatalf("expected 1 check restart update but found %d", n) + } + + // Assert the check update is properly formed + checkUpd := <-ctx.ServiceClient.checkWatcher.checkUpdateCh + if checkUpd.checkRestart.allocID != "allocid" { + t.Fatalf("expected check's allocid to be %q but found %q", "allocid", checkUpd.checkRestart.allocID) + } + if expected := 200 * time.Millisecond; checkUpd.checkRestart.timeLimit != expected { + t.Fatalf("expected check's time limit to be %v but found %v", expected, checkUpd.checkRestart.timeLimit) + } + // Make a change which will register a new service ctx.Task.Services[0].Name = "taskname-service2" ctx.Task.Services[0].Tags[0] = "tag3" @@ -628,6 +697,17 @@ func TestConsul_RegServices(t *testing.T) { t.Fatalf("unpexpected error registering task: %v", err) } + // Assert check update is pending + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 1 { + t.Fatalf("expected 1 check restart update but found %d", n) + } + + // Assert the check update's id has changed + checkUpd2 := <-ctx.ServiceClient.checkWatcher.checkUpdateCh + if checkUpd.checkID == checkUpd2.checkID { + t.Fatalf("expected new check update to have a new ID both both have: %q", checkUpd.checkID) + } + // Make sure changes don't take affect until sync() is called (since // Run() isn't running) if n := len(ctx.FakeConsul.services); n != 1 { @@ -675,6 +755,20 @@ func TestConsul_RegServices(t *testing.T) { t.Errorf("expected original task to survive not %q", v.Name) } } + + // Assert check update is pending + if n := len(ctx.ServiceClient.checkWatcher.checkUpdateCh); n != 1 { + t.Fatalf("expected 1 check restart update but found %d", n) + } + + // Assert the check update's id is correct and that it's a removal + checkUpd3 := <-ctx.ServiceClient.checkWatcher.checkUpdateCh + if checkUpd2.checkID != checkUpd3.checkID { + t.Fatalf("expected checkid %q but found %q", checkUpd2.checkID, checkUpd3.checkID) + } + if !checkUpd3.remove { + t.Fatalf("expected check watch removal update but found: %#v", checkUpd3) + } } // TestConsul_ShutdownOK tests the ok path for the shutdown logic in From a508bb970935043259e4285c2a4844883ab5c2be Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 15:27:39 -0700 Subject: [PATCH 23/33] Fold SetFailure into SetRestartTriggered --- client/restarts.go | 17 ++++++----------- client/restarts_test.go | 2 +- client/task_runner.go | 8 +------- 3 files changed, 8 insertions(+), 19 deletions(-) diff --git a/client/restarts.go b/client/restarts.go index 022c1cac3b6..ccf344fd10d 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -75,19 +75,14 @@ func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker // SetRestartTriggered is used to mark that the task has been signalled to be // restarted -func (r *RestartTracker) SetRestartTriggered() *RestartTracker { +func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { r.lock.Lock() defer r.lock.Unlock() - r.restartTriggered = true - return r -} - -// SetFailure is used to mark that a task should be restarted due to failure -// such as a failed Consul healthcheck. -func (r *RestartTracker) SetFailure() *RestartTracker { - r.lock.Lock() - defer r.lock.Unlock() - r.failure = true + if failure { + r.failure = true + } else { + r.restartTriggered = true + } return r } diff --git a/client/restarts_test.go b/client/restarts_test.go index 851052576e6..0d4c4e2068b 100644 --- a/client/restarts_test.go +++ b/client/restarts_test.go @@ -99,7 +99,7 @@ func TestClient_RestartTracker_RestartTriggered(t *testing.T) { p := testPolicy(true, structs.RestartPolicyModeFail) p.Attempts = 0 rt := newRestartTracker(p, structs.JobTypeService) - if state, when := rt.SetRestartTriggered().GetState(); state != structs.TaskRestarting && when != 0 { + if state, when := rt.SetRestartTriggered(false).GetState(); state != structs.TaskRestarting && when != 0 { t.Fatalf("expect restart immediately, got %v %v", state, when) } } diff --git a/client/task_runner.go b/client/task_runner.go index a33df2c39a4..06acb6dc6fe 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -1175,13 +1175,7 @@ func (r *TaskRunner) run() { <-handleWaitCh } - if restartEvent.failure { - r.restartTracker.SetFailure() - } else { - // Since the restart isn't from a failure, restart immediately - // and don't count against the restart policy - r.restartTracker.SetRestartTriggered() - } + r.restartTracker.SetRestartTriggered(restartEvent.failure) break WAIT case <-r.destroyCh: From 5141c957f7b1c0609d184a28ccaccaeb0106aac4 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 15:55:37 -0700 Subject: [PATCH 24/33] Add check_restart to jobspec tests --- jobspec/parse_test.go | 5 +++++ jobspec/test-fixtures/basic.hcl | 6 ++++++ 2 files changed, 11 insertions(+) diff --git a/jobspec/parse_test.go b/jobspec/parse_test.go index bf3d2082329..c0cb4d902b9 100644 --- a/jobspec/parse_test.go +++ b/jobspec/parse_test.go @@ -130,6 +130,11 @@ func TestParse(t *testing.T) { PortLabel: "admin", Interval: 10 * time.Second, Timeout: 2 * time.Second, + CheckRestart: &api.CheckRestart{ + Limit: 3, + Grace: helper.TimeToPtr(10 * time.Second), + IgnoreWarnings: true, + }, }, }, }, diff --git a/jobspec/test-fixtures/basic.hcl b/jobspec/test-fixtures/basic.hcl index 843bd9b016a..a9dab9e7262 100644 --- a/jobspec/test-fixtures/basic.hcl +++ b/jobspec/test-fixtures/basic.hcl @@ -95,6 +95,12 @@ job "binstore-storagelocker" { interval = "10s" timeout = "2s" port = "admin" + + check_restart { + limit = 3 + grace_period = "10s" + ignore_warnings = true + } } } From 1564e1c4b3f666d8798b4e2277b2146ed9aef4ee Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Thu, 14 Sep 2017 16:44:27 -0700 Subject: [PATCH 25/33] Move check_restart to its own section. --- .../job-specification/check_restart.html.md | 151 ++++++++++++++++++ .../docs/job-specification/service.html.md | 69 +------- website/source/layouts/docs.erb | 3 + 3 files changed, 157 insertions(+), 66 deletions(-) create mode 100644 website/source/docs/job-specification/check_restart.html.md diff --git a/website/source/docs/job-specification/check_restart.html.md b/website/source/docs/job-specification/check_restart.html.md new file mode 100644 index 00000000000..344f8e80579 --- /dev/null +++ b/website/source/docs/job-specification/check_restart.html.md @@ -0,0 +1,151 @@ +--- +layout: "docs" +page_title: "check_restart Stanza - Job Specification" +sidebar_current: "docs-job-specification-check_restart" +description: |- + The "check_restart" stanza instructs Nomad when to restart tasks with + unhealthy service checks. +--- + +# `check_restart` Stanza + + + + + + + + + + +
Placement + job -> group -> task -> service -> **check_restart** +
Placement + job -> group -> task -> service -> check -> **check_restart** +
+ +As of Nomad 0.7 the `check_restart` stanza instructs Nomad when to restart +tasks with unhealthy service checks. When a health check in Consul has been +unhealthy for the `limit` specified in a `check_restart` stanza, it is +restarted according to the task group's [`restart` policy][restart_stanza]. The +`check_restart` settings apply to [`check`s][check_stanza], but may also be +placed on [`service`s][service_stanza] to apply to all checks on a service. + +```hcl +job "mysql" { + group "mysqld" { + + restart { + attempts = 3 + delay = "10s" + interval = "10m" + mode = "fail" + } + + task "server" { + service { + tags = ["leader", "mysql"] + + port = "db" + + check { + type = "tcp" + port = "db" + interval = "10s" + timeout = "2s" + } + + check { + type = "script" + name = "check_table" + command = "/usr/local/bin/check_mysql_table_status" + args = ["--verbose"] + interval = "60s" + timeout = "5s" + + check_restart { + limit = 3 + grace = "90s" + + ignore_warnings = false + } + } + } + } + } +} +``` + +- `limit` `(int: 0)` - Restart task when a health check has failed `limit` + times. For example 1 causes a restart on the first failure. The default, + `0`, disables healtcheck based restarts. Failures must be consecutive. A + single passing check will reset the count, so flapping services may not be + restarted. + +- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts + before checking its health. + +- `ignore_warnings` `(bool: false)` - By default checks with both `critical` + and `warning` statuses are considered unhealthy. Setting `ignore_warnings = + true` treats a `warning` status like `passing` and will not trigger a restart. + +## Example Behavior + +Using the example `mysql` above would have the following behavior: + +```hcl +check_restart { + # ... + grace = "90s" + # ... +} +``` + +When the `server` task first starts and is registered in Consul, its health +will not be checked for 90 seconds. This gives the server time to startup. + +```hcl +check_restart { + limit = 3 + # ... +} +``` + +After the grace period if the script check fails, it has 180 seconds (`60s +interval * 3 limit`) to pass before a restart is triggered. Once a restart is +triggered the task group's [`restart` policy][restart_stanza] takes control: + +```hcl +restart { + # ... + delay = "10s" + # ... +} +``` + +The [`restart` stanza][restart_stanza] controls the restart behavior of the +task. In this case it will wait 10 seconds before restarting. Note that even if +the check passes in this time the restart will still occur. + +Once the task restarts Nomad waits the `grace` period again before starting to +check the task's health. + + +```hcl +restart { + attempts = 3 + # ... + interval = "10m" + mode = "fail" +} +``` + +If the check continues to fail, the task will be restarted up to `attempts` +times within an `interval`. If the `restart` attempts are reached within the +`limit` then the `mode` controls the behavior. In this case the task would fail +and not be restarted again. See the [`restart` stanza][restart_stanza] for +details. + +[check_stanza]: /docs/job-specification/service.html#check-parameters "check stanza" +[restart_stanza]: /docs/job-specification/restart.html "restart stanza" +[service_stanza]: /docs/job-specification/service.html "service stanza" diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index 2cee91b9d0e..5d4fc677585 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -117,6 +117,8 @@ scripts. - `args` `(array: [])` - Specifies additional arguments to the `command`. This only applies to script-based health checks. +- `check_restart` - See [`check_restart` stanza][check_restart_stanza]. + - `command` `(string: )` - Specifies the command to run for performing the health check. The script must exit: 0 for passing, 1 for warning, or any other value for a failing health check. This is required for script-based @@ -168,72 +170,6 @@ scripts. - `tls_skip_verify` `(bool: false)` - Skip verifying TLS certificates for HTTPS checks. Requires Consul >= 0.7.2. -#### `check_restart` Stanza - -As of Nomad 0.7 `check` stanzas may include a `check_restart` stanza to restart -tasks with unhealthy checks. Restarts use the parameters from the -[`restart`][restart_stanza] stanza, so if a task group has the default `15s` -delay, tasks won't be restarted for an extra 15 seconds after the -`check_restart` block considers it failed. `check_restart` stanzas have the -follow parameters: - -- `limit` `(int: 0)` - Restart task after `limit` failing health checks. For - example 1 causes a restart on the first failure. The default, `0`, disables - healtcheck based restarts. Failures must be consecutive. A single passing - check will reset the count, so flapping services may not be restarted. - -- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts - before checking its health. On restarts the `delay` and max jitter is added - to the grace period to prevent checking a task's health before it has - restarted. - -- `ignore_warnings` `(bool: false)` - By default checks with both `critical` - and `warning` statuses are considered unhealthy. Setting `ignore_warnings = - true` treats a `warning` status like `passing` and will not trigger a restart. - -For example: - -```hcl -restart { - delay = "8s" -} - -task "mysqld" { - service { - # ... - check { - type = "script" - name = "check_table" - command = "/usr/local/bin/check_mysql_table_status" - args = ["--verbose"] - interval = "20s" - timeout = "5s" - - check_restart { - # Restart the task after 3 consecutive failed checks (180s) - limit = 3 - - # Ignore failed checks for 90s after a service starts or restarts - grace = "90s" - - # Treat warnings as unhealthy (the default) - ignore_warnings = false - } - } - } -} -``` - -In this example the `mysqld` task has `90s` from startup to begin passing -healthchecks. After the grace period if `mysqld` would remain unhealthy for -`60s` (as determined by `limit * interval`) it would be restarted after `8s` -(as determined by the `restart.delay`). Nomad would then wait `100s` (as -determined by `grace + delay + (delay * 0.25)`) before checking `mysqld`'s -health again. - -~> `check_restart` stanzas may also be placed in `service` stanzas to apply the - same restart logic to multiple checks. - #### `header` Stanza HTTP checks may include a `header` stanza to set HTTP headers. The `header` @@ -388,6 +324,7 @@ service { [qemu driver][qemu] since the Nomad client does not have access to the file system of a task for that driver. +[check_restart_stanza]: /docs/job-specification/check_restart.html "check_restart stanza" [service-discovery]: /docs/service-discovery/index.html "Nomad Service Discovery" [interpolation]: /docs/runtime/interpolation.html "Nomad Runtime Interpolation" [network]: /docs/job-specification/network.html "Nomad network Job Specification" diff --git a/website/source/layouts/docs.erb b/website/source/layouts/docs.erb index cb98350fdc5..3cbe604f91f 100644 --- a/website/source/layouts/docs.erb +++ b/website/source/layouts/docs.erb @@ -26,6 +26,9 @@ > artifact + > + check_restart + > constraint From 80147622c566ed175d3e5babf308bebb215e329f Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 14:34:36 -0700 Subject: [PATCH 26/33] Add comments --- api/tasks.go | 2 ++ client/restarts.go | 7 ++++++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/api/tasks.go b/api/tasks.go index 9b7886384c4..a3d10831e82 100644 --- a/api/tasks.go +++ b/api/tasks.go @@ -187,6 +187,8 @@ func (s *Service) Canonicalize(t *Task, tg *TaskGroup, job *Job) { s.CheckRestart.Canonicalize() + // Canonicallize CheckRestart on Checks and merge Service.CheckRestart + // into each check. for _, c := range s.Checks { c.CheckRestart.Canonicalize() c.CheckRestart = c.CheckRestart.Merge(s.CheckRestart) diff --git a/client/restarts.go b/client/restarts.go index ccf344fd10d..ca0456f32d2 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -74,7 +74,9 @@ func (r *RestartTracker) SetWaitResult(res *dstructs.WaitResult) *RestartTracker } // SetRestartTriggered is used to mark that the task has been signalled to be -// restarted +// restarted. Setting the failure to true restarts according to the restart +// policy. When failure is false the task is restarted without considering the +// restart policy. func (r *RestartTracker) SetRestartTriggered(failure bool) *RestartTracker { r.lock.Lock() defer r.lock.Unlock() @@ -159,6 +161,9 @@ func (r *RestartTracker) GetState() (string, time.Duration) { } } + // If this task has been restarted due to failures more times + // than the restart policy allows within an interval fail + // according to the restart policy's mode. if r.count > r.policy.Attempts { if r.policy.Mode == structs.RestartPolicyModeFail { r.reason = fmt.Sprintf( From cde908e35d8b3dedc2ed68d2d7b2eb832aaf06ad Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 14:54:37 -0700 Subject: [PATCH 27/33] Cleanup and test restart failure code --- client/restarts.go | 62 ++++++++++++++++++++--------------------- client/restarts_test.go | 13 +++++++++ 2 files changed, 44 insertions(+), 31 deletions(-) diff --git a/client/restarts.go b/client/restarts.go index ca0456f32d2..c403b6f05d8 100644 --- a/client/restarts.go +++ b/client/restarts.go @@ -145,42 +145,42 @@ func (r *RestartTracker) GetState() (string, time.Duration) { } // Handle restarts due to failures - if r.failure { - if r.startErr != nil { - // If the error is not recoverable, do not restart. - if !structs.IsRecoverable(r.startErr) { - r.reason = ReasonUnrecoverableErrror - return structs.TaskNotRestarting, 0 - } - } else if r.waitRes != nil { - // If the task started successfully and restart on success isn't specified, - // don't restart but don't mark as failed. - if r.waitRes.Successful() && !r.onSuccess { - r.reason = "Restart unnecessary as task terminated successfully" - return structs.TaskTerminated, 0 - } - } + if !r.failure { + return "", 0 + } - // If this task has been restarted due to failures more times - // than the restart policy allows within an interval fail - // according to the restart policy's mode. - if r.count > r.policy.Attempts { - if r.policy.Mode == structs.RestartPolicyModeFail { - r.reason = fmt.Sprintf( - `Exceeded allowed attempts %d in interval %v and mode is "fail"`, - r.policy.Attempts, r.policy.Interval) - return structs.TaskNotRestarting, 0 - } else { - r.reason = ReasonDelay - return structs.TaskRestarting, r.getDelay() - } + if r.startErr != nil { + // If the error is not recoverable, do not restart. + if !structs.IsRecoverable(r.startErr) { + r.reason = ReasonUnrecoverableErrror + return structs.TaskNotRestarting, 0 + } + } else if r.waitRes != nil { + // If the task started successfully and restart on success isn't specified, + // don't restart but don't mark as failed. + if r.waitRes.Successful() && !r.onSuccess { + r.reason = "Restart unnecessary as task terminated successfully" + return structs.TaskTerminated, 0 } + } - r.reason = ReasonWithinPolicy - return structs.TaskRestarting, r.jitter() + // If this task has been restarted due to failures more times + // than the restart policy allows within an interval fail + // according to the restart policy's mode. + if r.count > r.policy.Attempts { + if r.policy.Mode == structs.RestartPolicyModeFail { + r.reason = fmt.Sprintf( + `Exceeded allowed attempts %d in interval %v and mode is "fail"`, + r.policy.Attempts, r.policy.Interval) + return structs.TaskNotRestarting, 0 + } else { + r.reason = ReasonDelay + return structs.TaskRestarting, r.getDelay() + } } - return "", 0 + r.reason = ReasonWithinPolicy + return structs.TaskRestarting, r.jitter() } // getDelay returns the delay time to enter the next interval. diff --git a/client/restarts_test.go b/client/restarts_test.go index 0d4c4e2068b..b0cad5b1a3c 100644 --- a/client/restarts_test.go +++ b/client/restarts_test.go @@ -104,6 +104,19 @@ func TestClient_RestartTracker_RestartTriggered(t *testing.T) { } } +func TestClient_RestartTracker_RestartTriggered_Failure(t *testing.T) { + t.Parallel() + p := testPolicy(true, structs.RestartPolicyModeFail) + p.Attempts = 1 + rt := newRestartTracker(p, structs.JobTypeService) + if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskRestarting || when == 0 { + t.Fatalf("expect restart got %v %v", state, when) + } + if state, when := rt.SetRestartTriggered(true).GetState(); state != structs.TaskNotRestarting || when != 0 { + t.Fatalf("expect failed got %v %v", state, when) + } +} + func TestClient_RestartTracker_StartError_Recoverable_Fail(t *testing.T) { t.Parallel() p := testPolicy(true, structs.RestartPolicyModeFail) From fa836d8ca556933d9f9e3af6d848bcd0693aeee3 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 14:57:18 -0700 Subject: [PATCH 28/33] Name const after what it represents --- client/task_runner.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/client/task_runner.go b/client/task_runner.go index 06acb6dc6fe..a5d96726e42 100644 --- a/client/task_runner.go +++ b/client/task_runner.go @@ -789,8 +789,8 @@ OUTER: return } case structs.VaultChangeModeRestart: - const failure = false - r.Restart("vault", "new Vault token acquired", failure) + const noFailure = false + r.Restart("vault", "new Vault token acquired", noFailure) case structs.VaultChangeModeNoop: fallthrough default: From 924813d57e06ca5bf6510c998a950f3118900fc7 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 15:01:56 -0700 Subject: [PATCH 29/33] Test converting CheckRestart from api->structs --- command/agent/job_endpoint_test.go | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/command/agent/job_endpoint_test.go b/command/agent/job_endpoint_test.go index f7ad1d398d7..9cf62f5e6d8 100644 --- a/command/agent/job_endpoint_test.go +++ b/command/agent/job_endpoint_test.go @@ -1216,6 +1216,11 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Interval: 4 * time.Second, Timeout: 2 * time.Second, InitialStatus: "ok", + CheckRestart: &api.CheckRestart{ + Limit: 3, + Grace: helper.TimeToPtr(10 * time.Second), + IgnoreWarnings: true, + }, }, }, }, @@ -1406,6 +1411,11 @@ func TestJobs_ApiJobToStructsJob(t *testing.T) { Interval: 4 * time.Second, Timeout: 2 * time.Second, InitialStatus: "ok", + CheckRestart: &structs.CheckRestart{ + Limit: 3, + Grace: 10 * time.Second, + IgnoreWarnings: true, + }, }, }, }, From 6bcf0199e9d262d30def66a694d388027f166ccd Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 15:12:47 -0700 Subject: [PATCH 30/33] Test CheckRestart.Validate --- nomad/structs/structs.go | 7 ++++--- nomad/structs/structs_test.go | 18 ++++++++++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/nomad/structs/structs.go b/nomad/structs/structs.go index 3b4a9436667..c4a66dd9be0 100644 --- a/nomad/structs/structs.go +++ b/nomad/structs/structs.go @@ -2780,15 +2780,16 @@ func (c *CheckRestart) Validate() error { return nil } + var mErr multierror.Error if c.Limit < 0 { - return fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit) + mErr.Errors = append(mErr.Errors, fmt.Errorf("limit must be greater than or equal to 0 but found %d", c.Limit)) } if c.Grace < 0 { - return fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace) + mErr.Errors = append(mErr.Errors, fmt.Errorf("grace period must be greater than or equal to 0 but found %d", c.Grace)) } - return nil + return mErr.ErrorOrNil() } const ( diff --git a/nomad/structs/structs_test.go b/nomad/structs/structs_test.go index 1ae8b782947..d4858cc7880 100644 --- a/nomad/structs/structs_test.go +++ b/nomad/structs/structs_test.go @@ -1154,6 +1154,24 @@ func TestTask_Validate_Service_Check(t *testing.T) { } } +func TestTask_Validate_Service_Check_CheckRestart(t *testing.T) { + invalidCheckRestart := &CheckRestart{ + Limit: -1, + Grace: -1, + } + + err := invalidCheckRestart.Validate() + assert.NotNil(t, err, "invalidateCheckRestart.Validate()") + assert.Len(t, err.(*multierror.Error).Errors, 2) + + validCheckRestart := &CheckRestart{} + assert.Nil(t, validCheckRestart.Validate()) + + validCheckRestart.Limit = 1 + validCheckRestart.Grace = 1 + assert.Nil(t, validCheckRestart.Validate()) +} + func TestTask_Validate_LogConfig(t *testing.T) { task := &Task{ LogConfig: DefaultLogConfig(), From 10ae18c7eb434af5f57b21a3db3d2acc11328433 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 15:18:32 -0700 Subject: [PATCH 31/33] Minor corrections to check_restart docs --- .../source/docs/job-specification/check_restart.html.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/website/source/docs/job-specification/check_restart.html.md b/website/source/docs/job-specification/check_restart.html.md index 344f8e80579..c39393219b6 100644 --- a/website/source/docs/job-specification/check_restart.html.md +++ b/website/source/docs/job-specification/check_restart.html.md @@ -30,6 +30,8 @@ unhealthy for the `limit` specified in a `check_restart` stanza, it is restarted according to the task group's [`restart` policy][restart_stanza]. The `check_restart` settings apply to [`check`s][check_stanza], but may also be placed on [`service`s][service_stanza] to apply to all checks on a service. +`check_restart` settings on `service` will only overwrite unset `check_restart` +settings on `checks.` ```hcl job "mysql" { @@ -66,7 +68,6 @@ job "mysql" { check_restart { limit = 3 grace = "90s" - ignore_warnings = false } } @@ -78,7 +79,7 @@ job "mysql" { - `limit` `(int: 0)` - Restart task when a health check has failed `limit` times. For example 1 causes a restart on the first failure. The default, - `0`, disables healtcheck based restarts. Failures must be consecutive. A + `0`, disables health check based restarts. Failures must be consecutive. A single passing check will reset the count, so flapping services may not be restarted. @@ -124,8 +125,8 @@ restart { ``` The [`restart` stanza][restart_stanza] controls the restart behavior of the -task. In this case it will wait 10 seconds before restarting. Note that even if -the check passes in this time the restart will still occur. +task. In this case it will stop the task and then wait 10 seconds before +starting it again. Once the task restarts Nomad waits the `grace` period again before starting to check the task's health. From 967825d341d0b5236bd6a7f5ca7090c843fc04b8 Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 15:19:53 -0700 Subject: [PATCH 32/33] Fix comments: task -> check --- command/agent/consul/check_watcher.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/command/agent/consul/check_watcher.go b/command/agent/consul/check_watcher.go index 29a158eaa05..4b0656765d6 100644 --- a/command/agent/consul/check_watcher.go +++ b/command/agent/consul/check_watcher.go @@ -267,7 +267,7 @@ func (w *checkWatcher) Run(ctx context.Context) { } } -// Watch a task and restart it if unhealthy. +// Watch a check and restart its task if unhealthy. func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.ServiceCheck, restarter TaskRestarter) { if !check.TriggersRestarts() { // Not watched, noop @@ -302,7 +302,7 @@ func (w *checkWatcher) Watch(allocID, taskName, checkID string, check *structs.S } } -// Unwatch a task. +// Unwatch a check. func (w *checkWatcher) Unwatch(cid string) { c := checkWatchUpdate{ checkID: cid, From 3d7446d99c1ef0170d3784615639a77152cf25ad Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Fri, 15 Sep 2017 15:53:22 -0700 Subject: [PATCH 33/33] @dadgar is better at words than me --- website/source/docs/job-specification/check_restart.html.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/website/source/docs/job-specification/check_restart.html.md b/website/source/docs/job-specification/check_restart.html.md index c39393219b6..bac3cec6197 100644 --- a/website/source/docs/job-specification/check_restart.html.md +++ b/website/source/docs/job-specification/check_restart.html.md @@ -30,8 +30,8 @@ unhealthy for the `limit` specified in a `check_restart` stanza, it is restarted according to the task group's [`restart` policy][restart_stanza]. The `check_restart` settings apply to [`check`s][check_stanza], but may also be placed on [`service`s][service_stanza] to apply to all checks on a service. -`check_restart` settings on `service` will only overwrite unset `check_restart` -settings on `checks.` +If `check_restart` is set on both the check and service, the stanza's are +merged with the check values taking precedence. ```hcl job "mysql" {