From a6fbbdd5440c1b19511d1753d0565b88d15d14fa Mon Sep 17 00:00:00 2001 From: Michael Schurter Date: Sun, 10 Sep 2017 21:38:31 -0700 Subject: [PATCH] Document new check_restart stanza --- website/source/api/json-jobs.html.md | 21 +++++- .../docs/job-specification/service.html.md | 74 +++++++++++++++++++ 2 files changed, 94 insertions(+), 1 deletion(-) diff --git a/website/source/api/json-jobs.html.md b/website/source/api/json-jobs.html.md index 8851df1d5cd..8e52d3ea43e 100644 --- a/website/source/api/json-jobs.html.md +++ b/website/source/api/json-jobs.html.md @@ -66,7 +66,12 @@ Below is the JSON representation of the job outputed by `$ nomad init`: "Interval": 10000000000, "Timeout": 2000000000, "InitialStatus": "", - "TLSSkipVerify": false + "TLSSkipVerify": false, + "CheckRestart": { + "Limit": 3, + "Grace": "30s", + "IgnoreWarnings": false + } }] }], "Resources": { @@ -377,6 +382,20 @@ The `Task` object supports the following keys: - `TLSSkipVerify`: If true, Consul will not attempt to verify the certificate when performing HTTPS checks. Requires Consul >= 0.7.2. + - `CheckRestart`: `CheckRestart` is an object which enables + restarting of tasks based upon Consul health checks. + + - `Limit`: The number of unhealthy checks allowed before the + service is restarted. Defaults to `0` which disables + health-based restarts. + + - `Grace`: The duration to wait after a task starts or restarts + before counting unhealthy checks count against the limit. + Defaults to "1s". + + - `IgnoreWarnings`: Treat checks that are warning as passing. + Defaults to false which means warnings are considered unhealthy. + - `ShutdownDelay` - Specifies the duration to wait when killing a task between removing it from Consul and sending it a shutdown signal. Ideally services would fail healthchecks once they receive a shutdown signal. Alternatively diff --git a/website/source/docs/job-specification/service.html.md b/website/source/docs/job-specification/service.html.md index cef747b0dd6..2cee91b9d0e 100644 --- a/website/source/docs/job-specification/service.html.md +++ b/website/source/docs/job-specification/service.html.md @@ -47,6 +47,12 @@ job "docs" { args = ["--verbose"] interval = "60s" timeout = "5s" + + check_restart { + limit = 3 + grace = "90s" + ignore_warnings = false + } } } } @@ -162,6 +168,72 @@ scripts. - `tls_skip_verify` `(bool: false)` - Skip verifying TLS certificates for HTTPS checks. Requires Consul >= 0.7.2. +#### `check_restart` Stanza + +As of Nomad 0.7 `check` stanzas may include a `check_restart` stanza to restart +tasks with unhealthy checks. Restarts use the parameters from the +[`restart`][restart_stanza] stanza, so if a task group has the default `15s` +delay, tasks won't be restarted for an extra 15 seconds after the +`check_restart` block considers it failed. `check_restart` stanzas have the +follow parameters: + +- `limit` `(int: 0)` - Restart task after `limit` failing health checks. For + example 1 causes a restart on the first failure. The default, `0`, disables + healtcheck based restarts. Failures must be consecutive. A single passing + check will reset the count, so flapping services may not be restarted. + +- `grace` `(string: "1s")` - Duration to wait after a task starts or restarts + before checking its health. On restarts the `delay` and max jitter is added + to the grace period to prevent checking a task's health before it has + restarted. + +- `ignore_warnings` `(bool: false)` - By default checks with both `critical` + and `warning` statuses are considered unhealthy. Setting `ignore_warnings = + true` treats a `warning` status like `passing` and will not trigger a restart. + +For example: + +```hcl +restart { + delay = "8s" +} + +task "mysqld" { + service { + # ... + check { + type = "script" + name = "check_table" + command = "/usr/local/bin/check_mysql_table_status" + args = ["--verbose"] + interval = "20s" + timeout = "5s" + + check_restart { + # Restart the task after 3 consecutive failed checks (180s) + limit = 3 + + # Ignore failed checks for 90s after a service starts or restarts + grace = "90s" + + # Treat warnings as unhealthy (the default) + ignore_warnings = false + } + } + } +} +``` + +In this example the `mysqld` task has `90s` from startup to begin passing +healthchecks. After the grace period if `mysqld` would remain unhealthy for +`60s` (as determined by `limit * interval`) it would be restarted after `8s` +(as determined by the `restart.delay`). Nomad would then wait `100s` (as +determined by `grace + delay + (delay * 0.25)`) before checking `mysqld`'s +health again. + +~> `check_restart` stanzas may also be placed in `service` stanzas to apply the + same restart logic to multiple checks. + #### `header` Stanza HTTP checks may include a `header` stanza to set HTTP headers. The `header` @@ -170,6 +242,7 @@ the header to be set multiple times, once for each value. ```hcl service { + # ... check { type = "http" port = "lb" @@ -319,3 +392,4 @@ system of a task for that driver. [interpolation]: /docs/runtime/interpolation.html "Nomad Runtime Interpolation" [network]: /docs/job-specification/network.html "Nomad network Job Specification" [qemu]: /docs/drivers/qemu.html "Nomad qemu Driver" +[restart_stanza]: /docs/job-specification/restart.html "restart stanza"