From 71fab46950e89341df8c9cab493569e8807abcbb Mon Sep 17 00:00:00 2001 From: Lee Hinman Date: Tue, 8 Oct 2019 08:32:15 -0600 Subject: [PATCH] Add Snapshot Lifecycle Retention documentation (#47545) * Add Snapshot Lifecycle Retention documentation This commits adds API and general purpose documentation for SLM retention. Relates to #43663 * Fix docs tests * Update default now that #47604 has been merged * Update docs/reference/ilm/apis/slm-api.asciidoc Co-Authored-By: Gordon Brown * Update docs/reference/ilm/apis/slm-api.asciidoc Co-Authored-By: Gordon Brown * Update docs with feedback --- docs/reference/ilm/apis/slm-api.asciidoc | 75 ++++++++--- .../ilm/getting-started-slm.asciidoc | 30 ++++- docs/reference/ilm/index.asciidoc | 2 + docs/reference/ilm/slm-retention.asciidoc | 119 ++++++++++++++++++ 4 files changed, 205 insertions(+), 21 deletions(-) create mode 100644 docs/reference/ilm/slm-retention.asciidoc diff --git a/docs/reference/ilm/apis/slm-api.asciidoc b/docs/reference/ilm/apis/slm-api.asciidoc index 4ac7a0b45331b..9522ccb7b76c4 100644 --- a/docs/reference/ilm/apis/slm-api.asciidoc +++ b/docs/reference/ilm/apis/slm-api.asciidoc @@ -7,7 +7,9 @@ The Snapshot Lifecycle Management APIs are used to manage policies for the time and frequency of automatic snapshots. Snapshot Lifecycle Management is related to <>, however, instead of managing a lifecycle of actions that are performed on a single index, SLM -allows configuring policies spanning multiple indices. +allows configuring policies spanning multiple indices. Snapshot Lifecycle +Management can also perform deletion of older snapshots based on a configurable +retention policy. SLM policy management is split into three different CRUD APIs, a way to put or update policies, a way to retrieve policies, and a way to delete unwanted policies, as @@ -62,7 +64,11 @@ PUT /_slm/policy/daily-snapshots "ignore_unavailable": false, "include_global_state": false }, - "retention": {} + "retention": { <6> + "expire_after": "30d", <7> + "min_count": 5, <8> + "max_count": 50 <9> + } } -------------------------------------------------- // TEST[setup:setup-repository] @@ -72,6 +78,10 @@ PUT /_slm/policy/daily-snapshots <3> Which repository to take the snapshot in <4> Any extra snapshot configuration <5> Which indices the snapshot should contain +<6> Optional retention configuration +<7> Keep snapshots for 30 days +<8> Always keep at least 5 successful snapshots, even if they're more than 30 days old +<9> Keep no more than 50 successful snapshots, even if they're less than 30 days old The top-level keys that the policy supports are described below: @@ -139,7 +149,11 @@ The output looks similar to the following: "ignore_unavailable": false, "include_global_state": false }, - "retention": {} + "retention": { + "expire_after": "30d", + "min_count": 5, + "max_count": 50 + } }, "stats": { "policy": "daily-snapshots", @@ -229,7 +243,11 @@ Which, in this case shows an error because the index did not exist: "ignore_unavailable": false, "include_global_state": false }, - "retention": {} + "retention": { + "expire_after": "30d", + "min_count": 5, + "max_count": 50 + } }, "stats": { "policy": "daily-snapshots", @@ -270,6 +288,11 @@ PUT /_slm/policy/daily-snapshots "indices": ["data-*", "important"], "ignore_unavailable": true, "include_global_state": false + }, + "retention": { + "expire_after": "30d", + "min_count": 5, + "max_count": 50 } } -------------------------------------------------- @@ -318,7 +341,11 @@ Which now includes the successful snapshot information: "ignore_unavailable": true, "include_global_state": false }, - "retention": {} + "retention": { + "expire_after": "30d", + "min_count": 5, + "max_count": 50 + } }, "stats": { "policy": "daily-snapshots", @@ -374,22 +401,14 @@ Which returns a response similar to: "retention_timed_out": 0, "retention_deletion_time": "1.4s", "retention_deletion_time_millis": 1404, - "policy_metrics": [ - { - "policy": "daily-snapshots", - "snapshots_taken": 1, - "snapshots_failed": 1, - "snapshots_deleted": 0, - "snapshot_deletion_failures": 0 - } - ], + "policy_stats": [ ], "total_snapshots_taken": 1, "total_snapshots_failed": 1, "total_snapshots_deleted": 0, "total_snapshot_deletion_failures": 0 } -------------------------------------------------- -// TESTRESPONSE[s/runs": 13/runs": $body.retention_runs/ s/_failed": 0/_failed": $body.retention_failed/ s/_timed_out": 0/_timed_out": $body.retention_timed_out/ s/"1.4s"/$body.retention_deletion_time/ s/1404/$body.retention_deletion_time_millis/] +// TESTRESPONSE[s/runs": 13/runs": $body.retention_runs/ s/_failed": 0/_failed": $body.retention_failed/ s/_timed_out": 0/_timed_out": $body.retention_timed_out/ s/"1.4s"/$body.retention_deletion_time/ s/1404/$body.retention_deletion_time_millis/ s/total_snapshots_taken": 1/total_snapshots_taken": $body.total_snapshots_taken/ s/total_snapshots_failed": 1/total_snapshots_failed": $body.total_snapshots_failed/ s/"policy_stats": [.*]/"policy_stats": $body.policy_stats/] [[slm-api-delete]] === Delete Snapshot Lifecycle Policy API @@ -410,3 +429,29 @@ any currently ongoing snapshots or remove any previously taken snapshots. DELETE /_slm/policy/daily-snapshots -------------------------------------------------- // TEST[continued] + +[[slm-api-execute-retention]] +=== Execute Snapshot Lifecycle Retention API + +While Snapshot Lifecycle Management retention is usually invoked through the global cluster settings +for its schedule, it can sometimes be useful to invoke a retention run to expunge expired snapshots +immediately. This API allows you to run a one-off retention run. + +==== Example + +To immediately start snapshot retention, use the following + +[source,console] +-------------------------------------------------- +POST /_slm/_execute_retention +-------------------------------------------------- + +This API will immediately return, as retention will be run asynchronously in the background: + +[source,console-result] +-------------------------------------------------- +{ + "acknowledged": true +} +-------------------------------------------------- + diff --git a/docs/reference/ilm/getting-started-slm.asciidoc b/docs/reference/ilm/getting-started-slm.asciidoc index c41d4da4092e2..54ebef9a8dd3b 100644 --- a/docs/reference/ilm/getting-started-slm.asciidoc +++ b/docs/reference/ilm/getting-started-slm.asciidoc @@ -6,7 +6,8 @@ Let's get started with snapshot lifecycle management (SLM) by working through a hands-on scenario. The goal of this example is to automatically back up {es} indices using the <> every day at a particular -time. +time. Once these snapshots have been created, they are kept for a configured +amount of time and then deleted per a configured retention policy. [float] [[slm-and-security]] @@ -14,8 +15,9 @@ time. Before starting, it's important to understand the privileges that are needed when configuring SLM if you are using the security plugin. There are two built-in cluster privileges that can be used to assist: `manage_slm` and -`read_slm`. It's also good to note that the `create_snapshot` permission -allows taking snapshots even for indices the role may not have access to. +`read_slm`. It's also good to note that the `cluster:admin/snapshot/*` +permission allows taking and deleting snapshots even for indices the role may +not have access to. An example of configuring an administrator role for SLM follows: @@ -23,7 +25,7 @@ An example of configuring an administrator role for SLM follows: ----------------------------------- POST /_security/role/slm-admin { - "cluster": ["manage_slm", "create_snapshot"], + "cluster": ["manage_slm", "cluster:admin/snapshot/*"], "indices": [ { "names": [".slm-history-*"], @@ -82,6 +84,10 @@ snapshots, what the snapshots should be named, and which indices should be included, among other things. We'll use the <> API to create the policy. +When configurating a policy, retention can also optionally be configured. See +the <> documentation for the full documentation of +how retention works. + [source,console] -------------------------------------------------- PUT /_slm/policy/nightly-snapshots @@ -92,7 +98,11 @@ PUT /_slm/policy/nightly-snapshots "config": { <4> "indices": ["*"] <5> }, - "retention": {} + "retention": { <6> + "expire_after": "30d", <7> + "min_count": 5, <8> + "max_count": 50 <9> + } } -------------------------------------------------- // TEST[continued] @@ -105,6 +115,10 @@ PUT /_slm/policy/nightly-snapshots <3> the repository the snapshot should be stored in <4> the configuration to be used for the snapshot requests (see below) <5> which indices should be included in the snapshot, in this case, every index +<6> Optional retention configuration +<7> Keep snapshots for 30 days +<8> Always keep at least 5 successful snapshots +<9> Keep no more than 50 successful snapshots, even if they're less than 30 days old This policy will take a snapshot of every index each day at 1:30AM UTC. Snapshots are incremental, allowing frequent snapshots to be stored efficiently, @@ -166,7 +180,11 @@ next time the policy will be executed. "config": { "indices": ["*"], }, - "retention": {} + "retention": { + "expire_after": "30d", + "min_count": 5, + "max_count": 50 + } }, "last_success": { <1> "snapshot_name": "nightly-snap-2019.04.24-tmtnyjtrsxkhbrrdcgg18a", <2> diff --git a/docs/reference/ilm/index.asciidoc b/docs/reference/ilm/index.asciidoc index 3ace2efe95bfd..10af04f8a14b1 100644 --- a/docs/reference/ilm/index.asciidoc +++ b/docs/reference/ilm/index.asciidoc @@ -87,3 +87,5 @@ include::start-stop-ilm.asciidoc[] include::ilm-with-existing-indices.asciidoc[] include::getting-started-slm.asciidoc[] + +include::slm-retention.asciidoc[] diff --git a/docs/reference/ilm/slm-retention.asciidoc b/docs/reference/ilm/slm-retention.asciidoc new file mode 100644 index 0000000000000..6362af3e3d5b7 --- /dev/null +++ b/docs/reference/ilm/slm-retention.asciidoc @@ -0,0 +1,119 @@ +[role="xpack"] +[testenv="basic"] +[[slm-retention]] +== Snapshot lifecycle management retention + +Automatic deletion of older snapshots is an optional feature of snapshot lifecycle management. +Retention is run as a cluster level task that is not associated with a particular policy's schedule +(though the configuration of which snapshots to keep is done on a per-policy basis). Retention +configuration conists of two parts—The first a cluster-level configuration for when retention is +run and for how long, the second configured on a policy for which snapshots should be eligible for +retention. + +The cluster level settings for retention are shown below, and can be changed dynamically using the +<> API: + +|===================================== +| Setting | Default value | Description + +| `slm.retention_schedule` | `0 30 1 * * ?` | A periodic or absolute time schedule for when + retention should be run. Supports all values supported by the cron scheduler: <>. Retention can also be manually run using the + <>. Defaults to daily at 1:30am in the master + node's timezone. + +| `slm.retention_duration` | `"1h"` | A limit of how long SLM should spend deleting old snapshots. +|===================================== + +Policy level configuration for retention is done inside the `retention` object when creating or +updating a policy. All of the retention configurations options are optional. + +[source,console] +-------------------------------------------------- +PUT /_slm/policy/daily-snapshots +{ + "schedule": "0 30 1 * * ?", + "name": "", + "repository": "my_repository", + "retention": { <1> + "expire_after": "30d", <2> + "min_count": 5, <3> + "max_count": 50 <4> + } +} +-------------------------------------------------- +// TEST[setup:setup-repository] +<1> Optional retention configuration +<2> Keep snapshots for 30 days +<3> Always keep at least 5 successful snapshots +<4> Keep no more than 50 successful snapshots + +Supported configuration for retention from within a policy are as follows. The default value for +each is unset unless specified by the user in the policy configuration. + +NOTE: The oldest snapshots are always deleted first, in the case of a `max_count` of 5 for a policy +with 6 snapshots, the oldest snapshot will be deleted. + +|===================================== +| Setting | Description +| `expire_after` | A timevalue for how old a snapshot must be in order to be eligible for deletion. +| `min_count` | A minimum number of snapshots to keep, regardless of age. +| `max_count` | The maximum number of snapshots to keep, regardless of age. +|===================================== + +As an example, the retention setting in the policy configured about would read in English as: + +____ +Remove snapshots older than thirty days, but always keep the latest five snapshots. If there are +more than fifty snapshots, remove the oldest surplus snapshots until there are no more than fifty +successful snapshots. +____ + +If multiple policies are configured to snapshot to the same repository, or manual snapshots have +been taken without using the <>, they are treated as not +eligible for retention, and do not count towards any limits. This allows multiple policies to have +differing retention configuration while using the same snapshot repository. + +Statistics for snapshot retention can be retrieved using the <>: + +[source,console] +-------------------------------------------------- +GET /_slm/stats +-------------------------------------------------- +// TEST[continued] + +Which returns a response + +[source,js] +-------------------------------------------------- +{ + "retention_runs": 13, <1> + "retention_failed": 0, <2> + "retention_timed_out": 0, <3> + "retention_deletion_time": "1.4s", <4> + "retention_deletion_time_millis": 1404, + "policy_stats": [ + { + "policy": "daily-snapshots", + "snapshots_taken": 1, + "snapshots_failed": 1, + "snapshots_deleted": 0, <5> + "snapshot_deletion_failures": 0 <6> + } + ], + "total_snapshots_taken": 1, + "total_snapshots_failed": 1, + "total_snapshots_deleted": 0, <7> + "total_snapshot_deletion_failures": 0 <8> +} +-------------------------------------------------- +// TESTRESPONSE[skip:this is not actually running retention] +<1> Number of times retention has been run +<2> Number of times retention failed while running +<3> Number of times retention hit the `slm.retention_duration` time limit and had to stop before deleting all eligible snapshots +<4> Total time spent deleting snapshots by the retention process +<5> Number of snapshots created by the "daily-snapshots" policy that have been deleted +<6> Number of snapshots that failed to be deleted +<7> Total number of snapshots deleted across all policies +<8> Total number of snapshot deletion failures across all policies