diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 91c9fd1cc6c76..f81dc6ae1a1c8 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -604,7 +604,7 @@ threshold has been breached: logger.warn( "flood stage disk watermark [{}] exceeded on {}, all indices on this node will be marked read-only", - diskThresholdSettings.describeFloodStageThreshold(), + diskThresholdSettings.describeFloodStageThreshold(total, false), usage ); diff --git a/docs/reference/how-to/fix-common-cluster-issues.asciidoc b/docs/reference/how-to/fix-common-cluster-issues.asciidoc index 28d79f63761eb..dc3da98002190 100644 --- a/docs/reference/how-to/fix-common-cluster-issues.asciidoc +++ b/docs/reference/how-to/fix-common-cluster-issues.asciidoc @@ -51,8 +51,13 @@ PUT _cluster/settings { "persistent": { "cluster.routing.allocation.disk.watermark.low": "90%", + "cluster.routing.allocation.disk.watermark.low.max_headroom": "100gb", "cluster.routing.allocation.disk.watermark.high": "95%", - "cluster.routing.allocation.disk.watermark.flood_stage": "97%" + "cluster.routing.allocation.disk.watermark.high.max_headroom": "20gb", + "cluster.routing.allocation.disk.watermark.flood_stage": "97%", + "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5gb", + "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%", + "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5gb" } } @@ -82,8 +87,13 @@ PUT _cluster/settings { "persistent": { "cluster.routing.allocation.disk.watermark.low": null, + "cluster.routing.allocation.disk.watermark.low.max_headroom": null, "cluster.routing.allocation.disk.watermark.high": null, - "cluster.routing.allocation.disk.watermark.flood_stage": null + "cluster.routing.allocation.disk.watermark.high.max_headroom": null, + "cluster.routing.allocation.disk.watermark.flood_stage": null, + "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": null, + "cluster.routing.allocation.disk.watermark.flood_stage.frozen": null, + "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": null } } ---- @@ -674,8 +684,8 @@ for tips on diagnosing and preventing them. [[task-queue-backlog]] === Task queue backlog -A backlogged task queue can prevent tasks from completing and -put the cluster into an unhealthy state. +A backlogged task queue can prevent tasks from completing and +put the cluster into an unhealthy state. Resource constraints, a large number of tasks being triggered at once, and long running tasks can all contribute to a backlogged task queue. @@ -685,11 +695,11 @@ and long running tasks can all contribute to a backlogged task queue. **Check the thread pool status** -A <> can result in <>. +A <> can result in <>. -You can use the <> to +You can use the <> to see the number of active threads in each thread pool and -how many tasks are queued, how many have been rejected, and how many have completed. +how many tasks are queued, how many have been rejected, and how many have completed. [source,console] ---- @@ -698,9 +708,9 @@ GET /_cat/thread_pool?v&s=t,n&h=type,name,node_name,active,queue,rejected,comple **Inspect the hot threads on each node** -If a particular thread pool queue is backed up, -you can periodically poll the <> API -to determine if the thread has sufficient +If a particular thread pool queue is backed up, +you can periodically poll the <> API +to determine if the thread has sufficient resources to progress and gauge how quickly it is progressing. [source,console] @@ -710,9 +720,9 @@ GET /_nodes/hot_threads **Look for long running tasks** -Long-running tasks can also cause a backlog. -You can use the <> API to get information about the tasks that are running. -Check the `running_time_in_nanos` to identify tasks that are taking an excessive amount of time to complete. +Long-running tasks can also cause a backlog. +You can use the <> API to get information about the tasks that are running. +Check the `running_time_in_nanos` to identify tasks that are taking an excessive amount of time to complete. [source,console] ---- @@ -723,10 +733,10 @@ GET /_tasks?filter_path=nodes.*.tasks [[resolve-task-queue-backlog]] ==== Resolve a task queue backlog -**Increase available resources** +**Increase available resources** -If tasks are progressing slowly and the queue is backing up, -you might need to take steps to <>. +If tasks are progressing slowly and the queue is backing up, +you might need to take steps to <>. In some cases, increasing the thread pool size might help. For example, the `force_merge` thread pool defaults to a single thread. @@ -734,5 +744,5 @@ Increasing the size to 2 might help reduce a backlog of force merge requests. **Cancel stuck tasks** -If you find the active task's hot thread isn't progressing and there's a backlog, -consider canceling the task. +If you find the active task's hot thread isn't progressing and there's a backlog, +consider canceling the task. diff --git a/docs/reference/index-modules/blocks.asciidoc b/docs/reference/index-modules/blocks.asciidoc index 3c4c8b05a757a..dc23c75a3c210 100644 --- a/docs/reference/index-modules/blocks.asciidoc +++ b/docs/reference/index-modules/blocks.asciidoc @@ -37,7 +37,8 @@ block and makes resources available almost immediately. + IMPORTANT: {es} adds and removes the read-only index block automatically when the disk utilization falls below the high watermark, controlled by -<>. +<> +and <>. `index.blocks.read`:: diff --git a/docs/reference/modules/cluster/disk_allocator.asciidoc b/docs/reference/modules/cluster/disk_allocator.asciidoc index 11d94b5c59c43..b8606d0d94508 100644 --- a/docs/reference/modules/cluster/disk_allocator.asciidoc +++ b/docs/reference/modules/cluster/disk_allocator.asciidoc @@ -72,16 +72,26 @@ Defaults to `true`. Set to `false` to disable the disk allocation decider. // tag::cluster-routing-watermark-low-tag[] `cluster.routing.allocation.disk.watermark.low` {ess-icon}:: (<>) -Controls the low watermark for disk usage. It defaults to `85%`, meaning that {es} will not allocate shards to nodes that have more than 85% disk used. It can also be set to an absolute byte value (like `500mb`) to prevent {es} from allocating shards if less than the specified amount of space is available. This setting has no effect on the primary shards of newly-created indices but will prevent their replicas from being allocated. +Controls the low watermark for disk usage. It defaults to `85%`, meaning that {es} will not allocate shards to nodes that have more than 85% disk used. It can alternatively be set to a ratio value, e.g., `0.85`. It can also be set to an absolute byte value (like `500mb`) to prevent {es} from allocating shards if less than the specified amount of space is available. This setting has no effect on the primary shards of newly-created indices but will prevent their replicas from being allocated. // end::cluster-routing-watermark-low-tag[] +`cluster.routing.allocation.disk.watermark.low.max_headroom` {ess-icon}:: +(<>) Controls the max headroom for the low stage watermark (in case of a percentage/ratio value). +Defaults to 150gb when `cluster.routing.allocation.disk.watermark.low` is not explicitly set. +This caps the amount of free space required. + [[cluster-routing-watermark-high]] // tag::cluster-routing-watermark-high-tag[] `cluster.routing.allocation.disk.watermark.high` {ess-icon}:: (<>) -Controls the high watermark. It defaults to `90%`, meaning that {es} will attempt to relocate shards away from a node whose disk usage is above 90%. It can also be set to an absolute byte value (similarly to the low watermark) to relocate shards away from a node if it has less than the specified amount of free space. This setting affects the allocation of all shards, whether previously allocated or not. +Controls the high watermark. It defaults to `90%`, meaning that {es} will attempt to relocate shards away from a node whose disk usage is above 90%. It can alternatively be set to a ratio value, e.g., `0.9`. It can also be set to an absolute byte value (similarly to the low watermark) to relocate shards away from a node if it has less than the specified amount of free space. This setting affects the allocation of all shards, whether previously allocated or not. // end::cluster-routing-watermark-high-tag[] +`cluster.routing.allocation.disk.watermark.high.max_headroom` {ess-icon}:: +(<>) Controls the max headroom for the high stage watermark (in case of a percentage/ratio value). +Defaults to 100gb when `cluster.routing.allocation.disk.watermark.high` is not explicitly set. +This caps the amount of free space required. + `cluster.routing.allocation.disk.watermark.enable_for_single_data_node`:: (<>) In earlier releases, the default behaviour was to disregard disk watermarks for a single @@ -95,10 +105,16 @@ is now `true`. The setting will be removed in a future release. + -- (<>) -Controls the flood stage watermark, which defaults to 95%. {es} enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node, and that has at least one disk exceeding the flood stage. This setting is a last resort to prevent nodes from running out of disk space. The index block is automatically released when the disk utilization falls below the high watermark. +Controls the flood stage watermark, which defaults to 95%. {es} enforces a read-only index block (`index.blocks.read_only_allow_delete`) on every index that has one or more shards allocated on the node, and that has at least one disk exceeding the flood stage. This setting is a last resort to prevent nodes from running out of disk space. The index block is automatically released when the disk utilization falls below the high watermark. Similarly to the low and high watermark values, it can alternatively be set to a ratio value, e.g., `0.95`, or an absolute byte value. + +`cluster.routing.allocation.disk.watermark.flood_stage.max_headroom` {ess-icon}:: +(<>) Controls the max headroom for the flood stage watermark (in case of a percentage/ratio value). +Defaults to 20gb when +`cluster.routing.allocation.disk.watermark.flood_stage` is not explicitly set. +This caps the amount of free space required. -NOTE: You cannot mix the usage of percentage values and byte values within -these settings. Either all values are set to percentage values, or all are set to byte values. This enforcement is so that {es} can validate that the settings are internally consistent, ensuring that the low disk threshold is less than the high disk threshold, and the high disk threshold is less than the flood stage threshold. +NOTE: You cannot mix the usage of percentage/ratio values and byte values within +the watermark settings. Either all values are set to percentage/ratio values, or all are set to byte values. This enforcement is so that {es} can validate that the settings are internally consistent, ensuring that the low disk threshold is less than the high disk threshold, and the high disk threshold is less than the flood stage threshold. A similar check is done for the max headroom values. An example of resetting the read-only index block on the `my-index-000001` index: @@ -123,7 +139,7 @@ Controls the flood stage watermark for dedicated frozen nodes, which defaults to `cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom` {ess-icon}:: (<>) Controls the max headroom for the flood stage watermark for dedicated frozen -nodes. Defaults to 20GB when +nodes. Defaults to 20gb when `cluster.routing.allocation.disk.watermark.flood_stage.frozen` is not explicitly set. This caps the amount of free space required on dedicated frozen nodes. diff --git a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc index 3c13b04015e5c..5a68b2ff1e457 100644 --- a/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc +++ b/docs/reference/troubleshooting/common-issues/disk-usage-exceeded.asciidoc @@ -46,8 +46,13 @@ PUT _cluster/settings { "persistent": { "cluster.routing.allocation.disk.watermark.low": "90%", + "cluster.routing.allocation.disk.watermark.low.max_headroom": "100gb", "cluster.routing.allocation.disk.watermark.high": "95%", - "cluster.routing.allocation.disk.watermark.flood_stage": "97%" + "cluster.routing.allocation.disk.watermark.high.max_headroom": "20gb", + "cluster.routing.allocation.disk.watermark.flood_stage": "97%", + "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": "5gb", + "cluster.routing.allocation.disk.watermark.flood_stage.frozen": "97%", + "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": "5gb" } } @@ -77,8 +82,13 @@ PUT _cluster/settings { "persistent": { "cluster.routing.allocation.disk.watermark.low": null, + "cluster.routing.allocation.disk.watermark.low.max_headroom": null, "cluster.routing.allocation.disk.watermark.high": null, - "cluster.routing.allocation.disk.watermark.flood_stage": null + "cluster.routing.allocation.disk.watermark.high.max_headroom": null, + "cluster.routing.allocation.disk.watermark.flood_stage": null, + "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom": null, + "cluster.routing.allocation.disk.watermark.flood_stage.frozen": null, + "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom": null } } ----- \ No newline at end of file +---- diff --git a/docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc b/docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc index 7433e25a43947..a9407cb611c42 100644 --- a/docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc +++ b/docs/reference/troubleshooting/fix-common-cluster-issues.asciidoc @@ -16,8 +16,8 @@ the operation and returns an error. The most common causes of high CPU usage and their solutions. <>:: -High JVM memory usage can degrade cluster performance and trigger circuit -breaker errors. +High JVM memory usage can degrade cluster performance and trigger circuit +breaker errors. <>:: A red or yellow cluster status indicates one or more shards are missing or @@ -29,8 +29,8 @@ When {es} rejects a request, it stops the operation and returns an error with a `429` response code. <>:: -A backlogged task queue can prevent tasks from completing and put the cluster -into an unhealthy state. +A backlogged task queue can prevent tasks from completing and put the cluster +into an unhealthy state. include::common-issues/disk-usage-exceeded.asciidoc[] include::common-issues/circuit-breaker-errors.asciidoc[] @@ -38,4 +38,4 @@ include::common-issues/high-cpu-usage.asciidoc[] include::common-issues/high-jvm-memory-pressure.asciidoc[] include::common-issues/red-yellow-cluster-status.asciidoc[] include::common-issues/rejected-requests.asciidoc[] -include::common-issues/task-queue-backlog.asciidoc[] \ No newline at end of file +include::common-issues/task-queue-backlog.asciidoc[] diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java index 7b91da334e517..3ae64fa2073a8 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/MockDiskUsagesIT.java @@ -34,8 +34,11 @@ import java.util.Map; import java.util.concurrent.atomic.AtomicReference; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING; import static org.elasticsearch.cluster.routing.allocation.decider.EnableAllocationDecider.CLUSTER_ROUTING_REBALANCE_ENABLE_SETTING; @@ -99,8 +102,11 @@ public void testRerouteOccursOnDiskPassingHighWatermark() throws Exception { .setPersistentSettings( Settings.builder() .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%") + .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "10b") .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%") + .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "10b") .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "0b" : "100%") + .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "10b") .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms") ) ); @@ -179,8 +185,11 @@ public void testAutomaticReleaseOfIndexBlock() throws Exception { .setPersistentSettings( Settings.builder() .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%") + .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "10b") .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), watermarkBytes ? "10b" : "90%") + .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "10b") .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), watermarkBytes ? "5b" : "95%") + .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), watermarkBytes ? "-1" : "5b") .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "150ms") ) ); @@ -274,6 +283,7 @@ public void testOnlyMovesEnoughShardsToDropBelowHighWatermark() throws Exception .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "90%") .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "100%") + .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "0b") .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms") ) ); @@ -366,7 +376,9 @@ public void testDoesNotExceedLowWatermarkWhenRebalancing() throws Exception { Settings.builder() .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "85%") .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "100%") + .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "0b") .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "100%") + .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "0b") ) ); @@ -451,6 +463,7 @@ public void testMovesShardsOffSpecificDataPathAboveWatermark() throws Exception .put(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") .put(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "90%") .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "100%") + .put(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "0b") .put(CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "0ms") ) ); diff --git a/server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java b/server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java index 26119bb174eb1..0783175223f23 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/health/HealthMetadataServiceIT.java @@ -39,12 +39,14 @@ public void testEachMasterPublishesTheirThresholds() throws Exception { try (InternalTestCluster internalCluster = internalCluster()) { int numberOfNodes = 3; Map watermarkByNode = new HashMap<>(); + Map maxHeadroomByNode = new HashMap<>(); for (int i = 0; i < numberOfNodes; i++) { - String customWatermark = percentageMode - ? randomIntBetween(86, 94) + "%" - : new ByteSizeValue(randomIntBetween(6, 19)).toString(); - String nodeName = startNode(internalCluster, customWatermark); + ByteSizeValue randomBytes = new ByteSizeValue(randomLongBetween(6, 19)); + String customWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString(); + ByteSizeValue customMaxHeadroom = percentageMode ? randomBytes : new ByteSizeValue(-1L); + String nodeName = startNode(internalCluster, customWatermark, customMaxHeadroom.toString()); watermarkByNode.put(nodeName, customWatermark); + maxHeadroomByNode.put(nodeName, customMaxHeadroom); } ensureStableCluster(numberOfNodes); @@ -53,6 +55,7 @@ public void testEachMasterPublishesTheirThresholds() throws Exception { HealthMetadata.Disk diskMetadata = HealthMetadata.getHealthCustomMetadata(internalCluster.clusterService().state()) .getDiskMetadata(); assertThat(diskMetadata.describeHighWatermark(), equalTo(watermarkByNode.get(electedMaster))); + assertThat(diskMetadata.highMaxHeadroom(), equalTo(maxHeadroomByNode.get(electedMaster))); } // Stop the master to ensure another node will become master with a different watermark @@ -63,6 +66,7 @@ public void testEachMasterPublishesTheirThresholds() throws Exception { HealthMetadata.Disk diskMetadata = HealthMetadata.getHealthCustomMetadata(internalCluster.clusterService().state()) .getDiskMetadata(); assertThat(diskMetadata.describeHighWatermark(), equalTo(watermarkByNode.get(electedMaster))); + assertThat(diskMetadata.highMaxHeadroom(), equalTo(maxHeadroomByNode.get(electedMaster))); } } } @@ -70,28 +74,29 @@ public void testEachMasterPublishesTheirThresholds() throws Exception { public void testWatermarkSettingUpdate() throws Exception { try (InternalTestCluster internalCluster = internalCluster()) { int numberOfNodes = 3; - String initialWatermark = percentageMode - ? randomIntBetween(86, 94) + "%" - : new ByteSizeValue(randomIntBetween(6, 19)).toString(); + ByteSizeValue randomBytes = new ByteSizeValue(randomLongBetween(6, 19)); + String initialWatermark = percentageMode ? randomIntBetween(86, 94) + "%" : randomBytes.toString(); + ByteSizeValue initialMaxHeadroom = percentageMode ? randomBytes : new ByteSizeValue(-1L); for (int i = 0; i < numberOfNodes; i++) { - startNode(internalCluster, initialWatermark); + startNode(internalCluster, initialWatermark, initialMaxHeadroom.toString()); } - String updatedLowWatermark = percentageMode - ? randomIntBetween(40, 59) + "%" - : new ByteSizeValue(randomIntBetween(101, 200)).toString(); - String updatedHighWatermark = percentageMode - ? randomIntBetween(60, 90) + "%" - : new ByteSizeValue(randomIntBetween(50, 100)).toString(); - String updatedFloodStageWatermark = percentageMode - ? randomIntBetween(91, 95) + "%" - : new ByteSizeValue(randomIntBetween(5, 10)).toString(); + randomBytes = new ByteSizeValue(randomLongBetween(101, 200)); + String updatedLowWatermark = percentageMode ? randomIntBetween(40, 59) + "%" : randomBytes.toString(); + ByteSizeValue updatedLowMaxHeadroom = percentageMode ? randomBytes : new ByteSizeValue(-1L); + randomBytes = new ByteSizeValue(randomLongBetween(50, 100)); + String updatedHighWatermark = percentageMode ? randomIntBetween(60, 90) + "%" : randomBytes.toString(); + ByteSizeValue updatedHighMaxHeadroom = percentageMode ? randomBytes : new ByteSizeValue(-1L); + randomBytes = new ByteSizeValue(randomLongBetween(5, 10)); + String updatedFloodStageWatermark = percentageMode ? randomIntBetween(91, 95) + "%" : randomBytes.toString(); + ByteSizeValue updatedFloodStageMaxHeadroom = percentageMode ? randomBytes : new ByteSizeValue(-1L); ensureStableCluster(numberOfNodes); { HealthMetadata.Disk diskMetadata = HealthMetadata.getHealthCustomMetadata(internalCluster.clusterService().state()) .getDiskMetadata(); assertThat(diskMetadata.describeHighWatermark(), equalTo(initialWatermark)); + assertThat(diskMetadata.highMaxHeadroom(), equalTo(initialMaxHeadroom)); } internalCluster.client() .admin() @@ -100,14 +105,26 @@ public void testWatermarkSettingUpdate() throws Exception { new ClusterUpdateSettingsRequest().persistentSettings( Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), updatedLowWatermark) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + updatedLowMaxHeadroom + ) .put( DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), updatedHighWatermark ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + updatedHighMaxHeadroom + ) .put( DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), updatedFloodStageWatermark ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), + updatedFloodStageMaxHeadroom + ) ) ) .actionGet(); @@ -115,30 +132,38 @@ public void testWatermarkSettingUpdate() throws Exception { HealthMetadata.Disk diskMetadata = HealthMetadata.getHealthCustomMetadata(internalCluster.clusterService().state()) .getDiskMetadata(); assertThat(diskMetadata.describeHighWatermark(), equalTo(updatedHighWatermark)); + assertThat(diskMetadata.highMaxHeadroom(), equalTo(updatedHighMaxHeadroom)); assertThat(diskMetadata.describeFloodStageWatermark(), equalTo(updatedFloodStageWatermark)); + assertThat(diskMetadata.floodStageMaxHeadroom(), equalTo(updatedFloodStageMaxHeadroom)); }); } } - private String startNode(InternalTestCluster internalCluster, String customWatermark) { + private String startNode(InternalTestCluster internalCluster, String customWatermark, String customMaxHeadroom) { return internalCluster.startNode( Settings.builder() .put(onlyRoles(Set.of(DiscoveryNodeRole.MASTER_ROLE, DiscoveryNodeRole.DATA_ROLE))) - .put(createWatermarkSettings(customWatermark)) + .put(createWatermarkSettings(customWatermark, customMaxHeadroom)) .build() ); } - private Settings createWatermarkSettings(String highWatermark) { + private Settings createWatermarkSettings(String highWatermark, String highMaxHeadroom) { // We define both thresholds to avoid inconsistencies over the type of the thresholds return Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), percentageMode ? "85%" : "20b") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "20b") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), highWatermark) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), highMaxHeadroom) .put( DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), percentageMode ? "95%" : "1b" ) - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING.getKey(), percentageMode ? "95%" : "5b") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "1b") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + percentageMode ? "95%" : "5b" + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "5b") .build(); } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java index 04e47b4a58b30..f603578da343a 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitor.java @@ -166,14 +166,13 @@ public void onNewInfo(ClusterInfo info) { final String node = entry.getKey(); final DiskUsage usage = entry.getValue(); final RoutingNode routingNode = routingNodes.node(node); + final ByteSizeValue total = ByteSizeValue.ofBytes(usage.getTotalBytes()); if (isDedicatedFrozenNode(routingNode)) { - ByteSizeValue total = ByteSizeValue.ofBytes(usage.getTotalBytes()); - long frozenFloodStageThreshold = diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(total).getBytes(); - if (usage.getFreeBytes() < frozenFloodStageThreshold) { + if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(total).getBytes()) { logger.warn( "flood stage disk watermark [{}] exceeded on {}", - diskThresholdSettings.describeFrozenFloodStageThreshold(total), + diskThresholdSettings.describeFrozenFloodStageThreshold(total, false), usage ); } @@ -182,9 +181,7 @@ public void onNewInfo(ClusterInfo info) { continue; } - if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage().getBytes() - || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdFloodStage()) { - + if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdFloodStage(total).getBytes()) { nodesOverLowThreshold.add(node); nodesOverHighThreshold.add(node); nodesOverHighThresholdAndRelocating.remove(node); @@ -199,16 +196,14 @@ public void onNewInfo(ClusterInfo info) { logger.warn( "flood stage disk watermark [{}] exceeded on {}, all indices on this node will be marked read-only", - diskThresholdSettings.describeFloodStageThreshold(), + diskThresholdSettings.describeFloodStageThreshold(total, false), usage ); continue; } - if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() - || usage.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) { - + if (usage.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { if (routingNode != null) { // might be temporarily null if the ClusterInfoService and the ClusterService are out of step for (ShardRouting routing : routingNode) { String indexName = routing.index().getName(); @@ -226,9 +221,7 @@ public void onNewInfo(ClusterInfo info) { Math.max(0L, usage.getFreeBytes() - reservedSpace) ); - if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() - || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) { - + if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { nodesOverLowThreshold.add(node); nodesOverHighThreshold.add(node); @@ -245,61 +238,57 @@ public void onNewInfo(ClusterInfo info) { ); } - } else if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() - || usageWithReservedSpace.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdLow()) { + } else if (usageWithReservedSpace.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdLowStage(total).getBytes()) { + nodesOverHighThresholdAndRelocating.remove(node); + + final boolean wasUnderLowThreshold = nodesOverLowThreshold.add(node); + final boolean wasOverHighThreshold = nodesOverHighThreshold.remove(node); + assert (wasUnderLowThreshold && wasOverHighThreshold) == false; + + if (wasUnderLowThreshold) { + logger.info( + "low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", + diskThresholdSettings.describeLowThreshold(total, false), + usage + ); + } else if (wasOverHighThreshold) { + logger.info( + "high disk watermark [{}] no longer exceeded on {}, but low disk watermark [{}] is still exceeded", + diskThresholdSettings.describeHighThreshold(total, false), + usage, + diskThresholdSettings.describeLowThreshold(total, false) + ); + } - nodesOverHighThresholdAndRelocating.remove(node); + } else { + nodesOverHighThresholdAndRelocating.remove(node); - final boolean wasUnderLowThreshold = nodesOverLowThreshold.add(node); - final boolean wasOverHighThreshold = nodesOverHighThreshold.remove(node); - assert (wasUnderLowThreshold && wasOverHighThreshold) == false; + if (nodesOverLowThreshold.contains(node)) { + // The node has previously been over the low watermark, but is no longer, so it may be possible to allocate more + // shards if we reroute now. + if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) { + reroute = true; + explanation = "one or more nodes has gone under the high or low watermark"; + nodesOverLowThreshold.remove(node); + nodesOverHighThreshold.remove(node); - if (wasUnderLowThreshold) { logger.info( - "low disk watermark [{}] exceeded on {}, replicas will not be assigned to this node", - diskThresholdSettings.describeLowThreshold(), + "low disk watermark [{}] no longer exceeded on {}", + diskThresholdSettings.describeLowThreshold(total, false), usage ); - } else if (wasOverHighThreshold) { - logger.info( - "high disk watermark [{}] no longer exceeded on {}, but low disk watermark [{}] is still exceeded", - diskThresholdSettings.describeHighThreshold(), - usage, - diskThresholdSettings.describeLowThreshold() - ); - } - - } else { - nodesOverHighThresholdAndRelocating.remove(node); - - if (nodesOverLowThreshold.contains(node)) { - // The node has previously been over the low watermark, but is no longer, so it may be possible to allocate more - // shards - // if we reroute now. - if (lastRunTimeMillis.get() <= currentTimeMillis - diskThresholdSettings.getRerouteInterval().millis()) { - reroute = true; - explanation = "one or more nodes has gone under the high or low watermark"; - nodesOverLowThreshold.remove(node); - nodesOverHighThreshold.remove(node); - - logger.info( - "low disk watermark [{}] no longer exceeded on {}", - diskThresholdSettings.describeLowThreshold(), - usage - ); - - } else { - logger.debug( - "{} has gone below a disk threshold, but an automatic reroute has occurred " - + "in the last [{}], skipping reroute", - node, - diskThresholdSettings.getRerouteInterval() - ); - } + } else { + logger.debug( + "{} has gone below a disk threshold, but an automatic reroute has occurred " + + "in the last [{}], skipping reroute", + node, + diskThresholdSettings.getRerouteInterval() + ); } - } + + } } final ActionListener listener = new GroupedActionListener<>(ActionListener.wrap(this::checkFinished), 3); @@ -325,16 +314,15 @@ public void onNewInfo(ClusterInfo info) { usageIncludingRelocations = diskUsage; relocatingShardsSize = 0L; } + final ByteSizeValue total = ByteSizeValue.ofBytes(usageIncludingRelocations.getTotalBytes()); - if (usageIncludingRelocations.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes() - || usageIncludingRelocations.getFreeDiskAsPercentage() < diskThresholdSettings.getFreeDiskThresholdHigh()) { - + if (usageIncludingRelocations.getFreeBytes() < diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { nodesOverHighThresholdAndRelocating.remove(diskUsage.getNodeId()); logger.warn( "high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to continue to exceed " + "the high disk watermark when these relocations are complete", - diskThresholdSettings.describeHighThreshold(), + diskThresholdSettings.describeHighThreshold(total, false), diskUsage, -relocatingShardsSize ); @@ -343,7 +331,7 @@ public void onNewInfo(ClusterInfo info) { "high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes; the node is expected to be below the high " + "disk watermark when these relocations are complete", - diskThresholdSettings.describeHighThreshold(), + diskThresholdSettings.describeHighThreshold(total, false), diskUsage, -relocatingShardsSize ); @@ -351,7 +339,7 @@ public void onNewInfo(ClusterInfo info) { logger.debug( "high disk watermark [{}] exceeded on {}, shards will be relocated away from this node; " + "currently relocating away shards totalling [{}] bytes", - diskThresholdSettings.describeHighThreshold(), + diskThresholdSettings.describeHighThreshold(total, false), diskUsage, -relocatingShardsSize ); diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java index 93ccc8ec446f0..8157727a7ae5f 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettings.java @@ -8,14 +8,11 @@ package org.elasticsearch.cluster.routing.allocation; -import org.elasticsearch.ElasticsearchParseException; import org.elasticsearch.Version; -import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; -import org.elasticsearch.common.unit.RatioValue; import org.elasticsearch.common.unit.RelativeByteSizeValue; import org.elasticsearch.core.TimeValue; @@ -34,31 +31,73 @@ public class DiskThresholdSettings { Setting.Property.OperatorDynamic, Setting.Property.NodeScope ); - public static final Setting CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING = new Setting<>( + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING = new Setting<>( "cluster.routing.allocation.disk.watermark.low", "85%", - (s) -> validWatermarkSetting(s, "cluster.routing.allocation.disk.watermark.low"), - new LowDiskWatermarkValidator(), + (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "cluster.routing.allocation.disk.watermark.low"), + new WatermarkValidator(), Setting.Property.Dynamic, Setting.Property.NodeScope ); - public static final Setting CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING = new Setting<>( + public static final Setting CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING = new Setting<>( + "cluster.routing.allocation.disk.watermark.low.max_headroom", + (settings) -> { + if (CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.exists(settings)) { + return "-1"; + } else { + return "150gb"; + } + }, + (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.low.max_headroom"), + new MaxHeadroomValidator(), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + public static final Setting CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING = new Setting<>( "cluster.routing.allocation.disk.watermark.high", "90%", - (s) -> validWatermarkSetting(s, "cluster.routing.allocation.disk.watermark.high"), - new HighDiskWatermarkValidator(), + (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "cluster.routing.allocation.disk.watermark.high"), + new WatermarkValidator(), Setting.Property.Dynamic, Setting.Property.NodeScope ); - public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING = new Setting<>( + public static final Setting CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING = new Setting<>( + "cluster.routing.allocation.disk.watermark.high.max_headroom", + (settings) -> { + if (CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.exists(settings)) { + return "-1"; + } else { + return "100gb"; + } + }, + (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.high.max_headroom"), + new MaxHeadroomValidator(), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING = new Setting<>( "cluster.routing.allocation.disk.watermark.flood_stage", "95%", - (s) -> validWatermarkSetting(s, "cluster.routing.allocation.disk.watermark.flood_stage"), - new FloodStageValidator(), + (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage"), + new WatermarkValidator(), + Setting.Property.Dynamic, + Setting.Property.NodeScope + ); + public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING = new Setting<>( + "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom", + (settings) -> { + if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.exists(settings)) { + return "-1"; + } else { + return "20gb"; + } + }, + (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage.max_headroom"), + new MaxHeadroomValidator(), Setting.Property.Dynamic, Setting.Property.NodeScope ); - public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING = new Setting<>( + public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING = new Setting<>( "cluster.routing.allocation.disk.watermark.flood_stage.frozen", "95%", (s) -> RelativeByteSizeValue.parseRelativeByteSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage.frozen"), @@ -68,10 +107,10 @@ public class DiskThresholdSettings { public static final Setting CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING = new Setting<>( "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom", (settings) -> { - if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING.exists(settings)) { + if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.exists(settings)) { return "-1"; } else { - return "20GB"; + return "20gb"; } }, (s) -> ByteSizeValue.parseBytesSizeValue(s, "cluster.routing.allocation.disk.watermark.flood_stage.frozen.max_headroom"), @@ -85,18 +124,16 @@ public class DiskThresholdSettings { Setting.Property.NodeScope ); - private volatile String lowWatermarkRaw; - private volatile String highWatermarkRaw; - private volatile Double freeDiskThresholdLow; - private volatile Double freeDiskThresholdHigh; - private volatile ByteSizeValue freeBytesThresholdLow; - private volatile ByteSizeValue freeBytesThresholdHigh; + private volatile RelativeByteSizeValue lowStageWatermark; + private volatile ByteSizeValue lowStageMaxHeadroom; + private volatile RelativeByteSizeValue highStageWatermark; + private volatile ByteSizeValue highStageMaxHeadroom; + private volatile RelativeByteSizeValue floodStageWatermark; + private volatile ByteSizeValue floodStageMaxHeadroom; + private volatile RelativeByteSizeValue frozenFloodStageWatermark; + private volatile ByteSizeValue frozenFloodStageMaxHeadroom; private volatile boolean enabled; private volatile TimeValue rerouteInterval; - private volatile Double freeDiskThresholdFloodStage; - private volatile ByteSizeValue freeBytesThresholdFloodStage; - private volatile RelativeByteSizeValue frozenFloodStage; - private volatile ByteSizeValue frozenFloodStageMaxHeadroom; static { assert Version.CURRENT.major == Version.V_7_0_0.major + 1; // this check is unnecessary in v9 @@ -108,20 +145,32 @@ public class DiskThresholdSettings { } public DiskThresholdSettings(Settings settings, ClusterSettings clusterSettings) { - final String lowWatermark = CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.get(settings); - final String highWatermark = CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings); - final String floodStage = CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings); - setHighWatermark(highWatermark); - setLowWatermark(lowWatermark); - setFloodStage(floodStage); - setFrozenFloodStage(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING.get(settings)); + setLowWatermark(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.get(settings)); + setLowStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.get(settings)); + setHighWatermark(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings)); + setHighStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.get(settings)); + setFloodStageWatermark(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings)); + setFloodStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings)); + setFrozenFloodStageWatermark(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.get(settings)); setFrozenFloodStageMaxHeadroom(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.get(settings)); this.rerouteInterval = CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.get(settings); this.enabled = CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.get(settings); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, this::setLowWatermark); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING, this::setLowStageMaxHeadroom); clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, this::setHighWatermark); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, this::setFloodStage); - clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING, this::setFrozenFloodStage); + clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING, this::setHighStageMaxHeadroom); + clusterSettings.addSettingsUpdateConsumer( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, + this::setFloodStageWatermark + ); + clusterSettings.addSettingsUpdateConsumer( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING, + this::setFloodStageMaxHeadroom + ); + clusterSettings.addSettingsUpdateConsumer( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING, + this::setFrozenFloodStageWatermark + ); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING, this::setFrozenFloodStageMaxHeadroom @@ -130,49 +179,82 @@ public DiskThresholdSettings(Settings settings, ClusterSettings clusterSettings) clusterSettings.addSettingsUpdateConsumer(CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING, this::setEnabled); } - static final class LowDiskWatermarkValidator implements Setting.Validator { + /** + * Validates that low, high and flood stage watermarks are all either percentages or byte values, + * and that their values adhere to the comparison: low < high < flood. Else, throws an exception. + */ + static class WatermarkValidator implements Setting.Validator { @Override - public void validate(String value) { + public void validate(RelativeByteSizeValue value) { } @Override - public void validate(final String value, final Map, Object> settings) { - final String highWatermarkRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING); - final String floodStageRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING); - doValidate(value, highWatermarkRaw, floodStageRaw); - } - - @Override - public Iterator> settings() { - final List> settings = List.of( - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, + public void validate(final RelativeByteSizeValue value, final Map, Object> settings) { + final RelativeByteSizeValue low = (RelativeByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING); + final RelativeByteSizeValue high = (RelativeByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING); + final RelativeByteSizeValue flood = (RelativeByteSizeValue) settings.get( CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING ); - return settings.iterator(); - } - } - - static final class HighDiskWatermarkValidator implements Setting.Validator { - - @Override - public void validate(final String value) { - - } - - @Override - public void validate(final String value, final Map, Object> settings) { - final String lowWatermarkRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING); - final String floodStageRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING); - doValidate(lowWatermarkRaw, value, floodStageRaw); + if (low.isAbsolute() == false && high.isAbsolute() == false && flood.isAbsolute() == false) { // Validate as percentages + final double lowWatermarkThreshold = low.getRatio().getAsPercent(); + final double highWatermarkThreshold = high.getRatio().getAsPercent(); + final double floodThreshold = flood.getRatio().getAsPercent(); + if (lowWatermarkThreshold > highWatermarkThreshold) { + throw new IllegalArgumentException( + "low disk watermark [" + low.getStringRep() + "] more than high disk watermark [" + high.getStringRep() + "]" + ); + } + if (highWatermarkThreshold > floodThreshold) { + throw new IllegalArgumentException( + "high disk watermark [" + + high.getStringRep() + + "] more than flood stage disk watermark [" + + flood.getStringRep() + + "]" + ); + } + } else if (low.isAbsolute() && high.isAbsolute() && flood.isAbsolute()) { // Validate as absolute values + final ByteSizeValue lowWatermarkBytes = low.getAbsolute(); + final ByteSizeValue highWatermarkBytes = high.getAbsolute(); + final ByteSizeValue floodStageBytes = flood.getAbsolute(); + + if (lowWatermarkBytes.getBytes() < highWatermarkBytes.getBytes()) { + throw new IllegalArgumentException( + "low disk watermark [" + low.getStringRep() + "] less than high disk watermark [" + high.getStringRep() + "]" + ); + } + if (highWatermarkBytes.getBytes() < floodStageBytes.getBytes()) { + throw new IllegalArgumentException( + "high disk watermark [" + + high.getStringRep() + + "] less than flood stage disk watermark [" + + flood.getStringRep() + + "]" + ); + } + } else { + final String message = String.format( + Locale.ROOT, + "unable to consistently parse [%s=%s], [%s=%s], and [%s=%s] as percentage or bytes", + CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), + low.getStringRep(), + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), + high.getStringRep(), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + flood.getStringRep() + ); + throw new IllegalArgumentException(message); + } } @Override public Iterator> settings() { final List> settings = List.of( CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING ); return settings.iterator(); @@ -180,183 +262,159 @@ public Iterator> settings() { } - static final class FloodStageValidator implements Setting.Validator { + /** + * Validates that low, high and flood stage max headrooms adhere to the comparison: flood < high < low. Else, throws an exception. + */ + static class MaxHeadroomValidator implements Setting.Validator { @Override - public void validate(final String value) { + public void validate(ByteSizeValue value) { } @Override - public void validate(final String value, final Map, Object> settings) { - final String lowWatermarkRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING); - final String highWatermarkRaw = (String) settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING); - doValidate(lowWatermarkRaw, highWatermarkRaw, value); + public void validate(final ByteSizeValue value, final Map, Object> settings) { + final ByteSizeValue lowHeadroom = (ByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING); + final ByteSizeValue highHeadroom = (ByteSizeValue) settings.get(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING); + final ByteSizeValue floodHeadroom = (ByteSizeValue) settings.get( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING + ); + + // For the comparisons, we need to mind that headroom values can default to -1. + if (floodHeadroom.compareTo(highHeadroom) > 0 && highHeadroom.getBytes() > 0) { + throw new IllegalArgumentException( + "flood disk max headroom [" + + floodHeadroom.getStringRep() + + "] more than high disk max headroom [" + + highHeadroom.getStringRep() + + "]" + ); + } + if (highHeadroom.compareTo(lowHeadroom) > 0 && lowHeadroom.getBytes() > 0) { + throw new IllegalArgumentException( + "high disk max headroom [" + + highHeadroom.getStringRep() + + "] more than low disk max headroom [" + + lowHeadroom.getStringRep() + + "]" + ); + } + // This check is also needed in case high headroom is -1 and relevant check above was skipped. + if (floodHeadroom.compareTo(lowHeadroom) > 0 && lowHeadroom.getBytes() > 0) { + throw new IllegalArgumentException( + "flood disk max headroom [" + + floodHeadroom.getStringRep() + + "] more than low disk max headroom [" + + lowHeadroom.getStringRep() + + "]" + ); + } } @Override public Iterator> settings() { final List> settings = List.of( - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING + CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING, + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING, + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING ); return settings.iterator(); } } - private static void doValidate(String low, String high, String flood) { - if (definitelyNotPercentage(low) == false) { // only try to validate as percentage if it isn't obviously a byte size value - try { - doValidateAsPercentage(low, high, flood); - return; // early return so that we do not try to parse as bytes - } catch (final ElasticsearchParseException e) { - // swallow as we are now going to try to parse as bytes - } - } - try { - doValidateAsBytes(low, high, flood); - } catch (final ElasticsearchParseException e) { - final String message = String.format( - Locale.ROOT, - "unable to consistently parse [%s=%s], [%s=%s], and [%s=%s] as percentage or bytes", - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), - low, - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - high, - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), - flood - ); - throw new IllegalArgumentException(message, e); - } + private void setRerouteInterval(TimeValue rerouteInterval) { + this.rerouteInterval = rerouteInterval; } - private static void doValidateAsPercentage(final String low, final String high, final String flood) { - final double lowWatermarkThreshold = thresholdPercentageFromWatermark(low, false); - final double highWatermarkThreshold = thresholdPercentageFromWatermark(high, false); - final double floodThreshold = thresholdPercentageFromWatermark(flood, false); - if (lowWatermarkThreshold > highWatermarkThreshold) { - throw new IllegalArgumentException("low disk watermark [" + low + "] more than high disk watermark [" + high + "]"); - } - if (highWatermarkThreshold > floodThreshold) { - throw new IllegalArgumentException("high disk watermark [" + high + "] more than flood stage disk watermark [" + flood + "]"); - } + private void setEnabled(boolean enabled) { + this.enabled = enabled; } - private static void doValidateAsBytes(final String low, final String high, final String flood) { - final ByteSizeValue lowWatermarkBytes = thresholdBytesFromWatermark( - low, - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), - false - ); - final ByteSizeValue highWatermarkBytes = thresholdBytesFromWatermark( - high, - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - false - ); - final ByteSizeValue floodStageBytes = thresholdBytesFromWatermark( - flood, - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), - false - ); - if (lowWatermarkBytes.getBytes() < highWatermarkBytes.getBytes()) { - throw new IllegalArgumentException("low disk watermark [" + low + "] less than high disk watermark [" + high + "]"); - } - if (highWatermarkBytes.getBytes() < floodStageBytes.getBytes()) { - throw new IllegalArgumentException("high disk watermark [" + high + "] less than flood stage disk watermark [" + flood + "]"); - } + private void setLowWatermark(RelativeByteSizeValue lowWatermark) { + this.lowStageWatermark = lowWatermark; } - private void setRerouteInterval(TimeValue rerouteInterval) { - this.rerouteInterval = rerouteInterval; + private void setLowStageMaxHeadroom(ByteSizeValue maxHeadroom) { + this.lowStageMaxHeadroom = maxHeadroom; } - private void setEnabled(boolean enabled) { - this.enabled = enabled; + private void setHighWatermark(RelativeByteSizeValue highWatermark) { + this.highStageWatermark = highWatermark; } - private void setLowWatermark(String lowWatermark) { - // Watermark is expressed in terms of used data, but we need "free" data watermark - this.lowWatermarkRaw = lowWatermark; - this.freeDiskThresholdLow = 100.0 - thresholdPercentageFromWatermark(lowWatermark); - this.freeBytesThresholdLow = thresholdBytesFromWatermark( - lowWatermark, - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey() - ); + private void setHighStageMaxHeadroom(ByteSizeValue maxHeadroom) { + this.highStageMaxHeadroom = maxHeadroom; } - private void setHighWatermark(String highWatermark) { - // Watermark is expressed in terms of used data, but we need "free" data watermark - this.highWatermarkRaw = highWatermark; - this.freeDiskThresholdHigh = 100.0 - thresholdPercentageFromWatermark(highWatermark); - this.freeBytesThresholdHigh = thresholdBytesFromWatermark( - highWatermark, - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey() - ); + private void setFloodStageWatermark(RelativeByteSizeValue floodStage) { + this.floodStageWatermark = floodStage; } - private void setFloodStage(String floodStageRaw) { - // Watermark is expressed in terms of used data, but we need "free" data watermark - this.freeDiskThresholdFloodStage = 100.0 - thresholdPercentageFromWatermark(floodStageRaw); - this.freeBytesThresholdFloodStage = thresholdBytesFromWatermark( - floodStageRaw, - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey() - ); + private void setFloodStageMaxHeadroom(ByteSizeValue maxHeadroom) { + this.floodStageMaxHeadroom = maxHeadroom; } - private void setFrozenFloodStage(RelativeByteSizeValue floodStage) { - this.frozenFloodStage = floodStage; + private void setFrozenFloodStageWatermark(RelativeByteSizeValue floodStage) { + this.frozenFloodStageWatermark = floodStage; } private void setFrozenFloodStageMaxHeadroom(ByteSizeValue maxHeadroom) { this.frozenFloodStageMaxHeadroom = maxHeadroom; } - /** - * Gets the raw (uninterpreted) low watermark value as found in the settings. - */ - public String getLowWatermarkRaw() { - return lowWatermarkRaw; + private ByteSizeValue getFreeBytesThreshold(ByteSizeValue total, RelativeByteSizeValue watermark, ByteSizeValue maxHeadroom) { + // If bytes are given, they can be readily returned as free bytes. If percentages are given, we need to calculate the free bytes. + if (watermark.isAbsolute()) { + return watermark.getAbsolute(); + } + return ByteSizeValue.ofBytes(total.getBytes() - watermark.calculateValue(total, maxHeadroom).getBytes()); } - /** - * Gets the raw (uninterpreted) high watermark value as found in the settings. - */ - public String getHighWatermarkRaw() { - return highWatermarkRaw; + public ByteSizeValue getFreeBytesThresholdLowStage(ByteSizeValue total) { + return getFreeBytesThreshold(total, lowStageWatermark, lowStageMaxHeadroom); } - public Double getFreeDiskThresholdLow() { - return freeDiskThresholdLow; + public ByteSizeValue getFreeBytesThresholdHighStage(ByteSizeValue total) { + return getFreeBytesThreshold(total, highStageWatermark, highStageMaxHeadroom); } - public Double getFreeDiskThresholdHigh() { - return freeDiskThresholdHigh; + public ByteSizeValue getFreeBytesThresholdFloodStage(ByteSizeValue total) { + return getFreeBytesThreshold(total, floodStageWatermark, floodStageMaxHeadroom); } - public ByteSizeValue getFreeBytesThresholdLow() { - return freeBytesThresholdLow; + public ByteSizeValue getFreeBytesThresholdFrozenFloodStage(ByteSizeValue total) { + return getFreeBytesThreshold(total, frozenFloodStageWatermark, frozenFloodStageMaxHeadroom); } - public ByteSizeValue getFreeBytesThresholdHigh() { - return freeBytesThresholdHigh; - } + private ByteSizeValue getMinimumTotalSizeForBelowWatermark( + ByteSizeValue used, + RelativeByteSizeValue watermark, + ByteSizeValue maxHeadroom + ) { + // If watermark is absolute, simply return total disk = used disk + free disk, where free disk bytes is the watermark value. + if (watermark.isAbsolute()) { + return ByteSizeValue.ofBytes(watermark.getAbsolute().getBytes() + used.getBytes()); + } - public Double getFreeDiskThresholdFloodStage() { - return freeDiskThresholdFloodStage; - } + // If watermark is percentage/ratio, calculate the total needed disk space. + // This may not be the minimum, due to the possible max headroom value which can cap the free disk space required. + double ratioThreshold = watermark.getRatio().getAsRatio(); + if (ratioThreshold >= 0.0 && ratioThreshold < 1.0) { + ByteSizeValue totalBytes = ByteSizeValue.ofBytes((long) Math.ceil(used.getBytes() / ratioThreshold)); - public ByteSizeValue getFreeBytesThresholdFloodStage() { - return freeBytesThresholdFloodStage; - } + // Now calculate the minimum free bytes, taking into account the possible max headroom value as well. + ByteSizeValue minimumFreeBytes = getFreeBytesThreshold(totalBytes, watermark, maxHeadroom); - public ByteSizeValue getFreeBytesThresholdFrozenFloodStage(ByteSizeValue total) { - // flood stage bytes are reversed compared to percentage, so we special handle it. - RelativeByteSizeValue frozenFloodStage = this.frozenFloodStage; - if (frozenFloodStage.isAbsolute()) { - return frozenFloodStage.getAbsolute(); + // Finally return used + minimum free bytes + return ByteSizeValue.ofBytes(minimumFreeBytes.getBytes() + used.getBytes()); + } else { + return used; } - return ByteSizeValue.ofBytes(total.getBytes() - frozenFloodStage.calculateValue(total, frozenFloodStageMaxHeadroom).getBytes()); + } + + public ByteSizeValue getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue used) { + return getMinimumTotalSizeForBelowWatermark(used, lowStageWatermark, lowStageMaxHeadroom); } public boolean isEnabled() { @@ -367,128 +425,68 @@ public TimeValue getRerouteInterval() { return rerouteInterval; } - String describeLowThreshold() { - return freeBytesThresholdLow.equals(ByteSizeValue.ZERO) - ? Strings.format1Decimals(100.0 - freeDiskThresholdLow, "%") - : freeBytesThresholdLow.toString(); - } - - String describeHighThreshold() { - return freeBytesThresholdHigh.equals(ByteSizeValue.ZERO) - ? Strings.format1Decimals(100.0 - freeDiskThresholdHigh, "%") - : freeBytesThresholdHigh.toString(); - } - - String describeFloodStageThreshold() { - return freeBytesThresholdFloodStage.equals(ByteSizeValue.ZERO) - ? Strings.format1Decimals(100.0 - freeDiskThresholdFloodStage, "%") - : freeBytesThresholdFloodStage.toString(); - } - - String describeFrozenFloodStageThreshold(ByteSizeValue total) { - ByteSizeValue maxHeadroom = this.frozenFloodStageMaxHeadroom; - RelativeByteSizeValue floodStage = this.frozenFloodStage; - if (floodStage.isAbsolute()) { - return floodStage.getStringRep(); - } else if (floodStage.calculateValue(total, maxHeadroom).equals(floodStage.calculateValue(total, null))) { - return Strings.format1Decimals(floodStage.getRatio().getAsPercent(), "%"); + private String describeThreshold( + ByteSizeValue total, + RelativeByteSizeValue watermark, + ByteSizeValue maxHeadroom, + boolean includeSettingKey, + String watermarkSettingKey, + String maxHeadroomSettingKey + ) { + if (watermark.isAbsolute()) { + return includeSettingKey ? watermarkSettingKey + "=" + watermark.getStringRep() : watermark.getStringRep(); + } else if (watermark.calculateValue(total, maxHeadroom).equals(watermark.calculateValue(total, null))) { + String value = watermark.getStringRep(); + return includeSettingKey ? watermarkSettingKey + "=" + value : value; } else { - return "max_headroom=" + maxHeadroom; + return includeSettingKey + ? maxHeadroomSettingKey + "=" + maxHeadroom.getStringRep() + : "max_headroom=" + maxHeadroom.getStringRep(); } } - /** - * Attempts to parse the watermark into a percentage, returning 100.0% if - * it cannot be parsed. - */ - private static double thresholdPercentageFromWatermark(String watermark) { - return thresholdPercentageFromWatermark(watermark, true); - } - - /** - * Attempts to parse the watermark into a percentage, returning 100.0% if it can not be parsed and the specified lenient parameter is - * true, otherwise throwing an {@link ElasticsearchParseException}. - * - * @param watermark the watermark to parse as a percentage - * @param lenient true if lenient parsing should be applied - * @return the parsed percentage - */ - private static double thresholdPercentageFromWatermark(String watermark, boolean lenient) { - if (lenient && definitelyNotPercentage(watermark)) { - // obviously not a percentage so return lenient fallback value like we would below on a parse failure - return 100.0; - } - try { - return RatioValue.parseRatioValue(watermark).getAsPercent(); - } catch (ElasticsearchParseException ex) { - // NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two - // cases separately - if (lenient) { - return 100.0; - } - throw ex; - } + public String describeLowThreshold(ByteSizeValue total, boolean includeSettingKey) { + return describeThreshold( + total, + lowStageWatermark, + lowStageMaxHeadroom, + includeSettingKey, + CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey() + ); } - /** - * Attempts to parse the watermark into a {@link ByteSizeValue}, returning - * a ByteSizeValue of 0 bytes if the value cannot be parsed. - */ - private static ByteSizeValue thresholdBytesFromWatermark(String watermark, String settingName) { - return thresholdBytesFromWatermark(watermark, settingName, true); + public String describeHighThreshold(ByteSizeValue total, boolean includeSettingKey) { + return describeThreshold( + total, + highStageWatermark, + highStageMaxHeadroom, + includeSettingKey, + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey() + ); } - /** - * Attempts to parse the watermark into a {@link ByteSizeValue}, returning zero bytes if it can not be parsed and the specified lenient - * parameter is true, otherwise throwing an {@link ElasticsearchParseException}. - * - * @param watermark the watermark to parse as a byte size - * @param settingName the name of the setting - * @param lenient true if lenient parsing should be applied - * @return the parsed byte size value - */ - private static ByteSizeValue thresholdBytesFromWatermark(String watermark, String settingName, boolean lenient) { - try { - return ByteSizeValue.parseBytesSizeValue(watermark, settingName); - } catch (ElasticsearchParseException ex) { - // NOTE: this is not end-user leniency, since up above we check that it's a valid byte or percentage, and then store the two - // cases separately - if (lenient) { - return ByteSizeValue.ZERO; - } - throw ex; - } + public String describeFloodStageThreshold(ByteSizeValue total, boolean includeSettingKey) { + return describeThreshold( + total, + floodStageWatermark, + floodStageMaxHeadroom, + includeSettingKey, + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey() + ); } - /** - * Checks if a watermark string is a valid percentage or byte size value, - * @return the watermark value given - */ - private static String validWatermarkSetting(String watermark, String settingName) { - if (definitelyNotPercentage(watermark)) { - // short circuit to save expensive exception on obvious byte size value below - ByteSizeValue.parseBytesSizeValue(watermark, settingName); - return watermark; - } - try { - RatioValue.parseRatioValue(watermark); - } catch (ElasticsearchParseException e) { - try { - ByteSizeValue.parseBytesSizeValue(watermark, settingName); - } catch (ElasticsearchParseException ex) { - ex.addSuppressed(e); - throw ex; - } - } - return watermark; + public String describeFrozenFloodStageThreshold(ByteSizeValue total, boolean includeSettingKey) { + return describeThreshold( + total, + frozenFloodStageWatermark, + frozenFloodStageMaxHeadroom, + includeSettingKey, + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey() + ); } - // Checks that a value is definitely not a percentage by testing if it ends on `b` which implies that it is probably a byte size value - // instead. This is used to make setting validation skip attempting to parse a value as a percentage/ration for the settings in this - // class that accept either a byte size value. The main motivation of this method is to make tests faster. Some tests call this method - // frequently when starting up internal cluster nodes and using exception throwing and catching when trying to parse as a ratio as a - // means of identifying that a string is not a ratio is quite slow. - private static boolean definitelyNotPercentage(String value) { - return value.endsWith("b"); - } } diff --git a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java index 69aae52a20ca4..67f797336e5d3 100644 --- a/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java +++ b/server/src/main/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDecider.java @@ -36,28 +36,33 @@ import java.util.Map; import java.util.Set; -import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING; -import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING; - /** * The {@link DiskThresholdDecider} checks that the node a shard is potentially * being allocated to has enough disk space. * - * It has three configurable settings, all of which can be changed dynamically: + * It has the following configurable settings, all of which can be changed dynamically: * * cluster.routing.allocation.disk.watermark.low is the low disk * watermark. New shards will not allocated to a node with usage higher than this, * although this watermark may be passed by allocating a shard. It defaults to * 0.85 (85.0%). * + * cluster.routing.allocation.disk.watermark.low.max_headroom is the + * max headroom for the low watermark. Defaults to 150GB when the low watermark + * is not explicitly set. This caps the amount of free space required. + * * cluster.routing.allocation.disk.watermark.high is the high disk * watermark. If a node has usage higher than this, shards are not allowed to * remain on the node. In addition, if allocating a shard to a node causes the * node to pass this watermark, it will not be allowed. It defaults to * 0.90 (90.0%). * - * Both watermark settings are expressed in terms of used disk percentage, or - * exact byte values for free space (like "500mb") + * cluster.routing.allocation.disk.watermark.high.max_headroom is the + * max headroom for the high watermark. Defaults to 100GB when the high watermark + * is not explicitly set. This caps the amount of free space required. + * + * The watermark settings are expressed in terms of used disk percentage/ratio, or + * exact byte values for free space (like "500mb"). * * cluster.routing.allocation.disk.threshold_enabled is used to * enable or disable this decider. It defaults to true (enabled). @@ -183,17 +188,13 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing return YES_DISK_WATERMARKS_IGNORED; } - final double usedDiskThresholdLow = 100.0 - diskThresholdSettings.getFreeDiskThresholdLow(); - final double usedDiskThresholdHigh = 100.0 - diskThresholdSettings.getFreeDiskThresholdHigh(); - // subtractLeavingShards is passed as false here, because they still use disk space, and therefore we should be extra careful // and take the size into account final DiskUsageWithRelocations usage = getDiskUsage(node, allocation, usages, false); - // First, check that the node currently over the low watermark - double freeDiskPercentage = usage.getFreeDiskAsPercentage(); // Cache the used disk percentage for displaying disk percentages consistent with documentation double usedDiskPercentage = usage.getUsedDiskAsPercentage(); long freeBytes = usage.getFreeBytes(); + final ByteSizeValue total = ByteSizeValue.ofBytes(usage.getTotalBytes()); if (freeBytes < 0L) { final long sizeOfRelocatingShards = sizeOfRelocatingShards( node, @@ -233,12 +234,12 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing && shardRouting.recoverySource().getType() == RecoverySource.Type.EMPTY_STORE; // checks for exact byte comparisons - if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLow().getBytes()) { + if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLowStage(total).getBytes()) { if (skipLowThresholdChecks == false) { if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} free) on node {}, preventing allocation", - diskThresholdSettings.getFreeBytesThresholdLow(), + diskThresholdSettings.getFreeBytesThresholdLowStage(total).getBytes(), freeBytesValue, node.nodeId() ); @@ -246,21 +247,21 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing return allocation.decision( Decision.NO, NAME, - "the node is above the low watermark cluster setting [%s=%s], having less than the minimum required [%s] free " - + "space, actual free: [%s]", - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getLowWatermarkRaw(), - diskThresholdSettings.getFreeBytesThresholdLow(), - freeBytesValue + "the node is above the low watermark cluster setting [%s], having less than the minimum required [%s] free " + + "space, actual free: [%s], actual used: [%s]", + diskThresholdSettings.describeLowThreshold(total, true), + diskThresholdSettings.getFreeBytesThresholdLowStage(total), + freeBytesValue, + Strings.format1Decimals(usedDiskPercentage, "%") ); - } else if (freeBytes > diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) { + } else if (freeBytes > diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { // Allow the shard to be allocated because it is primary that // has never been allocated if it's under the high watermark if (logger.isDebugEnabled()) { logger.debug( "less than the required {} free bytes threshold ({} free) on node {}, " + "but allowing allocation because primary has never been allocated", - diskThresholdSettings.getFreeBytesThresholdLow(), + diskThresholdSettings.getFreeBytesThresholdLowStage(total), freeBytesValue, node.nodeId() ); @@ -273,7 +274,7 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing logger.debug( "less than the required {} free bytes threshold ({} free) on node {}, " + "preventing allocation even though primary has never been allocated", - diskThresholdSettings.getFreeBytesThresholdHigh(), + diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes(), freeBytesValue, node.nodeId() ); @@ -281,72 +282,12 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing return allocation.decision( Decision.NO, NAME, - "the node is above the high watermark cluster setting [%s=%s], having less than the minimum required [%s] free " - + "space, actual free: [%s]", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - diskThresholdSettings.getFreeBytesThresholdHigh(), - freeBytesValue - ); - } - } - - // checks for percentage comparisons - if (freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdLow()) { - // If the shard is a replica or is a non-empty primary, check the low threshold - if (skipLowThresholdChecks == false) { - if (logger.isDebugEnabled()) { - logger.debug( - "more than the allowed {} used disk threshold ({} used) on node [{}], preventing allocation", - Strings.format1Decimals(usedDiskThresholdLow, "%"), - Strings.format1Decimals(usedDiskPercentage, "%"), - node.nodeId() - ); - } - return allocation.decision( - Decision.NO, - NAME, - "the node is above the low watermark cluster setting [%s=%s], using more disk space than the maximum allowed " - + "[%s%%], actual free: [%s%%]", - CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getLowWatermarkRaw(), - usedDiskThresholdLow, - freeDiskPercentage - ); - } else if (freeDiskPercentage > diskThresholdSettings.getFreeDiskThresholdHigh()) { - // Allow the shard to be allocated because it is primary that - // has never been allocated if it's under the high watermark - if (logger.isDebugEnabled()) { - logger.debug( - "more than the allowed {} used disk threshold ({} used) on node [{}], " - + "but allowing allocation because primary has never been allocated", - Strings.format1Decimals(usedDiskThresholdLow, "%"), - Strings.format1Decimals(usedDiskPercentage, "%"), - node.nodeId() - ); - } - return YES_UNALLOCATED_PRIMARY_BETWEEN_WATERMARKS; - } else { - // Even though the primary has never been allocated, the node is - // above the high watermark, so don't allow allocating the shard - if (logger.isDebugEnabled()) { - logger.debug( - "less than the required {} free bytes threshold ({} bytes free) on node {}, " - + "preventing allocation even though primary has never been allocated", - Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"), - Strings.format1Decimals(freeDiskPercentage, "%"), - node.nodeId() - ); - } - return allocation.decision( - Decision.NO, - NAME, - "the node is above the high watermark cluster setting [%s=%s], using more disk space than the maximum allowed " - + "[%s%%], actual free: [%s%%]", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - usedDiskThresholdHigh, - freeDiskPercentage + "the node is above the high watermark cluster setting [%s], having less than the minimum required [%s] free " + + "space, actual free: [%s], actual used: [%s]", + diskThresholdSettings.describeHighThreshold(total, true), + diskThresholdSettings.getFreeBytesThresholdHighStage(total), + freeBytesValue, + Strings.format1Decimals(usedDiskPercentage, "%") ); } } @@ -361,57 +302,40 @@ public Decision canAllocate(ShardRouting shardRouting, RoutingNode node, Routing allocation.routingTable() ); assert shardSize >= 0 : shardSize; - double freeSpaceAfterShard = freeDiskPercentageAfterShardAssigned(usage, shardSize); long freeBytesAfterShard = freeBytes - shardSize; - if (freeBytesAfterShard < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) { + if (freeBytesAfterShard < diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { logger.warn( - "after allocating [{}] node [{}] would have less than the required threshold of " - + "{} free (currently {} free, estimated shard size is {}), preventing allocation", + "after allocating [{}] node [{}] would be above the high watermark setting [{}], having less than the minimum " + + "required {} of free space (actual free: {}, actual used: {}, estimated shard size: {}), preventing allocation", shardRouting, node.nodeId(), - diskThresholdSettings.getFreeBytesThresholdHigh(), + diskThresholdSettings.describeHighThreshold(total, false), + diskThresholdSettings.getFreeBytesThresholdHighStage(total), freeBytesValue, + Strings.format1Decimals(usedDiskPercentage, "%"), new ByteSizeValue(shardSize) ); return allocation.decision( Decision.NO, NAME, - "allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " - + "and cause it to have less than the minimum required [%s] of free space (free: [%s], estimated shard size: [%s])", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - diskThresholdSettings.getFreeBytesThresholdHigh(), + "allocating the shard to this node will bring the node above the high watermark cluster setting [%s] " + + "and cause it to have less than the minimum required [%s] of free space (free: [%s], used: [%s], estimated " + + "shard size: [%s])", + diskThresholdSettings.describeHighThreshold(total, true), + diskThresholdSettings.getFreeBytesThresholdHighStage(total), freeBytesValue, + Strings.format1Decimals(usedDiskPercentage, "%"), new ByteSizeValue(shardSize) ); } - if (freeSpaceAfterShard < diskThresholdSettings.getFreeDiskThresholdHigh()) { - logger.warn( - "after allocating [{}] node [{}] would have more than the allowed " - + "{} free disk threshold ({} free), preventing allocation", - shardRouting, - node.nodeId(), - Strings.format1Decimals(diskThresholdSettings.getFreeDiskThresholdHigh(), "%"), - Strings.format1Decimals(freeSpaceAfterShard, "%") - ); - return allocation.decision( - Decision.NO, - NAME, - "allocating the shard to this node will bring the node above the high watermark cluster setting [%s=%s] " - + "and cause it to use more disk space than the maximum allowed [%s%%] (free space after shard added: [%s%%])", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - usedDiskThresholdHigh, - freeSpaceAfterShard - ); - } assert freeBytesAfterShard >= 0 : freeBytesAfterShard; return allocation.decision( Decision.YES, NAME, - "enough disk for shard on node, free: [%s], shard size: [%s], free after allocating shard: [%s]", + "enough disk for shard on node, free: [%s], used: [%s], shard size: [%s], free after allocating shard: [%s]", freeBytesValue, + Strings.format1Decimals(usedDiskPercentage, "%"), new ByteSizeValue(shardSize), new ByteSizeValue(freeBytesAfterShard) ); @@ -483,6 +407,8 @@ public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting // If this node is already above the high threshold, the shard cannot remain (get it off!) final double freeDiskPercentage = usage.getFreeDiskAsPercentage(); final long freeBytes = usage.getFreeBytes(); + double usedDiskPercentage = usage.getUsedDiskAsPercentage(); + final ByteSizeValue total = ByteSizeValue.ofBytes(usage.getTotalBytes()); if (logger.isTraceEnabled()) { logger.trace("node [{}] has {}% free disk ({} bytes)", node.nodeId(), freeDiskPercentage, freeBytes); } @@ -514,44 +440,27 @@ public Decision canRemain(IndexMetadata indexMetadata, ShardRouting shardRouting sizeOfRelocatingShards ); } - if (freeBytes < diskThresholdSettings.getFreeBytesThresholdHigh().getBytes()) { + if (freeBytes < diskThresholdSettings.getFreeBytesThresholdHighStage(total).getBytes()) { if (logger.isDebugEnabled()) { logger.debug( - "less than the required {} free bytes threshold ({} bytes free) on node {}, shard cannot remain", - diskThresholdSettings.getFreeBytesThresholdHigh(), + "node {} is over the high watermark setting [{}], having less than the required {} free space " + + "(actual free: {}, actual used: {}), shard cannot remain", + node.nodeId(), + diskThresholdSettings.describeHighThreshold(total, false), + diskThresholdSettings.getFreeBytesThresholdHighStage(total), freeBytes, - node.nodeId() + Strings.format1Decimals(usedDiskPercentage, "%") ); } return allocation.decision( Decision.NO, NAME, - "the shard cannot remain on this node because it is above the high watermark cluster setting [%s=%s] " - + "and there is less than the required [%s] free space on node, actual free: [%s]", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - diskThresholdSettings.getFreeBytesThresholdHigh(), - new ByteSizeValue(freeBytes) - ); - } - if (freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdHigh()) { - if (logger.isDebugEnabled()) { - logger.debug( - "less than the required {}% free disk threshold ({}% free) on node {}, shard cannot remain", - diskThresholdSettings.getFreeDiskThresholdHigh(), - freeDiskPercentage, - node.nodeId() - ); - } - return allocation.decision( - Decision.NO, - NAME, - "the shard cannot remain on this node because it is above the high watermark cluster setting [%s=%s] " - + "and there is less than the required [%s%%] free disk on node, actual free: [%s%%]", - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - diskThresholdSettings.getHighWatermarkRaw(), - diskThresholdSettings.getFreeDiskThresholdHigh(), - freeDiskPercentage + "the shard cannot remain on this node because it is above the high watermark cluster setting [%s] " + + "and there is less than the required [%s] free space on node, actual free: [%s], actual used: [%s]", + diskThresholdSettings.describeHighThreshold(total, true), + diskThresholdSettings.getFreeBytesThresholdHighStage(total), + new ByteSizeValue(freeBytes), + Strings.format1Decimals(usedDiskPercentage, "%") ); } @@ -618,25 +527,6 @@ static DiskUsage averageUsage(RoutingNode node, Map usages) { return new DiskUsage(node.nodeId(), node.node().getName(), "_na_", totalBytes / usages.size(), freeBytes / usages.size()); } - /** - * Given the DiskUsage for a node and the size of the shard, return the - * percentage of free disk if the shard were to be allocated to the node. - * @param usage A DiskUsage for the node to have space computed for - * @param shardSize Size in bytes of the shard - * @return Percentage of free space after the shard is assigned to the node - */ - static double freeDiskPercentageAfterShardAssigned(DiskUsageWithRelocations usage, Long shardSize) { - shardSize = (shardSize == null) ? 0 : shardSize; - DiskUsage newUsage = new DiskUsage( - usage.getNodeId(), - usage.getNodeName(), - usage.getPath(), - usage.getTotalBytes(), - usage.getFreeBytes() - shardSize - ); - return newUsage.getFreeDiskAsPercentage(); - } - private static final Decision YES_DISABLED = Decision.single(Decision.Type.YES, NAME, "the disk threshold decider is disabled"); private static final Decision YES_USAGES_UNAVAILABLE = Decision.single(Decision.Type.YES, NAME, "disk usages are unavailable"); diff --git a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java index c4024e0ba543d..c78aa07d46acb 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java +++ b/server/src/main/java/org/elasticsearch/common/settings/ClusterSettings.java @@ -242,9 +242,12 @@ public void apply(Settings value, Settings current, Settings previous) { ThrottlingAllocationDecider.CLUSTER_ROUTING_ALLOCATION_NODE_CONCURRENT_RECOVERIES_SETTING, DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING, + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, - DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING, + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING, + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING, DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING, diff --git a/server/src/main/java/org/elasticsearch/common/settings/Setting.java b/server/src/main/java/org/elasticsearch/common/settings/Setting.java index f0590b98bc034..f55c0e6b96d31 100644 --- a/server/src/main/java/org/elasticsearch/common/settings/Setting.java +++ b/server/src/main/java/org/elasticsearch/common/settings/Setting.java @@ -269,6 +269,24 @@ public Setting(String key, Function defaultValue, Function defaultValue, + Function parser, + Validator validator, + Property... properties + ) { + this(new SimpleKey(key), defaultValue, parser, validator, properties); + } + /** * Creates a new Setting instance * @param key the settings key for this setting. diff --git a/server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java b/server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java index 7475f89f5910c..316cc4b2dd150 100644 --- a/server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java +++ b/server/src/main/java/org/elasticsearch/common/unit/RelativeByteSizeValue.java @@ -90,7 +90,7 @@ public static RelativeByteSizeValue parseRelativeByteSizeValue(String value, Str public String getStringRep() { if (ratio != null) { - return ratio.toString(); + return RatioValue.formatNoTrailingZerosPercent(ratio.getAsPercent()); } else { return absolute.getStringRep(); } diff --git a/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java b/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java index 4cbe5988e3660..23c1a70623763 100644 --- a/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java +++ b/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadata.java @@ -126,7 +126,9 @@ public int hashCode() { */ public record Disk( RelativeByteSizeValue highWatermark, + ByteSizeValue highMaxHeadroom, RelativeByteSizeValue floodStageWatermark, + ByteSizeValue floodStageMaxHeadroom, RelativeByteSizeValue frozenFloodStageWatermark, ByteSizeValue frozenFloodStageMaxHeadroom ) implements ToXContentFragment, Writeable { @@ -134,7 +136,9 @@ public record Disk( public static final String TYPE = "disk"; private static final ParseField HIGH_WATERMARK_FIELD = new ParseField("high_watermark"); + private static final ParseField HIGH_MAX_HEADROOM_FIELD = new ParseField("high_max_headroom"); private static final ParseField FLOOD_STAGE_WATERMARK_FIELD = new ParseField("flood_stage_watermark"); + private static final ParseField FLOOD_STAGE_MAX_HEADROOM_FIELD = new ParseField("flood_stage_max_headroom"); private static final ParseField FROZEN_FLOOD_STAGE_WATERMARK_FIELD = new ParseField("frozen_flood_stage_watermark"); private static final ParseField FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD = new ParseField("frozen_flood_stage_max_headroom"); @@ -143,15 +147,19 @@ public record Disk( true, (args) -> new Disk( RelativeByteSizeValue.parseRelativeByteSizeValue((String) args[0], HIGH_WATERMARK_FIELD.getPreferredName()), - RelativeByteSizeValue.parseRelativeByteSizeValue((String) args[1], FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), - RelativeByteSizeValue.parseRelativeByteSizeValue((String) args[2], FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), - ByteSizeValue.parseBytesSizeValue((String) args[3], FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName()) + ByteSizeValue.parseBytesSizeValue((String) args[1], HIGH_MAX_HEADROOM_FIELD.getPreferredName()), + RelativeByteSizeValue.parseRelativeByteSizeValue((String) args[2], FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), + ByteSizeValue.parseBytesSizeValue((String) args[3], FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName()), + RelativeByteSizeValue.parseRelativeByteSizeValue((String) args[4], FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), + ByteSizeValue.parseBytesSizeValue((String) args[5], FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName()) ) ); static { PARSER.declareString(ConstructingObjectParser.constructorArg(), HIGH_WATERMARK_FIELD); + PARSER.declareString(ConstructingObjectParser.constructorArg(), HIGH_MAX_HEADROOM_FIELD); PARSER.declareString(ConstructingObjectParser.constructorArg(), FLOOD_STAGE_WATERMARK_FIELD); + PARSER.declareString(ConstructingObjectParser.constructorArg(), FLOOD_STAGE_MAX_HEADROOM_FIELD); PARSER.declareString(ConstructingObjectParser.constructorArg(), FROZEN_FLOOD_STAGE_WATERMARK_FIELD); PARSER.declareString(ConstructingObjectParser.constructorArg(), FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD); } @@ -159,7 +167,9 @@ public record Disk( Disk(StreamInput in) throws IOException { this( RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), HIGH_WATERMARK_FIELD.getPreferredName()), + new ByteSizeValue(in), RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), + new ByteSizeValue(in), RelativeByteSizeValue.parseRelativeByteSizeValue(in.readString(), FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName()), new ByteSizeValue(in) ); @@ -172,7 +182,9 @@ static Disk fromXContent(XContentParser parser) throws IOException { @Override public void writeTo(StreamOutput out) throws IOException { out.writeString(describeHighWatermark()); + highMaxHeadroom.writeTo(out); out.writeString(describeFloodStageWatermark()); + floodStageMaxHeadroom.writeTo(out); out.writeString(describeFrozenFloodStageWatermark()); frozenFloodStageMaxHeadroom.writeTo(out); } @@ -185,7 +197,9 @@ public boolean isFragment() { @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.field(HIGH_WATERMARK_FIELD.getPreferredName(), describeHighWatermark()); + builder.field(HIGH_MAX_HEADROOM_FIELD.getPreferredName(), highMaxHeadroom); builder.field(FLOOD_STAGE_WATERMARK_FIELD.getPreferredName(), describeFloodStageWatermark()); + builder.field(FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName(), floodStageMaxHeadroom); builder.field(FROZEN_FLOOD_STAGE_WATERMARK_FIELD.getPreferredName(), describeFrozenFloodStageWatermark()); builder.field(FROZEN_FLOOD_STAGE_MAX_HEADROOM_FIELD.getPreferredName(), frozenFloodStageMaxHeadroom); return builder; @@ -217,7 +231,9 @@ public boolean equals(Object o) { if (o == null || getClass() != o.getClass()) return false; Disk disk = (Disk) o; return Objects.equals(describeHighWatermark(), disk.describeHighWatermark()) + && Objects.equals(highMaxHeadroom, disk.highMaxHeadroom) && Objects.equals(describeFloodStageWatermark(), disk.describeFloodStageWatermark()) + && Objects.equals(floodStageMaxHeadroom, disk.floodStageMaxHeadroom) && Objects.equals(describeFrozenFloodStageWatermark(), disk.describeFrozenFloodStageWatermark()) && Objects.equals(frozenFloodStageMaxHeadroom, disk.frozenFloodStageMaxHeadroom); } @@ -226,7 +242,9 @@ public boolean equals(Object o) { public int hashCode() { return Objects.hash( describeHighWatermark(), + highMaxHeadroom, describeFloodStageWatermark(), + floodStageMaxHeadroom, describeFrozenFloodStageWatermark(), frozenFloodStageMaxHeadroom ); @@ -243,13 +261,17 @@ static Builder newBuilder(Disk disk) { public static class Builder { private RelativeByteSizeValue highWatermark; + private ByteSizeValue highMaxHeadroom; private RelativeByteSizeValue floodStageWatermark; + private ByteSizeValue floodStageMaxHeadroom; private RelativeByteSizeValue frozenFloodStageWatermark; private ByteSizeValue frozenFloodStageMaxHeadroom; private Builder(Disk disk) { this.highWatermark = disk.highWatermark; + this.highMaxHeadroom = disk.highMaxHeadroom; this.floodStageWatermark = disk.floodStageWatermark; + this.floodStageMaxHeadroom = disk.floodStageMaxHeadroom; this.frozenFloodStageWatermark = disk.frozenFloodStageWatermark; this.frozenFloodStageMaxHeadroom = disk.frozenFloodStageMaxHeadroom; } @@ -261,6 +283,15 @@ Disk.Builder highWatermark(RelativeByteSizeValue highWatermark) { return this; } + Disk.Builder highMaxHeadroom(ByteSizeValue highMaxHeadroom) { + this.highMaxHeadroom = highMaxHeadroom; + return this; + } + + Disk.Builder highMaxHeadroom(String highMaxHeadroom, String setting) { + return highMaxHeadroom(ByteSizeValue.parseBytesSizeValue(highMaxHeadroom, setting)); + } + Disk.Builder highWatermark(String highWatermark, String setting) { return highWatermark(RelativeByteSizeValue.parseRelativeByteSizeValue(highWatermark, setting)); } @@ -274,6 +305,15 @@ public Disk.Builder floodStageWatermark(String floodStageWatermark, String setti return floodStageWatermark(RelativeByteSizeValue.parseRelativeByteSizeValue(floodStageWatermark, setting)); } + Disk.Builder floodStageMaxHeadroom(ByteSizeValue floodStageMaxHeadroom) { + this.floodStageMaxHeadroom = floodStageMaxHeadroom; + return this; + } + + Disk.Builder floodStageMaxHeadroom(String floodStageMaxHeadroom, String setting) { + return floodStageMaxHeadroom(ByteSizeValue.parseBytesSizeValue(floodStageMaxHeadroom, setting)); + } + Disk.Builder frozenFloodStageWatermark(RelativeByteSizeValue frozenFloodStageWatermark) { this.frozenFloodStageWatermark = frozenFloodStageWatermark; return this; @@ -293,7 +333,14 @@ Disk.Builder frozenFloodStageMaxHeadroom(String frozenFloodStageMaxHeadroom, Str } Disk build() { - return new Disk(highWatermark, floodStageWatermark, frozenFloodStageWatermark, frozenFloodStageMaxHeadroom); + return new Disk( + highWatermark, + highMaxHeadroom, + floodStageWatermark, + floodStageMaxHeadroom, + frozenFloodStageWatermark, + frozenFloodStageMaxHeadroom + ); } } } diff --git a/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java b/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java index 2293ca31c8b7d..3b50368db4e3e 100644 --- a/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java +++ b/server/src/main/java/org/elasticsearch/health/metadata/HealthMetadataService.java @@ -24,7 +24,6 @@ import org.elasticsearch.common.io.stream.NamedWriteableRegistry; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; -import org.elasticsearch.common.unit.RelativeByteSizeValue; import org.elasticsearch.core.Nullable; import org.elasticsearch.xcontent.NamedXContentRegistry; import org.elasticsearch.xcontent.ParseField; @@ -32,8 +31,10 @@ import java.util.List; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING; -import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING; +import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING; import static org.elasticsearch.cluster.routing.allocation.DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING; import static org.elasticsearch.health.node.selection.HealthNodeTaskExecutor.ENABLED_SETTING; @@ -69,15 +70,29 @@ public HealthMetadataService(ClusterService clusterService, Settings settings) { ClusterSettings clusterSettings = clusterService.getClusterSettings(); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING, - value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), value) + value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), value.getStringRep()) + ); + clusterSettings.addSettingsUpdateConsumer( + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING, + value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), value.getStringRep()) ); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING, - value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), value) + value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), value.getStringRep()) + ); + clusterSettings.addSettingsUpdateConsumer( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING, + value -> updateOnSettingsUpdated( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), + value.getStringRep() + ) ); clusterSettings.addSettingsUpdateConsumer( - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING, - value -> updateOnSettingsUpdated(CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING.getKey(), value.getStringRep()) + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING, + value -> updateOnSettingsUpdated( + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + value.getStringRep() + ) ); clusterSettings.addSettingsUpdateConsumer( CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING, @@ -200,9 +215,15 @@ ClusterState execute(ClusterState clusterState) { if (CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey().equals(setting)) { builder.highWatermark(value, setting); } + if (CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey().equals(setting)) { + builder.highMaxHeadroom(value, setting); + } if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey().equals(setting)) { builder.floodStageWatermark(value, setting); } + if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey().equals(setting)) { + builder.floodStageMaxHeadroom(value, setting); + } if (CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey().equals(setting)) { builder.frozenFloodStageWatermark(value, setting); } @@ -233,15 +254,11 @@ ClusterState execute(ClusterState clusterState) { HealthMetadata initialHealthMetadata = HealthMetadata.getHealthCustomMetadata(clusterState); final var finalHealthMetadata = new HealthMetadata( new HealthMetadata.Disk( - RelativeByteSizeValue.parseRelativeByteSizeValue( - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings), - CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey() - ), - RelativeByteSizeValue.parseRelativeByteSizeValue( - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings), - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey() - ), - CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_SETTING.get(settings), + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.get(settings), + CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.get(settings), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.get(settings), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.get(settings), + CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.get(settings), CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.get(settings) ) ); diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java index 1e3be3f54c205..363ecb8d78490 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdMonitorTests.java @@ -60,7 +60,7 @@ public class DiskThresholdMonitorTests extends ESAllocationTestCase { - public void testMarkFloodStageIndicesReadOnly() { + private void doTestMarkFloodStageIndicesReadOnly(boolean testMaxHeadroom) { AllocationService allocation = createAllocationService( Settings.builder().put("cluster.routing.allocation.node_concurrent_recoveries", 10).build() ); @@ -128,10 +128,26 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi } }; + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; Map builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4)); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 30)); - builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 100))); + builder.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(19).getBytes() : 4) + ); + builder.put( + "node2", + new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(200).getBytes() : 30) + ); + builder.put( + "frozen", + new DiskUsage( + "frozen", + "frozen", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); final ClusterInfo initialClusterInfo = clusterInfo(builder); monitor.onNewInfo(initialClusterInfo); assertTrue(reroute.get()); // reroute on new nodes @@ -144,9 +160,24 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi indices.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4)); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5)); - builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(19).getBytes() : 4) + ); + builder.put( + "node2", + new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(20).getBytes() : 5) + ); + builder.put( + "frozen", + new DiskUsage( + "frozen", + "frozen", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); currentTime.addAndGet(randomLongBetween(60000, 120000)); monitor.onNewInfo(clusterInfo(builder)); assertTrue(reroute.get()); @@ -195,15 +226,38 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi indices.set(null); reroute.set(false); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 4)); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 5)); - builder.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(19).getBytes() : 4) + ); + builder.put( + "node2", + new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(20).getBytes() : 5) + ); + builder.put( + "frozen", + new DiskUsage( + "frozen", + "frozen", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder)); assertTrue(reroute.get()); assertEquals(Collections.singleton("test_1"), indices.get()); } - public void testDoesNotSubmitRerouteTaskTooFrequently() { + public void testMarkFloodStageIndicesReadOnlyWithPercentages() { + doTestMarkFloodStageIndicesReadOnly(false); + } + + public void testMarkFloodStageIndicesReadOnlyWithMaxHeadroom() { + doTestMarkFloodStageIndicesReadOnly(true); + } + + private void doTestDoesNotSubmitRerouteTaskTooFrequently(boolean testMaxHeadroom) { final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) .nodes(DiscoveryNodes.builder().add(newNormalNode("node1")).add(newNormalNode("node2"))) .build(); @@ -227,13 +281,32 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi } }; + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; Map allDisksOk = new HashMap<>(); - allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, 50)); - allDisksOk.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50)); + allDisksOk.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50) + ); + allDisksOk.put( + "node2", + new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50) + ); Map oneDiskAboveWatermark = new HashMap<>(); - oneDiskAboveWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9))); - oneDiskAboveWatermark.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, 50)); + oneDiskAboveWatermark.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(21, 99)).getBytes() : between(5, 9) + ) + ); + oneDiskAboveWatermark.put( + "node2", + new DiskUsage("node2", "node2", "/foo/bar", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(500).getBytes() : 50) + ); // should reroute when receiving info about previously-unknown nodes currentTime.addAndGet(randomLongBetween(0, 120000)); @@ -326,7 +399,10 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi // should reroute again when one disk has reserved space that pushes it over the high watermark Map reservedSpaces = Map.of( new ClusterInfo.NodeAndPath("node1", "/foo/bar"), - new ClusterInfo.ReservedSpace.Builder().add(new ShardId("baz", "quux", 0), between(41, 100)).build() + new ClusterInfo.ReservedSpace.Builder().add( + new ShardId("baz", "quux", 0), + testMaxHeadroom ? ByteSizeValue.ofGb(between(401, 10000)).getBytes() : between(41, 100) + ).build() ); currentTime.addAndGet( @@ -338,10 +414,17 @@ protected void updateIndicesReadOnly(Set indicesToMarkReadOnly, ActionLi monitor.onNewInfo(clusterInfo(allDisksOk, reservedSpaces)); assertNotNull(listenerReference.get()); listenerReference.getAndSet(null).onResponse(null); + } + + public void testDoesNotSubmitRerouteTaskTooFrequentlyWithPercentages() { + doTestDoesNotSubmitRerouteTaskTooFrequently(false); + } + public void testDoesNotSubmitRerouteTaskTooFrequentlyWithMaxHeadroom() { + doTestDoesNotSubmitRerouteTaskTooFrequently(true); } - public void testAutoReleaseIndices() { + private void doTestAutoReleaseIndices(boolean testMaxHeadroom) { AtomicReference> indicesToMarkReadOnly = new AtomicReference<>(); AtomicReference> indicesToRelease = new AtomicReference<>(); AllocationService allocation = createAllocationService( @@ -362,13 +445,15 @@ public void testAutoReleaseIndices() { ); assertThat(shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(8)); + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + Map reservedSpaces = new HashMap<>(); - final int reservedSpaceNode1 = between(0, 10); + final long reservedSpaceNode1 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 100)).getBytes() : between(0, 10); reservedSpaces.put( new ClusterInfo.NodeAndPath("node1", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode1).build() ); - final int reservedSpaceNode2 = between(0, 10); + final long reservedSpaceNode2 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 100)).getBytes() : between(0, 10); reservedSpaces.put( new ClusterInfo.NodeAndPath("node2", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode2).build() @@ -399,8 +484,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); Map builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indicesToMarkReadOnly.get()); assertNull(indicesToRelease.get()); @@ -409,8 +512,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 90))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 90))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 9900)).getBytes() : between(5, 90) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 9900)).getBytes() : between(5, 90) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertNull(indicesToMarkReadOnly.get()); assertNull(indicesToRelease.get()); @@ -456,8 +577,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 100))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertThat(indicesToMarkReadOnly.get(), contains("test_1")); assertNull(indicesToRelease.get()); @@ -466,8 +605,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertNull(indicesToMarkReadOnly.get()); assertThat(indicesToRelease.get(), contains("test_2")); @@ -476,7 +633,16 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder)); assertThat(indicesToMarkReadOnly.get(), contains("test_1")); assertNull(indicesToRelease.get()); @@ -485,10 +651,37 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 100))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 99)).getBytes() : between(5, 9) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 10000)).getBytes() : between(5, 100) + ) + ); if (randomBoolean()) { - builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100))); + builder.put( + "node3", + new DiskUsage( + "node3", + "node3", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); } monitor.onNewInfo(clusterInfo(builder)); assertNull(indicesToMarkReadOnly.get()); @@ -498,9 +691,27 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 100))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 10000)).getBytes() : between(5, 100) + ) + ); if (randomBoolean()) { - builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100))); + builder.put( + "node3", + new DiskUsage( + "node3", + "node3", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); } monitor.onNewInfo(clusterInfo(builder)); assertNull(indicesToMarkReadOnly.get()); @@ -510,16 +721,42 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); if (randomBoolean()) { - builder.put("node3", new DiskUsage("node3", "node3", "/foo/bar", 100, between(0, 100))); + builder.put( + "node3", + new DiskUsage( + "node3", + "node3", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); } monitor.onNewInfo(clusterInfo(builder)); assertThat(indicesToMarkReadOnly.get(), contains("test_1")); assertNull(indicesToRelease.get()); } - public void testNoAutoReleaseOfIndicesOnReplacementNodes() { + public void testAutoReleaseIndicesWithPercentages() { + doTestAutoReleaseIndices(false); + } + + public void testAutoReleaseIndicesWithMaxHeadroom() { + doTestAutoReleaseIndices(true); + } + + private void doTestNoAutoReleaseOfIndicesOnReplacementNodes(boolean testMaxHeadroom) { AtomicReference> indicesToMarkReadOnly = new AtomicReference<>(); AtomicReference> indicesToRelease = new AtomicReference<>(); AtomicReference currentClusterState = new AtomicReference<>(); @@ -541,13 +778,15 @@ public void testNoAutoReleaseOfIndicesOnReplacementNodes() { ); assertThat(RoutingNodesHelper.shardsWithState(clusterState.getRoutingNodes(), ShardRoutingState.STARTED).size(), equalTo(8)); + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + Map reservedSpaces = new HashMap<>(); - final int reservedSpaceNode1 = between(0, 10); + final long reservedSpaceNode1 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 100)).getBytes() : between(0, 10); reservedSpaces.put( new ClusterInfo.NodeAndPath("node1", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode1).build() ); - final int reservedSpaceNode2 = between(0, 10); + final long reservedSpaceNode2 = testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 100)).getBytes() : between(0, 10); reservedSpaces.put( new ClusterInfo.NodeAndPath("node2", "/foo/bar"), new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), reservedSpaceNode2).build() @@ -580,8 +819,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); Map builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertEquals(new HashSet<>(Arrays.asList("test_1", "test_2")), indicesToMarkReadOnly.get()); assertNull(indicesToRelease.get()); @@ -590,8 +847,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 90))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(5, 90))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 9900)).getBytes() : between(5, 90) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(20, 9900)).getBytes() : between(5, 90) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertNull(indicesToMarkReadOnly.get()); assertNull(indicesToRelease.get()); @@ -643,12 +918,30 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener currentClusterState.set(clusterStateWithBlocks); - // When free disk on any of node1 or node2 goes below 5% flood watermark, then apply index block on indices not having the block + // When free disk on any of node1 or node2 goes below the flood watermark, then apply index block on indices not having the block indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 100))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(0, 4))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 10000)).getBytes() : between(0, 100) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(0, 19)).getBytes() : between(0, 4) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertThat(indicesToMarkReadOnly.get(), contains("test_1")); assertNull(indicesToRelease.get()); @@ -657,8 +950,26 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertNull(indicesToMarkReadOnly.get()); assertNull(indicesToRelease.get()); @@ -676,15 +987,40 @@ protected void updateIndicesReadOnly(Set indicesToUpdate, ActionListener indicesToMarkReadOnly.set(null); indicesToRelease.set(null); builder = new HashMap<>(); - builder.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 100))); - builder.put("node2", new DiskUsage("node2", "node2", "/foo/bar", 100, between(10, 100))); + builder.put( + "node1", + new DiskUsage( + "node1", + "node1", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); + builder.put( + "node2", + new DiskUsage( + "node2", + "node2", + "/foo/bar", + totalBytes, + testMaxHeadroom ? ByteSizeValue.ofGb(between(100, 10000)).getBytes() : between(10, 100) + ) + ); monitor.onNewInfo(clusterInfo(builder, reservedSpaces)); assertNull(indicesToMarkReadOnly.get()); assertThat(indicesToRelease.get(), contains("test_2")); } - @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging") - public void testDiskMonitorLogging() throws IllegalAccessException { + public void testNoAutoReleaseOfIndicesOnReplacementNodesWithPercentages() { + doTestNoAutoReleaseOfIndicesOnReplacementNodes(false); + } + + public void testNoAutoReleaseOfIndicesOnReplacementNodesWithMaxHeadroom() { + doTestNoAutoReleaseOfIndicesOnReplacementNodes(true); + } + + private void doTestDiskMonitorLogging(boolean testHeadroom) throws IllegalAccessException { final ClusterState clusterState = ClusterState.builder(ClusterName.CLUSTER_NAME_SETTING.getDefault(Settings.EMPTY)) .nodes(DiscoveryNodes.builder().add(newNormalNode("node1")).add(newFrozenOnlyNode("frozen"))) .build(); @@ -725,53 +1061,62 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste } }; + long thousandTb = ByteSizeValue.ofTb(1000).getBytes(); + long total = testHeadroom ? thousandTb : 100; + Map allDisksOk = new HashMap<>(); - allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100))); - if (randomBoolean()) { - allDisksOk.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(15, 100))); - } else { - allDisksOk.put( + allDisksOk.put("node1", new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(150, 1000) : between(15, 100))); + allDisksOk.put( + "frozen", + new DiskUsage( "frozen", - new DiskUsage( - "frozen", - "frozen", - "/foo/bar", - ByteSizeValue.ofGb(1000).getBytes(), - (randomBoolean() ? ByteSizeValue.ofGb(between(20, 1000)) : ByteSizeValue.ofGb(between(20, 50))).getBytes() - ) - ); - } + "frozen", + "/foo/bar", + total, + testHeadroom ? (randomBoolean() ? betweenGb(20, 1000) : betweenGb(20, 50)) : between(15, 100) + ) + ); Map aboveLowWatermark = new HashMap<>(); - aboveLowWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(10, 14))); - aboveLowWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(10, 14))); + aboveLowWatermark.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(100, 149) : between(10, 14)) + ); + aboveLowWatermark.put( + "frozen", + new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(100, 149) : between(10, 14)) + ); Map aboveHighWatermark = new HashMap<>(); - aboveHighWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(5, 9))); - aboveHighWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(5, 9))); + aboveHighWatermark.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(20, 99) : between(5, 9)) + ); + aboveHighWatermark.put( + "frozen", + new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(20, 99) : between(5, 9)) + ); Map aboveFloodStageWatermark = new HashMap<>(); - aboveFloodStageWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(0, 4))); + aboveFloodStageWatermark.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(0, 19) : between(0, 4)) + ); // frozen is below flood stage, so no logging from it. - aboveFloodStageWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(5, 9))); + aboveFloodStageWatermark.put( + "frozen", + new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(0, 19) : between(5, 9)) + ); Map frozenAboveFloodStageWatermark = new HashMap<>(); // node1 is below low watermark, so no logging from it. - frozenAboveFloodStageWatermark.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100))); - frozenAboveFloodStageWatermark.put("frozen", new DiskUsage("frozen", "frozen", "/foo/bar", 100, between(0, 4))); - - Map frozenAboveFloodStageMaxHeadroom = new HashMap<>(); - // node1 is below low watermark, so no logging from it. - frozenAboveFloodStageMaxHeadroom.put("node1", new DiskUsage("node1", "node1", "/foo/bar", 100, between(15, 100))); - frozenAboveFloodStageMaxHeadroom.put( + frozenAboveFloodStageWatermark.put( + "node1", + new DiskUsage("node1", "node1", "/foo/bar", total, testHeadroom ? betweenGb(150, 1000) : between(15, 100)) + ); + frozenAboveFloodStageWatermark.put( "frozen", - new DiskUsage( - "frozen", - "frozen", - "/foo/bar", - ByteSizeValue.ofGb(1000).getBytes(), - ByteSizeValue.ofGb(between(0, 19)).getBytes() - ) + new DiskUsage("frozen", "frozen", "/foo/bar", total, testHeadroom ? betweenGb(0, 19) : between(0, 4)) ); advanceTime.set(true); // first check sees new nodes and triggers a reroute @@ -779,17 +1124,23 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste advanceTime.set(randomBoolean()); // no new nodes so no reroute delay needed assertNoLogging(monitor, allDisksOk); + String lowWatermarkString = testHeadroom ? "max_headroom=150gb" : "85%"; + String highWatermarkString = testHeadroom ? "max_headroom=100gb" : "90%"; + String floodWatermarkString = testHeadroom ? "max_headroom=20gb" : "95%"; + assertSingleInfoMessage( monitor, aboveLowWatermark, - "low disk watermark [85%] exceeded on *node1* replicas will not be assigned to this node" + "low disk watermark [" + lowWatermarkString + "] exceeded on *node1* replicas will not be assigned to this node" ); advanceTime.set(false); // will do one reroute and emit warnings, but subsequent reroutes and associated messages are delayed assertSingleWarningMessage( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete" ); @@ -797,7 +1148,9 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste assertRepeatedWarningMessages( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete" ); @@ -805,15 +1158,19 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste assertRepeatedWarningMessages( monitor, aboveFloodStageWatermark, - "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only" + "flood stage disk watermark [" + + floodWatermarkString + + "] exceeded on *node1* all indices on this node will be marked read-only" ); - relocatingShardSizeRef.set(-5L); + relocatingShardSizeRef.set(testHeadroom ? (-1L) * ByteSizeValue.ofGb(80).getBytes() : -5L); advanceTime.set(true); assertSingleInfoMessage( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to be below the high disk watermark when these relocations are complete" ); @@ -823,7 +1180,9 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste assertSingleWarningMessage( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete" ); @@ -831,7 +1190,9 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste assertRepeatedWarningMessages( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete" ); @@ -839,56 +1200,78 @@ long sizeOfRelocatingShards(RoutingNode routingNode, DiskUsage diskUsage, Cluste assertSingleInfoMessage( monitor, aboveLowWatermark, - "high disk watermark [90%] no longer exceeded on *node1* but low disk watermark [85%] is still exceeded" + "high disk watermark [" + + highWatermarkString + + "] no longer exceeded on *node1* but low disk watermark [" + + lowWatermarkString + + "] is still exceeded" ); advanceTime.set(true); // only log about dropping below the low disk watermark on a reroute - assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*"); + assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*"); advanceTime.set(randomBoolean()); assertRepeatedWarningMessages( monitor, aboveFloodStageWatermark, - "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only" + "flood stage disk watermark [" + + floodWatermarkString + + "] exceeded on *node1* all indices on this node will be marked read-only" ); - assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*"); + assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*"); advanceTime.set(true); assertRepeatedWarningMessages( monitor, aboveHighWatermark, - "high disk watermark [90%] exceeded on *node1* shards will be relocated away from this node* " + "high disk watermark [" + + highWatermarkString + + "] exceeded on *node1* shards will be relocated away from this node* " + "the node is expected to continue to exceed the high disk watermark when these relocations are complete" ); - assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*"); + assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*"); assertRepeatedWarningMessages( monitor, aboveFloodStageWatermark, - "flood stage disk watermark [95%] exceeded on *node1* all indices on this node will be marked read-only" + "flood stage disk watermark [" + + floodWatermarkString + + "] exceeded on *node1* all indices on this node will be marked read-only" ); assertSingleInfoMessage( monitor, aboveLowWatermark, - "high disk watermark [90%] no longer exceeded on *node1* but low disk watermark [85%] is still exceeded" + "high disk watermark [" + + highWatermarkString + + "] no longer exceeded on *node1* but low disk watermark [" + + lowWatermarkString + + "] is still exceeded" ); - assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [85%] no longer exceeded on *node1*"); - - assertRepeatedWarningMessages(monitor, frozenAboveFloodStageWatermark, "flood stage disk watermark [95%] exceeded on *frozen*"); + assertSingleInfoMessage(monitor, allDisksOk, "low disk watermark [" + lowWatermarkString + "] no longer exceeded on *node1*"); assertRepeatedWarningMessages( monitor, - frozenAboveFloodStageMaxHeadroom, - "flood stage disk watermark [max_headroom=20gb] exceeded on *frozen*" + frozenAboveFloodStageWatermark, + "flood stage disk watermark [" + floodWatermarkString + "] exceeded on *frozen*" ); assertNoLogging(monitor, allDisksOk); } + @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging") + public void testDiskMonitorLoggingWithPercentages() throws IllegalAccessException { + doTestDiskMonitorLogging(false); + } + + @TestLogging(value = "org.elasticsearch.cluster.routing.allocation.DiskThresholdMonitor:INFO", reason = "testing INFO/WARN logging") + public void testDiskMonitorLoggingWithMaxHeadrooms() throws IllegalAccessException { + doTestDiskMonitorLogging(true); + } + private void assertNoLogging(DiskThresholdMonitor monitor, Map diskUsages) throws IllegalAccessException { MockLogAppender mockAppender = new MockLogAppender(); mockAppender.start(); @@ -956,6 +1339,10 @@ private void assertLogging(DiskThresholdMonitor monitor, Map mockAppender.stop(); } + private static long betweenGb(int min, int max) { + return ByteSizeValue.ofGb(between(min, max)).getBytes(); + } + private static ClusterInfo clusterInfo(Map diskUsages) { return clusterInfo(diskUsages, Map.of()); } @@ -996,4 +1383,5 @@ private static DiscoveryNode newNormalNode(String nodeId, String nodeName) { private static DiscoveryNode newNormalNode(String nodeId) { return newNormalNode(nodeId, ""); } + } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java index f0a2aa507b185..579bd0f66fa93 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/DiskThresholdSettingsTests.java @@ -12,6 +12,7 @@ import org.elasticsearch.common.settings.Settings; import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.test.ESTestCase; +import org.hamcrest.Matchers; import java.util.Locale; @@ -26,40 +27,344 @@ public void testDefaults() { ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); - ByteSizeValue zeroBytes = ByteSizeValue.parseBytesSizeValue("0b", "test"); - assertEquals(zeroBytes, diskThresholdSettings.getFreeBytesThresholdHigh()); - assertEquals(10.0D, diskThresholdSettings.getFreeDiskThresholdHigh(), 0.0D); - assertEquals(zeroBytes, diskThresholdSettings.getFreeBytesThresholdLow()); - assertEquals(15.0D, diskThresholdSettings.getFreeDiskThresholdLow(), 0.0D); assertEquals(60L, diskThresholdSettings.getRerouteInterval().seconds()); assertTrue(diskThresholdSettings.isEnabled()); - assertEquals(zeroBytes, diskThresholdSettings.getFreeBytesThresholdFloodStage()); - assertEquals(5.0D, diskThresholdSettings.getFreeDiskThresholdFloodStage(), 0.0D); + + // Test default watermark percentages + ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test"); + assertEquals(ByteSizeValue.ofBytes(15), diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(10), diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes)); + assertEquals("85%", diskThresholdSettings.describeLowThreshold(hundredBytes, false)); + assertEquals("90%", diskThresholdSettings.describeHighThreshold(hundredBytes, false)); + assertEquals("95%", diskThresholdSettings.describeFloodStageThreshold(hundredBytes, false)); + assertEquals("95%", diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, false)); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey() + "=" + "85%", + diskThresholdSettings.describeLowThreshold(hundredBytes, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey() + "=" + "90%", + diskThresholdSettings.describeHighThreshold(hundredBytes, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey() + "=" + "95%", + diskThresholdSettings.describeFloodStageThreshold(hundredBytes, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey() + "=" + "95%", + diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, true) + ); + + // Test default watermark max headroom values + ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test"); + ByteSizeValue lowHeadroom = ByteSizeValue.parseBytesSizeValue("150gb", "test"); + ByteSizeValue highHeadroom = ByteSizeValue.parseBytesSizeValue("100gb", "test"); + ByteSizeValue floodHeadroom = ByteSizeValue.parseBytesSizeValue("20gb", "test"); + ByteSizeValue frozenFloodHeadroom = ByteSizeValue.parseBytesSizeValue("20gb", "test"); + assertEquals(lowHeadroom, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb)); + assertEquals(highHeadroom, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb)); + assertEquals(floodHeadroom, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb)); + assertEquals(frozenFloodHeadroom, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb)); + assertEquals("max_headroom=150gb", diskThresholdSettings.describeLowThreshold(thousandTb, false)); + assertEquals("max_headroom=100gb", diskThresholdSettings.describeHighThreshold(thousandTb, false)); + assertEquals("max_headroom=20gb", diskThresholdSettings.describeFloodStageThreshold(thousandTb, false)); + assertEquals("max_headroom=20gb", diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, false)); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + "150gb", + diskThresholdSettings.describeLowThreshold(thousandTb, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + "100gb", + diskThresholdSettings.describeHighThreshold(thousandTb, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey() + "=" + "20gb", + diskThresholdSettings.describeFloodStageThreshold(thousandTb, true) + ); + assertEquals( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey() + "=" + "20gb", + diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, true) + ); + } + + public void testMinimumTotalSizeForBelowLowWatermark() { + ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); + + // Test default values + + // For 850 used bytes, we need 850 / 0.85 = 1000 total bytes. + assertEquals( + ByteSizeValue.ofBytes(1000), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850)) + ); + // For 100TB used bytes, the max headroom should cap the minimum required free space to 150GB. So we need 100TB+150GB total bytes. + assertEquals( + ByteSizeValue.ofBytes(ByteSizeValue.ofTb(100).getBytes() + ByteSizeValue.ofGb(150).getBytes()), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100)) + ); + + // Test random factor. Stay in low values so max headroom does not apply. + final long factor = between(1, 1000); + assertThat( + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(85 * factor)), + Matchers.equalTo(ByteSizeValue.ofBytes(100L * factor)) + ); + + // Test absolute values + + Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1gb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "100mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "10mb") + .build(); + nss.applySettings(newSettings); + + // For 850 used bytes, we need 850b + 1GB total bytes. + assertEquals( + ByteSizeValue.ofBytes(ByteSizeValue.ofGb(1).getBytes() + ByteSizeValue.ofBytes(850).getBytes()), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850)) + ); + // For 100TB used bytes, we need 100TB+1GB total bytes. + assertEquals( + ByteSizeValue.ofBytes(ByteSizeValue.ofTb(100).getBytes() + ByteSizeValue.ofGb(1).getBytes()), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100)) + ); + + newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "0.50") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "-1") + .build(); + nss.applySettings(newSettings); + + // For 850 used bytes, we need 850 / 0.5 = 1700 total bytes + assertEquals( + ByteSizeValue.ofBytes(1700), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850)) + ); + // For 100TB used bytes, we need 100TB / 0.5 total bytes. + assertEquals( + ByteSizeValue.ofBytes((long) (ByteSizeValue.ofTb(100).getBytes() / 0.5)), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100)) + ); + + newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "0.50") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "200gb") + .build(); + nss.applySettings(newSettings); + + // For 850 used bytes, we need 850 / 0.5 = 1700 total bytes + assertEquals( + ByteSizeValue.ofBytes(1700), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(850)) + ); + // For 100TB used bytes, the max headroom should cap the minimum required free space to 200GB. So we need 100TB+200GB total bytes. + assertEquals( + ByteSizeValue.ofBytes(ByteSizeValue.ofTb(100).getBytes() + ByteSizeValue.ofGb(200).getBytes()), + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofTb(100)) + ); + + // Test random percentage + + // to make it easy, stay below high watermark. + final long percentage = between(1, 89); + newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), percentage + "%") + .build(); + nss.applySettings(newSettings); + + assertThat( + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(percentage * factor)), + Matchers.equalTo(ByteSizeValue.ofBytes(100L * factor)) + ); + + // Test random absolute values + + final long absolute = between(1, 1000); + newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), absolute + "b") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), absolute + "b") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), absolute + "b") + .build(); + nss.applySettings(newSettings); + + long needed = between(0, 1000); + assertThat( + diskThresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(needed)), + Matchers.equalTo(ByteSizeValue.ofBytes(needed + absolute)) + ); } - public void testUpdate() { + public void testUpdateWatermarkByteValues() { ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); Settings newSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "500mb") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1000mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "500mb") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "250mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), "150mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s") + .build(); + nss.applySettings(newSettings); + + ByteSizeValue total = ByteSizeValue.parseBytesSizeValue("1000tb", "test"); + assertEquals(ByteSizeValue.parseBytesSizeValue("1000mb", "test"), diskThresholdSettings.getFreeBytesThresholdLowStage(total)); + assertEquals(ByteSizeValue.parseBytesSizeValue("500mb", "test"), diskThresholdSettings.getFreeBytesThresholdHighStage(total)); + assertEquals(ByteSizeValue.parseBytesSizeValue("250mb", "test"), diskThresholdSettings.getFreeBytesThresholdFloodStage(total)); + assertEquals( + ByteSizeValue.parseBytesSizeValue("150mb", "test"), + diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(total) + ); + assertEquals(30L, diskThresholdSettings.getRerouteInterval().seconds()); + assertFalse(diskThresholdSettings.isEnabled()); + } + + public void testUpdateWatermarkPercentageValues() { + ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); + + Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), randomBoolean() ? "50%" : "0.50") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), randomBoolean() ? "60%" : "0.60") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + randomBoolean() ? "75%" : "0.75" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + randomBoolean() ? "80%" : "0.80" + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s") .build(); nss.applySettings(newSettings); - assertEquals(ByteSizeValue.parseBytesSizeValue("500mb", "test"), diskThresholdSettings.getFreeBytesThresholdHigh()); - assertEquals(0.0D, diskThresholdSettings.getFreeDiskThresholdHigh(), 0.0D); - assertEquals(ByteSizeValue.parseBytesSizeValue("1000mb", "test"), diskThresholdSettings.getFreeBytesThresholdLow()); - assertEquals(0.0D, diskThresholdSettings.getFreeDiskThresholdLow(), 0.0D); - assertEquals(ByteSizeValue.parseBytesSizeValue("250mb", "test"), diskThresholdSettings.getFreeBytesThresholdFloodStage()); - assertEquals(0.0D, diskThresholdSettings.getFreeDiskThresholdFloodStage(), 0.0D); + ByteSizeValue total = ByteSizeValue.parseBytesSizeValue("100b", "test"); + assertEquals(ByteSizeValue.parseBytesSizeValue("50b", "test"), diskThresholdSettings.getFreeBytesThresholdLowStage(total)); + assertEquals(ByteSizeValue.parseBytesSizeValue("40b", "test"), diskThresholdSettings.getFreeBytesThresholdHighStage(total)); + assertEquals(ByteSizeValue.parseBytesSizeValue("25b", "test"), diskThresholdSettings.getFreeBytesThresholdFloodStage(total)); + assertEquals(ByteSizeValue.parseBytesSizeValue("20b", "test"), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(total)); assertEquals(30L, diskThresholdSettings.getRerouteInterval().seconds()); assertFalse(diskThresholdSettings.isEnabled()); } + public void testUpdateMaxHeadroomValues() { + ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); + + Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "1000mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "500mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "250mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "150mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s") + .build(); + nss.applySettings(newSettings); + + // Test that default percentage values apply + ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test"); + assertEquals(ByteSizeValue.ofBytes(15), diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(10), diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(5), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes)); + + // Test that max headroom values apply + ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test"); + ByteSizeValue lowHeadroom = ByteSizeValue.parseBytesSizeValue("1000mb", "test"); + ByteSizeValue highHeadroom = ByteSizeValue.parseBytesSizeValue("500mb", "test"); + ByteSizeValue floodHeadroom = ByteSizeValue.parseBytesSizeValue("250mb", "test"); + ByteSizeValue frozenFloodHeadroom = ByteSizeValue.parseBytesSizeValue("150mb", "test"); + assertEquals(lowHeadroom, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb)); + assertEquals(highHeadroom, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb)); + assertEquals(floodHeadroom, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb)); + assertEquals(frozenFloodHeadroom, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb)); + } + + public void testUpdateWatermarkAndMaxHeadroomValues() { + ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, nss); + + boolean watermarksAbsolute = randomBoolean(); + boolean lowHeadroomEnabled = randomBoolean(); + boolean highHeadroomEnabled = randomBoolean(); + boolean floodHeadroomEnabled = randomBoolean(); + boolean frozenFloodHeadroomEnabled = randomBoolean(); + + Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), false) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), + watermarksAbsolute ? "50b" : randomBoolean() ? "50%" : "0.50" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), + watermarksAbsolute ? "40b" : randomBoolean() ? "60%" : "0.60" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + watermarksAbsolute ? "30b" : randomBoolean() ? "70%" : "0.70" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + watermarksAbsolute ? "15b" : randomBoolean() ? "85%" : "0.85" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + lowHeadroomEnabled ? "1000mb" : "-1" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + highHeadroomEnabled ? "500mb" : "-1" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), + floodHeadroomEnabled ? "250mb" : "-1" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), + frozenFloodHeadroomEnabled ? "150mb" : "-1" + ) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_REROUTE_INTERVAL_SETTING.getKey(), "30s") + .build(); + nss.applySettings(newSettings); + + // Test that watermark values apply + ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test"); + assertEquals(ByteSizeValue.ofBytes(50), diskThresholdSettings.getFreeBytesThresholdLowStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(40), diskThresholdSettings.getFreeBytesThresholdHighStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(30), diskThresholdSettings.getFreeBytesThresholdFloodStage(hundredBytes)); + assertEquals(ByteSizeValue.ofBytes(15), diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(hundredBytes)); + + // Test that max headroom values (if enabled) prevail over percentage watermark values + ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test"); + ByteSizeValue lowExpected = ByteSizeValue.parseBytesSizeValue( + watermarksAbsolute ? "50b" : lowHeadroomEnabled ? "1000mb" : "500tb", + "test" + ); + ByteSizeValue highExpected = ByteSizeValue.parseBytesSizeValue( + watermarksAbsolute ? "40b" : highHeadroomEnabled ? "500mb" : "400tb", + "test" + ); + ByteSizeValue floodExpected = ByteSizeValue.parseBytesSizeValue( + watermarksAbsolute ? "30b" : floodHeadroomEnabled ? "250mb" : "300tb", + "test" + ); + ByteSizeValue frozenFloodExpected = ByteSizeValue.parseBytesSizeValue( + watermarksAbsolute ? "15b" : frozenFloodHeadroomEnabled ? "150mb" : "150tb", + "test" + ); + assertEquals(lowExpected, diskThresholdSettings.getFreeBytesThresholdLowStage(thousandTb)); + assertEquals(highExpected, diskThresholdSettings.getFreeBytesThresholdHighStage(thousandTb)); + assertEquals(floodExpected, diskThresholdSettings.getFreeBytesThresholdFloodStage(thousandTb)); + assertEquals(frozenFloodExpected, diskThresholdSettings.getFreeBytesThresholdFrozenFloodStage(thousandTb)); + } + public void testInvalidConstruction() { final Settings settings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") @@ -79,7 +384,7 @@ public void testInvalidLowHighPercentageUpdate() { final Settings newSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "80%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "83.45%") .build(); final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); @@ -88,7 +393,7 @@ public void testInvalidLowHighPercentageUpdate() { assertNotNull(e.getCause()); assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); - assertThat(cause, hasToString(containsString("low disk watermark [90%] more than high disk watermark [80%]"))); + assertThat(cause, hasToString(containsString("low disk watermark [90%] more than high disk watermark [83.45%]"))); } public void testInvalidHighFloodPercentageUpdate() { @@ -96,13 +401,13 @@ public void testInvalidHighFloodPercentageUpdate() { new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater final Settings newSettings = Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "50%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "50.1%") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "60%") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "55%") .build(); final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); - final String expected = "illegal value can't update [cluster.routing.allocation.disk.watermark.low] from [85%] to [50%]"; + final String expected = "illegal value can't update [cluster.routing.allocation.disk.watermark.low] from [85%] to [50.1%]"; assertThat(e, hasToString(containsString(expected))); assertNotNull(e.getCause()); assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); @@ -126,7 +431,7 @@ public void testInvalidLowHighBytesUpdate() { assertNotNull(e.getCause()); assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); - assertThat(cause, hasToString(containsString("low disk watermark [500m] less than high disk watermark [1000m]"))); + assertThat(cause, hasToString(containsString("low disk watermark [500mb] less than high disk watermark [1000mb]"))); } public void testInvalidHighFloodBytesUpdate() { @@ -145,7 +450,7 @@ public void testInvalidHighFloodBytesUpdate() { assertNotNull(e.getCause()); assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); - assertThat(cause, hasToString(containsString("low disk watermark [500m] less than high disk watermark [1000m]"))); + assertThat(cause, hasToString(containsString("low disk watermark [500mb] less than high disk watermark [1000mb]"))); } public void testIncompatibleThresholdUpdate() { @@ -155,7 +460,7 @@ public void testIncompatibleThresholdUpdate() { final Settings newSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "1000m") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "95%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "95.2%") .build(); final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); @@ -170,9 +475,9 @@ public void testIncompatibleThresholdUpdate() { DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%", DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), - "1000m", + "1000mb", DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), - "95%" + "95.2%" ); assertThat(cause, hasToString(containsString(incompatibleExpected))); } @@ -195,6 +500,65 @@ public void testInvalidHighDiskThreshold() { assertThat(cause, hasToString(containsString("low disk watermark [85%] more than high disk watermark [75%]"))); } + public void testInvalidLowHighMaxHeadroomUpdate() { + final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater + + final Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "300m") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "750m") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "500m") + .build(); + + final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); + final String expected = + "illegal value can't update [cluster.routing.allocation.disk.watermark.low.max_headroom] from [150gb] to [300m]"; + assertThat(e, hasToString(containsString(expected))); + assertNotNull(e.getCause()); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); + assertThat(cause, hasToString(containsString("high disk max headroom [750mb] more than low disk max headroom [300mb]"))); + } + + public void testInvalidHighFloodMaxHeadroomUpdate() { + final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater + + final Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "400m") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "500m") + .build(); + + final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); + final String expected = + "illegal value can't update [cluster.routing.allocation.disk.watermark.high.max_headroom] from [100gb] to [400m]"; + assertThat(e, hasToString(containsString(expected))); + assertNotNull(e.getCause()); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); + assertThat(cause, hasToString(containsString("flood disk max headroom [500mb] more than high disk max headroom [400mb]"))); + } + + public void testInvalidLowFloodMaxHeadroomUpdate() { + final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater + + final Settings newSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "-1") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "300m") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "500m") + .build(); + + final IllegalArgumentException e = expectThrows(IllegalArgumentException.class, () -> clusterSettings.applySettings(newSettings)); + final String expected = + "illegal value can't update [cluster.routing.allocation.disk.watermark.low.max_headroom] from [150gb] to [300m]"; + assertThat(e, hasToString(containsString(expected))); + assertNotNull(e.getCause()); + assertThat(e.getCause(), instanceOf(IllegalArgumentException.class)); + final IllegalArgumentException cause = (IllegalArgumentException) e.getCause(); + assertThat(cause, hasToString(containsString("flood disk max headroom [500mb] more than low disk max headroom [300mb]"))); + } + public void testSequenceOfUpdates() { final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); new DiskThresholdSettings(Settings.EMPTY, clusterSettings); // this has the effect of registering the settings updater @@ -244,39 +608,175 @@ public void testSequenceOfUpdates() { } } - public void testThresholdDescriptions() { + private void doTestDescriptions(boolean includeKey) { final ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); + ByteSizeValue hundredBytes = ByteSizeValue.parseBytesSizeValue("100b", "test"); + ByteSizeValue thousandTb = ByteSizeValue.parseBytesSizeValue("1000tb", "test"); + String lowWatermarkPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey() + "=" + : ""; + String highWatermarkPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey() + "=" + : ""; + String floodWatermarkPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey() + "=" + : ""; + String frozenFloodWatermarkPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey() + "=" + : ""; + String lowMaxHeadroomPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + : "max_headroom="; + String highMaxHeadroomPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey() + "=" + : "max_headroom="; + String floodMaxHeadroomPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey() + "=" + : "max_headroom="; + String frozenFloodMaxHeadroomPrefix = includeKey + ? DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey() + "=" + : "max_headroom="; DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(Settings.EMPTY, clusterSettings); - assertThat(diskThresholdSettings.describeLowThreshold(), equalTo("85%")); - assertThat(diskThresholdSettings.describeHighThreshold(), equalTo("90%")); - assertThat(diskThresholdSettings.describeFloodStageThreshold(), equalTo("95%")); + assertThat(diskThresholdSettings.describeLowThreshold(hundredBytes, includeKey), equalTo(lowWatermarkPrefix + "85%")); + assertThat(diskThresholdSettings.describeHighThreshold(hundredBytes, includeKey), equalTo(highWatermarkPrefix + "90%")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(hundredBytes, includeKey), equalTo(floodWatermarkPrefix + "95%")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, includeKey), + equalTo(frozenFloodWatermarkPrefix + "95%") + ); + + assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowMaxHeadroomPrefix + "150gb")); + assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highMaxHeadroomPrefix + "100gb")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodMaxHeadroomPrefix + "20gb")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey), + equalTo(frozenFloodMaxHeadroomPrefix + "20gb") + ); + + // With 1000GB assert max headrooms diskThresholdSettings = new DiskThresholdSettings( Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "91.2%") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "91.3%") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "91.4%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), + randomBoolean() ? "91.2%" : "0.912" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), + randomBoolean() ? "91.3%" : "0.913" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + randomBoolean() ? "91.4%" : "0.914" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + randomBoolean() ? "91.5%" : "0.915" + ) .build(), clusterSettings ); - assertThat(diskThresholdSettings.describeLowThreshold(), equalTo("91.2%")); - assertThat(diskThresholdSettings.describeHighThreshold(), equalTo("91.3%")); - assertThat(diskThresholdSettings.describeFloodStageThreshold(), equalTo("91.4%")); + assertThat(diskThresholdSettings.describeLowThreshold(hundredBytes, includeKey), equalTo(lowWatermarkPrefix + "91.2%")); + assertThat(diskThresholdSettings.describeHighThreshold(hundredBytes, includeKey), equalTo(highWatermarkPrefix + "91.3%")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(hundredBytes, includeKey), equalTo(floodWatermarkPrefix + "91.4%")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, includeKey), + equalTo(frozenFloodWatermarkPrefix + "91.5%") + ); + + // Even for 1000TB, the watermarks apply since they are set (the default max headrooms do not apply) + assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowWatermarkPrefix + "91.2%")); + assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highWatermarkPrefix + "91.3%")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodWatermarkPrefix + "91.4%")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey), + equalTo(frozenFloodWatermarkPrefix + "91.5%") + ); diskThresholdSettings = new DiskThresholdSettings( Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "1GB") .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "10MB") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "1B") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), "2B") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), "1B") + // Max headroom values should be ignored since the watermark values are set to absolute values + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "100mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "50mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "10mb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "10mb") .build(), clusterSettings ); - assertThat(diskThresholdSettings.describeLowThreshold(), equalTo("1gb")); - assertThat(diskThresholdSettings.describeHighThreshold(), equalTo("10mb")); - assertThat(diskThresholdSettings.describeFloodStageThreshold(), equalTo("1b")); + assertThat(diskThresholdSettings.describeLowThreshold(hundredBytes, includeKey), equalTo(lowWatermarkPrefix + "1gb")); + assertThat(diskThresholdSettings.describeHighThreshold(hundredBytes, includeKey), equalTo(highWatermarkPrefix + "10mb")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(hundredBytes, includeKey), equalTo(floodWatermarkPrefix + "2b")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, includeKey), + equalTo(frozenFloodWatermarkPrefix + "1b") + ); + + // Even for 1000TB, the watermarks apply since they are set to absolute values (max headroom values should be ignored) + assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowWatermarkPrefix + "1gb")); + assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highWatermarkPrefix + "10mb")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodWatermarkPrefix + "2b")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey), + equalTo(frozenFloodWatermarkPrefix + "1b") + ); + + // Test a mixture of percentages and max headroom values + diskThresholdSettings = new DiskThresholdSettings( + Settings.builder() + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), + randomBoolean() ? "31.2%" : "0.312" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), + randomBoolean() ? "31.3%" : "0.313" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), + randomBoolean() ? "31.4%" : "0.314" + ) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_WATERMARK_SETTING.getKey(), + randomBoolean() ? "31.5%" : "0.315" + ) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), "100gb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), "50gb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_MAX_HEADROOM_SETTING.getKey(), "10gb") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_FROZEN_MAX_HEADROOM_SETTING.getKey(), "10gb") + .build(), + clusterSettings + ); + + assertThat(diskThresholdSettings.describeLowThreshold(hundredBytes, includeKey), equalTo(lowWatermarkPrefix + "31.2%")); + assertThat(diskThresholdSettings.describeHighThreshold(hundredBytes, includeKey), equalTo(highWatermarkPrefix + "31.3%")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(hundredBytes, includeKey), equalTo(floodWatermarkPrefix + "31.4%")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(hundredBytes, includeKey), + equalTo(frozenFloodWatermarkPrefix + "31.5%") + ); + + assertThat(diskThresholdSettings.describeLowThreshold(thousandTb, includeKey), equalTo(lowMaxHeadroomPrefix + "100gb")); + assertThat(diskThresholdSettings.describeHighThreshold(thousandTb, includeKey), equalTo(highMaxHeadroomPrefix + "50gb")); + assertThat(diskThresholdSettings.describeFloodStageThreshold(thousandTb, includeKey), equalTo(floodMaxHeadroomPrefix + "10gb")); + assertThat( + diskThresholdSettings.describeFrozenFloodStageThreshold(thousandTb, includeKey), + equalTo(frozenFloodMaxHeadroomPrefix + "10gb") + ); + } + + public void testDescriptionsWithKeys() { + doTestDescriptions(true); + } + + public void testDescriptionsWithoutKeys() { + doTestDescriptions(false); } } diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java index 8793f6f9c63e5..636d78bbb2695 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderTests.java @@ -45,6 +45,7 @@ import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Setting; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; import org.elasticsearch.repositories.IndexId; @@ -83,22 +84,41 @@ DiskThresholdDecider makeDecider(Settings settings) { return new DiskThresholdDecider(settings, new ClusterSettings(settings, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS)); } - public void testDiskThreshold() { + private void doTestDiskThreshold(boolean testMaxHeadroom) { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(150).toString() : "-1") + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(100).toString() : "-1") + ) .build(); Map usages = new HashMap<>(); - usages.put("node1", new DiskUsage("node1", "node1", "/dev/null", 100, 10)); // 90% used - usages.put("node2", new DiskUsage("node2", "node2", "/dev/null", 100, 35)); // 65% used - usages.put("node3", new DiskUsage("node3", "node3", "/dev/null", 100, 60)); // 40% used - usages.put("node4", new DiskUsage("node4", "node4", "/dev/null", 100, 80)); // 20% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + final long exactFreeSpaceForHighWatermark = testMaxHeadroom ? ByteSizeValue.ofGb(100).getBytes() : 10; + usages.put("node1", new DiskUsage("node1", "node1", "/dev/null", totalBytes, exactFreeSpaceForHighWatermark)); + usages.put( + "node2", + new DiskUsage("node2", "node2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(350).getBytes() : 35) + ); + usages.put( + "node3", + new DiskUsage("node3", "node3", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(600).getBytes() : 60) + ); + usages.put( + "node4", + new DiskUsage("node4", "node4", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(800).getBytes() : 80) + ); Map shardSizes = new HashMap<>(); - shardSizes.put("[test][0][p]", 10L); // 10 bytes - shardSizes.put("[test][0][r]", 10L); + shardSizes.put("[test][0][p]", exactFreeSpaceForHighWatermark); + shardSizes.put("[test][0][r]", exactFreeSpaceForHighWatermark); final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes); ClusterSettings clusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); @@ -177,14 +197,25 @@ public void testDiskThreshold() { logger.info("--> changing decider settings"); - // Set the low threshold to 60 instead of 70 - // Set the high threshold to 70 instead of 80 - // node2 now should not have new shards allocated to it, but shards can remain - diskSettings = Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.7) - .build(); + if (testMaxHeadroom) { + // Set the low max headroom to 300GB + // Set the high max headroom to 200GB + // node2 (with 250GB free space) now should not have new shards allocated to it, and shards cannot remain + diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(300)) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(200)) + .build(); + } else { + // Set the low threshold to 60 instead of 70 + // Set the high threshold to 70 instead of 80 + // node2 (with 75% used space) now should not have new shards allocated to it, but shards can remain + diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%") + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.7) + .build(); + } deciders = new AllocationDeciders( new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))) @@ -209,14 +240,27 @@ public void testDiskThreshold() { logger.info("--> changing settings again"); - // Set the low threshold to 50 instead of 60 - // Set the high threshold to 60 instead of 70 - // node2 now should not have new shards allocated to it, and shards cannot remain - diskSettings = Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.5) - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.6) - .build(); + if (testMaxHeadroom) { + // Set the low max headroom to 500GB + // Set the high max headroom to 400GB + // node2 (with 250GB free space) now should not have new shards allocated to it, and shards cannot remain + // Note that node3 (with 500GB free space) should not receive the shard so it does not get over the high threshold + diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(500)) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), ByteSizeValue.ofGb(400)) + .build(); + } else { + // Set the low threshold to 50 instead of 60 + // Set the high threshold to 60 instead of 70 + // node2 (with 75 used) now should not have new shards allocated to it, and shards cannot remain + // Note that node3 (with 50% used space) should not receive the shard so it does not get over the high threshold + diskSettings = Settings.builder() + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.5) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.6) + .build(); + } deciders = new AllocationDeciders( new HashSet<>(Arrays.asList(new SameShardAllocationDecider(Settings.EMPTY, clusterSettings), makeDecider(diskSettings))) @@ -261,6 +305,14 @@ public void testDiskThreshold() { assertThat(clusterState.getRoutingNodes().node("node4").size(), equalTo(1)); } + public void testDiskThresholdWithPercentages() { + doTestDiskThreshold(false); + } + + public void testDiskThresholdWithMaxHeadroom() { + doTestDiskThreshold(true); + } + public void testDiskThresholdWithAbsoluteSizes() { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) @@ -505,18 +557,36 @@ public void testDiskThresholdWithAbsoluteSizes() { assertThat(clusterState.getRoutingNodes().node("node5").size(), equalTo(1)); } - public void testDiskThresholdWithShardSizes() { + private void doTestDiskThresholdWithShardSizes(boolean testMaxHeadroom) { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(150).toString() : "-1") + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "71%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(149).toString() : "-1") + ) .build(); Map usages = new HashMap<>(); - usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 31)); // 69% used - usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 1)); // 99% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + // below but close to low watermark + usages.put( + "node1", + new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(151).getBytes() : 31) + ); + // almost fully used + usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1)); - final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, Map.of("[test][0][p]", 10L)); + final ClusterInfo clusterInfo = new DevNullClusterInfo( + usages, + usages, + Map.of("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(10).getBytes() : 10L) + ); AllocationDeciders deciders = new AllocationDeciders( new HashSet<>( @@ -554,11 +624,9 @@ public void testDiskThresholdWithShardSizes() { .routingTable(routingTable) .build(); logger.info("--> adding node1"); + // node2 is added because DiskThresholdDecider automatically ignore single-node clusters clusterState = ClusterState.builder(clusterState) - .nodes( - DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider - // automatically ignore single-node clusters - ) + .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2"))) .build(); routingTable = strategy.reroute(clusterState, "reroute").routingTable(); clusterState = ClusterState.builder(clusterState).routingTable(routingTable).build(); @@ -573,6 +641,14 @@ public void testDiskThresholdWithShardSizes() { assertThat(shardsWithState(clusterState.getRoutingNodes(), STARTED).size(), equalTo(0)); } + public void testDiskThresholdWithShardSizesWithPercentages() { + doTestDiskThresholdWithShardSizes(false); + } + + public void testDiskThresholdWithShardSizesWithMaxHeadroom() { + doTestDiskThresholdWithShardSizes(true); + } + public void testUnknownDiskUsage() { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) @@ -663,37 +739,41 @@ public void testAverageUsage() { assertThat(node1Usage.getFreeBytes(), equalTo(25L)); } - public void testFreeDiskPercentageAfterShardAssigned() { - DiskThresholdDecider decider = makeDecider(Settings.EMPTY); - - Map usages = new HashMap<>(); - usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 50)); // 50% used - usages.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 0)); // 100% used - - Double after = DiskThresholdDecider.freeDiskPercentageAfterShardAssigned( - new DiskThresholdDecider.DiskUsageWithRelocations(new DiskUsage("node2", "n2", "/dev/null", 100, 30), 0L), - 11L - ); - assertThat(after, equalTo(19.0)); - } - - public void testShardRelocationsTakenIntoAccount() { + private void doTestShardRelocationsTakenIntoAccount(boolean testMaxHeadroom) { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), 0.7) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(100).toString() : "-1") + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), 0.8) + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(50).toString() : "-1") + ) .build(); Map usages = new HashMap<>(); - usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 40)); // 60% used - usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 40)); // 60% used - usages.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 40)); // 60% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + usages.put( + "node1", + new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(110).getBytes() : 40) + ); + usages.put( + "node2", + new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(110).getBytes() : 40) + ); + usages.put( + "node3", + new DiskUsage("node3", "n3", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(110).getBytes() : 40) + ); Map shardSizes = new HashMap<>(); - shardSizes.put("[test][0][p]", 14L); // 14 bytes - shardSizes.put("[test][0][r]", 14L); - shardSizes.put("[test2][0][p]", 1L); // 1 bytes - shardSizes.put("[test2][0][r]", 1L); + shardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L); + shardSizes.put("[test][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L); + shardSizes.put("[test2][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1L); + shardSizes.put("[test2][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1L); final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes); DiskThresholdDecider decider = makeDecider(diskSettings); @@ -766,15 +846,21 @@ public void testShardRelocationsTakenIntoAccount() { } Map overfullUsages = new HashMap<>(); - overfullUsages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 40)); // 60% used - overfullUsages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 40)); // 60% used - overfullUsages.put("node3", new DiskUsage("node3", "n3", "/dev/null", 100, 0)); // 100% used + overfullUsages.put( + "node1", + new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(110).getBytes() : 40) + ); + overfullUsages.put( + "node2", + new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(110).getBytes() : 40) + ); + overfullUsages.put("node3", new DiskUsage("node3", "n3", "/dev/null", totalBytes, 0)); // 100% used Map largerShardSizes = new HashMap<>(); - largerShardSizes.put("[test][0][p]", 14L); - largerShardSizes.put("[test][0][r]", 14L); - largerShardSizes.put("[test2][0][p]", 2L); - largerShardSizes.put("[test2][0][r]", 2L); + largerShardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L); + largerShardSizes.put("[test][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(14).getBytes() : 14L); + largerShardSizes.put("[test2][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(2).getBytes() : 2L); + largerShardSizes.put("[test2][0][r]", testMaxHeadroom ? ByteSizeValue.ofGb(2).getBytes() : 2L); final ClusterInfo overfullClusterInfo = new DevNullClusterInfo(overfullUsages, overfullUsages, largerShardSizes); @@ -788,9 +874,12 @@ public void testShardRelocationsTakenIntoAccount() { expectThrows(IllegalArgumentException.class, () -> strategy.reroute(clusterStateThatRejectsCommands, cmds, false, false)) .getMessage(), containsString( - "the node is above the low watermark cluster setting " - + "[cluster.routing.allocation.disk.watermark.low=0.7], using more disk space than the maximum " - + "allowed [70.0%], actual free: [26.0%]" + testMaxHeadroom + ? "the node is above the low watermark cluster setting " + + "[cluster.routing.allocation.disk.watermark.low.max_headroom=100gb], " + + "having less than the minimum required [100gb] free space, actual free: [96gb], actual used: [99%]" + : "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=70%], " + + "having less than the minimum required [30b] free space, actual free: [26b], actual used: [74%]" ) ); @@ -836,7 +925,10 @@ public void testShardRelocationsTakenIntoAccount() { shardSizes, Map.of( new ClusterInfo.NodeAndPath("node1", "/dev/null"), - new ClusterInfo.ReservedSpace.Builder().add(new ShardId("", "", 0), between(51, 200)).build() + new ClusterInfo.ReservedSpace.Builder().add( + new ShardId("", "", 0), + testMaxHeadroom ? ByteSizeValue.ofGb(between(150, 200)).getBytes() : between(51, 200) + ).build() ) ) ); @@ -850,22 +942,44 @@ public void testShardRelocationsTakenIntoAccount() { } } - public void testCanRemainWithShardRelocatingAway() { + public void testShardRelocationsTakenIntoAccountWithPercentages() { + doTestShardRelocationsTakenIntoAccount(false); + } + + public void testShardRelocationsTakenIntoAccountWithMaxHeadroom() { + doTestShardRelocationsTakenIntoAccount(true); + } + + private void doTestCanRemainWithShardRelocatingAway(boolean testMaxHeadroom) { Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(100).toString() : "-1") + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(50).toString() : "-1") + ) .build(); - // We have an index with 2 primary shards each taking 40 bytes. Each node has 100 bytes available Map usages = new HashMap<>(); - usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 20)); // 80% used - usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 100)); // 0% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + usages.put( + "node1", + new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(40).getBytes() : 20) + ); + usages.put( + "node2", + new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100) + ); Map shardSizes = new HashMap<>(); - shardSizes.put("[test][0][p]", 40L); - shardSizes.put("[test][1][p]", 40L); - shardSizes.put("[foo][0][p]", 10L); + shardSizes.put("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(4980).getBytes() : 40L); + shardSizes.put("[test][1][p]", testMaxHeadroom ? ByteSizeValue.ofGb(4980).getBytes() : 40L); + shardSizes.put("[foo][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(10).getBytes() : 10L); final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes); @@ -929,9 +1043,13 @@ public void testCanRemainWithShardRelocatingAway() { assertThat( decision.getExplanation(), containsString( - "the shard cannot remain on this node because it is above the high watermark cluster setting " - + "[cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30.0%] free disk on node, " - + "actual free: [20.0%]" + testMaxHeadroom + ? "the shard cannot remain on this node because it is above the high watermark cluster setting " + + "[cluster.routing.allocation.disk.watermark.high.max_headroom=50gb] and there is less than the required [50gb] " + + "free space on node, actual free: [40gb], actual used: [99.6%]" + : "the shard cannot remain on this node because it is above the high watermark cluster setting " + + "[cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space " + + "on node, actual free: [20b], actual used: [80%]" ) ); @@ -964,7 +1082,9 @@ public void testCanRemainWithShardRelocatingAway() { ); assertThat(decision.type(), equalTo(Decision.Type.YES)); assertEquals( - "there is enough disk on this node for the shard to remain, free: [60b]", + testMaxHeadroom + ? "there is enough disk on this node for the shard to remain, free: [4.9tb]" + : "there is enough disk on this node for the shard to remain, free: [60b]", ((Decision.Single) decision).getExplanation() ); decision = diskThresholdDecider.canAllocate(fooRouting, firstRoutingNode, routingAllocation); @@ -973,16 +1093,25 @@ public void testCanRemainWithShardRelocatingAway() { assertThat( ((Decision.Single) decision).getExplanation(), containsString( - "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=70%], using " - + "more disk space than the maximum allowed [70.0%], actual free: [20.0%]" + testMaxHeadroom + ? "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark" + + ".high.max_headroom=50gb], having less than the minimum required [50gb] free space, actual free: " + + "[40gb], actual used: [99.6%]" + : "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=70%], " + + "having less than the minimum required [30b] free space, actual free: [20b], actual used: [80%]" ) ); } else { assertThat( ((Decision.Single) decision).getExplanation(), containsString( - "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=60%], using more " - + "disk space than the maximum allowed [60.0%], actual free: [20.0%]" + testMaxHeadroom + ? "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low" + + ".max_headroom=100gb], having less than the minimum required [100gb] free space, actual free: [40gb], actual " + + "used: [99.6%]" + : "the node is above the low watermark cluster setting [cluster.routing.allocation.disk.watermark.low=60%], " + + "having less than the minimum required [40b] free space, actual free: [20b], actual used: [80%]" + ) ); } @@ -1011,7 +1140,7 @@ public void testCanRemainWithShardRelocatingAway() { EmptySnapshotsInfoService.INSTANCE ); // Ensure that the reroute call doesn't alter the routing table, since the first primary is relocating away - // and therefor we will have sufficient disk space on node1. + // and therefore we will have sufficient disk space on node1. ClusterState result = strategy.reroute(clusterState, "reroute"); assertThat(result, equalTo(clusterState)); assertThat(result.routingTable().index("test").shard(0).primaryShard().state(), equalTo(STARTED)); @@ -1022,20 +1151,40 @@ public void testCanRemainWithShardRelocatingAway() { assertThat(result.routingTable().index("test").shard(1).primaryShard().relocatingNodeId(), equalTo("node2")); } - public void testWatermarksEnabledForSingleDataNode() { + public void testCanRemainWithShardRelocatingAwayWithPercentages() { + doTestCanRemainWithShardRelocatingAway(false); + } + + public void testCanRemainWithShardRelocatingAwayWithMaxHeadroom() { + doTestCanRemainWithShardRelocatingAway(true); + } + + private void doTestWatermarksEnabledForSingleDataNode(boolean testMaxHeadroom) { Settings.Builder builder = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "60%") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%"); + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(100).toString() : "-1") + ) + .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "70%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(50).toString() : "-1") + ); if (randomBoolean()) { builder.put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), true); } Settings diskSettings = builder.build(); - Map usages = Map.of("data", new DiskUsage("data", "data", "/dev/null", 100, 20)); // 80% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + Map usages = Map.of( + "data", + new DiskUsage("data", "data", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(40).getBytes() : 20) + ); - // We have an index with 1 primary shard, taking 40 bytes. The single data node has only 20 bytes free. - Map shardSizes = Map.of("[test][0][p]", 40L); + // We have an index with 1 primary shard, taking more bytes than the free space of the single data node. + Map shardSizes = Map.of("[test][0][p]", testMaxHeadroom ? ByteSizeValue.ofGb(60).getBytes() : 40L); final ClusterInfo clusterInfo = new DevNullClusterInfo(usages, usages, shardSizes); DiskThresholdDecider diskThresholdDecider = makeDecider(diskSettings); @@ -1129,9 +1278,13 @@ public void testWatermarksEnabledForSingleDataNode() { assertThat( decision.getExplanation(), containsString( - "the shard cannot remain on this node because it is above the high watermark cluster setting" - + " [cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30.0%] free disk on node," - + " actual free: [20.0%]" + testMaxHeadroom + ? "the shard cannot remain on this node because it is above the high watermark cluster setting [cluster" + + ".routing.allocation.disk.watermark.high.max_headroom=50gb] and there is less than the required [50gb] free " + + "space on node, actual free: [40gb], actual used: [99.6%]" + : "the shard cannot remain on this node because it is above the high watermark cluster setting" + + " [cluster.routing.allocation.disk.watermark.high=70%] and there is less than the required [30b] free space " + + "on node, actual free: [20b], actual used: [80%]" ) ); @@ -1140,6 +1293,14 @@ public void testWatermarksEnabledForSingleDataNode() { } } + public void testWatermarksEnabledForSingleDataNodeWithPercentages() { + doTestWatermarksEnabledForSingleDataNode(false); + } + + public void testWatermarksEnabledForSingleDataNodeWithMaxHeadroom() { + doTestWatermarksEnabledForSingleDataNode(true); + } + public void testSingleDataNodeDeprecationWarning() { Settings settings = Settings.builder().put(DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE.getKey(), false).build(); @@ -1159,19 +1320,33 @@ public void testSingleDataNodeDeprecationWarning() { assertSettingDeprecationsAndWarnings(new Setting[] { DiskThresholdDecider.ENABLE_FOR_SINGLE_DATA_NODE }); } - public void testDiskThresholdWithSnapshotShardSizes() { - final long shardSizeInBytes = randomBoolean() ? 10L : 50L; + private void doTestDiskThresholdWithSnapshotShardSizes(boolean testMaxHeadroom) { + final long shardSizeInBytes = randomBoolean() + ? (testMaxHeadroom ? ByteSizeValue.ofGb(49).getBytes() : 10L) // fits free space of node1 + : (testMaxHeadroom ? ByteSizeValue.ofGb(300).getBytes() : 50L); // does not fit free space of node1 logger.info("--> using shard size [{}]", shardSizeInBytes); final Settings diskSettings = Settings.builder() .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_THRESHOLD_ENABLED_SETTING.getKey(), true) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), "90%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(100).toString() : "-1") + ) .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), "95%") + .put( + DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_MAX_HEADROOM_SETTING.getKey(), + (testMaxHeadroom ? ByteSizeValue.ofGb(20).toString() : "-1") + ) .build(); Map usages = new HashMap<>(); - usages.put("node1", new DiskUsage("node1", "n1", "/dev/null", 100, 21)); // 79% used - usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", 100, 1)); // 99% used + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + usages.put( + "node1", + new DiskUsage("node1", "n1", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(150).getBytes() : 21) + ); + usages.put("node2", new DiskUsage("node2", "n2", "/dev/null", totalBytes, testMaxHeadroom ? ByteSizeValue.ofGb(1).getBytes() : 1)); final ClusterInfoService clusterInfoService = () -> new DevNullClusterInfo(usages, usages, Map.of()); final AllocationDeciders deciders = new AllocationDeciders( @@ -1219,9 +1394,8 @@ public void testDiskThresholdWithSnapshotShardSizes() { .metadata(metadata) .routingTable(routingTable) .putCustom(RestoreInProgress.TYPE, restores.build()) - .nodes( - DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider - // automatically ignore single-node clusters + .nodes(DiscoveryNodes.builder().add(newNode("node1")).add(newNode("node2")) // node2 is added because DiskThresholdDecider + // automatically ignore single-node clusters ) .build(); @@ -1286,6 +1460,14 @@ public void testDiskThresholdWithSnapshotShardSizes() { assertThat(shardsWithState(clusterState.getRoutingNodes(), "test", INITIALIZING, STARTED).size(), equalTo(shouldAllocate ? 1 : 0)); } + public void testDiskThresholdWithSnapshotShardSizesWithPercentages() { + doTestDiskThresholdWithSnapshotShardSizes(false); + } + + public void testDiskThresholdWithSnapshotShardSizesWithMaxHeadroom() { + doTestDiskThresholdWithSnapshotShardSizes(true); + } + public void logShardStates(ClusterState state) { RoutingNodes rn = state.getRoutingNodes(); logger.info( diff --git a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java index dd380fb98e725..ab8cd7f8c1619 100644 --- a/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java +++ b/server/src/test/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderUnitTests.java @@ -32,8 +32,10 @@ import org.elasticsearch.cluster.routing.allocation.AllocationService; import org.elasticsearch.cluster.routing.allocation.RoutingAllocation; import org.elasticsearch.cluster.routing.allocation.decider.DiskThresholdDeciderTests.DevNullClusterInfo; +import org.elasticsearch.common.Strings; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.index.Index; import org.elasticsearch.index.shard.ShardId; @@ -52,6 +54,17 @@ */ public class DiskThresholdDeciderUnitTests extends ESAllocationTestCase { + private static long getExpectedShardSize(ShardRouting shardRouting, long defaultSize, RoutingAllocation allocation) { + return DiskThresholdDecider.getExpectedShardSize( + shardRouting, + defaultSize, + allocation.clusterInfo(), + allocation.snapshotShardSizeInfo(), + allocation.metadata(), + allocation.routingTable() + ); + } + public void testCanAllocateUsesMaxAvailableSpace() { ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss); @@ -127,13 +140,13 @@ public void testCanAllocateUsesMaxAvailableSpace() { assertThat( ((Decision.Single) decision).getExplanation(), containsString( - "the node is above the high watermark cluster " - + "setting [cluster.routing.allocation.disk.watermark.high=90%], using more disk space than the maximum allowed [90.0%]" + "the node is above the high watermark cluster setting [cluster.routing.allocation.disk.watermark.high=90%], " + + "having less than the minimum required" ) ); } - public void testCannotAllocateDueToLackOfDiskResources() { + private void doTestCannotAllocateDueToLackOfDiskResources(boolean testMaxHeadroom) { ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss); @@ -175,12 +188,17 @@ public void testCannotAllocateDueToLackOfDiskResources() { // actual test -- after all that bloat :) - Map leastAvailableUsages = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", 100, 0)); // all full - final int freeBytes = randomIntBetween(20, 100); - Map mostAvailableUsage = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", 100, freeBytes)); + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + + Map leastAvailableUsages = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", totalBytes, 0)); // all + // full + final long freeBytes = testMaxHeadroom + ? ByteSizeValue.ofGb(randomIntBetween(100, 10000)).getBytes() + : randomLongBetween(20, totalBytes); + Map mostAvailableUsage = Map.of("node_0", new DiskUsage("node_0", "node_0", "_na_", totalBytes, freeBytes)); // way bigger than available space - final long shardSize = randomIntBetween(110, 1000); + final long shardSize = randomLongBetween(totalBytes + 10, totalBytes * 10); ClusterInfo clusterInfo = new ClusterInfo( leastAvailableUsages, mostAvailableUsage, @@ -200,22 +218,45 @@ public void testCannotAllocateDueToLackOfDiskResources() { Decision decision = decider.canAllocate(test_0, RoutingNodesHelper.routingNode("node_0", node_0), allocation); assertEquals(Decision.Type.NO, decision.type()); + double usedPercentage = 100.0 * (totalBytes - freeBytes) / totalBytes; + assertThat( decision.getExplanation(), containsString( - "allocating the shard to this node will bring the node above the high watermark cluster setting " - + "[cluster.routing.allocation.disk.watermark.high=90%] " - + "and cause it to have less than the minimum required [0b] of free space " - + "(free: [" - + freeBytes - + "b], estimated shard size: [" - + shardSize - + "b])" + testMaxHeadroom + ? "allocating the shard to this node will bring the node above the high watermark cluster setting " + + "[cluster.routing.allocation.disk.watermark.high.max_headroom=100gb] " + + "and cause it to have less than the minimum required [100gb] of free space " + + "(free: [" + + ByteSizeValue.ofBytes(freeBytes) + + "], used: [" + + Strings.format1Decimals(usedPercentage, "%") + + "], estimated shard size: [" + + ByteSizeValue.ofBytes(shardSize) + + "])" + : "allocating the shard to this node will bring the node above the high watermark cluster setting " + + "[cluster.routing.allocation.disk.watermark.high=90%] " + + "and cause it to have less than the minimum required [10b] of free space " + + "(free: [" + + freeBytes + + "b], used: [" + + Strings.format1Decimals(usedPercentage, "%") + + "], estimated shard size: [" + + shardSize + + "b])" ) ); } - public void testCanRemainUsesLeastAvailableSpace() { + public void testCannotAllocateDueToLackOfDiskResourcesWithPercentages() { + doTestCannotAllocateDueToLackOfDiskResources(false); + } + + public void testCannotAllocateDueToLackOfDiskResourcesWithMaxHeadroom() { + doTestCannotAllocateDueToLackOfDiskResources(true); + } + + private void doTestCanRemainUsesLeastAvailableSpace(boolean testMaxHeadroom) { ClusterSettings nss = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); DiskThresholdDecider decider = new DiskThresholdDecider(Settings.EMPTY, nss); Map shardRoutingMap = new HashMap<>(); @@ -290,18 +331,28 @@ public void testCanRemainUsesLeastAvailableSpace() { clusterState = ClusterState.builder(clusterState).nodes(DiscoveryNodes.builder().add(node_0).add(node_1)).build(); // actual test -- after all that bloat :) + + final long totalBytes = testMaxHeadroom ? ByteSizeValue.ofGb(10000).getBytes() : 100; + final long exactFreeSpaceForHighWatermark = testMaxHeadroom ? ByteSizeValue.ofGb(100).getBytes() : 10; + final long exactFreeSpaceForBelowHighWatermark = exactFreeSpaceForHighWatermark - 1; + final double exactUsedSpaceForBelowHighWatermark = 100.0 * (totalBytes - exactFreeSpaceForBelowHighWatermark) / totalBytes; + final long ninetyPercentFreeSpace = (long) (totalBytes * 0.9); + Map leastAvailableUsages = new HashMap<>(); - leastAvailableUsages.put("node_0", new DiskUsage("node_0", "node_0", "/node0/least", 100, 10)); // 90% used - leastAvailableUsages.put("node_1", new DiskUsage("node_1", "node_1", "/node1/least", 100, 9)); // 91% used + leastAvailableUsages.put("node_0", new DiskUsage("node_0", "node_0", "/node0/least", totalBytes, exactFreeSpaceForHighWatermark)); + leastAvailableUsages.put( + "node_1", + new DiskUsage("node_1", "node_1", "/node1/least", totalBytes, exactFreeSpaceForBelowHighWatermark) + ); Map mostAvailableUsage = new HashMap<>(); - mostAvailableUsage.put("node_0", new DiskUsage("node_0", "node_0", "/node0/most", 100, 90)); // 10% used - mostAvailableUsage.put("node_1", new DiskUsage("node_1", "node_1", "/node1/most", 100, 90)); // 10% used + mostAvailableUsage.put("node_0", new DiskUsage("node_0", "node_0", "/node0/most", totalBytes, ninetyPercentFreeSpace)); + mostAvailableUsage.put("node_1", new DiskUsage("node_1", "node_1", "/node1/most", totalBytes, ninetyPercentFreeSpace)); Map shardSizes = new HashMap<>(); - shardSizes.put("[test][0][p]", 10L); // 10 bytes - shardSizes.put("[test][1][p]", 10L); - shardSizes.put("[test][2][p]", 10L); + shardSizes.put("[test][0][p]", exactFreeSpaceForHighWatermark); + shardSizes.put("[test][1][p]", exactFreeSpaceForHighWatermark); + shardSizes.put("[test][2][p]", exactFreeSpaceForHighWatermark); final ClusterInfo clusterInfo = new ClusterInfo( leastAvailableUsages, @@ -323,7 +374,11 @@ public void testCanRemainUsesLeastAvailableSpace() { assertEquals(Decision.Type.YES, decision.type()); assertThat( ((Decision.Single) decision).getExplanation(), - containsString("there is enough disk on this node for the shard to remain, free: [10b]") + containsString( + "there is enough disk on this node for the shard to remain, free: [" + + ByteSizeValue.ofBytes(exactFreeSpaceForHighWatermark) + + "]" + ) ); decision = decider.canRemain(indexMetadata, test_1, RoutingNodesHelper.routingNode("node_1", node_1), allocation); assertEquals(Decision.Type.NO, decision.type()); @@ -331,8 +386,16 @@ public void testCanRemainUsesLeastAvailableSpace() { ((Decision.Single) decision).getExplanation(), containsString( "the shard cannot remain on this node because it is above the high watermark cluster setting " - + "[cluster.routing.allocation.disk.watermark.high=90%] and there is less than the required [10.0%] " - + "free disk on node, actual free: [9.0%]" + + "[cluster.routing.allocation.disk.watermark.high" + + (testMaxHeadroom ? ".max_headroom=100gb" : "=90%") + + "] and there is less than the required [" + + ByteSizeValue.ofBytes(exactFreeSpaceForHighWatermark) + + "] free space on " + + "node, actual free: [" + + ByteSizeValue.ofBytes(exactFreeSpaceForBelowHighWatermark) + + "], actual used: [" + + Strings.format1Decimals(exactUsedSpaceForBelowHighWatermark, "%") + + "]" ) ); try { @@ -363,6 +426,14 @@ public void testCanRemainUsesLeastAvailableSpace() { ); } + public void testCanRemainUsesLeastAvailableSpaceWithPercentages() { + doTestCanRemainUsesLeastAvailableSpace(false); + } + + public void testCanRemainUsesLeastAvailableSpaceWithMaxHeadroom() { + doTestCanRemainUsesLeastAvailableSpace(true); + } + public void testShardSizeAndRelocatingSize() { Map shardSizes = new HashMap<>(); shardSizes.put("[test][0][r]", 10L); @@ -617,17 +688,6 @@ public void testSizeShrinkIndex() { assertEquals(42L, getExpectedShardSize(target2, 42L, allocationWithMissingSourceIndex)); } - private static long getExpectedShardSize(ShardRouting shardRouting, long defaultSize, RoutingAllocation allocation) { - return DiskThresholdDecider.getExpectedShardSize( - shardRouting, - defaultSize, - allocation.clusterInfo(), - allocation.snapshotShardSizeInfo(), - allocation.metadata(), - allocation.routingTable() - ); - } - public void testDiskUsageWithRelocations() { assertThat( new DiskThresholdDecider.DiskUsageWithRelocations(new DiskUsage("n", "n", "/dev/null", 1000L, 1000L), 0).getFreeBytes(), diff --git a/server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java b/server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java index 8ae9c836bdf9f..2d6d0c4a06d21 100644 --- a/server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java +++ b/server/src/test/java/org/elasticsearch/health/metadata/HealthMetadataSerializationTests.java @@ -65,7 +65,9 @@ private static HealthMetadata randomHealthMetadata() { private static HealthMetadata.Disk randomDiskMetadata() { return new HealthMetadata.Disk( randomRelativeByteSizeValue(), + ByteSizeValue.ofGb(randomIntBetween(10, 999)), randomRelativeByteSizeValue(), + ByteSizeValue.ofGb(randomIntBetween(10, 999)), randomRelativeByteSizeValue(), ByteSizeValue.ofGb(randomIntBetween(10, 999)) ); @@ -81,16 +83,27 @@ private static RelativeByteSizeValue randomRelativeByteSizeValue() { static HealthMetadata.Disk mutateDiskMetadata(HealthMetadata.Disk base) { RelativeByteSizeValue highWatermark = base.highWatermark(); + ByteSizeValue highWatermarkMaxHeadRoom = base.highMaxHeadroom(); RelativeByteSizeValue floodStageWatermark = base.floodStageWatermark(); + ByteSizeValue floodStageWatermarkMaxHeadRoom = base.floodStageMaxHeadroom(); RelativeByteSizeValue floodStageWatermarkFrozen = base.frozenFloodStageWatermark(); ByteSizeValue floodStageWatermarkFrozenMaxHeadRoom = base.frozenFloodStageMaxHeadroom(); - switch (randomInt(3)) { + switch (randomInt(5)) { case 0 -> highWatermark = randomRelativeByteSizeValue(); - case 1 -> floodStageWatermark = randomRelativeByteSizeValue(); - case 2 -> floodStageWatermarkFrozen = randomRelativeByteSizeValue(); - case 3 -> ByteSizeValue.ofGb(randomIntBetween(10, 999)); + case 1 -> highWatermarkMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999)); + case 2 -> floodStageWatermark = randomRelativeByteSizeValue(); + case 3 -> floodStageWatermarkMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999)); + case 4 -> floodStageWatermarkFrozen = randomRelativeByteSizeValue(); + case 5 -> floodStageWatermarkFrozenMaxHeadRoom = ByteSizeValue.ofGb(randomIntBetween(10, 999)); } - return new HealthMetadata.Disk(highWatermark, floodStageWatermark, floodStageWatermarkFrozen, floodStageWatermarkFrozenMaxHeadRoom); + return new HealthMetadata.Disk( + highWatermark, + highWatermarkMaxHeadRoom, + floodStageWatermark, + floodStageWatermarkMaxHeadRoom, + floodStageWatermarkFrozen, + floodStageWatermarkFrozenMaxHeadRoom + ); } private HealthMetadata mutate(HealthMetadata base) { diff --git a/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java b/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java index 476c6b21df034..c9e24032bf37f 100644 --- a/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java +++ b/x-pack/plugin/autoscaling/src/main/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderService.java @@ -218,18 +218,8 @@ static Optional singleNoDecision(Decision decision, Predicate } } - static long nodeSizeForDataBelowLowWatermark(long bytes, DiskThresholdSettings thresholdSettings) { - ByteSizeValue bytesThreshold = thresholdSettings.getFreeBytesThresholdLow(); - if (bytesThreshold.getBytes() != 0) { - return bytesThreshold.getBytes() + bytes; - } else { - double percentThreshold = thresholdSettings.getFreeDiskThresholdLow(); - if (percentThreshold >= 0.0 && percentThreshold < 100.0) { - return (long) (100 * bytes / (100 - percentThreshold)); - } else { - return bytes; - } - } + static long nodeSizeForDataBelowLowWatermark(long neededBytes, DiskThresholdSettings thresholdSettings) { + return thresholdSettings.getMinimumTotalSizeForBelowLowWatermark(ByteSizeValue.ofBytes(neededBytes)).getBytes(); } // todo: move this to top level class. @@ -547,22 +537,12 @@ long unmovableSize(String nodeId, Collection shards) { return 0; } - long threshold = Math.max( - diskThresholdSettings.getFreeBytesThresholdHigh().getBytes(), - thresholdFromPercentage(diskThresholdSettings.getFreeDiskThresholdHigh(), diskUsage) - ); + long threshold = diskThresholdSettings.getFreeBytesThresholdHighStage(ByteSizeValue.ofBytes(diskUsage.getTotalBytes())) + .getBytes(); long missing = threshold - diskUsage.getFreeBytes(); return Math.max(missing, shards.stream().mapToLong(this::sizeOf).min().orElseThrow()); } - private long thresholdFromPercentage(Double percentage, DiskUsage diskUsage) { - if (percentage == null) { - return 0L; - } - - return (long) Math.ceil(diskUsage.getTotalBytes() * percentage / 100); - } - Stream nodesInTier(RoutingNodes routingNodes) { return nodeIds.stream().map(routingNodes::node); } diff --git a/x-pack/plugin/autoscaling/src/test/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderServiceTests.java b/x-pack/plugin/autoscaling/src/test/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderServiceTests.java index 353fa8d789c84..ccfaa5131fb0d 100644 --- a/x-pack/plugin/autoscaling/src/test/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderServiceTests.java +++ b/x-pack/plugin/autoscaling/src/test/java/org/elasticsearch/xpack/autoscaling/storage/ReactiveStorageDeciderServiceTests.java @@ -355,39 +355,6 @@ public void testNodeLockSplitClone() { assertThat(createAllocationState(shardSizes, clusterState).maxNodeLockedSize(), equalTo(sourceSize * 2)); } - public void testNodeSizeForDataBelowLowWatermark() { - final ClusterSettings emptyClusterSettings = new ClusterSettings(Settings.EMPTY, ClusterSettings.BUILT_IN_CLUSTER_SETTINGS); - final DiskThresholdSettings defaultSettings = new DiskThresholdSettings(Settings.EMPTY, emptyClusterSettings); - final long factor = between(1, 1000); - assertThat(ReactiveStorageDeciderService.nodeSizeForDataBelowLowWatermark(85 * factor, defaultSettings), equalTo(100L * factor)); - - // to make it easy, stay below high watermark. - final long percentage = between(1, 89); - final DiskThresholdSettings relativeSettings = new DiskThresholdSettings( - Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), percentage + "%") - .build(), - emptyClusterSettings - ); - assertThat( - ReactiveStorageDeciderService.nodeSizeForDataBelowLowWatermark(percentage * factor, relativeSettings), - equalTo(100L * factor) - ); - - final long absolute = between(1, 1000); - final DiskThresholdSettings absoluteSettings = new DiskThresholdSettings( - Settings.builder() - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_LOW_DISK_WATERMARK_SETTING.getKey(), absolute + "b") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_HIGH_DISK_WATERMARK_SETTING.getKey(), absolute + "b") - .put(DiskThresholdSettings.CLUSTER_ROUTING_ALLOCATION_DISK_FLOOD_STAGE_WATERMARK_SETTING.getKey(), absolute + "b") - .build(), - emptyClusterSettings - ); - - long needed = between(0, 1000); - assertThat(ReactiveStorageDeciderService.nodeSizeForDataBelowLowWatermark(needed, absoluteSettings), equalTo(needed + absolute)); - } - private Settings.Builder addRandomNodeLockUsingAttributes(Settings.Builder settings) { String setting = randomFrom( IndexMetadata.INDEX_ROUTING_REQUIRE_GROUP_SETTING, diff --git a/x-pack/plugin/deprecation/src/main/java/org/elasticsearch/xpack/deprecation/TransportNodeDeprecationCheckAction.java b/x-pack/plugin/deprecation/src/main/java/org/elasticsearch/xpack/deprecation/TransportNodeDeprecationCheckAction.java index 22cb856cccec3..32e936346a589 100644 --- a/x-pack/plugin/deprecation/src/main/java/org/elasticsearch/xpack/deprecation/TransportNodeDeprecationCheckAction.java +++ b/x-pack/plugin/deprecation/src/main/java/org/elasticsearch/xpack/deprecation/TransportNodeDeprecationCheckAction.java @@ -24,6 +24,7 @@ import org.elasticsearch.common.regex.Regex; import org.elasticsearch.common.settings.ClusterSettings; import org.elasticsearch.common.settings.Settings; +import org.elasticsearch.common.unit.ByteSizeValue; import org.elasticsearch.license.XPackLicenseState; import org.elasticsearch.plugins.PluginsService; import org.elasticsearch.tasks.Task; @@ -158,9 +159,9 @@ static DeprecationIssue checkDiskLowWatermark( DiskUsage usage = clusterInfo.getNodeMostAvailableDiskUsages().get(nodeId); if (usage != null) { long freeBytes = usage.getFreeBytes(); - double freeDiskPercentage = usage.getFreeDiskAsPercentage(); - if (exceedsLowWatermark(nodeSettings, clusterSettings, freeBytes, freeDiskPercentage) - || exceedsLowWatermark(dynamicSettings, clusterSettings, freeBytes, freeDiskPercentage)) { + long totalBytes = usage.getTotalBytes(); + if (exceedsLowWatermark(nodeSettings, clusterSettings, freeBytes, totalBytes) + || exceedsLowWatermark(dynamicSettings, clusterSettings, freeBytes, totalBytes)) { return new DeprecationIssue( DeprecationIssue.Level.CRITICAL, "Disk usage exceeds low watermark", @@ -179,15 +180,9 @@ static DeprecationIssue checkDiskLowWatermark( return null; } - private static boolean exceedsLowWatermark( - Settings settingsToCheck, - ClusterSettings clusterSettings, - long freeBytes, - double freeDiskPercentage - ) { + private static boolean exceedsLowWatermark(Settings settingsToCheck, ClusterSettings clusterSettings, long freeBytes, long totalBytes) { DiskThresholdSettings diskThresholdSettings = new DiskThresholdSettings(settingsToCheck, clusterSettings); - if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLow().getBytes() - || freeDiskPercentage < diskThresholdSettings.getFreeDiskThresholdLow()) { + if (freeBytes < diskThresholdSettings.getFreeBytesThresholdLowStage(ByteSizeValue.ofBytes(totalBytes)).getBytes()) { return true; } return false; diff --git a/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/xpack/searchablesnapshots/cache/shared/FrozenCacheServiceTests.java b/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/xpack/searchablesnapshots/cache/shared/FrozenCacheServiceTests.java index 56ba6093551c4..34b09ec0b1b05 100644 --- a/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/xpack/searchablesnapshots/cache/shared/FrozenCacheServiceTests.java +++ b/x-pack/plugin/searchable-snapshots/src/test/java/org/elasticsearch/xpack/searchablesnapshots/cache/shared/FrozenCacheServiceTests.java @@ -210,7 +210,9 @@ public void testDecay() throws IOException { } public void testCacheSizeRejectedOnNonFrozenNodes() { - String cacheSize = randomBoolean() ? new ByteSizeValue(size(500)).getStringRep() : new RatioValue(between(1, 100)).toString(); + String cacheSize = randomBoolean() + ? new ByteSizeValue(size(500)).getStringRep() + : RatioValue.formatNoTrailingZerosPercent(new RatioValue(between(1, 100)).getAsPercent()).toString(); final Settings settings = Settings.builder() .put(FrozenCacheService.SHARED_CACHE_SIZE_SETTING.getKey(), cacheSize) .put(FrozenCacheService.SHARED_CACHE_REGION_SIZE_SETTING.getKey(), new ByteSizeValue(size(100)).getStringRep())