From 581490d83c6b572834aff6056f5e17a191110a31 Mon Sep 17 00:00:00 2001 From: Tanguy Leroux Date: Wed, 7 Oct 2020 11:27:25 +0200 Subject: [PATCH] Fix DiskThresholdDeciderIT.testHighWatermarkNotExceeded (#63112) (#63385) The first refreshDiskUsage() refreshes the ClusterInfo update which in turn calls listeners like DiskThreshMonitor. This one triggers a reroute as expected and turns an internal checkInProgress flag before submitting a cluster state update to relocate shards (the internal flag is toggled again once the cluster state update is processed). In the test I suspect that the second refreshDiskUsage() may complete before DiskThreshMonitor's internal flag is set back to its initial state, resulting in the second ClusterInfo update to be ignored and message like "[node_t0] skipping monitor as a check is already in progress" to be logged. Adding another wait for languid events to be processed before executing the second refreshDiskUsage() should help here. Closes #62326 --- .../decider/DiskThresholdDeciderIT.java | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java index 146eec6155161..4f7efbdfe87a5 100644 --- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java +++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java @@ -136,11 +136,11 @@ protected Collection> nodePlugins() { return Collections.singletonList(InternalSettingsPlugin.class); } - @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62326") public void testHighWatermarkNotExceeded() throws Exception { internalCluster().startMasterOnlyNode(); internalCluster().startDataOnlyNode(); final String dataNodeName = internalCluster().startDataOnlyNode(); + ensureStableCluster(3); final InternalClusterInfoService clusterInfoService = (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class); @@ -276,6 +276,13 @@ private long createReasonableSizedShards(final String indexName) throws Interrup } private void refreshDiskUsage() { + assertFalse(client().admin().cluster().prepareHealth() + .setWaitForEvents(Priority.LANGUID) + .setWaitForNoRelocatingShards(true) + .setWaitForNoInitializingShards(true) + .get() + .isTimedOut()); + final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class); ((InternalClusterInfoService) clusterInfoService).refresh(); // if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute @@ -284,9 +291,13 @@ private void refreshDiskUsage() { .allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) { assertAcked(client().admin().cluster().prepareReroute()); } - assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID) - .setWaitForNoRelocatingShards(true) - .setWaitForNoInitializingShards(true).get().isTimedOut()); + + assertFalse(client().admin().cluster().prepareHealth() + .setWaitForEvents(Priority.LANGUID) + .setWaitForNoRelocatingShards(true) + .setWaitForNoInitializingShards(true) + .get() + .isTimedOut()); } private static class TestFileStore extends FilterFileStore {