From 581490d83c6b572834aff6056f5e17a191110a31 Mon Sep 17 00:00:00 2001
From: Tanguy Leroux <tlrx.dev@gmail.com>
Date: Wed, 7 Oct 2020 11:27:25 +0200
Subject: [PATCH] Fix DiskThresholdDeciderIT.testHighWatermarkNotExceeded
 (#63112) (#63385)

The first refreshDiskUsage() refreshes the ClusterInfo update which in turn
calls listeners like DiskThreshMonitor. This one triggers a reroute as
expected and turns an internal checkInProgress flag before submitting
a cluster state update to relocate shards (the internal flag is toggled
again once the cluster state update is processed).

In the test I suspect that the second refreshDiskUsage() may complete
before DiskThreshMonitor's internal flag is set back to its initial state,
resulting in the second ClusterInfo update to be ignored and message
like "[node_t0] skipping monitor as a check is already in progress" to
be logged. Adding another wait for languid events to be processed
before executing the second refreshDiskUsage() should help here.

Closes #62326
---
 .../decider/DiskThresholdDeciderIT.java       | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
index 146eec6155161..4f7efbdfe87a5 100644
--- a/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
+++ b/server/src/internalClusterTest/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
@@ -136,11 +136,11 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Collections.singletonList(InternalSettingsPlugin.class);
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62326")
     public void testHighWatermarkNotExceeded() throws Exception {
         internalCluster().startMasterOnlyNode();
         internalCluster().startDataOnlyNode();
         final String dataNodeName = internalCluster().startDataOnlyNode();
+        ensureStableCluster(3);
 
         final InternalClusterInfoService clusterInfoService
                 = (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class);
@@ -276,6 +276,13 @@ private long createReasonableSizedShards(final String indexName) throws Interrup
     }
 
     private void refreshDiskUsage() {
+        assertFalse(client().admin().cluster().prepareHealth()
+            .setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true)
+            .setWaitForNoInitializingShards(true)
+            .get()
+            .isTimedOut());
+
         final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class);
         ((InternalClusterInfoService) clusterInfoService).refresh();
         // if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute
@@ -284,9 +291,13 @@ private void refreshDiskUsage() {
             .allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) {
             assertAcked(client().admin().cluster().prepareReroute());
         }
-        assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
-                .setWaitForNoRelocatingShards(true)
-                .setWaitForNoInitializingShards(true).get().isTimedOut());
+
+        assertFalse(client().admin().cluster().prepareHealth()
+            .setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true)
+            .setWaitForNoInitializingShards(true)
+            .get()
+            .isTimedOut());
     }
 
     private static class TestFileStore extends FilterFileStore {