Fix DiskThresholdDeciderIT.testHighWatermarkNotExceeded (#63112) (#63385

) The first refreshDiskUsage() refreshes the ClusterInfo update which in turn calls listeners like DiskThreshMonitor. This one triggers a reroute as expected and turns an internal checkInProgress flag before submitting a cluster state update to relocate shards (the internal flag is toggled again once the cluster state update is processed). In the test I suspect that the second refreshDiskUsage() may complete before DiskThreshMonitor's internal flag is set back to its initial state, resulting in the second ClusterInfo update to be ignored and message like "[node_t0] skipping monitor as a check is already in progress" to be logged. Adding another wait for languid events to be processed before executing the second refreshDiskUsage() should help here. Closes #62326
elastic · Oct 7, 2020 · 581490d · 581490d
1 parent d45f7de
commit 581490d
Showing 1 changed file with 15 additions and 4 deletions.
diff --git a/...est/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java b/...est/java/org/elasticsearch/cluster/routing/allocation/decider/DiskThresholdDeciderIT.java
@@ -136,11 +136,11 @@ protected Collection<Class<? extends Plugin>> nodePlugins() {
         return Collections.singletonList(InternalSettingsPlugin.class);
     }
 
-    @AwaitsFix(bugUrl = "https://github.com/elastic/elasticsearch/issues/62326")
     public void testHighWatermarkNotExceeded() throws Exception {
         internalCluster().startMasterOnlyNode();
         internalCluster().startDataOnlyNode();
         final String dataNodeName = internalCluster().startDataOnlyNode();
+        ensureStableCluster(3);
 
         final InternalClusterInfoService clusterInfoService
                 = (InternalClusterInfoService) internalCluster().getMasterNodeInstance(ClusterInfoService.class);
@@ -276,6 +276,13 @@ private long createReasonableSizedShards(final String indexName) throws Interrup
     }
 
     private void refreshDiskUsage() {
+        assertFalse(client().admin().cluster().prepareHealth()
+            .setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true)
+            .setWaitForNoInitializingShards(true)
+            .get()
+            .isTimedOut());
+
         final ClusterInfoService clusterInfoService = internalCluster().getMasterNodeInstance(ClusterInfoService.class);
         ((InternalClusterInfoService) clusterInfoService).refresh();
         // if the nodes were all under the low watermark already (but unbalanced) then a change in the disk usage doesn't trigger a reroute
@@ -284,9 +291,13 @@ private void refreshDiskUsage() {
             .allMatch(cur -> cur.value.getFreeBytes() > WATERMARK_BYTES)) {
             assertAcked(client().admin().cluster().prepareReroute());
         }
-        assertFalse(client().admin().cluster().prepareHealth().setWaitForEvents(Priority.LANGUID)
-                .setWaitForNoRelocatingShards(true)
-                .setWaitForNoInitializingShards(true).get().isTimedOut());
+
+        assertFalse(client().admin().cluster().prepareHealth()
+            .setWaitForEvents(Priority.LANGUID)
+            .setWaitForNoRelocatingShards(true)
+            .setWaitForNoInitializingShards(true)
+            .get()
+            .isTimedOut());
     }
 
     private static class TestFileStore extends FilterFileStore {