From 750027044f3ff55f2c7d4b0e18a446d2cf985190 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Mon, 27 Mar 2023 14:12:39 -0700 Subject: [PATCH] [Segment Replication] Fix testAllocationWithDisruption flakyness (#6838) * [Segment Replication] Fix testAllocationWithDisruption flakyness Signed-off-by: Suraj Singh * Use more number of nodes with lesser primary shards to decrease chances of primary accumulation on one node Signed-off-by: Suraj Singh * Update comment Signed-off-by: Suraj Singh --------- Signed-off-by: Suraj Singh --- .../SegmentReplicationAllocationIT.java | 32 ++++++++++++------- 1 file changed, 21 insertions(+), 11 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index ae3381f5116d2..04f0a2c4bfb13 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -166,15 +166,16 @@ public void testSingleIndexShardAllocation() throws Exception { } /** - * Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple node adding in and getting - * removed. The test asserts post each such event that primary shard distribution is balanced across single index. + * Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple nodes adding in and getting + * removed. The test asserts post each such event that primary shard distribution is balanced for each index. */ - @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/6565") public void testAllocationWithDisruption() throws Exception { internalCluster().startClusterManagerOnlyNode(); final int maxReplicaCount = 2; - final int maxShardCount = 5; - final int nodeCount = randomIntBetween(maxReplicaCount + 1, 10); + final int maxShardCount = 2; + // Create higher number of nodes than number of shards to reduce chances of SameShardAllocationDecider kicking-in + // and preventing primary relocations + final int nodeCount = randomIntBetween(5, 10); final int numberOfIndices = randomIntBetween(1, 10); logger.info("--> Creating {} nodes", nodeCount); @@ -184,13 +185,11 @@ public void testAllocationWithDisruption() throws Exception { } enablePreferPrimaryBalance(); - int shardCount, replicaCount, totalShardCount = 0, totalReplicaCount = 0; + int shardCount, replicaCount; ClusterState state; for (int i = 0; i < numberOfIndices; i++) { shardCount = randomIntBetween(1, maxShardCount); - totalShardCount += shardCount; replicaCount = randomIntBetween(1, maxReplicaCount); - totalReplicaCount += replicaCount; logger.info("--> Creating index test{} with primary {} and replica {}", i, shardCount, replicaCount); createIndex("test" + i, shardCount, replicaCount, i % 2 == 0); ensureGreen(TimeValue.timeValueSeconds(60)); @@ -212,13 +211,15 @@ public void testAllocationWithDisruption() throws Exception { logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); - logger.info("--> Stop one third nodes"); - for (int i = 0; i < nodeCount; i += 3) { - internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(i))); + int nodeCountToStop = additionalNodeCount; + while (nodeCountToStop > 0) { + internalCluster().stopRandomDataNode(); // give replica a chance to promote as primary before terminating node containing the replica ensureGreen(TimeValue.timeValueSeconds(60)); + nodeCountToStop--; } state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info("--> Cluster state post nodes stop {}", state); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); } @@ -240,6 +241,15 @@ private void verifyPerIndexPrimaryBalance() throws Exception { .filter(ShardRouting::primary) .collect(Collectors.toList()) .size(); + if (primaryCount > avgPrimaryShardsPerNode) { + logger.info( + "--> Primary shard balance assertion failure for index {} on node {} {} <= {}", + index.key, + node.node().getName(), + primaryCount, + avgPrimaryShardsPerNode + ); + } assertTrue(primaryCount <= avgPrimaryShardsPerNode); } }