From eb8405abecf57dacc4237faa34b297c400f53459 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Sun, 26 Mar 2023 10:00:38 -0700 Subject: [PATCH] [Segment Replication] Fix testAllocationWithDisruption flakyness Signed-off-by: Suraj Singh --- .../SegmentReplicationAllocationIT.java | 31 +++++++++++++------ 1 file changed, 21 insertions(+), 10 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java index ae3381f5116d2..a70617c353af4 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationAllocationIT.java @@ -166,15 +166,17 @@ public void testSingleIndexShardAllocation() throws Exception { } /** - * Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple node adding in and getting - * removed. The test asserts post each such event that primary shard distribution is balanced across single index. + * Similar to testSingleIndexShardAllocation test but creates multiple indices, multiple nodes adding in and getting + * removed. The test asserts post each such event that primary shard distribution is balanced for each index. */ - @AwaitsFix(bugUrl = "https://github.com/opensearch-project/OpenSearch/issues/6565") public void testAllocationWithDisruption() throws Exception { internalCluster().startClusterManagerOnlyNode(); final int maxReplicaCount = 2; final int maxShardCount = 5; - final int nodeCount = randomIntBetween(maxReplicaCount + 1, 10); + // Create maxReplicaCount+2 number of nodes. maxReplicaCount for replica shards & 1 node for primary shard + // allocation. One extra node to ensure post failover, primary shards do not remain stuck on one node due to + // SameShardAllocationDecider preventing re-balancing + final int nodeCount = randomIntBetween(maxReplicaCount + 2, 10); final int numberOfIndices = randomIntBetween(1, 10); logger.info("--> Creating {} nodes", nodeCount); @@ -184,13 +186,11 @@ public void testAllocationWithDisruption() throws Exception { } enablePreferPrimaryBalance(); - int shardCount, replicaCount, totalShardCount = 0, totalReplicaCount = 0; + int shardCount, replicaCount; ClusterState state; for (int i = 0; i < numberOfIndices; i++) { shardCount = randomIntBetween(1, maxShardCount); - totalShardCount += shardCount; replicaCount = randomIntBetween(1, maxReplicaCount); - totalReplicaCount += replicaCount; logger.info("--> Creating index test{} with primary {} and replica {}", i, shardCount, replicaCount); createIndex("test" + i, shardCount, replicaCount, i % 2 == 0); ensureGreen(TimeValue.timeValueSeconds(60)); @@ -212,13 +212,15 @@ public void testAllocationWithDisruption() throws Exception { logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); - logger.info("--> Stop one third nodes"); - for (int i = 0; i < nodeCount; i += 3) { - internalCluster().stopRandomNode(InternalTestCluster.nameFilter(nodeNames.get(i))); + int nodeCountToStop = additionalNodeCount; + while (nodeCountToStop > 0) { + internalCluster().stopRandomDataNode(); // give replica a chance to promote as primary before terminating node containing the replica ensureGreen(TimeValue.timeValueSeconds(60)); + nodeCountToStop--; } state = client().admin().cluster().prepareState().execute().actionGet().getState(); + logger.info("--> Cluster state post nodes stop {}", state); logger.info(ShardAllocations.printShardDistribution(state)); verifyPerIndexPrimaryBalance(); } @@ -240,6 +242,15 @@ private void verifyPerIndexPrimaryBalance() throws Exception { .filter(ShardRouting::primary) .collect(Collectors.toList()) .size(); + if (primaryCount > avgPrimaryShardsPerNode) { + logger.info( + "--> Primary shard balance assertion failure for index {} on node {} {} <= {}", + index.key, + node.node().getName(), + primaryCount, + avgPrimaryShardsPerNode + ); + } assertTrue(primaryCount <= avgPrimaryShardsPerNode); } }