From e17af2a19a6fb0258dca5a075cbe273b4367056b Mon Sep 17 00:00:00 2001 From: "opensearch-trigger-bot[bot]" <98922864+opensearch-trigger-bot[bot]@users.noreply.github.com> Date: Tue, 16 Aug 2022 18:06:42 -0400 Subject: [PATCH 1/7] Added bwc version 2.2.1 (#4194) Co-authored-by: opensearch-ci-bot --- .ci/bwcVersions | 1 + server/src/main/java/org/opensearch/Version.java | 1 + 2 files changed, 2 insertions(+) diff --git a/.ci/bwcVersions b/.ci/bwcVersions index 67976d58188e2..1ba3ee562317a 100644 --- a/.ci/bwcVersions +++ b/.ci/bwcVersions @@ -48,4 +48,5 @@ BWC_VERSION: - "2.1.0" - "2.1.1" - "2.2.0" + - "2.2.1" - "2.3.0" diff --git a/server/src/main/java/org/opensearch/Version.java b/server/src/main/java/org/opensearch/Version.java index 27a39db1c50cb..ba512d3fbcdd9 100644 --- a/server/src/main/java/org/opensearch/Version.java +++ b/server/src/main/java/org/opensearch/Version.java @@ -95,6 +95,7 @@ public class Version implements Comparable, ToXContentFragment { public static final Version V_2_1_0 = new Version(2010099, org.apache.lucene.util.Version.LUCENE_9_2_0); public static final Version V_2_1_1 = new Version(2010199, org.apache.lucene.util.Version.LUCENE_9_2_0); public static final Version V_2_2_0 = new Version(2020099, org.apache.lucene.util.Version.LUCENE_9_3_0); + public static final Version V_2_2_1 = new Version(2020199, org.apache.lucene.util.Version.LUCENE_9_3_0); public static final Version V_2_3_0 = new Version(2030099, org.apache.lucene.util.Version.LUCENE_9_3_0); public static final Version V_3_0_0 = new Version(3000099, org.apache.lucene.util.Version.LUCENE_9_4_0); public static final Version CURRENT = V_3_0_0; From 237f1a5344fb96e888852e089be349867b3c2340 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Tue, 16 Aug 2022 17:28:25 -0700 Subject: [PATCH 2/7] [Segment Replication] Wait for documents to replicate to replica shards (#4236) * [Segment Replication] Add thread sleep to account for replica lag in delete operations test Signed-off-by: Suraj Singh * Address review comments, assertBusy on doc count rather than sleep Signed-off-by: Suraj Singh Signed-off-by: Suraj Singh --- .../replication/SegmentReplicationIT.java | 20 ++++++++++++++++--- 1 file changed, 17 insertions(+), 3 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 96ac703b9837e..9518d5e802a8e 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -38,6 +38,7 @@ import java.util.Map; import java.util.Optional; import java.util.Set; +import java.util.concurrent.TimeUnit; import java.util.function.Function; import java.util.stream.Collectors; @@ -293,9 +294,22 @@ public void testDeleteOperations() throws Exception { refresh(INDEX_NAME); waitForReplicaUpdate(); - - assertHitCount(client(nodeA).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), expectedHitCount - 1); - assertHitCount(client(nodeB).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), expectedHitCount - 1); + assertBusy(() -> { + final long nodeA_Count = client(nodeA).prepareSearch(INDEX_NAME) + .setSize(0) + .setPreference("_only_local") + .get() + .getHits() + .getTotalHits().value; + assertEquals(expectedHitCount - 1, nodeA_Count); + final long nodeB_Count = client(nodeB).prepareSearch(INDEX_NAME) + .setSize(0) + .setPreference("_only_local") + .get() + .getHits() + .getTotalHits().value; + assertEquals(expectedHitCount - 1, nodeB_Count); + }, 5, TimeUnit.SECONDS); } } From 3a97d4cad65fa19126340aabf7e8bd2b1ce2b822 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Tue, 16 Aug 2022 18:39:51 -0700 Subject: [PATCH 3/7] Segment Replication - Add additional unit tests for update & delete ops. (#4237) * Segment Replication - Add additional unit tests for update & delete operations. Signed-off-by: Marc Handalian * Fix spotless. Signed-off-by: Marc Handalian Signed-off-by: Marc Handalian --- .../SegmentReplicationIndexShardTests.java | 54 +++++++++++++++++++ ...enSearchIndexLevelReplicationTestCase.java | 7 ++- 2 files changed, 60 insertions(+), 1 deletion(-) diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index d10f8ced963b7..4f6d86c13e12c 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -8,17 +8,23 @@ package org.opensearch.index.shard; +import org.opensearch.action.delete.DeleteRequest; +import org.opensearch.action.index.IndexRequest; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; import org.opensearch.common.unit.TimeValue; +import org.opensearch.common.xcontent.XContentType; import org.opensearch.index.IndexSettings; +import org.opensearch.index.engine.DocIdSeqNoAndSource; import org.opensearch.index.engine.NRTReplicationEngineFactory; +import org.opensearch.index.mapper.MapperService; import org.opensearch.index.replication.OpenSearchIndexLevelReplicationTestCase; import org.opensearch.indices.replication.checkpoint.SegmentReplicationCheckpointPublisher; import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.indices.replication.common.ReplicationType; import java.io.IOException; +import java.util.List; import static org.mockito.ArgumentMatchers.any; import static org.mockito.Mockito.verify; @@ -52,6 +58,54 @@ public void testReplicationCheckpointNotNullForSegReb() throws IOException { closeShards(indexShard); } + public void testSegmentReplication_Index_Update_Delete() throws Exception { + String mappings = "{ \"" + MapperService.SINGLE_MAPPING_NAME + "\": { \"properties\": { \"foo\": { \"type\": \"keyword\"} }}}"; + try (ReplicationGroup shards = createGroup(2, settings, mappings, new NRTReplicationEngineFactory())) { + shards.startAll(); + final IndexShard primaryShard = shards.getPrimary(); + + final int numDocs = randomIntBetween(100, 200); + for (int i = 0; i < numDocs; i++) { + shards.index(new IndexRequest(index.getName()).id(String.valueOf(i)).source("{\"foo\": \"bar\"}", XContentType.JSON)); + } + + primaryShard.refresh("Test"); + replicateSegments(primaryShard, shards.getReplicas()); + + shards.assertAllEqual(numDocs); + + for (int i = 0; i < numDocs; i++) { + // randomly update docs. + if (randomBoolean()) { + shards.index( + new IndexRequest(index.getName()).id(String.valueOf(i)).source("{ \"foo\" : \"baz\" }", XContentType.JSON) + ); + } + } + + primaryShard.refresh("Test"); + replicateSegments(primaryShard, shards.getReplicas()); + shards.assertAllEqual(numDocs); + + final List docs = getDocIdAndSeqNos(primaryShard); + for (IndexShard shard : shards.getReplicas()) { + assertEquals(getDocIdAndSeqNos(shard), docs); + } + for (int i = 0; i < numDocs; i++) { + // randomly delete. + if (randomBoolean()) { + shards.delete(new DeleteRequest(index.getName()).id(String.valueOf(i))); + } + } + primaryShard.refresh("Test"); + replicateSegments(primaryShard, shards.getReplicas()); + final List docsAfterDelete = getDocIdAndSeqNos(primaryShard); + for (IndexShard shard : shards.getReplicas()) { + assertEquals(getDocIdAndSeqNos(shard), docsAfterDelete); + } + } + } + public void testIgnoreShardIdle() throws Exception { try (ReplicationGroup shards = createGroup(1, settings, new NRTReplicationEngineFactory())) { shards.startAll(); diff --git a/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java b/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java index 249ffcfd0bf6e..b3f062aef4fbe 100644 --- a/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java +++ b/test/framework/src/main/java/org/opensearch/index/replication/OpenSearchIndexLevelReplicationTestCase.java @@ -141,7 +141,12 @@ protected ReplicationGroup createGroup(int replicas, Settings settings) throws I } protected ReplicationGroup createGroup(int replicas, Settings settings, EngineFactory engineFactory) throws IOException { - IndexMetadata metadata = buildIndexMetadata(replicas, settings, indexMapping); + return createGroup(replicas, settings, indexMapping, engineFactory); + } + + protected ReplicationGroup createGroup(int replicas, Settings settings, String mappings, EngineFactory engineFactory) + throws IOException { + IndexMetadata metadata = buildIndexMetadata(replicas, settings, mappings); return new ReplicationGroup(metadata) { @Override protected EngineFactory getEngineFactory(ShardRouting routing) { From f65e02d1b910bd0a1990868bfa5d12ba829bbbd5 Mon Sep 17 00:00:00 2001 From: Marc Handalian Date: Wed, 17 Aug 2022 10:32:11 -0700 Subject: [PATCH 4/7] Support shard promotion with Segment Replication. (#4135) * Support shard promotion with Segment Replication. This change adds basic failover support with segment replication. Once selected, a replica will commit and reopen a writeable engine. Signed-off-by: Marc Handalian * Add check to ensure a closed shard does not publish checkpoints. Signed-off-by: Marc Handalian * Clean up in SegmentReplicationIT. Signed-off-by: Marc Handalian * PR feedback. Signed-off-by: Marc Handalian * Fix merge conflict. Signed-off-by: Marc Handalian Signed-off-by: Marc Handalian --- .../replication/SegmentReplicationIT.java | 148 +++++++++++++++++- .../org/opensearch/index/IndexService.java | 2 +- .../index/engine/NRTReplicationEngine.java | 17 ++ .../shard/CheckpointRefreshListener.java | 2 +- .../opensearch/index/shard/IndexShard.java | 40 +++-- .../org/opensearch/index/store/Store.java | 42 +++++ .../engine/NRTReplicationEngineTests.java | 52 ++++++ .../SegmentReplicationIndexShardTests.java | 80 +++++++++- 8 files changed, 363 insertions(+), 20 deletions(-) diff --git a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java index 9518d5e802a8e..8566cc5556861 100644 --- a/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java +++ b/server/src/internalClusterTest/java/org/opensearch/indices/replication/SegmentReplicationIT.java @@ -19,7 +19,10 @@ import org.opensearch.cluster.ClusterState; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.cluster.node.DiscoveryNode; +import org.opensearch.cluster.routing.IndexShardRoutingTable; import org.opensearch.cluster.routing.ShardRouting; +import org.opensearch.cluster.routing.allocation.command.CancelAllocationCommand; +import org.opensearch.common.Nullable; import org.opensearch.common.settings.Settings; import org.opensearch.common.util.FeatureFlags; import org.opensearch.index.Index; @@ -30,6 +33,7 @@ import org.opensearch.indices.IndicesService; import org.opensearch.indices.replication.common.ReplicationType; import org.opensearch.test.BackgroundIndexer; +import org.opensearch.test.InternalTestCluster; import org.opensearch.test.OpenSearchIntegTestCase; import java.io.IOException; @@ -73,6 +77,109 @@ protected boolean addMockInternalEngine() { return false; } + public void testPrimaryStopped_ReplicaPromoted() throws Exception { + final String primary = internalCluster().startNode(); + createIndex(INDEX_NAME); + ensureYellowAndNoInitializingShards(INDEX_NAME); + final String replica = internalCluster().startNode(); + ensureGreen(INDEX_NAME); + + client().prepareIndex(INDEX_NAME).setId("1").setSource("foo", "bar").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + refresh(INDEX_NAME); + + waitForReplicaUpdate(); + assertHitCount(client(primary).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 1); + assertHitCount(client(replica).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 1); + + // index another doc but don't refresh, we will ensure this is searchable once replica is promoted. + client().prepareIndex(INDEX_NAME).setId("2").setSource("bar", "baz").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + + // stop the primary node - we only have one shard on here. + internalCluster().stopRandomNode(InternalTestCluster.nameFilter(primary)); + ensureYellowAndNoInitializingShards(INDEX_NAME); + + final ShardRouting replicaShardRouting = getShardRoutingForNodeName(replica); + assertNotNull(replicaShardRouting); + assertTrue(replicaShardRouting + " should be promoted as a primary", replicaShardRouting.primary()); + assertHitCount(client(replica).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 2); + + // assert we can index into the new primary. + client().prepareIndex(INDEX_NAME).setId("3").setSource("bar", "baz").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + assertHitCount(client(replica).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 3); + + // start another node, index another doc and replicate. + String nodeC = internalCluster().startNode(); + ensureGreen(INDEX_NAME); + client().prepareIndex(INDEX_NAME).setId("4").setSource("baz", "baz").get(); + refresh(INDEX_NAME); + waitForReplicaUpdate(); + assertHitCount(client(nodeC).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 4); + assertHitCount(client(replica).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 4); + assertSegmentStats(REPLICA_COUNT); + } + + public void testRestartPrimary() throws Exception { + final String primary = internalCluster().startNode(); + createIndex(INDEX_NAME); + ensureYellowAndNoInitializingShards(INDEX_NAME); + final String replica = internalCluster().startNode(); + ensureGreen(INDEX_NAME); + + assertEquals(getNodeContainingPrimaryShard().getName(), primary); + + final int initialDocCount = 1; + client().prepareIndex(INDEX_NAME).setId("1").setSource("foo", "bar").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + refresh(INDEX_NAME); + + waitForReplicaUpdate(); + assertDocCounts(initialDocCount, replica, primary); + + internalCluster().restartNode(primary); + ensureGreen(INDEX_NAME); + + assertEquals(getNodeContainingPrimaryShard().getName(), replica); + + flushAndRefresh(INDEX_NAME); + waitForReplicaUpdate(); + + assertDocCounts(initialDocCount, replica, primary); + assertSegmentStats(REPLICA_COUNT); + } + + public void testCancelPrimaryAllocation() throws Exception { + // this test cancels allocation on the primary - promoting the new replica and recreating the former primary as a replica. + final String primary = internalCluster().startNode(); + createIndex(INDEX_NAME); + ensureYellowAndNoInitializingShards(INDEX_NAME); + final String replica = internalCluster().startNode(); + ensureGreen(INDEX_NAME); + + final int initialDocCount = 1; + + client().prepareIndex(INDEX_NAME).setId("1").setSource("foo", "bar").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE).get(); + refresh(INDEX_NAME); + + waitForReplicaUpdate(); + assertDocCounts(initialDocCount, replica, primary); + + final IndexShard indexShard = getIndexShard(primary); + client().admin() + .cluster() + .prepareReroute() + .add(new CancelAllocationCommand(INDEX_NAME, indexShard.shardId().id(), primary, true)) + .execute() + .actionGet(); + ensureGreen(INDEX_NAME); + + assertEquals(getNodeContainingPrimaryShard().getName(), replica); + + flushAndRefresh(INDEX_NAME); + waitForReplicaUpdate(); + + assertDocCounts(initialDocCount, replica, primary); + assertSegmentStats(REPLICA_COUNT); + } + public void testReplicationAfterPrimaryRefreshAndFlush() throws Exception { final String nodeA = internalCluster().startNode(); final String nodeB = internalCluster().startNode(); @@ -240,9 +347,8 @@ public void testStartReplicaAfterPrimaryIndexesDocs() throws Exception { assertHitCount(client(primaryNode).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 3); assertHitCount(client(replicaNode).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), 3); - final Index index = resolveIndex(INDEX_NAME); - IndexShard primaryShard = getIndexShard(index, primaryNode); - IndexShard replicaShard = getIndexShard(index, replicaNode); + IndexShard primaryShard = getIndexShard(primaryNode); + IndexShard replicaShard = getIndexShard(replicaNode); assertEquals( primaryShard.translogStats().estimatedNumberOfOperations(), replicaShard.translogStats().estimatedNumberOfOperations() @@ -351,8 +457,7 @@ private void assertSegmentStats(int numberOfReplicas) throws IOException { final ShardRouting replicaShardRouting = shardSegment.getShardRouting(); ClusterState state = client(internalCluster().getMasterName()).admin().cluster().prepareState().get().getState(); final DiscoveryNode replicaNode = state.nodes().resolveNode(replicaShardRouting.currentNodeId()); - final Index index = resolveIndex(INDEX_NAME); - IndexShard indexShard = getIndexShard(index, replicaNode.getName()); + IndexShard indexShard = getIndexShard(replicaNode.getName()); final String lastCommitSegmentsFileName = SegmentInfos.getLastCommitSegmentsFileName(indexShard.store().directory()); // calls to readCommit will fail if a valid commit point and all its segments are not in the store. SegmentInfos.readCommit(indexShard.store().directory(), lastCommitSegmentsFileName); @@ -392,7 +497,8 @@ private void waitForReplicaUpdate() throws Exception { }); } - private IndexShard getIndexShard(Index index, String node) { + private IndexShard getIndexShard(String node) { + final Index index = resolveIndex(INDEX_NAME); IndicesService indicesService = internalCluster().getInstance(IndicesService.class, node); IndexService indexService = indicesService.indexServiceSafe(index); final Optional shardId = indexService.shardIds().stream().findFirst(); @@ -409,7 +515,8 @@ private List getShardSegments(IndicesSegmentResponse indicesSeg } private Map getLatestSegments(ShardSegments segments) { - final Long latestPrimaryGen = segments.getSegments().stream().map(Segment::getGeneration).max(Long::compare).get(); + final Optional generation = segments.getSegments().stream().map(Segment::getGeneration).max(Long::compare); + final Long latestPrimaryGen = generation.get(); return segments.getSegments() .stream() .filter(s -> s.getGeneration() == latestPrimaryGen) @@ -419,4 +526,31 @@ private Map getLatestSegments(ShardSegments segments) { private Map> segmentsByShardType(ShardSegments[] replicationGroupSegments) { return Arrays.stream(replicationGroupSegments).collect(Collectors.groupingBy(s -> s.getShardRouting().primary())); } + + @Nullable + private ShardRouting getShardRoutingForNodeName(String nodeName) { + final ClusterState state = client(internalCluster().getClusterManagerName()).admin().cluster().prepareState().get().getState(); + for (IndexShardRoutingTable shardRoutingTable : state.routingTable().index(INDEX_NAME)) { + for (ShardRouting shardRouting : shardRoutingTable.activeShards()) { + final String nodeId = shardRouting.currentNodeId(); + final DiscoveryNode discoveryNode = state.nodes().resolveNode(nodeId); + if (discoveryNode.getName().equals(nodeName)) { + return shardRouting; + } + } + } + return null; + } + + private void assertDocCounts(int expectedDocCount, String... nodeNames) { + for (String node : nodeNames) { + assertHitCount(client(node).prepareSearch(INDEX_NAME).setSize(0).setPreference("_only_local").get(), expectedDocCount); + } + } + + private DiscoveryNode getNodeContainingPrimaryShard() { + final ClusterState state = client(internalCluster().getClusterManagerName()).admin().cluster().prepareState().get().getState(); + final ShardRouting primaryShard = state.routingTable().index(INDEX_NAME).shard(0).primaryShard(); + return state.nodes().resolveNode(primaryShard.currentNodeId()); + } } diff --git a/server/src/main/java/org/opensearch/index/IndexService.java b/server/src/main/java/org/opensearch/index/IndexService.java index 38065bf5aed20..e1427df1c34ab 100644 --- a/server/src/main/java/org/opensearch/index/IndexService.java +++ b/server/src/main/java/org/opensearch/index/IndexService.java @@ -550,7 +550,7 @@ public synchronized IndexShard createShard( circuitBreakerService, // TODO Replace with remote translog factory in the follow up PR this.indexSettings.isRemoteTranslogStoreEnabled() ? null : new InternalTranslogFactory(), - this.indexSettings.isSegRepEnabled() && routing.primary() ? checkpointPublisher : null, + this.indexSettings.isSegRepEnabled() ? checkpointPublisher : null, remoteStore ); eventListener.indexShardStateChanged(indexShard, null, indexShard.state(), "shard created"); diff --git a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java index a481203ba3dea..6f5b7030ed65f 100644 --- a/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java +++ b/server/src/main/java/org/opensearch/index/engine/NRTReplicationEngine.java @@ -129,6 +129,23 @@ public synchronized void updateSegments(final SegmentInfos infos, long seqNo) th localCheckpointTracker.fastForwardProcessedSeqNo(seqNo); } + /** + * Persist the latest live SegmentInfos. + * + * This method creates a commit point from the latest SegmentInfos. It is intended to be used when this shard is about to be promoted as the new primary. + * + * TODO: If this method is invoked while the engine is currently updating segments on its reader, wait for that update to complete so the updated segments are used. + * + * + * @throws IOException - When there is an IO error committing the SegmentInfos. + */ + public void commitSegmentInfos() throws IOException { + // TODO: This method should wait for replication events to finalize. + final SegmentInfos latestSegmentInfos = getLatestSegmentInfos(); + store.commitSegmentInfos(latestSegmentInfos, localCheckpointTracker.getMaxSeqNo(), localCheckpointTracker.getProcessedCheckpoint()); + translogManager.syncTranslog(); + } + @Override public String getHistoryUUID() { return loadHistoryUUID(lastCommittedSegmentInfos.userData); diff --git a/server/src/main/java/org/opensearch/index/shard/CheckpointRefreshListener.java b/server/src/main/java/org/opensearch/index/shard/CheckpointRefreshListener.java index 96d74bea85920..fb046e2310d93 100644 --- a/server/src/main/java/org/opensearch/index/shard/CheckpointRefreshListener.java +++ b/server/src/main/java/org/opensearch/index/shard/CheckpointRefreshListener.java @@ -40,7 +40,7 @@ public void beforeRefresh() throws IOException { @Override public void afterRefresh(boolean didRefresh) throws IOException { - if (didRefresh && shard.getReplicationTracker().isPrimaryMode()) { + if (didRefresh && shard.state() != IndexShardState.CLOSED && shard.getReplicationTracker().isPrimaryMode()) { publisher.publish(shard); } } diff --git a/server/src/main/java/org/opensearch/index/shard/IndexShard.java b/server/src/main/java/org/opensearch/index/shard/IndexShard.java index 8970e0734f184..67a8e691fda0d 100644 --- a/server/src/main/java/org/opensearch/index/shard/IndexShard.java +++ b/server/src/main/java/org/opensearch/index/shard/IndexShard.java @@ -242,6 +242,7 @@ public class IndexShard extends AbstractIndexShardComponent implements IndicesCl private final GlobalCheckpointListeners globalCheckpointListeners; private final PendingReplicationActions pendingReplicationActions; private final ReplicationTracker replicationTracker; + private final SegmentReplicationCheckpointPublisher checkpointPublisher; protected volatile ShardRouting shardRouting; protected volatile IndexShardState state; @@ -306,8 +307,6 @@ Runnable getGlobalCheckpointSyncer() { private final AtomicReference pendingRefreshLocation = new AtomicReference<>(); private final RefreshPendingLocationListener refreshPendingLocationListener; private volatile boolean useRetentionLeasesInPeerRecovery; - private final ReferenceManager.RefreshListener checkpointRefreshListener; - private final Store remoteStore; private final TranslogFactory translogFactory; @@ -417,11 +416,7 @@ public boolean shouldCache(Query query) { persistMetadata(path, indexSettings, shardRouting, null, logger); this.useRetentionLeasesInPeerRecovery = replicationTracker.hasAllPeerRecoveryRetentionLeases(); this.refreshPendingLocationListener = new RefreshPendingLocationListener(); - if (checkpointPublisher != null) { - this.checkpointRefreshListener = new CheckpointRefreshListener(this, checkpointPublisher); - } else { - this.checkpointRefreshListener = null; - } + this.checkpointPublisher = checkpointPublisher; this.remoteStore = remoteStore; this.translogFactory = translogFactory; } @@ -627,6 +622,11 @@ public void updateShardState( + newRouting; assert getOperationPrimaryTerm() == newPrimaryTerm; try { + if (indexSettings.isSegRepEnabled()) { + // this Shard's engine was read only, we need to update its engine before restoring local history from xlog. + assert newRouting.primary() && currentRouting.primary() == false; + promoteNRTReplicaToPrimary(); + } replicationTracker.activatePrimaryMode(getLocalCheckpoint()); ensurePeerRecoveryRetentionLeasesExist(); /* @@ -3231,8 +3231,8 @@ private EngineConfig newEngineConfig(LongSupplier globalCheckpointSupplier) thro Directory remoteDirectory = ((FilterDirectory) ((FilterDirectory) remoteStore.directory()).getDelegate()).getDelegate(); internalRefreshListener.add(new RemoteStoreRefreshListener(store.directory(), remoteDirectory)); } - if (this.checkpointRefreshListener != null) { - internalRefreshListener.add(checkpointRefreshListener); + if (this.checkpointPublisher != null && indexSettings.isSegRepEnabled() && shardRouting.primary()) { + internalRefreshListener.add(new CheckpointRefreshListener(this, this.checkpointPublisher)); } return this.engineConfigFactory.newEngineConfig( @@ -4123,4 +4123,26 @@ RetentionLeaseSyncer getRetentionLeaseSyncer() { public GatedCloseable getSegmentInfosSnapshot() { return getEngine().getSegmentInfosSnapshot(); } + + /** + * With segment replication enabled - prepare the shard's engine to be promoted as the new primary. + * + * If this shard is currently using a replication engine, this method: + * 1. Invokes {@link NRTReplicationEngine#commitSegmentInfos()} to ensure the engine can be reopened as writeable from the latest refresh point. + * InternalEngine opens its IndexWriter from an on-disk commit point, but this replica may have recently synced from a primary's refresh point, meaning it has documents searchable in its in-memory SegmentInfos + * that are not part of a commit point. This ensures that those documents are made part of a commit and do not need to be reindexed after promotion. + * 2. Invokes resetEngineToGlobalCheckpoint - This call performs the engine swap, opening up as a writeable engine and replays any operations in the xlog. The operations indexed from xlog here will be + * any ack'd writes that were not copied to this replica before promotion. + */ + private void promoteNRTReplicaToPrimary() { + assert shardRouting.primary() && indexSettings.isSegRepEnabled(); + getReplicationEngine().ifPresentOrElse(engine -> { + try { + engine.commitSegmentInfos(); + resetEngineToGlobalCheckpoint(); + } catch (IOException e) { + throw new EngineException(shardId, "Unable to update replica to writeable engine, failing shard", e); + } + }, () -> { throw new EngineException(shardId, "Expected replica engine to be of type NRTReplicationEngine"); }); + } } diff --git a/server/src/main/java/org/opensearch/index/store/Store.java b/server/src/main/java/org/opensearch/index/store/Store.java index 163717ad94c2c..58598ab2d08f4 100644 --- a/server/src/main/java/org/opensearch/index/store/Store.java +++ b/server/src/main/java/org/opensearch/index/store/Store.java @@ -121,6 +121,7 @@ import static java.util.Collections.emptyMap; import static java.util.Collections.unmodifiableMap; +import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY; /** * A Store provides plain access to files written by an opensearch index shard. Each shard @@ -799,6 +800,47 @@ public void beforeClose() { shardLock.setDetails("closing shard"); } + /** + * This method should only be used with Segment Replication. + * Perform a commit from a live {@link SegmentInfos}. Replica engines with segrep do not have an IndexWriter and Lucene does not currently + * have the ability to create a writer directly from a SegmentInfos object. To promote the replica as a primary and avoid reindexing, we must first commit + * on the replica so that it can be opened with a writeable engine. Further, InternalEngine currently invokes `trimUnsafeCommits` which reverts the engine to a previous safeCommit where the max seqNo is less than or equal + * to the current global checkpoint. It is likely that the replica has a maxSeqNo that is higher than the global cp and a new commit will be wiped. + * + * To get around these limitations, this method first creates an IndexCommit directly from SegmentInfos, it then + * uses an appending IW to create an IndexCommit from the commit created on SegmentInfos. + * This ensures that 1. All files in the new commit are fsynced and 2. Deletes older commit points so the only commit to start from is our new commit. + * + * @param latestSegmentInfos {@link SegmentInfos} The latest active infos + * @param maxSeqNo The engine's current maxSeqNo + * @param processedCheckpoint The engine's current processed checkpoint. + * @throws IOException when there is an IO error committing. + */ + public void commitSegmentInfos(SegmentInfos latestSegmentInfos, long maxSeqNo, long processedCheckpoint) throws IOException { + assert indexSettings.isSegRepEnabled(); + metadataLock.writeLock().lock(); + try { + final Map userData = new HashMap<>(latestSegmentInfos.getUserData()); + userData.put(LOCAL_CHECKPOINT_KEY, String.valueOf(processedCheckpoint)); + userData.put(SequenceNumbers.MAX_SEQ_NO, Long.toString(maxSeqNo)); + latestSegmentInfos.setUserData(userData, true); + latestSegmentInfos.commit(directory()); + + // similar to TrimUnsafeCommits, create a commit with an appending IW, this will delete old commits and ensure all files + // associated with the SegmentInfos.commit are fsynced. + final List existingCommits = DirectoryReader.listCommits(directory); + assert existingCommits.isEmpty() == false : "Expected at least one commit but none found"; + final IndexCommit lastIndexCommit = existingCommits.get(existingCommits.size() - 1); + assert latestSegmentInfos.getSegmentsFileName().equals(lastIndexCommit.getSegmentsFileName()); + try (IndexWriter writer = newAppendingIndexWriter(directory, lastIndexCommit)) { + writer.setLiveCommitData(lastIndexCommit.getUserData().entrySet()); + writer.commit(); + } + } finally { + metadataLock.writeLock().unlock(); + } + } + /** * A store directory * diff --git a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java index 675ff860c3334..1fe1a37dedae0 100644 --- a/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java +++ b/server/src/test/java/org/opensearch/index/engine/NRTReplicationEngineTests.java @@ -12,18 +12,25 @@ import org.apache.lucene.index.NoMergePolicy; import org.apache.lucene.index.SegmentInfos; import org.hamcrest.MatcherAssert; +import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.concurrent.GatedCloseable; import org.opensearch.common.lucene.Lucene; import org.opensearch.common.lucene.search.Queries; +import org.opensearch.common.settings.Settings; +import org.opensearch.index.IndexSettings; import org.opensearch.index.mapper.ParsedDocument; +import org.opensearch.index.seqno.LocalCheckpointTracker; import org.opensearch.index.seqno.SequenceNumbers; import org.opensearch.index.store.Store; import org.opensearch.index.translog.TestTranslog; import org.opensearch.index.translog.Translog; +import org.opensearch.indices.replication.common.ReplicationType; +import org.opensearch.test.IndexSettingsModule; import java.io.IOException; import java.nio.file.Path; import java.util.List; +import java.util.Map; import java.util.Set; import java.util.concurrent.atomic.AtomicLong; import java.util.stream.Collectors; @@ -31,6 +38,8 @@ import static org.hamcrest.Matchers.equalTo; import static org.hamcrest.Matchers.notNullValue; import static org.opensearch.index.seqno.SequenceNumbers.NO_OPS_PERFORMED; +import static org.opensearch.index.seqno.SequenceNumbers.LOCAL_CHECKPOINT_KEY; +import static org.opensearch.index.seqno.SequenceNumbers.MAX_SEQ_NO; public class NRTReplicationEngineTests extends EngineTestCase { @@ -210,6 +219,49 @@ public void testTrimTranslogOps() throws Exception { } } + public void testCommitSegmentInfos() throws Exception { + // This test asserts that NRTReplication#commitSegmentInfos creates a new commit point with the latest checkpoints + // stored in user data. + final AtomicLong globalCheckpoint = new AtomicLong(SequenceNumbers.NO_OPS_PERFORMED); + final IndexSettings indexSettings = IndexSettingsModule.newIndexSettings( + "index", + Settings.builder().put(IndexMetadata.SETTING_REPLICATION_TYPE, ReplicationType.SEGMENT).build() + ); + try ( + final Store nrtEngineStore = createStore(indexSettings, newDirectory()); + final NRTReplicationEngine nrtEngine = buildNrtReplicaEngine(globalCheckpoint, nrtEngineStore) + ) { + List operations = generateHistoryOnReplica(between(1, 500), randomBoolean(), randomBoolean(), randomBoolean()) + .stream() + .filter(op -> op.operationType().equals(Engine.Operation.TYPE.INDEX)) + .collect(Collectors.toList()); + for (Engine.Operation op : operations) { + applyOperation(nrtEngine, op); + } + + final SegmentInfos previousInfos = nrtEngine.getLatestSegmentInfos(); + LocalCheckpointTracker localCheckpointTracker = nrtEngine.getLocalCheckpointTracker(); + final long maxSeqNo = localCheckpointTracker.getMaxSeqNo(); + final long processedCheckpoint = localCheckpointTracker.getProcessedCheckpoint(); + nrtEngine.commitSegmentInfos(); + + // ensure getLatestSegmentInfos returns an updated infos ref with correct userdata. + final SegmentInfos latestSegmentInfos = nrtEngine.getLatestSegmentInfos(); + assertEquals(previousInfos.getGeneration(), latestSegmentInfos.getLastGeneration()); + Map userData = latestSegmentInfos.getUserData(); + assertEquals(processedCheckpoint, localCheckpointTracker.getProcessedCheckpoint()); + assertEquals(maxSeqNo, Long.parseLong(userData.get(MAX_SEQ_NO))); + assertEquals(processedCheckpoint, Long.parseLong(userData.get(LOCAL_CHECKPOINT_KEY))); + + // read infos from store and assert userdata + final String lastCommitSegmentsFileName = SegmentInfos.getLastCommitSegmentsFileName(nrtEngineStore.directory()); + final SegmentInfos committedInfos = SegmentInfos.readCommit(nrtEngineStore.directory(), lastCommitSegmentsFileName); + userData = committedInfos.getUserData(); + assertEquals(processedCheckpoint, Long.parseLong(userData.get(LOCAL_CHECKPOINT_KEY))); + assertEquals(maxSeqNo, Long.parseLong(userData.get(MAX_SEQ_NO))); + } + } + private void assertMatchingSegmentsAndCheckpoints(NRTReplicationEngine nrtEngine, SegmentInfos expectedSegmentInfos) throws IOException { assertEquals(engine.getPersistedLocalCheckpoint(), nrtEngine.getPersistedLocalCheckpoint()); diff --git a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java index 4f6d86c13e12c..23371a39871c7 100644 --- a/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java +++ b/server/src/test/java/org/opensearch/index/shard/SegmentReplicationIndexShardTests.java @@ -16,6 +16,8 @@ import org.opensearch.common.xcontent.XContentType; import org.opensearch.index.IndexSettings; import org.opensearch.index.engine.DocIdSeqNoAndSource; +import org.opensearch.index.engine.InternalEngine; +import org.opensearch.index.engine.NRTReplicationEngine; import org.opensearch.index.engine.NRTReplicationEngineFactory; import org.opensearch.index.mapper.MapperService; import org.opensearch.index.replication.OpenSearchIndexLevelReplicationTestCase; @@ -26,10 +28,12 @@ import java.io.IOException; import java.util.List; -import static org.mockito.ArgumentMatchers.any; -import static org.mockito.Mockito.verify; +import static java.util.Arrays.asList; +import static org.hamcrest.Matchers.equalTo; +import static org.mockito.Mockito.any; import static org.mockito.Mockito.mock; import static org.mockito.Mockito.times; +import static org.mockito.Mockito.verify; public class SegmentReplicationIndexShardTests extends OpenSearchIndexLevelReplicationTestCase { @@ -177,4 +181,76 @@ public void testPublishCheckpointAfterRelocationHandOff() throws IOException { closeShards(shard); } + public void testNRTReplicaPromotedAsPrimary() throws Exception { + try (ReplicationGroup shards = createGroup(2, settings, new NRTReplicationEngineFactory())) { + shards.startAll(); + IndexShard oldPrimary = shards.getPrimary(); + final IndexShard nextPrimary = shards.getReplicas().get(0); + final IndexShard replica = shards.getReplicas().get(1); + + // 1. Create ops that are in the index and xlog of both shards but not yet part of a commit point. + final int numDocs = shards.indexDocs(randomInt(10)); + + // refresh and copy the segments over. + oldPrimary.refresh("Test"); + replicateSegments(oldPrimary, shards.getReplicas()); + + // at this point both shards should have numDocs persisted and searchable. + assertDocCounts(oldPrimary, numDocs, numDocs); + for (IndexShard shard : shards.getReplicas()) { + assertDocCounts(shard, numDocs, numDocs); + } + + // 2. Create ops that are in the replica's xlog, not in the index. + // index some more into both but don't replicate. replica will have only numDocs searchable, but should have totalDocs + // persisted. + final int totalDocs = numDocs + shards.indexDocs(randomInt(10)); + + assertDocCounts(oldPrimary, totalDocs, totalDocs); + for (IndexShard shard : shards.getReplicas()) { + assertDocCounts(shard, totalDocs, numDocs); + } + + // promote the replica + shards.syncGlobalCheckpoint(); + assertEquals(totalDocs, nextPrimary.translogStats().estimatedNumberOfOperations()); + shards.promoteReplicaToPrimary(nextPrimary); + + // close and start the oldPrimary as a replica. + oldPrimary.close("demoted", false); + oldPrimary.store().close(); + oldPrimary = shards.addReplicaWithExistingPath(oldPrimary.shardPath(), oldPrimary.routingEntry().currentNodeId()); + shards.recoverReplica(oldPrimary); + + assertEquals(NRTReplicationEngine.class, oldPrimary.getEngine().getClass()); + assertEquals(InternalEngine.class, nextPrimary.getEngine().getClass()); + assertDocCounts(nextPrimary, totalDocs, totalDocs); + assertEquals(0, nextPrimary.translogStats().estimatedNumberOfOperations()); + + // refresh and push segments to our other replica. + nextPrimary.refresh("test"); + replicateSegments(nextPrimary, asList(replica)); + + for (IndexShard shard : shards) { + assertConsistentHistoryBetweenTranslogAndLucene(shard); + } + final List docsAfterRecovery = getDocIdAndSeqNos(shards.getPrimary()); + for (IndexShard shard : shards.getReplicas()) { + assertThat(shard.routingEntry().toString(), getDocIdAndSeqNos(shard), equalTo(docsAfterRecovery)); + } + } + } + + /** + * Assert persisted and searchable doc counts. This method should not be used while docs are concurrently indexed because + * it asserts point in time seqNos are relative to the doc counts. + */ + private void assertDocCounts(IndexShard indexShard, int expectedPersistedDocCount, int expectedSearchableDocCount) throws IOException { + assertDocCount(indexShard, expectedSearchableDocCount); + // assigned seqNos start at 0, so assert max & local seqNos are 1 less than our persisted doc count. + assertEquals(expectedPersistedDocCount - 1, indexShard.seqNoStats().getMaxSeqNo()); + assertEquals(expectedPersistedDocCount - 1, indexShard.seqNoStats().getLocalCheckpoint()); + // processed cp should be 1 less than our searchable doc count. + assertEquals(expectedSearchableDocCount - 1, indexShard.getProcessedLocalCheckpoint()); + } } From a2ba3a8c6662b0bac5fc3d73c5029fe323f1192b Mon Sep 17 00:00:00 2001 From: Kartik Ganesh Date: Wed, 17 Aug 2022 14:27:07 -0700 Subject: [PATCH 5/7] Added timing data and more granular stages to SegmentReplicationState (#4222) * Added timing data and more granular stages to SegmentReplicationState This change introduces instrumentation logging that measures the latency of the various stages of segment replication as seen by each replica. Logs have also been added to the source node for checkpoint publishing and checkpoint metadata responses. All logging is currently at the TRACE level. Signed-off-by: Kartik Ganesh * Fixing SegmentReplicationTarget tests Signed-off-by: Kartik Ganesh * Incorporated PR feedback Signed-off-by: Kartik Ganesh * Fixing SegmentReplicationTargetService tests Signed-off-by: Kartik Ganesh Signed-off-by: Kartik Ganesh --- .../SegmentReplicationSourceHandler.java | 18 ++++- .../SegmentReplicationSourceService.java | 14 ++++ .../replication/SegmentReplicationState.java | 71 +++++++++++++++---- .../replication/SegmentReplicationTarget.java | 19 +++-- .../SegmentReplicationTargetService.java | 22 +++++- .../checkpoint/PublishCheckpointAction.java | 24 ++++++- .../SegmentReplicationTargetServiceTests.java | 21 ++++-- .../SegmentReplicationTargetTests.java | 20 ++++-- 8 files changed, 175 insertions(+), 34 deletions(-) diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceHandler.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceHandler.java index ce764900e433f..2d21653c1924c 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceHandler.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceHandler.java @@ -27,6 +27,7 @@ import org.opensearch.indices.recovery.FileChunkWriter; import org.opensearch.indices.recovery.MultiChunkTransfer; import org.opensearch.indices.replication.common.CopyState; +import org.opensearch.indices.replication.common.ReplicationTimer; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.Transports; @@ -104,16 +105,24 @@ class SegmentReplicationSourceHandler { * @param listener {@link ActionListener} that completes with the list of files sent. */ public synchronized void sendFiles(GetSegmentFilesRequest request, ActionListener listener) { + final ReplicationTimer timer = new ReplicationTimer(); if (isReplicating.compareAndSet(false, true) == false) { throw new OpenSearchException("Replication to {} is already running.", shard.shardId()); } future.addListener(listener, OpenSearchExecutors.newDirectExecutorService()); final Closeable releaseResources = () -> IOUtils.close(resources); try { - + timer.start(); final Consumer onFailure = e -> { assert Transports.assertNotTransportThread(SegmentReplicationSourceHandler.this + "[onFailure]"); IOUtils.closeWhileHandlingException(releaseResources, () -> future.onFailure(e)); + timer.stop(); + logger.trace( + "[replication id {}] Source node failed to send files to target node [{}], timing: {}", + request.getReplicationId(), + request.getTargetNode().getId(), + timer.time() + ); }; RunUnderPrimaryPermit.run(() -> { @@ -151,6 +160,13 @@ public synchronized void sendFiles(GetSegmentFilesRequest request, ActionListene future.onResponse(new GetSegmentFilesResponse(List.of(storeFileMetadata))); } finally { IOUtils.close(resources); + timer.stop(); + logger.trace( + "[replication id {}] Source node completed sending files to target node [{}], timing: {}", + request.getReplicationId(), + request.getTargetNode().getId(), + timer.time() + ); } }, onFailure); } catch (Exception e) { diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceService.java index d428459884f97..0cee731fde2cb 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationSourceService.java @@ -10,6 +10,7 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.action.support.ChannelActionListener; import org.opensearch.cluster.ClusterChangedEvent; import org.opensearch.cluster.ClusterStateListener; @@ -25,6 +26,7 @@ import org.opensearch.indices.recovery.RecoverySettings; import org.opensearch.indices.recovery.RetryableTransportClient; import org.opensearch.indices.replication.common.CopyState; +import org.opensearch.indices.replication.common.ReplicationTimer; import org.opensearch.tasks.Task; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportChannel; @@ -86,6 +88,8 @@ public SegmentReplicationSourceService( private class CheckpointInfoRequestHandler implements TransportRequestHandler { @Override public void messageReceived(CheckpointInfoRequest request, TransportChannel channel, Task task) throws Exception { + final ReplicationTimer timer = new ReplicationTimer(); + timer.start(); final RemoteSegmentFileChunkWriter segmentSegmentFileChunkWriter = new RemoteSegmentFileChunkWriter( request.getReplicationId(), recoverySettings, @@ -109,6 +113,16 @@ public void messageReceived(CheckpointInfoRequest request, TransportChannel chan copyState.getPendingDeleteFiles() ) ); + timer.stop(); + logger.trace( + new ParameterizedMessage( + "[replication id {}] Source node sent checkpoint info [{}] to target node [{}], timing: {}", + request.getReplicationId(), + copyState.getCheckpoint(), + request.getTargetNode().getId(), + timer.time() + ) + ); } } diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java index 838c06a4785ef..f865ba1332186 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationState.java @@ -8,10 +8,14 @@ package org.opensearch.indices.replication; +import org.opensearch.common.collect.Tuple; import org.opensearch.indices.replication.common.ReplicationLuceneIndex; import org.opensearch.indices.replication.common.ReplicationState; import org.opensearch.indices.replication.common.ReplicationTimer; +import java.util.ArrayList; +import java.util.List; + /** * ReplicationState implementation to track Segment Replication events. * @@ -26,10 +30,12 @@ public class SegmentReplicationState implements ReplicationState { */ public enum Stage { DONE((byte) 0), - INIT((byte) 1), - - REPLICATING((byte) 2); + REPLICATING((byte) 2), + GET_CHECKPOINT_INFO((byte) 3), + FILE_DIFF((byte) 4), + GET_FILES((byte) 5), + FINALIZE_REPLICATION((byte) 6); private static final Stage[] STAGES = new Stage[Stage.values().length]; @@ -60,13 +66,27 @@ public static Stage fromId(byte id) { private Stage stage; private final ReplicationLuceneIndex index; - private final ReplicationTimer timer; + private final ReplicationTimer overallTimer; + private final ReplicationTimer stageTimer; + private final List> timingData; + private long replicationId; public SegmentReplicationState(ReplicationLuceneIndex index) { stage = Stage.INIT; this.index = index; - timer = new ReplicationTimer(); - timer.start(); + // Timing data will have as many entries as stages, plus one + // additional entry for the overall timer + timingData = new ArrayList<>(Stage.values().length + 1); + overallTimer = new ReplicationTimer(); + stageTimer = new ReplicationTimer(); + stageTimer.start(); + // set an invalid value by default + this.replicationId = -1L; + } + + public SegmentReplicationState(ReplicationLuceneIndex index, long replicationId) { + this(index); + this.replicationId = replicationId; } @Override @@ -74,9 +94,17 @@ public ReplicationLuceneIndex getIndex() { return index; } + public long getReplicationId() { + return replicationId; + } + @Override public ReplicationTimer getTimer() { - return timer; + return overallTimer; + } + + public List> getTimingData() { + return timingData; } public Stage getStage() { @@ -90,6 +118,12 @@ protected void validateAndSetStage(Stage expected, Stage next) { "can't move replication to stage [" + next + "]. current stage: [" + stage + "] (expected [" + expected + "])" ); } + // save the timing data for the current step + stageTimer.stop(); + timingData.add(new Tuple<>(stage.name(), stageTimer.time())); + // restart the step timer + stageTimer.reset(); + stageTimer.start(); stage = next; } @@ -97,16 +131,29 @@ public void setStage(Stage stage) { switch (stage) { case INIT: this.stage = Stage.INIT; - getIndex().reset(); break; case REPLICATING: validateAndSetStage(Stage.INIT, stage); - getIndex().start(); + // only start the overall timer once we've started replication + overallTimer.start(); break; - case DONE: + case GET_CHECKPOINT_INFO: validateAndSetStage(Stage.REPLICATING, stage); - getIndex().stop(); - getTimer().stop(); + break; + case FILE_DIFF: + validateAndSetStage(Stage.GET_CHECKPOINT_INFO, stage); + break; + case GET_FILES: + validateAndSetStage(Stage.FILE_DIFF, stage); + break; + case FINALIZE_REPLICATION: + validateAndSetStage(Stage.GET_FILES, stage); + break; + case DONE: + validateAndSetStage(Stage.FINALIZE_REPLICATION, stage); + // add the overall timing data + overallTimer.stop(); + timingData.add(new Tuple<>("OVERALL", overallTimer.time())); break; default: throw new IllegalArgumentException("unknown SegmentReplicationState.Stage [" + stage + "]"); diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java index 73d9a2f805d75..a658ffc09d590 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTarget.java @@ -64,7 +64,7 @@ public SegmentReplicationTarget( super("replication_target", indexShard, new ReplicationLuceneIndex(), listener); this.checkpoint = checkpoint; this.source = source; - this.state = new SegmentReplicationState(stateIndex); + this.state = new SegmentReplicationState(stateIndex, getId()); this.multiFileWriter = new MultiFileWriter(indexShard.store(), stateIndex, getPrefix(), logger, this::ensureRefCount); } @@ -139,7 +139,9 @@ public void startReplication(ActionListener listener) { final StepListener getFilesListener = new StepListener<>(); final StepListener finalizeListener = new StepListener<>(); + logger.trace("[shardId {}] Replica starting replication [id {}]", shardId().getId(), getId()); // Get list of files to copy from this checkpoint. + state.setStage(SegmentReplicationState.Stage.GET_CHECKPOINT_INFO); source.getCheckpointMetadata(getId(), checkpoint, checkpointInfoListener); checkpointInfoListener.whenComplete(checkpointInfo -> getFiles(checkpointInfo, getFilesListener), listener::onFailure); @@ -152,14 +154,16 @@ public void startReplication(ActionListener listener) { private void getFiles(CheckpointInfoResponse checkpointInfo, StepListener getFilesListener) throws IOException { + state.setStage(SegmentReplicationState.Stage.FILE_DIFF); final Store.MetadataSnapshot snapshot = checkpointInfo.getSnapshot(); Store.MetadataSnapshot localMetadata = getMetadataSnapshot(); final Store.RecoveryDiff diff = snapshot.segmentReplicationDiff(localMetadata); - logger.debug("Replication diff {}", diff); - // Segments are immutable. So if the replica has any segments with the same name that differ from the one in the incoming snapshot - // from - // source that means the local copy of the segment has been corrupted/changed in some way and we throw an IllegalStateException to - // fail the shard + logger.trace("Replication diff {}", diff); + /* + * Segments are immutable. So if the replica has any segments with the same name that differ from the one in the incoming + * snapshot from source that means the local copy of the segment has been corrupted/changed in some way and we throw an + * IllegalStateException to fail the shard + */ if (diff.different.isEmpty() == false) { getFilesListener.onFailure( new IllegalStateException( @@ -177,15 +181,18 @@ private void getFiles(CheckpointInfoResponse checkpointInfo, StepListener listener) { + state.setStage(SegmentReplicationState.Stage.FINALIZE_REPLICATION); ActionListener.completeWith(listener, () -> { multiFileWriter.renameAllTempFiles(); final Store store = store(); diff --git a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java index f699f0edba842..a79ce195ad83b 100644 --- a/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java +++ b/server/src/main/java/org/opensearch/indices/replication/SegmentReplicationTargetService.java @@ -116,7 +116,7 @@ public void beforeIndexShardClosed(ShardId shardId, @Nullable IndexShard indexSh * @param replicaShard replica shard on which checkpoint is received */ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedCheckpoint, final IndexShard replicaShard) { - + logger.trace(() -> new ParameterizedMessage("Replica received new replication checkpoint from primary [{}]", receivedCheckpoint)); // Checks if received checkpoint is already present and ahead then it replaces old received checkpoint if (latestReceivedCheckpoint.get(replicaShard.shardId()) != null) { if (receivedCheckpoint.isAheadOf(latestReceivedCheckpoint.get(replicaShard.shardId()))) { @@ -139,6 +139,14 @@ public synchronized void onNewCheckpoint(final ReplicationCheckpoint receivedChe startReplication(receivedCheckpoint, replicaShard, new SegmentReplicationListener() { @Override public void onReplicationDone(SegmentReplicationState state) { + logger.trace( + () -> new ParameterizedMessage( + "[shardId {}] [replication id {}] Replication complete, timing data: {}", + replicaShard.shardId().getId(), + state.getReplicationId(), + state.getTimingData() + ) + ); // if we received a checkpoint during the copy event that is ahead of this // try and process it. if (latestReceivedCheckpoint.get(replicaShard.shardId()).isAheadOf(replicaShard.getLatestReplicationCheckpoint())) { @@ -154,6 +162,14 @@ public void onReplicationDone(SegmentReplicationState state) { @Override public void onReplicationFailure(SegmentReplicationState state, OpenSearchException e, boolean sendShardFailure) { + logger.trace( + () -> new ParameterizedMessage( + "[shardId {}] [replication id {}] Replication failed, timing data: {}", + replicaShard.shardId().getId(), + state.getReplicationId(), + state.getTimingData() + ) + ); if (sendShardFailure == true) { logger.error("replication failure", e); replicaShard.failShard("replication failure", e); @@ -172,9 +188,9 @@ public void startReplication( startReplication(new SegmentReplicationTarget(checkpoint, indexShard, sourceFactory.get(indexShard), listener)); } - public void startReplication(final SegmentReplicationTarget target) { + // pkg-private for integration tests + void startReplication(final SegmentReplicationTarget target) { final long replicationId = onGoingReplications.start(target, recoverySettings.activityTimeout()); - logger.trace(() -> new ParameterizedMessage("Starting replication {}", replicationId)); threadPool.generic().execute(new ReplicationRunner(replicationId)); } diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java index 8093b6aee88f9..cc51082639cdb 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/PublishCheckpointAction.java @@ -29,6 +29,7 @@ import org.opensearch.index.shard.IndexShardClosedException; import org.opensearch.indices.IndicesService; import org.opensearch.indices.replication.SegmentReplicationTargetService; +import org.opensearch.indices.replication.common.ReplicationTimer; import org.opensearch.node.NodeClosedException; import org.opensearch.tasks.Task; import org.opensearch.threadpool.ThreadPool; @@ -103,7 +104,10 @@ final void publish(IndexShard indexShard) { // we have to execute under the system context so that if security is enabled the sync is authorized threadContext.markAsSystemContext(); PublishCheckpointRequest request = new PublishCheckpointRequest(indexShard.getLatestReplicationCheckpoint()); + final ReplicationCheckpoint checkpoint = request.getCheckpoint(); final ReplicationTask task = (ReplicationTask) taskManager.register("transport", "segrep_publish_checkpoint", request); + final ReplicationTimer timer = new ReplicationTimer(); + timer.start(); transportService.sendChildRequest( clusterService.localNode(), transportPrimaryAction, @@ -123,12 +127,23 @@ public String executor() { @Override public void handleResponse(ReplicationResponse response) { + timer.stop(); + logger.trace( + () -> new ParameterizedMessage( + "[shardId {}] Completed publishing checkpoint [{}], timing: {}", + indexShard.shardId().getId(), + checkpoint, + timer.time() + ) + ); task.setPhase("finished"); taskManager.unregister(task); } @Override public void handleException(TransportException e) { + timer.stop(); + logger.trace("[shardId {}] Failed to publish checkpoint, timing: {}", indexShard.shardId().getId(), timer.time()); task.setPhase("finished"); taskManager.unregister(task); if (ExceptionsHelper.unwrap(e, NodeClosedException.class) != null) { @@ -151,6 +166,13 @@ public void handleException(TransportException e) { } } ); + logger.trace( + () -> new ParameterizedMessage( + "[shardId {}] Publishing replication checkpoint [{}]", + checkpoint.getShardId().getId(), + checkpoint + ) + ); } } @@ -168,7 +190,7 @@ protected void shardOperationOnReplica(PublishCheckpointRequest request, IndexSh Objects.requireNonNull(request); Objects.requireNonNull(replica); ActionListener.completeWith(listener, () -> { - logger.trace("Checkpoint received on replica {}", request); + logger.trace(() -> new ParameterizedMessage("Checkpoint {} received on replica {}", request, replica.shardId())); if (request.getCheckpoint().getShardId().equals(replica.shardId())) { replicationService.onNewCheckpoint(request.getCheckpoint(), replica); } diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java index 004fa7f614ef1..d3a6d1a97dacc 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetServiceTests.java @@ -100,8 +100,8 @@ public void onReplicationFailure(SegmentReplicationState state, OpenSearchExcept ); final SegmentReplicationTarget spy = Mockito.spy(target); doAnswer(invocation -> { - // setting stage to REPLICATING so transition in markAsDone succeeds on listener completion - target.state().setStage(SegmentReplicationState.Stage.REPLICATING); + // set up stage correctly so the transition in markAsDone succeeds on listener completion + moveTargetToFinalStage(target); final ActionListener listener = invocation.getArgument(0); listener.onResponse(null); return null; @@ -123,7 +123,7 @@ public void onReplicationDone(SegmentReplicationState state) { @Override public void onReplicationFailure(SegmentReplicationState state, OpenSearchException e, boolean sendShardFailure) { - assertEquals(SegmentReplicationState.Stage.REPLICATING, state.getStage()); + assertEquals(SegmentReplicationState.Stage.INIT, state.getStage()); assertEquals(expectedError, e.getCause()); assertTrue(sendShardFailure); } @@ -131,8 +131,6 @@ public void onReplicationFailure(SegmentReplicationState state, OpenSearchExcept ); final SegmentReplicationTarget spy = Mockito.spy(target); doAnswer(invocation -> { - // setting stage to REPLICATING so transition in markAsDone succeeds on listener completion - target.state().setStage(SegmentReplicationState.Stage.REPLICATING); final ActionListener listener = invocation.getArgument(0); listener.onFailure(expectedError); return null; @@ -271,4 +269,17 @@ public void testBeforeIndexShardClosed_CancelsOngoingReplications() { sut.beforeIndexShardClosed(indexShard.shardId(), indexShard, Settings.EMPTY); verify(spy, times(1)).cancel(any()); } + + /** + * Move the {@link SegmentReplicationTarget} object through its {@link SegmentReplicationState.Stage} values in order + * until the final, non-terminal stage. + */ + private void moveTargetToFinalStage(SegmentReplicationTarget target) { + SegmentReplicationState.Stage[] stageValues = SegmentReplicationState.Stage.values(); + assertEquals(target.state().getStage(), SegmentReplicationState.Stage.INIT); + // Skip the first two stages (DONE and INIT) and iterate until the last value + for (int i = 2; i < stageValues.length; i++) { + target.state().setStage(stageValues[i]); + } + } } diff --git a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java index 1157c463785ac..11217a46b3c69 100644 --- a/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java +++ b/server/src/test/java/org/opensearch/indices/replication/SegmentReplicationTargetTests.java @@ -28,6 +28,7 @@ import org.junit.Assert; import org.mockito.Mockito; import org.opensearch.ExceptionsHelper; +import org.opensearch.OpenSearchException; import org.opensearch.action.ActionListener; import org.opensearch.cluster.metadata.IndexMetadata; import org.opensearch.common.settings.Settings; @@ -96,7 +97,7 @@ public class SegmentReplicationTargetTests extends IndexShardTestCase { Settings.builder().put(IndexMetadata.SETTING_VERSION_CREATED, org.opensearch.Version.CURRENT).build() ); - SegmentInfos testSegmentInfos; + private SegmentInfos testSegmentInfos; @Override public void setUp() throws Exception { @@ -162,6 +163,7 @@ public void getSegmentFiles( public void onResponse(Void replicationResponse) { try { verify(spyIndexShard, times(1)).finalizeReplication(any(), anyLong()); + segrepTarget.markAsDone(); } catch (IOException ex) { Assert.fail(); } @@ -169,7 +171,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { - logger.error("Unexpected test error", e); + logger.error("Unexpected onFailure", e); Assert.fail(); } }); @@ -213,6 +215,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { assertEquals(exception, e.getCause().getCause()); + segrepTarget.fail(new OpenSearchException(e), false); } }); } @@ -255,6 +258,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { assertEquals(exception, e.getCause().getCause()); + segrepTarget.fail(new OpenSearchException(e), false); } }); } @@ -299,6 +303,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { assertEquals(exception, e.getCause()); + segrepTarget.fail(new OpenSearchException(e), false); } }); } @@ -343,6 +348,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { assertEquals(exception, e.getCause()); + segrepTarget.fail(new OpenSearchException(e), false); } }); } @@ -384,6 +390,7 @@ public void onResponse(Void replicationResponse) { @Override public void onFailure(Exception e) { assert (e instanceof IllegalStateException); + segrepTarget.fail(new OpenSearchException(e), false); } }); } @@ -432,11 +439,13 @@ public void getSegmentFiles( @Override public void onResponse(Void replicationResponse) { logger.info("No error processing checkpoint info"); + segrepTarget.markAsDone(); } @Override public void onFailure(Exception e) { - assert (e instanceof IllegalStateException); + logger.error("Unexpected onFailure", e); + Assert.fail(); } }); } @@ -448,7 +457,7 @@ public void onFailure(Exception e) { * @return * @throws IOException */ - List generateStoreMetadataSnapshot(int docCount) throws IOException { + private List generateStoreMetadataSnapshot(int docCount) throws IOException { List docList = new ArrayList<>(); for (int i = 0; i < docCount; i++) { Document document = new Document(); @@ -480,7 +489,7 @@ List generateStoreMetadataSnapshot(int docCount) throws return Arrays.asList(storeMetadata, storeMetadataWithDeletes); } - public static void deleteContent(Directory directory) throws IOException { + private static void deleteContent(Directory directory) throws IOException { final String[] files = directory.listAll(); final List exceptions = new ArrayList<>(); for (String file : files) { @@ -498,7 +507,6 @@ public static void deleteContent(Directory directory) throws IOException { @Override public void tearDown() throws Exception { super.tearDown(); - segrepTarget.markAsDone(); closeShards(spyIndexShard, indexShard); } } From 36f1d77ad7983d9a58bfd755423ce806941fd930 Mon Sep 17 00:00:00 2001 From: Owais Kazi Date: Wed, 17 Aug 2022 17:14:09 -0700 Subject: [PATCH 6/7] Handles the status code for `.` properties (#4246) * Return 400 status code for array out of bound Signed-off-by: Owais Kazi * Spotless apply Signed-off-by: Owais Kazi * PR comments Signed-off-by: Owais Kazi Signed-off-by: Owais Kazi --- .../opensearch/index/mapper/ObjectMapper.java | 4 ++++ .../index/mapper/ObjectMapperTests.java | 20 +++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/server/src/main/java/org/opensearch/index/mapper/ObjectMapper.java b/server/src/main/java/org/opensearch/index/mapper/ObjectMapper.java index fcbca6049ec0b..1702c7700cf60 100644 --- a/server/src/main/java/org/opensearch/index/mapper/ObjectMapper.java +++ b/server/src/main/java/org/opensearch/index/mapper/ObjectMapper.java @@ -380,6 +380,10 @@ protected static void parseProperties(ObjectMapper.Builder objBuilder, Map fieldBuilder = typeParser.parse(realFieldName, propNode, parserContext); for (int i = fieldNameParts.length - 2; i >= 0; --i) { diff --git a/server/src/test/java/org/opensearch/index/mapper/ObjectMapperTests.java b/server/src/test/java/org/opensearch/index/mapper/ObjectMapperTests.java index 079475d9f3554..d6c89342c9df2 100644 --- a/server/src/test/java/org/opensearch/index/mapper/ObjectMapperTests.java +++ b/server/src/test/java/org/opensearch/index/mapper/ObjectMapperTests.java @@ -178,6 +178,26 @@ public void testFieldsWithFilledArrayShouldThrowException() throws Exception { } } + public void testDotAsFieldName() throws Exception { + String mapping = Strings.toString( + XContentFactory.jsonBuilder() + .startObject() + .startObject("properties") + .startObject(".") + .field("type", "text") + .endObject() + .endObject() + .endObject() + ); + + try { + createIndex("test").mapperService().documentMapperParser().parse("tweet", new CompressedXContent(mapping)); + fail("Expected MapperParsingException"); + } catch (MapperParsingException e) { + assertThat(e.getMessage(), containsString("Invalid field name")); + } + } + public void testFieldPropertiesArray() throws Exception { String mapping = Strings.toString( XContentFactory.jsonBuilder() From d308a2925818ad89e0a85161bc6ee43c057eb6d8 Mon Sep 17 00:00:00 2001 From: Suraj Singh Date: Wed, 17 Aug 2022 17:25:19 -0700 Subject: [PATCH 7/7] [Segment Replication] Update PrimaryShardAllocator to prefer replicas with higher replication checkpoint (#4041) * [Segment Replication] Update PrimaryShardAllocator to prefer replicas having higher replication checkpoint Signed-off-by: Suraj Singh * Use empty replication checkpoint to avoid NPE Signed-off-by: Suraj Singh * Update NodeGatewayStartedShards to optionally wire in/out ReplicationCheckpoint field Signed-off-by: Suraj Singh * Use default replication checkpoint causing EOF errors on empty checkpoint * Add indexSettings to GatewayAllocator to allow ReplicationCheckpoint comparator only for segrep enabled indices * Add unit tests for primary term first replica promotion & comparator fix * Fix NPE on empty IndexMetadata * Remove settings from AllocationService and directly inject in GatewayAllocator * Add more unit tests and minor code clean up Signed-off-by: Suraj Singh * Address review comments & integration test Signed-off-by: Suraj Singh * Fix comparator on null ReplicationCheckpoint Signed-off-by: Suraj Singh Signed-off-by: Suraj Singh --- .../gateway/PrimaryShardAllocator.java | 17 +- ...ransportNodesListGatewayStartedShards.java | 58 ++++- .../checkpoint/ReplicationCheckpoint.java | 7 +- .../gateway/PrimaryShardAllocatorTests.java | 229 +++++++++++++++++- .../test/gateway/TestGatewayAllocator.java | 17 +- 5 files changed, 314 insertions(+), 14 deletions(-) diff --git a/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java b/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java index b9cfebaa98521..4dc9396751fc9 100644 --- a/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java +++ b/server/src/main/java/org/opensearch/gateway/PrimaryShardAllocator.java @@ -313,6 +313,11 @@ private static ShardStoreInfo shardStoreInfo(NodeGatewayStartedShards nodeShardS NodeGatewayStartedShards::primary ).reversed(); + private static final Comparator HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR = Comparator.comparing( + NodeGatewayStartedShards::replicationCheckpoint, + Comparator.nullsLast(Comparator.naturalOrder()) + ); + /** * Builds a list of nodes. If matchAnyShard is set to false, only nodes that have an allocation id matching * inSyncAllocationIds are added to the list. Otherwise, any node that has a shard is added to the list, but @@ -381,6 +386,12 @@ protected static NodeShardsResult buildNodeShardsResult( } } + /** + * Orders the active shards copies based on below comparators + * 1. No store exception i.e. shard copy is readable + * 2. Prefer previous primary shard + * 3. Prefer shard copy with the highest replication checkpoint. It is NO-OP for doc rep enabled indices. + */ final Comparator comparator; // allocation preference if (matchAnyShard) { // prefer shards with matching allocation ids @@ -388,9 +399,11 @@ protected static NodeShardsResult buildNodeShardsResult( (NodeGatewayStartedShards state) -> inSyncAllocationIds.contains(state.allocationId()) ).reversed(); comparator = matchingAllocationsFirst.thenComparing(NO_STORE_EXCEPTION_FIRST_COMPARATOR) - .thenComparing(PRIMARY_FIRST_COMPARATOR); + .thenComparing(PRIMARY_FIRST_COMPARATOR) + .thenComparing(HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR); } else { - comparator = NO_STORE_EXCEPTION_FIRST_COMPARATOR.thenComparing(PRIMARY_FIRST_COMPARATOR); + comparator = NO_STORE_EXCEPTION_FIRST_COMPARATOR.thenComparing(PRIMARY_FIRST_COMPARATOR) + .thenComparing(HIGHEST_REPLICATION_CHECKPOINT_FIRST_COMPARATOR); } nodeShardStates.sort(comparator); diff --git a/server/src/main/java/org/opensearch/gateway/TransportNodesListGatewayStartedShards.java b/server/src/main/java/org/opensearch/gateway/TransportNodesListGatewayStartedShards.java index 78b4fa287ef59..953b4def9d653 100644 --- a/server/src/main/java/org/opensearch/gateway/TransportNodesListGatewayStartedShards.java +++ b/server/src/main/java/org/opensearch/gateway/TransportNodesListGatewayStartedShards.java @@ -35,6 +35,7 @@ import org.apache.logging.log4j.message.ParameterizedMessage; import org.opensearch.LegacyESVersion; import org.opensearch.OpenSearchException; +import org.opensearch.Version; import org.opensearch.action.ActionListener; import org.opensearch.action.ActionType; import org.opensearch.action.FailedNodeException; @@ -56,11 +57,13 @@ import org.opensearch.common.xcontent.NamedXContentRegistry; import org.opensearch.env.NodeEnvironment; import org.opensearch.index.IndexSettings; +import org.opensearch.index.shard.IndexShard; import org.opensearch.index.shard.ShardId; import org.opensearch.index.shard.ShardPath; import org.opensearch.index.shard.ShardStateMetadata; import org.opensearch.index.store.Store; import org.opensearch.indices.IndicesService; +import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.threadpool.ThreadPool; import org.opensearch.transport.TransportService; @@ -195,6 +198,7 @@ protected NodeGatewayStartedShards nodeOperation(NodeRequest request) { clusterService.localNode(), allocationId, shardStateMetadata.primary, + null, exception ); } @@ -202,10 +206,16 @@ protected NodeGatewayStartedShards nodeOperation(NodeRequest request) { logger.debug("{} shard state info found: [{}]", shardId, shardStateMetadata); String allocationId = shardStateMetadata.allocationId != null ? shardStateMetadata.allocationId.getId() : null; - return new NodeGatewayStartedShards(clusterService.localNode(), allocationId, shardStateMetadata.primary); + final IndexShard shard = indicesService.getShardOrNull(shardId); + return new NodeGatewayStartedShards( + clusterService.localNode(), + allocationId, + shardStateMetadata.primary, + shard != null ? shard.getLatestReplicationCheckpoint() : null + ); } logger.trace("{} no local shard info found", shardId); - return new NodeGatewayStartedShards(clusterService.localNode(), null, false); + return new NodeGatewayStartedShards(clusterService.localNode(), null, false, null); } catch (Exception e) { throw new OpenSearchException("failed to load started shards", e); } @@ -349,10 +359,10 @@ public String getCustomDataPath() { * @opensearch.internal */ public static class NodeGatewayStartedShards extends BaseNodeResponse { - private final String allocationId; private final boolean primary; private final Exception storeException; + private final ReplicationCheckpoint replicationCheckpoint; public NodeGatewayStartedShards(StreamInput in) throws IOException { super(in); @@ -363,16 +373,33 @@ public NodeGatewayStartedShards(StreamInput in) throws IOException { } else { storeException = null; } + if (in.getVersion().onOrAfter(Version.V_3_0_0) && in.readBoolean()) { + replicationCheckpoint = new ReplicationCheckpoint(in); + } else { + replicationCheckpoint = null; + } } - public NodeGatewayStartedShards(DiscoveryNode node, String allocationId, boolean primary) { - this(node, allocationId, primary, null); + public NodeGatewayStartedShards( + DiscoveryNode node, + String allocationId, + boolean primary, + ReplicationCheckpoint replicationCheckpoint + ) { + this(node, allocationId, primary, replicationCheckpoint, null); } - public NodeGatewayStartedShards(DiscoveryNode node, String allocationId, boolean primary, Exception storeException) { + public NodeGatewayStartedShards( + DiscoveryNode node, + String allocationId, + boolean primary, + ReplicationCheckpoint replicationCheckpoint, + Exception storeException + ) { super(node); this.allocationId = allocationId; this.primary = primary; + this.replicationCheckpoint = replicationCheckpoint; this.storeException = storeException; } @@ -384,6 +411,10 @@ public boolean primary() { return this.primary; } + public ReplicationCheckpoint replicationCheckpoint() { + return this.replicationCheckpoint; + } + public Exception storeException() { return this.storeException; } @@ -399,6 +430,14 @@ public void writeTo(StreamOutput out) throws IOException { } else { out.writeBoolean(false); } + if (out.getVersion().onOrAfter(Version.V_3_0_0)) { + if (replicationCheckpoint != null) { + out.writeBoolean(true); + replicationCheckpoint.writeTo(out); + } else { + out.writeBoolean(false); + } + } } @Override @@ -414,7 +453,8 @@ public boolean equals(Object o) { return primary == that.primary && Objects.equals(allocationId, that.allocationId) - && Objects.equals(storeException, that.storeException); + && Objects.equals(storeException, that.storeException) + && Objects.equals(replicationCheckpoint, that.replicationCheckpoint); } @Override @@ -422,6 +462,7 @@ public int hashCode() { int result = (allocationId != null ? allocationId.hashCode() : 0); result = 31 * result + (primary ? 1 : 0); result = 31 * result + (storeException != null ? storeException.hashCode() : 0); + result = 31 * result + (replicationCheckpoint != null ? replicationCheckpoint.hashCode() : 0); return result; } @@ -432,6 +473,9 @@ public String toString() { if (storeException != null) { buf.append(",storeException=").append(storeException); } + if (replicationCheckpoint != null) { + buf.append(",ReplicationCheckpoint=").append(replicationCheckpoint.toString()); + } buf.append("]"); return buf.toString(); } diff --git a/server/src/main/java/org/opensearch/indices/replication/checkpoint/ReplicationCheckpoint.java b/server/src/main/java/org/opensearch/indices/replication/checkpoint/ReplicationCheckpoint.java index 8afb5bd055636..6a4e5e449f178 100644 --- a/server/src/main/java/org/opensearch/indices/replication/checkpoint/ReplicationCheckpoint.java +++ b/server/src/main/java/org/opensearch/indices/replication/checkpoint/ReplicationCheckpoint.java @@ -23,7 +23,7 @@ * * @opensearch.internal */ -public class ReplicationCheckpoint implements Writeable { +public class ReplicationCheckpoint implements Writeable, Comparable { private final ShardId shardId; private final long primaryTerm; @@ -107,6 +107,11 @@ public void writeTo(StreamOutput out) throws IOException { out.writeLong(segmentInfosVersion); } + @Override + public int compareTo(ReplicationCheckpoint other) { + return this.isAheadOf(other) ? -1 : 1; + } + @Override public boolean equals(Object o) { if (this == o) return true; diff --git a/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java b/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java index 4a1ecb9661687..3c39ec9f03b2a 100644 --- a/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java +++ b/server/src/test/java/org/opensearch/gateway/PrimaryShardAllocatorTests.java @@ -62,6 +62,7 @@ import org.opensearch.common.util.set.Sets; import org.opensearch.env.ShardLockObtainFailedException; import org.opensearch.index.shard.ShardId; +import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.repositories.IndexId; import org.opensearch.snapshots.Snapshot; import org.opensearch.snapshots.SnapshotId; @@ -205,6 +206,203 @@ public void testShardLockObtainFailedException() { assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); } + /** + * Tests that replica with the highest primary term version will be selected as target + */ + public void testPreferReplicaWithHighestPrimaryTerm() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, new ReplicationCheckpoint(shardId, 20, 10, 101, 1)); + testAllocator.addData(node2, allocId2, false, new ReplicationCheckpoint(shardId, 22, 10, 120, 2)); + testAllocator.addData(node3, allocId3, false, new ReplicationCheckpoint(shardId, 20, 10, 120, 2)); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node2.getId()) + ); + // Assert node2's allocation id is used + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId2) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + + /** + * Tests that replica with highest primary ter version will be selected as target + */ + public void testPreferReplicaWithNullReplicationCheckpoint() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, new ReplicationCheckpoint(shardId, 20, 10, 101, 1)); + testAllocator.addData(node2, allocId2, false); + testAllocator.addData(node3, allocId3, false, new ReplicationCheckpoint(shardId, 40, 10, 120, 2)); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node3.getId()) + ); + // Assert node3's allocation id should be used as it has highest replication checkpoint + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId3) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + + /** + * Tests that null ReplicationCheckpoint are ignored + */ + public void testPreferReplicaWithAllNullReplicationCheckpoint() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, null, null); + testAllocator.addData(node2, allocId2, false, null, null); + testAllocator.addData(node3, allocId3, true, null, null); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node3.getId()) + ); + // Assert node3's allocation id should be used as it was previous primary + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId3) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + + /** + * Tests that replica with highest segment info version will be selected as target on equal primary terms + */ + public void testPreferReplicaWithHighestSegmentInfoVersion() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, new ReplicationCheckpoint(shardId, 10, 10, 101, 1)); + testAllocator.addData(node2, allocId2, false, new ReplicationCheckpoint(shardId, 20, 10, 120, 3)); + testAllocator.addData(node3, allocId3, false, new ReplicationCheckpoint(shardId, 20, 10, 120, 2)); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node2.getId()) + ); + // Assert node2's allocation id is used + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId2) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + + /** + * Tests that prefer allocation of replica at lower checkpoint but in sync set + */ + public void testOutOfSyncHighestRepCheckpointIsIgnored() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId3 + ); + testAllocator.addData(node1, allocId1, false, new ReplicationCheckpoint(shardId, 10, 10, 101, 1)); + testAllocator.addData(node2, allocId2, false, new ReplicationCheckpoint(shardId, 20, 10, 120, 2)); + testAllocator.addData(node3, allocId3, false, new ReplicationCheckpoint(shardId, 15, 10, 120, 2)); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node3.getId()) + ); + // Assert node3's allocation id is used + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId3) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + + /** + * Tests that prefer allocation of older primary over replica with higher replication checkpoint + */ + public void testPreferAllocatingPreviousPrimaryWithLowerRepCheckpoint() { + String allocId1 = randomAlphaOfLength(10); + String allocId2 = randomAlphaOfLength(10); + String allocId3 = randomAlphaOfLength(10); + final RoutingAllocation allocation = routingAllocationWithOnePrimaryNoReplicas( + yesAllocationDeciders(), + CLUSTER_RECOVERED, + allocId1, + allocId2, + allocId3 + ); + testAllocator.addData(node1, allocId1, true, new ReplicationCheckpoint(shardId, 10, 10, 101, 1)); + testAllocator.addData(node2, allocId2, false, new ReplicationCheckpoint(shardId, 20, 10, 120, 2)); + testAllocator.addData(node3, allocId3, false, new ReplicationCheckpoint(shardId, 15, 10, 120, 2)); + allocateAllUnassigned(allocation); + assertThat(allocation.routingNodesChanged(), equalTo(true)); + assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); + assertThat(allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).size(), equalTo(1)); + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).currentNodeId(), + equalTo(node1.getId()) + ); + // Assert node1's allocation id is used with highest replication checkpoint + assertThat( + allocation.routingNodes().shardsWithState(ShardRoutingState.INITIALIZING).get(0).allocationId().getId(), + equalTo(allocId1) + ); + assertClusterHealthStatus(allocation, ClusterHealthStatus.YELLOW); + } + /** * Tests that when one node returns a ShardLockObtainFailedException and another properly loads the store, it will * select the second node as target @@ -219,7 +417,7 @@ public void testShardLockObtainFailedExceptionPreferOtherValidCopies() { allocId2 ); testAllocator.addData(node1, allocId1, randomBoolean(), new ShardLockObtainFailedException(shardId, "test")); - testAllocator.addData(node2, allocId2, randomBoolean(), null); + testAllocator.addData(node2, allocId2, randomBoolean()); allocateAllUnassigned(allocation); assertThat(allocation.routingNodesChanged(), equalTo(true)); assertThat(allocation.routingNodes().unassigned().ignored().isEmpty(), equalTo(true)); @@ -601,17 +799,42 @@ public TestAllocator clear() { return this; } + public TestAllocator addData( + DiscoveryNode node, + String allocationId, + boolean primary, + ReplicationCheckpoint replicationCheckpoint + ) { + return addData(node, allocationId, primary, replicationCheckpoint, null); + } + public TestAllocator addData(DiscoveryNode node, String allocationId, boolean primary) { - return addData(node, allocationId, primary, null); + return addData(node, allocationId, primary, ReplicationCheckpoint.empty(shardId), null); } public TestAllocator addData(DiscoveryNode node, String allocationId, boolean primary, @Nullable Exception storeException) { + return addData(node, allocationId, primary, ReplicationCheckpoint.empty(shardId), storeException); + } + + public TestAllocator addData( + DiscoveryNode node, + String allocationId, + boolean primary, + ReplicationCheckpoint replicationCheckpoint, + @Nullable Exception storeException + ) { if (data == null) { data = new HashMap<>(); } data.put( node, - new TransportNodesListGatewayStartedShards.NodeGatewayStartedShards(node, allocationId, primary, storeException) + new TransportNodesListGatewayStartedShards.NodeGatewayStartedShards( + node, + allocationId, + primary, + replicationCheckpoint, + storeException + ) ); return this; } diff --git a/test/framework/src/main/java/org/opensearch/test/gateway/TestGatewayAllocator.java b/test/framework/src/main/java/org/opensearch/test/gateway/TestGatewayAllocator.java index 54c92f4d519aa..a36dc26685eb4 100644 --- a/test/framework/src/main/java/org/opensearch/test/gateway/TestGatewayAllocator.java +++ b/test/framework/src/main/java/org/opensearch/test/gateway/TestGatewayAllocator.java @@ -43,6 +43,7 @@ import org.opensearch.gateway.ReplicaShardAllocator; import org.opensearch.gateway.TransportNodesListGatewayStartedShards.NodeGatewayStartedShards; import org.opensearch.index.shard.ShardId; +import org.opensearch.indices.replication.checkpoint.ReplicationCheckpoint; import org.opensearch.indices.store.TransportNodesListShardStoreMetadata.NodeStoreFilesMetadata; import java.util.Collections; @@ -71,6 +72,7 @@ public class TestGatewayAllocator extends GatewayAllocator { Map> knownAllocations = new HashMap<>(); DiscoveryNodes currentNodes = DiscoveryNodes.EMPTY_NODES; + Map shardIdNodeToReplicationCheckPointMap = new HashMap<>(); PrimaryShardAllocator primaryShardAllocator = new PrimaryShardAllocator() { @Override @@ -90,7 +92,8 @@ protected AsyncShardFetch.FetchResult fetchData(ShardR routing -> new NodeGatewayStartedShards( currentNodes.get(routing.currentNodeId()), routing.allocationId().getId(), - routing.primary() + routing.primary(), + getReplicationCheckpoint(shardId, routing.currentNodeId()) ) ) ); @@ -99,6 +102,10 @@ protected AsyncShardFetch.FetchResult fetchData(ShardR } }; + private ReplicationCheckpoint getReplicationCheckpoint(ShardId shardId, String nodeName) { + return shardIdNodeToReplicationCheckPointMap.getOrDefault(getReplicationCheckPointKey(shardId, nodeName), null); + } + ReplicaShardAllocator replicaShardAllocator = new ReplicaShardAllocator() { @Override protected AsyncShardFetch.FetchResult fetchData(ShardRouting shard, RoutingAllocation allocation) { @@ -156,4 +163,12 @@ public void allocateUnassigned( public void addKnownAllocation(ShardRouting shard) { knownAllocations.computeIfAbsent(shard.currentNodeId(), id -> new HashMap<>()).put(shard.shardId(), shard); } + + public String getReplicationCheckPointKey(ShardId shardId, String nodeName) { + return shardId.toString() + "_" + nodeName; + } + + public void addReplicationCheckpoint(ShardId shardId, String nodeName, ReplicationCheckpoint replicationCheckpoint) { + shardIdNodeToReplicationCheckPointMap.putIfAbsent(getReplicationCheckPointKey(shardId, nodeName), replicationCheckpoint); + } }