opensearch-project · AndreKurait · Dec 5, 2024 · Nov 22, 2024 · Nov 25, 2024 · Nov 25, 2024
@@ -128,7 +128,7 @@
            }
            // Otherwise, shift the SegmentReaders to the front
            else if (leafReader1 instanceof SegmentReader && !(leafReader2 instanceof SegmentReader)) {
                log.info("Found non-SegmentReader of type {} in the DirectoryReader", leafReader2.getClass().getName());
                return -1;
            } else if (!(leafReader1 instanceof SegmentReader) && leafReader2 instanceof SegmentReader) {
                log.info("Found non-SegmentReader of type {} in the DirectoryReader", leafReader1.getClass().getName());
@@ -177,20 +177,20 @@
             .doOnTerminate(sharedSegmentReaderScheduler::dispose);
     }
 
-    Flux<RfsLuceneDocument> readDocsFromSegment(LeafReaderContext leafReaderContext, int docCommitId, Scheduler scheduler,
+    Flux<RfsLuceneDocument> readDocsFromSegment(LeafReaderContext leafReaderContext, int docStartingId, Scheduler scheduler,
                                                 int concurrency) {
         var segmentReader = leafReaderContext.reader();
         var liveDocs = segmentReader.getLiveDocs();
 
         int segmentDocBase = leafReaderContext.docBase;
 
-        log.atInfo().setMessage("For segment: {}, working on docCommitId: {}")
+        log.atInfo().setMessage("For segment: {}, working on docStartingId: {}")
                 .addArgument(leafReaderContext)
-                .addArgument(docCommitId)
+                .addArgument(docStartingId)
                 .log();
 
         return Flux.range(0, segmentReader.maxDoc())
-                .skipWhile(id -> id + segmentDocBase <= docCommitId && docCommitId != 0)
+                .skipWhile(docNum -> segmentDocBase + docNum < docStartingId)
                 .flatMapSequentialDelayError(docIdx -> Mono.defer(() -> {
                     try {
                         if (liveDocs == null || liveDocs.get(docIdx)) {
@@ -218,7 +218,7 @@
    protected RfsLuceneDocument getDocument(IndexReader reader, int luceneDocId, boolean isLive, int segmentDocBase) {
        Document document;
        try {
            document = reader.document(luceneDocId);
        } catch (IOException e) {
            log.atError().setCause(e).setMessage("Failed to read document at Lucene index location {}")
                .addArgument(luceneDocId).log();

@@ -1,12 +1,14 @@
 package org.opensearch.migrations.bulkload.common;
 
+import lombok.Getter;
 import lombok.RequiredArgsConstructor;
 
 /**
  * This class represents a document at the Lucene level within RFS.  It tracks where the document was within the Lucene
  * index, as well as the document's embedded Elasticsearch/OpenSearch properties
  */
 @RequiredArgsConstructor
+@Getter
 public class RfsLuceneDocument {
     // The Lucene document number of the document
     public final int luceneDocNumber;

@@ -82,7 +82,7 @@ static Stream<Arguments> provideSnapshots() {
 
     @ParameterizedTest
     @MethodSource("provideSnapshots")
-    public void ReadDocuments_AsExpected(TestResources.Snapshot snapshot, Version version) throws Exception {
+    public void ReadDocuments_AsExpected(TestResources.Snapshot snapshot, Version version) {
         final var repo = new FileSystemRepo(snapshot.dir);
         var sourceResourceProvider = ClusterProviderRegistry.getSnapshotReader(version, repo);
         DefaultSourceRepoAccessor repoAccessor = new DefaultSourceRepoAccessor(repo);
@@ -100,8 +100,7 @@ public void ReadDocuments_AsExpected(TestResources.Snapshot snapshot, Version ve
         // Use the LuceneDocumentsReader to get the documents
         var reader = LuceneDocumentsReader.getFactory(sourceResourceProvider).apply(luceneDir);
 
-        Flux<RfsLuceneDocument> documents = reader.readDocuments()
-            .sort(Comparator.comparing(doc -> doc.id)); // Sort for consistent order given LuceneDocumentsReader may interleave
+        Flux<RfsLuceneDocument> documents = reader.readDocuments();
 
         // Verify that the results are as expected
         StepVerifier.create(documents).expectNextMatches(doc -> {
@@ -116,25 +115,25 @@ public void ReadDocuments_AsExpected(TestResources.Snapshot snapshot, Version ve
             assertDocsEqual(expectedId, actualId, expectedType, actualType, expectedSource, actualSource);
             return true;
         }).expectNextMatches(doc -> {
-            String expectedId = "unchangeddoc";
+            String expectedId = "updateddoc";
             String actualId = doc.id;
 
             String expectedType = null;
             String actualType = doc.type;
 
-            String expectedSource = "{\"title\":\"This doc will not be changed\\nIt has multiple lines of text\\nIts source doc has extra newlines.\",\"content\":\"bluh bluh\"}";
+            String expectedSource = "{\"title\":\"This is doc that will be updated\",\"content\":\"Updated!\"}";
             String actualSource = doc.source;
             assertDocsEqual(expectedId, actualId, expectedType, actualType,
                     expectedSource, actualSource);
             return true;
         }).expectNextMatches(doc -> {
-            String expectedId = "updateddoc";
+            String expectedId = "unchangeddoc";
             String actualId = doc.id;
 
             String expectedType = null;
             String actualType = doc.type;
 
-            String expectedSource = "{\"title\":\"This is doc that will be updated\",\"content\":\"Updated!\"}";
+            String expectedSource = "{\"title\":\"This doc will not be changed\\nIt has multiple lines of text\\nIts source doc has extra newlines.\",\"content\":\"bluh bluh\"}";
             String actualSource = doc.source;
             assertDocsEqual(expectedId, actualId, expectedType, actualType,
                     expectedSource, actualSource);
@@ -143,7 +142,7 @@ public void ReadDocuments_AsExpected(TestResources.Snapshot snapshot, Version ve
     }
 
     @Test
-    public void ReadDocuments_ES5_Origin_AsExpected() throws Exception {
+    public void ReadDocuments_ES5_Origin_AsExpected() {
         TestResources.Snapshot snapshot = TestResources.SNAPSHOT_ES_6_8_MERGED;
         Version version = Version.fromString("ES 6.8");
 
@@ -164,41 +163,40 @@ public void ReadDocuments_ES5_Origin_AsExpected() throws Exception {
         // Use the LuceneDocumentsReader to get the documents
         var reader = LuceneDocumentsReader.getFactory(sourceResourceProvider).apply(luceneDir);
 
-        Flux<RfsLuceneDocument> documents = reader.readDocuments()
-            .sort(Comparator.comparing(doc -> doc.id)); // Sort for consistent order given LuceneDocumentsReader may interleave
+        Flux<RfsLuceneDocument> documents = reader.readDocuments();
 
         // Verify that the results are as expected
         StepVerifier.create(documents).expectNextMatches(doc -> {
-            String expectedId = "complexdoc";
+            String expectedId = "unchangeddoc";
             String actualId = doc.id;
 
-             String expectedType = "type1";
-             String actualType = doc.type;
+            String expectedType = "type2";
+            String actualType = doc.type;
 
-            String expectedSource = "{\"title\":\"This is a doc with complex history. Updated!\"}";
+            String expectedSource = "{\"content\":\"This doc will not be changed\nIt has multiple lines of text\nIts source doc has extra newlines.\"}";
             String actualSource = doc.source;
-            assertDocsEqual(expectedId, actualId, expectedType, actualType,
-                    expectedSource, actualSource);
+            assertDocsEqual(expectedId, actualId, expectedType, actualType, expectedSource, actualSource);
             return true;
         }).expectNextMatches(doc -> {
-            String expectedId = "unchangeddoc";
+            String expectedId = "updateddoc";
             String actualId = doc.id;
 
-             String expectedType = "type2";
-             String actualType = doc.type;
+            String expectedType = "type2";
+            String actualType = doc.type;
 
-            String expectedSource = "{\"content\":\"This doc will not be changed\nIt has multiple lines of text\nIts source doc has extra newlines.\"}";
+            String expectedSource = "{\"content\":\"Updated!\"}";
             String actualSource = doc.source;
-            assertDocsEqual(expectedId, actualId, expectedType, actualType, expectedSource, actualSource);
+            assertDocsEqual(expectedId, actualId, expectedType, actualType,
+                    expectedSource, actualSource);
             return true;
         }).expectNextMatches(doc -> {
-            String expectedId = "updateddoc";
+            String expectedId = "complexdoc";
             String actualId = doc.id;
 
-             String expectedType = "type2";
+             String expectedType = "type1";
              String actualType = doc.type;
 
-            String expectedSource = "{\"content\":\"Updated!\"}";
+            String expectedSource = "{\"title\":\"This is a doc with complex history. Updated!\"}";
             String actualSource = doc.source;
             assertDocsEqual(expectedId, actualId, expectedType, actualType,
                     expectedSource, actualSource);
@@ -278,15 +276,15 @@ protected DirectoryReader getReader() {
     }
 
     @Test
-    public void ReadDocumentsStartingFromCheckpointForOneSegments_AsExpected() throws Exception {
+    public void ReadDocumentsStartingFromCheckpointForOneSegments_AsExpected() {
         // This snapshot has 6 documents in 1 segment. There are updates and deletes involved, so
         // there are only 3 final documents, which affects which document id the reader should
         // start at.
         var snapshot = TestResources.SNAPSHOT_ES_7_10_W_SOFT;
         var version = Version.fromString("ES 7.10");
         List<List<String>> documentIds = List.of(
-                List.of("complexdoc", "unchangeddoc", "updateddoc"),
-                List.of("unchangeddoc", "updateddoc"),
+                List.of("complexdoc", "updateddoc", "unchangeddoc"),
+                List.of("updateddoc", "unchangeddoc"),
                 List.of("unchangeddoc"));
         List<Integer> documentStartingIndices = List.of(0, 2, 5);
 
@@ -309,8 +307,7 @@ public void ReadDocumentsStartingFromCheckpointForOneSegments_AsExpected() throw
 
 
         for (int i = 0; i < documentStartingIndices.size(); i++) {
-            Flux<RfsLuceneDocument> documents = reader.readDocuments(documentStartingIndices.get(i))
-                    .sort(Comparator.comparing(doc -> doc.id)); // Sort for consistent order given LuceneDocumentsReader may interleave
+            Flux<RfsLuceneDocument> documents = reader.readDocuments(documentStartingIndices.get(i));
 
             var actualDocIds = documents.collectList().block().stream().map(doc -> doc.id).collect(Collectors.joining(","));
             var expectedDocIds = String.join(",", documentIds.get(i));
@@ -324,8 +321,8 @@ public void ReadDocumentsStartingFromCheckpointForManySegments_AsExpected() thro
         var snapshot = TestResources.SNAPSHOT_ES_6_8;
         var version = Version.fromString("ES 6.8");
         List<List<String>> documentIds = List.of(
-                List.of("complexdoc", "unchangeddoc", "updateddoc"),
-                List.of("unchangeddoc", "updateddoc"),
+                List.of("complexdoc", "updateddoc", "unchangeddoc"),
+                List.of("updateddoc", "unchangeddoc"),
                 List.of("unchangeddoc"));
 
         final var repo = new FileSystemRepo(snapshot.dir);
@@ -346,11 +343,11 @@ public void ReadDocumentsStartingFromCheckpointForManySegments_AsExpected() thro
         var reader = LuceneDocumentsReader.getFactory(sourceResourceProvider).apply(luceneDir);
 
 
-        for (int i = 0; i < documentIds.size(); i++) {
-            Flux<RfsLuceneDocument> documents = reader.readDocuments(0);
+        for (int startingDocIndex = 0; startingDocIndex < documentIds.size(); startingDocIndex++) {
+            Flux<RfsLuceneDocument> documents = reader.readDocuments(startingDocIndex);
 
             var actualDocIds = documents.collectList().block().stream().map(doc -> doc.id).collect(Collectors.joining(","));
-            var expectedDocIds = String.join(",", documentIds.get(i));
+            var expectedDocIds = String.join(",", documentIds.get(startingDocIndex));
             Assertions.assertEquals(expectedDocIds, actualDocIds);
         }
     }