Merge branch 'feature/rrf-score-normalization-v2' into rrf_tests

Signed-off-by: Ryan Bogan <[email protected]>
ryanbogan · Nov 26, 2024 · 5033a31 · 5033a31
2 parents f539741 + 29fafd6
commit 5033a31
Show file tree

Hide file tree

Showing 8 changed files with 488 additions and 20 deletions.
diff --git a/.github/workflows/test_aggregations.yml b/.github/workflows/test_aggregations.yml
@@ -10,9 +10,6 @@ on:
     branches:
       - "*"
       - "feature/**"
-env:
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
-
 jobs:
   Get-CI-Image-Tag:
     uses: opensearch-project/opensearch-build/.github/workflows/get-ci-image-tag.yml@main
@@ -33,16 +30,20 @@ jobs:
       # this image tag is subject to change as more dependencies and updates will arrive over time
       image: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-version-linux }}
       # need to switch to root so that github actions can install runner binary on container without permission issues.
-      options: --user root
+      options: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-options }}
 
 
     steps:
+      - name: Run start commands
+        run: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-command }}
+
       - name: Checkout neural-search
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Setup Java ${{ matrix.java }}
-        uses: actions/setup-java@v1
+        uses: actions/setup-java@v4
         with:
+          distribution: 'temurin'
           java-version: ${{ matrix.java }}
 
       - name: Run tests
@@ -61,11 +62,12 @@ jobs:
 
     steps:
       - name: Checkout neural-search
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
 
       - name: Setup Java ${{ matrix.java }}
-        uses: actions/setup-java@v1
+        uses: actions/setup-java@v4
         with:
+          distribution: 'temurin'
           java-version: ${{ matrix.java }}
 
       - name: Run tests

diff --git a/.github/workflows/test_security.yml b/.github/workflows/test_security.yml
@@ -10,8 +10,6 @@ on:
     branches:
       - "*"
       - "feature/**"
-env:
-  ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true
 
 jobs:
   Get-CI-Image-Tag:
@@ -32,17 +30,21 @@ jobs:
       # this image tag is subject to change as more dependencies and updates will arrive over time
       image: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-version-linux }}
       # need to switch to root so that github actions can install runner binary on container without permission issues.
-      options: --user root
+      options: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-options }}
 
     steps:
+      - name: Run start commands
+        run: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-command }}
+
       - name: Checkout neural-search
-        uses: actions/checkout@v1
+        uses: actions/checkout@v4
         with:
           submodules: true
 
       - name: Setup Java ${{ matrix.java }}
-        uses: actions/setup-java@v1
+        uses: actions/setup-java@v4
         with:
+          distribution: 'temurin'
           java-version: ${{ matrix.java }}
 
       - name: Run tests

diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 - Implement Reciprocal Rank Fusion score normalization/combination technique in hybrid query ([#874](https://github.com/opensearch-project/neural-search/pull/874))
 ### Enhancements
 ### Bug Fixes
+-  Address inconsistent scoring in hybrid query results ([#998](https://github.com/opensearch-project/neural-search/pull/998))
 ### Infrastructure
 ### Documentation
 ### Maintenance

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/RRFProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/RRFProcessor.java
@@ -123,7 +123,6 @@ boolean isHybridQuery(final SearchPhaseResult searchPhaseResult) {
             && isHybridQueryStartStopElement(searchPhaseResult.queryResult().topDocs().topDocs.scoreDocs[0]);
     }
 
-    @VisibleForTesting
     <Result extends SearchPhaseResult> List<QuerySearchResult> getQueryPhaseSearchResults(final SearchPhaseResults<Result> results) {
         return results.getAtomicArray()
             .asList()

diff --git a/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java b/src/main/java/org/opensearch/neuralsearch/query/HybridQueryScorer.java
@@ -4,6 +4,7 @@
  */
 package org.opensearch.neuralsearch.query;
 
+import com.google.common.annotations.VisibleForTesting;
 import lombok.Getter;
 import lombok.extern.log4j.Log4j2;
 import org.apache.lucene.search.DisiPriorityQueue;
@@ -30,7 +31,7 @@
  * corresponds to order of sub-queries in an input Hybrid query.
  */
 @Log4j2
-public final class HybridQueryScorer extends Scorer {
+public class HybridQueryScorer extends Scorer {
 
     // score for each of sub-query in this hybrid query
     @Getter
@@ -100,7 +101,8 @@ public float score() throws IOException {
         return score(getSubMatches());
     }
 
-    private float score(DisiWrapper topList) throws IOException {
+    @VisibleForTesting
+    float score(DisiWrapper topList) throws IOException {
         float totalScore = 0.0f;
         for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) {
             // check if this doc has match in the subQuery. If not, add score as 0.0 and continue
@@ -187,7 +189,12 @@ public int docID() {
      */
     public float[] hybridScores() throws IOException {
         float[] scores = new float[numSubqueries];
-        DisiWrapper topList = subScorersPQ.topList();
+        // retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
+        // while the two-phase iterator can efficiently skip blocks of document IDs during matching,
+        // the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
+        // this is necessary for maintaining correct scoring order.
+        DisiWrapper topList = getSubMatches();
+
         for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
             (HybridDisiWrapper) disiWrapper.next) {
             // check if this doc has match in the subQuery. If not, add score as 0.0 and continue

diff --git a/src/main/java/org/opensearch/neuralsearch/search/collector/HybridTopScoreDocCollector.java b/src/main/java/org/opensearch/neuralsearch/search/collector/HybridTopScoreDocCollector.java
@@ -108,12 +108,21 @@ public void collect(int doc) throws IOException {
                 }
                 // Increment total hit count which represents unique doc found on the shard
                 totalHits++;
+                hitsThresholdChecker.incrementHitCount();
                 for (int i = 0; i < subScoresByQuery.length; i++) {
                     float score = subScoresByQuery[i];
                     // if score is 0.0 there is no hits for that sub-query
                     if (score == 0) {
                         continue;
                     }
+                    if (hitsThresholdChecker.isThresholdReached() && totalHitsRelation == TotalHits.Relation.EQUAL_TO) {
+                        log.info(
+                            "hit count threshold reached: total hits={}, threshold={}, action=updating_results",
+                            totalHits,
+                            hitsThresholdChecker.getTotalHitsThreshold()
+                        );
+                        totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
+                    }
                     collectedHitsPerSubQuery[i]++;
                     PriorityQueue<ScoreDoc> pq = compoundScores[i];
                     ScoreDoc currentDoc = new ScoreDoc(doc + docBase, score);

diff --git a/src/test/java/org/opensearch/neuralsearch/query/HybridQueryBuilderTests.java b/src/test/java/org/opensearch/neuralsearch/query/HybridQueryBuilderTests.java
@@ -50,6 +50,7 @@
 import org.opensearch.core.xcontent.ToXContent;
 import org.opensearch.core.xcontent.XContentBuilder;
 import org.opensearch.core.xcontent.XContentParser;
+import org.opensearch.index.mapper.MappedFieldType;
 import org.opensearch.index.mapper.TextFieldMapper;
 import org.opensearch.index.query.MatchAllQueryBuilder;
 import org.opensearch.index.query.QueryBuilder;
@@ -756,6 +757,69 @@ public void testBoost_whenDefaultBoostSet_thenBuildSuccessfully() {
         assertNotNull(hybridQueryBuilder);
     }
 
+    @SneakyThrows
+    public void testBuild_whenValidParameters_thenCreateQuery() {
+        String queryText = "test query";
+        String modelId = "test_model";
+        String fieldName = "rank_features";
+
+        // Create mock context
+        QueryShardContext context = mock(QueryShardContext.class);
+        MappedFieldType fieldType = mock(MappedFieldType.class);
+        when(context.fieldMapper(fieldName)).thenReturn(fieldType);
+        when(fieldType.typeName()).thenReturn("rank_features");
+
+        // Create HybridQueryBuilder instance (no spy since it's final)
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder = new NeuralSparseQueryBuilder();
+        neuralSparseQueryBuilder.fieldName(fieldName)
+            .queryText(queryText)
+            .modelId(modelId)
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f, "token2", 0.5f));
+        HybridQueryBuilder builder = new HybridQueryBuilder().add(neuralSparseQueryBuilder);
+
+        // Build query
+        Query query = builder.toQuery(context);
+
+        // Verify
+        assertNotNull("Query should not be null", query);
+        assertTrue("Should be HybridQuery", query instanceof HybridQuery);
+    }
+
+    @SneakyThrows
+    public void testDoEquals_whenSameParameters_thenEqual() {
+        // Create neural queries
+        NeuralQueryBuilder neuralQueryBuilder1 = new NeuralQueryBuilder().queryText("test").modelId("test_model");
+
+        NeuralQueryBuilder neuralQueryBuilder2 = new NeuralQueryBuilder().queryText("test").modelId("test_model");
+
+        // Create neural sparse queries with queryTokensSupplier
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder1 = new NeuralSparseQueryBuilder().fieldName("test_field")
+            .queryText("test")
+            .modelId("test_model")
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f));
+
+        NeuralSparseQueryBuilder neuralSparseQueryBuilder2 = new NeuralSparseQueryBuilder().fieldName("test_field")
+            .queryText("test")
+            .modelId("test_model")
+            .queryTokensSupplier(() -> Map.of("token1", 1.0f));
+
+        // Create builders
+        HybridQueryBuilder builder1 = new HybridQueryBuilder().add(neuralQueryBuilder1).add(neuralSparseQueryBuilder1);
+
+        HybridQueryBuilder builder2 = new HybridQueryBuilder().add(neuralQueryBuilder2).add(neuralSparseQueryBuilder2);
+
+        // Verify
+        assertTrue("Builders should be equal", builder1.equals(builder2));
+        assertEquals("Hash codes should match", builder1.hashCode(), builder2.hashCode());
+    }
+
+    public void testValidate_whenInvalidParameters_thenThrowException() {
+        // Test null query builder
+        HybridQueryBuilder builderWithNull = new HybridQueryBuilder();
+        IllegalArgumentException nullException = assertThrows(IllegalArgumentException.class, () -> builderWithNull.add(null));
+        assertEquals("inner hybrid query clause cannot be null", nullException.getMessage());
+    }
+
     public void testVisit() {
         HybridQueryBuilder hybridQueryBuilder = new HybridQueryBuilder().add(new NeuralQueryBuilder()).add(new NeuralSparseQueryBuilder());
         List<QueryBuilder> visitedQueries = new ArrayList<>();