Skip to content

Commit

Permalink
Merge branch 'feature/rrf-score-normalization-v2' into rrf_tests
Browse files Browse the repository at this point in the history
Signed-off-by: Ryan Bogan <[email protected]>
  • Loading branch information
ryanbogan authored Nov 26, 2024
2 parents f539741 + 29fafd6 commit 5033a31
Show file tree
Hide file tree
Showing 8 changed files with 488 additions and 20 deletions.
18 changes: 10 additions & 8 deletions .github/workflows/test_aggregations.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,9 +10,6 @@ on:
branches:
- "*"
- "feature/**"
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
Get-CI-Image-Tag:
uses: opensearch-project/opensearch-build/.github/workflows/get-ci-image-tag.yml@main
Expand All @@ -33,16 +30,20 @@ jobs:
# this image tag is subject to change as more dependencies and updates will arrive over time
image: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-version-linux }}
# need to switch to root so that github actions can install runner binary on container without permission issues.
options: --user root
options: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-options }}


steps:
- name: Run start commands
run: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-command }}

- name: Checkout neural-search
uses: actions/checkout@v1
uses: actions/checkout@v4

- name: Setup Java ${{ matrix.java }}
uses: actions/setup-java@v1
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: ${{ matrix.java }}

- name: Run tests
Expand All @@ -61,11 +62,12 @@ jobs:

steps:
- name: Checkout neural-search
uses: actions/checkout@v1
uses: actions/checkout@v4

- name: Setup Java ${{ matrix.java }}
uses: actions/setup-java@v1
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: ${{ matrix.java }}

- name: Run tests
Expand Down
12 changes: 7 additions & 5 deletions .github/workflows/test_security.yml
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@ on:
branches:
- "*"
- "feature/**"
env:
ACTIONS_ALLOW_USE_UNSECURE_NODE_VERSION: true

jobs:
Get-CI-Image-Tag:
Expand All @@ -32,17 +30,21 @@ jobs:
# this image tag is subject to change as more dependencies and updates will arrive over time
image: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-version-linux }}
# need to switch to root so that github actions can install runner binary on container without permission issues.
options: --user root
options: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-options }}

steps:
- name: Run start commands
run: ${{ needs.Get-CI-Image-Tag.outputs.ci-image-start-command }}

- name: Checkout neural-search
uses: actions/checkout@v1
uses: actions/checkout@v4
with:
submodules: true

- name: Setup Java ${{ matrix.java }}
uses: actions/setup-java@v1
uses: actions/setup-java@v4
with:
distribution: 'temurin'
java-version: ${{ matrix.java }}

- name: Run tests
Expand Down
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
- Implement Reciprocal Rank Fusion score normalization/combination technique in hybrid query ([#874](https://github.com/opensearch-project/neural-search/pull/874))
### Enhancements
### Bug Fixes
- Address inconsistent scoring in hybrid query results ([#998](https://github.com/opensearch-project/neural-search/pull/998))
### Infrastructure
### Documentation
### Maintenance
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -123,7 +123,6 @@ boolean isHybridQuery(final SearchPhaseResult searchPhaseResult) {
&& isHybridQueryStartStopElement(searchPhaseResult.queryResult().topDocs().topDocs.scoreDocs[0]);
}

@VisibleForTesting
<Result extends SearchPhaseResult> List<QuerySearchResult> getQueryPhaseSearchResults(final SearchPhaseResults<Result> results) {
return results.getAtomicArray()
.asList()
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
*/
package org.opensearch.neuralsearch.query;

import com.google.common.annotations.VisibleForTesting;
import lombok.Getter;
import lombok.extern.log4j.Log4j2;
import org.apache.lucene.search.DisiPriorityQueue;
Expand All @@ -30,7 +31,7 @@
* corresponds to order of sub-queries in an input Hybrid query.
*/
@Log4j2
public final class HybridQueryScorer extends Scorer {
public class HybridQueryScorer extends Scorer {

// score for each of sub-query in this hybrid query
@Getter
Expand Down Expand Up @@ -100,7 +101,8 @@ public float score() throws IOException {
return score(getSubMatches());
}

private float score(DisiWrapper topList) throws IOException {
@VisibleForTesting
float score(DisiWrapper topList) throws IOException {
float totalScore = 0.0f;
for (DisiWrapper disiWrapper = topList; disiWrapper != null; disiWrapper = disiWrapper.next) {
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue
Expand Down Expand Up @@ -187,7 +189,12 @@ public int docID() {
*/
public float[] hybridScores() throws IOException {
float[] scores = new float[numSubqueries];
DisiWrapper topList = subScorersPQ.topList();
// retrieves sub-matches using DisjunctionDisiScorer's two-phase iteration process.
// while the two-phase iterator can efficiently skip blocks of document IDs during matching,
// the DisiWrapper (obtained from subScorersPQ.topList()) ensures sequential document ID iteration.
// this is necessary for maintaining correct scoring order.
DisiWrapper topList = getSubMatches();

for (HybridDisiWrapper disiWrapper = (HybridDisiWrapper) topList; disiWrapper != null; disiWrapper =
(HybridDisiWrapper) disiWrapper.next) {
// check if this doc has match in the subQuery. If not, add score as 0.0 and continue
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -108,12 +108,21 @@ public void collect(int doc) throws IOException {
}
// Increment total hit count which represents unique doc found on the shard
totalHits++;
hitsThresholdChecker.incrementHitCount();
for (int i = 0; i < subScoresByQuery.length; i++) {
float score = subScoresByQuery[i];
// if score is 0.0 there is no hits for that sub-query
if (score == 0) {
continue;
}
if (hitsThresholdChecker.isThresholdReached() && totalHitsRelation == TotalHits.Relation.EQUAL_TO) {
log.info(
"hit count threshold reached: total hits={}, threshold={}, action=updating_results",
totalHits,
hitsThresholdChecker.getTotalHitsThreshold()
);
totalHitsRelation = TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO;
}
collectedHitsPerSubQuery[i]++;
PriorityQueue<ScoreDoc> pq = compoundScores[i];
ScoreDoc currentDoc = new ScoreDoc(doc + docBase, score);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,7 @@
import org.opensearch.core.xcontent.ToXContent;
import org.opensearch.core.xcontent.XContentBuilder;
import org.opensearch.core.xcontent.XContentParser;
import org.opensearch.index.mapper.MappedFieldType;
import org.opensearch.index.mapper.TextFieldMapper;
import org.opensearch.index.query.MatchAllQueryBuilder;
import org.opensearch.index.query.QueryBuilder;
Expand Down Expand Up @@ -756,6 +757,69 @@ public void testBoost_whenDefaultBoostSet_thenBuildSuccessfully() {
assertNotNull(hybridQueryBuilder);
}

@SneakyThrows
public void testBuild_whenValidParameters_thenCreateQuery() {
String queryText = "test query";
String modelId = "test_model";
String fieldName = "rank_features";

// Create mock context
QueryShardContext context = mock(QueryShardContext.class);
MappedFieldType fieldType = mock(MappedFieldType.class);
when(context.fieldMapper(fieldName)).thenReturn(fieldType);
when(fieldType.typeName()).thenReturn("rank_features");

// Create HybridQueryBuilder instance (no spy since it's final)
NeuralSparseQueryBuilder neuralSparseQueryBuilder = new NeuralSparseQueryBuilder();
neuralSparseQueryBuilder.fieldName(fieldName)
.queryText(queryText)
.modelId(modelId)
.queryTokensSupplier(() -> Map.of("token1", 1.0f, "token2", 0.5f));
HybridQueryBuilder builder = new HybridQueryBuilder().add(neuralSparseQueryBuilder);

// Build query
Query query = builder.toQuery(context);

// Verify
assertNotNull("Query should not be null", query);
assertTrue("Should be HybridQuery", query instanceof HybridQuery);
}

@SneakyThrows
public void testDoEquals_whenSameParameters_thenEqual() {
// Create neural queries
NeuralQueryBuilder neuralQueryBuilder1 = new NeuralQueryBuilder().queryText("test").modelId("test_model");

NeuralQueryBuilder neuralQueryBuilder2 = new NeuralQueryBuilder().queryText("test").modelId("test_model");

// Create neural sparse queries with queryTokensSupplier
NeuralSparseQueryBuilder neuralSparseQueryBuilder1 = new NeuralSparseQueryBuilder().fieldName("test_field")
.queryText("test")
.modelId("test_model")
.queryTokensSupplier(() -> Map.of("token1", 1.0f));

NeuralSparseQueryBuilder neuralSparseQueryBuilder2 = new NeuralSparseQueryBuilder().fieldName("test_field")
.queryText("test")
.modelId("test_model")
.queryTokensSupplier(() -> Map.of("token1", 1.0f));

// Create builders
HybridQueryBuilder builder1 = new HybridQueryBuilder().add(neuralQueryBuilder1).add(neuralSparseQueryBuilder1);

HybridQueryBuilder builder2 = new HybridQueryBuilder().add(neuralQueryBuilder2).add(neuralSparseQueryBuilder2);

// Verify
assertTrue("Builders should be equal", builder1.equals(builder2));
assertEquals("Hash codes should match", builder1.hashCode(), builder2.hashCode());
}

public void testValidate_whenInvalidParameters_thenThrowException() {
// Test null query builder
HybridQueryBuilder builderWithNull = new HybridQueryBuilder();
IllegalArgumentException nullException = assertThrows(IllegalArgumentException.class, () -> builderWithNull.add(null));
assertEquals("inner hybrid query clause cannot be null", nullException.getMessage());
}

public void testVisit() {
HybridQueryBuilder hybridQueryBuilder = new HybridQueryBuilder().add(new NeuralQueryBuilder()).add(new NeuralSparseQueryBuilder());
List<QueryBuilder> visitedQueries = new ArrayList<>();
Expand Down
Loading

0 comments on commit 5033a31

Please sign in to comment.