diff --git a/docs/changelog/100134.yaml b/docs/changelog/100134.yaml new file mode 100644 index 000000000000..3836ec279305 --- /dev/null +++ b/docs/changelog/100134.yaml @@ -0,0 +1,5 @@ +pr: 100134 +summary: Implement matches() on `SourceConfirmedTextQuery` +area: Highlighting +type: enhancement +issues: [] diff --git a/modules/mapper-extras/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java b/modules/mapper-extras/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java new file mode 100644 index 000000000000..3a6d7f5a52e8 --- /dev/null +++ b/modules/mapper-extras/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java @@ -0,0 +1,131 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License + * 2.0 and the Server Side Public License, v 1; you may not use this file except + * in compliance with, at your election, the Elastic License 2.0 or the Server + * Side Public License, v 1. + */ + +package org.elasticsearch.index.mapper; + +import org.elasticsearch.action.bulk.BulkRequestBuilder; +import org.elasticsearch.action.bulk.BulkResponse; +import org.elasticsearch.action.search.SearchResponse; +import org.elasticsearch.action.support.WriteRequest; +import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin; +import org.elasticsearch.index.query.QueryBuilders; +import org.elasticsearch.plugins.Plugin; +import org.elasticsearch.search.SearchHit; +import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder; +import org.elasticsearch.test.ESIntegTestCase; +import org.elasticsearch.xcontent.XContentBuilder; +import org.elasticsearch.xcontent.XContentFactory; + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; + +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked; +import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures; +import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder; +import static org.hamcrest.Matchers.containsString; + +public class MatchOnlyTextMapperIT extends ESIntegTestCase { + + protected Collection> nodePlugins() { + return Arrays.asList(MapperExtrasPlugin.class); + } + + public void testHighlightingWithMatchOnlyTextFieldMatchPhrase() throws IOException { + + // We index and retrieve a large number of documents to ensure that we go over multiple + // segments, to ensure that the highlighter is using the correct segment lookups to + // load the source. + + XContentBuilder mappings = jsonBuilder(); + mappings.startObject().startObject("properties").startObject("message").field("type", "match_only_text").endObject().endObject(); + mappings.endObject(); + assertAcked(prepareCreate("test").setMapping(mappings)); + BulkRequestBuilder bulk = client().prepareBulk("test").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + for (int i = 0; i < 2000; i++) { + bulk.add( + client().prepareIndex() + .setSource( + XContentFactory.jsonBuilder() + .startObject() + .field( + "message", + "[.ds-.slm-history-5-2023.09.20-" + + randomInt() + + "][0] marking and sending shard failed due to [failed recovery]" + ) + .endObject() + ) + ); + } + BulkResponse bulkItemResponses = bulk.get(); + assertNoFailures(bulkItemResponses); + + SearchResponse searchResponse = client().prepareSearch("test") + .setQuery(QueryBuilders.matchPhraseQuery("message", "marking and sending shard")) + .setSize(500) + .highlighter(new HighlightBuilder().field("message")) + .get(); + assertNoFailures(searchResponse); + for (SearchHit searchHit : searchResponse.getHits()) { + assertThat( + searchHit.getHighlightFields().get("message").fragments()[0].string(), + containsString("marking and sending shard") + ); + } + } + + public void testHighlightingWithMatchOnlyTextFieldSyntheticSource() throws IOException { + + // We index and retrieve a large number of documents to ensure that we go over multiple + // segments, to ensure that the highlighter is using the correct segment lookups to + // load the source. + + String mappings = """ + { "_source" : { "mode" : "synthetic" }, + "properties" : { + "message" : { "type" : "match_only_text" } + } + } + """; + assertAcked(prepareCreate("test").setMapping(mappings)); + BulkRequestBuilder bulk = client().prepareBulk("test").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE); + for (int i = 0; i < 2000; i++) { + bulk.add( + client().prepareIndex() + .setSource( + XContentFactory.jsonBuilder() + .startObject() + .field( + "message", + "[.ds-.slm-history-5-2023.09.20-" + + randomInt() + + "][0] marking and sending shard failed due to [failed recovery]" + ) + .endObject() + ) + ); + } + BulkResponse bulkItemResponses = bulk.get(); + assertNoFailures(bulkItemResponses); + + SearchResponse searchResponse = client().prepareSearch("test") + .setQuery(QueryBuilders.matchPhraseQuery("message", "marking and sending shard")) + .setSize(500) + .highlighter(new HighlightBuilder().field("message")) + .get(); + assertNoFailures(searchResponse); + for (SearchHit searchHit : searchResponse.getHits()) { + assertThat( + searchHit.getHighlightFields().get("message").fragments()[0].string(), + containsString("marking and sending shard") + ); + } + } + +} diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java index d89c5db66f37..df86bbb11687 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java @@ -211,9 +211,9 @@ private IOFunction, IOExcepti }; }; } - ValueFetcher valueFetcher = valueFetcher(searchExecutionContext, null); - SourceProvider sourceProvider = searchExecutionContext.lookup(); return context -> { + ValueFetcher valueFetcher = valueFetcher(searchExecutionContext, null); + SourceProvider sourceProvider = searchExecutionContext.lookup(); valueFetcher.setNextReader(context); return docID -> { try { diff --git a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java index f351cca7f73a..dba3d8e0925b 100644 --- a/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java +++ b/modules/mapper-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java @@ -9,7 +9,9 @@ package org.elasticsearch.index.mapper.extras; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.index.FieldInfo; import org.apache.lucene.index.FieldInvertState; +import org.apache.lucene.index.IndexOptions; import org.apache.lucene.index.LeafReaderContext; import org.apache.lucene.index.Term; import org.apache.lucene.index.TermStates; @@ -25,6 +27,7 @@ import org.apache.lucene.search.LeafSimScorer; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Matches; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; @@ -288,6 +291,25 @@ public RuntimePhraseScorer scorer(LeafReaderContext context) throws IOException return new RuntimePhraseScorer(this, approximation, leafSimScorer, valueFetcher, field, in); } + @Override + public Matches matches(LeafReaderContext context, int doc) throws IOException { + FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field); + if (fi == null) { + return null; + } + // Some highlighters will already have reindexed the source with positions and offsets, + // so rather than doing it again we check to see if this data is available on the + // current context and if so delegate directly to the inner query + if (fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0) { + Weight innerWeight = in.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1); + return innerWeight.matches(context, doc); + } + RuntimePhraseScorer scorer = scorer(context); + if (scorer == null || scorer.iterator().advance(doc) != doc) { + return null; + } + return scorer.matches(); + } }; } @@ -380,6 +402,20 @@ private float computeFreq() throws IOException { } return frequency; } + + private Matches matches() throws IOException { + MemoryIndex index = new MemoryIndex(true, false); + List values = valueFetcher.apply(docID()); + for (Object value : values) { + if (value == null) { + continue; + } + index.addField(field, value.toString(), indexAnalyzer); + } + IndexSearcher searcher = index.createSearcher(); + Weight w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1); + return w.matches(searcher.getLeafContexts().get(0), 0); + } } } diff --git a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java index 4a643db99481..f9782b14b872 100644 --- a/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java +++ b/modules/mapper-extras/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java @@ -10,11 +10,13 @@ import org.apache.lucene.document.Document; import org.apache.lucene.document.Field.Store; +import org.apache.lucene.document.KeywordField; import org.apache.lucene.document.TextField; import org.apache.lucene.index.DirectoryReader; import org.apache.lucene.index.IndexReader; import org.apache.lucene.index.IndexWriter; import org.apache.lucene.index.LeafReaderContext; +import org.apache.lucene.index.ReaderUtil; import org.apache.lucene.index.Term; import org.apache.lucene.queries.spans.SpanNearQuery; import org.apache.lucene.queries.spans.SpanQuery; @@ -23,12 +25,19 @@ import org.apache.lucene.search.BooleanQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchNoDocsQuery; +import org.apache.lucene.search.Matches; +import org.apache.lucene.search.MatchesIterator; import org.apache.lucene.search.MultiPhraseQuery; import org.apache.lucene.search.PhraseQuery; import org.apache.lucene.search.PrefixQuery; import org.apache.lucene.search.Query; import org.apache.lucene.search.ScoreDoc; +import org.apache.lucene.search.ScoreMode; +import org.apache.lucene.search.Sort; +import org.apache.lucene.search.SortedSetSelector; import org.apache.lucene.search.TermQuery; +import org.apache.lucene.search.TopDocs; +import org.apache.lucene.search.Weight; import org.apache.lucene.store.Directory; import org.apache.lucene.tests.search.CheckHits; import org.apache.lucene.util.IOFunction; @@ -41,6 +50,8 @@ import java.util.Collections; import java.util.List; +import static org.hamcrest.Matchers.greaterThan; + public class SourceConfirmedTextQueryTests extends ESTestCase { private static final IOFunction, IOException>> SOURCE_FETCHER_PROVIDER = @@ -428,4 +439,65 @@ public void testEmptyIndex() throws Exception { } } + public void testMatches() throws Exception { + checkMatches(new TermQuery(new Term("body", "d")), "a b c d e", new int[] { 3, 3 }); + checkMatches(new PhraseQuery("body", "b", "c"), "a b c d c b c a", new int[] { 1, 2, 5, 6 }); + } + + private static void checkMatches(Query query, String inputDoc, int[] expectedMatches) throws IOException { + try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(Lucene.STANDARD_ANALYZER))) { + Document doc = new Document(); + doc.add(new TextField("body", "xxxxxnomatchxxxx", Store.YES)); + doc.add(new KeywordField("sort", "0", Store.NO)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("body", inputDoc, Store.YES)); + doc.add(new KeywordField("sort", "1", Store.NO)); + w.addDocument(doc); + + doc = new Document(); + doc.add(new TextField("body", "xxxx " + inputDoc, Store.YES)); + doc.add(new KeywordField("sort", "2", Store.NO)); + w.addDocument(doc); + + Query sourceConfirmedQuery = new SourceConfirmedTextQuery(query, SOURCE_FETCHER_PROVIDER, Lucene.STANDARD_ANALYZER); + + try (IndexReader ir = DirectoryReader.open(w)) { + + IndexSearcher searcher = new IndexSearcher(ir); + TopDocs td = searcher.search( + sourceConfirmedQuery, + 3, + new Sort(KeywordField.newSortField("sort", false, SortedSetSelector.Type.MAX)) + ); + + Weight weight = searcher.createWeight(searcher.rewrite(sourceConfirmedQuery), ScoreMode.COMPLETE_NO_SCORES, 1); + + int firstDoc = td.scoreDocs[0].doc; + LeafReaderContext firstCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(firstDoc, searcher.getLeafContexts())); + checkMatches(weight, firstCtx, firstDoc - firstCtx.docBase, expectedMatches, 0); + + int secondDoc = td.scoreDocs[1].doc; + LeafReaderContext secondCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(secondDoc, searcher.getLeafContexts())); + checkMatches(weight, secondCtx, secondDoc - secondCtx.docBase, expectedMatches, 1); + + } + } + } + + private static void checkMatches(Weight w, LeafReaderContext ctx, int doc, int[] expectedMatches, int offset) throws IOException { + Matches matches = w.matches(ctx, doc); + assertNotNull(matches); + MatchesIterator mi = matches.getMatches("body"); + int i = 0; + while (mi.next()) { + assertThat(expectedMatches.length, greaterThan(i + 1)); + assertEquals(mi.startPosition(), expectedMatches[i] + offset); + assertEquals(mi.endPosition(), expectedMatches[i + 1] + offset); + i += 2; + } + assertEquals(expectedMatches.length, i); + } + }