Implement matches() on SourceConfirmedTextQuery (elastic#100134)

`match_only_text` does not currently support highlighting via the matches option of the default highlighter. This commit implements matches on the backing query for this field, and also fixes a bug where the field type's value fetcher could hold on to the wrong reference for a source lookup, causing threading errors.
romseygeek · Oct 4, 2023 · 5d0c8a3 · 5d0c8a3
1 parent 68974a2
commit 5d0c8a3
Show file tree

Hide file tree

Showing 5 changed files with 246 additions and 2 deletions.
diff --git a/docs/changelog/100134.yaml b/docs/changelog/100134.yaml
@@ -0,0 +1,5 @@
+pr: 100134
+summary: Implement matches() on `SourceConfirmedTextQuery`
+area: Highlighting
+type: enhancement
+issues: []
diff --git a/...as/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java b/...as/src/internalClusterTest/java/org/elasticsearch/index/mapper/MatchOnlyTextMapperIT.java
@@ -0,0 +1,131 @@
+/*
+ * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one
+ * or more contributor license agreements. Licensed under the Elastic License
+ * 2.0 and the Server Side Public License, v 1; you may not use this file except
+ * in compliance with, at your election, the Elastic License 2.0 or the Server
+ * Side Public License, v 1.
+ */
+
+package org.elasticsearch.index.mapper;
+
+import org.elasticsearch.action.bulk.BulkRequestBuilder;
+import org.elasticsearch.action.bulk.BulkResponse;
+import org.elasticsearch.action.search.SearchResponse;
+import org.elasticsearch.action.support.WriteRequest;
+import org.elasticsearch.index.mapper.extras.MapperExtrasPlugin;
+import org.elasticsearch.index.query.QueryBuilders;
+import org.elasticsearch.plugins.Plugin;
+import org.elasticsearch.search.SearchHit;
+import org.elasticsearch.search.fetch.subphase.highlight.HighlightBuilder;
+import org.elasticsearch.test.ESIntegTestCase;
+import org.elasticsearch.xcontent.XContentBuilder;
+import org.elasticsearch.xcontent.XContentFactory;
+
+import java.io.IOException;
+import java.util.Arrays;
+import java.util.Collection;
+
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertAcked;
+import static org.elasticsearch.test.hamcrest.ElasticsearchAssertions.assertNoFailures;
+import static org.elasticsearch.xcontent.XContentFactory.jsonBuilder;
+import static org.hamcrest.Matchers.containsString;
+
+public class MatchOnlyTextMapperIT extends ESIntegTestCase {
+
+    protected Collection<Class<? extends Plugin>> nodePlugins() {
+        return Arrays.asList(MapperExtrasPlugin.class);
+    }
+
+    public void testHighlightingWithMatchOnlyTextFieldMatchPhrase() throws IOException {
+
+        // We index and retrieve a large number of documents to ensure that we go over multiple
+        // segments, to ensure that the highlighter is using the correct segment lookups to
+        // load the source.
+
+        XContentBuilder mappings = jsonBuilder();
+        mappings.startObject().startObject("properties").startObject("message").field("type", "match_only_text").endObject().endObject();
+        mappings.endObject();
+        assertAcked(prepareCreate("test").setMapping(mappings));
+        BulkRequestBuilder bulk = client().prepareBulk("test").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
+        for (int i = 0; i < 2000; i++) {
+            bulk.add(
+                client().prepareIndex()
+                    .setSource(
+                        XContentFactory.jsonBuilder()
+                            .startObject()
+                            .field(
+                                "message",
+                                "[.ds-.slm-history-5-2023.09.20-"
+                                    + randomInt()
+                                    + "][0] marking and sending shard failed due to [failed recovery]"
+                            )
+                            .endObject()
+                    )
+            );
+        }
+        BulkResponse bulkItemResponses = bulk.get();
+        assertNoFailures(bulkItemResponses);
+
+        SearchResponse searchResponse = client().prepareSearch("test")
+            .setQuery(QueryBuilders.matchPhraseQuery("message", "marking and sending shard"))
+            .setSize(500)
+            .highlighter(new HighlightBuilder().field("message"))
+            .get();
+        assertNoFailures(searchResponse);
+        for (SearchHit searchHit : searchResponse.getHits()) {
+            assertThat(
+                searchHit.getHighlightFields().get("message").fragments()[0].string(),
+                containsString("<em>marking and sending shard</em>")
+            );
+        }
+    }
+
+    public void testHighlightingWithMatchOnlyTextFieldSyntheticSource() throws IOException {
+
+        // We index and retrieve a large number of documents to ensure that we go over multiple
+        // segments, to ensure that the highlighter is using the correct segment lookups to
+        // load the source.
+
+        String mappings = """
+            { "_source" : { "mode" : "synthetic" },
+              "properties" : {
+                "message" : { "type" : "match_only_text" }
+              }
+            }
+            """;
+        assertAcked(prepareCreate("test").setMapping(mappings));
+        BulkRequestBuilder bulk = client().prepareBulk("test").setRefreshPolicy(WriteRequest.RefreshPolicy.IMMEDIATE);
+        for (int i = 0; i < 2000; i++) {
+            bulk.add(
+                client().prepareIndex()
+                    .setSource(
+                        XContentFactory.jsonBuilder()
+                            .startObject()
+                            .field(
+                                "message",
+                                "[.ds-.slm-history-5-2023.09.20-"
+                                    + randomInt()
+                                    + "][0] marking and sending shard failed due to [failed recovery]"
+                            )
+                            .endObject()
+                    )
+            );
+        }
+        BulkResponse bulkItemResponses = bulk.get();
+        assertNoFailures(bulkItemResponses);
+
+        SearchResponse searchResponse = client().prepareSearch("test")
+            .setQuery(QueryBuilders.matchPhraseQuery("message", "marking and sending shard"))
+            .setSize(500)
+            .highlighter(new HighlightBuilder().field("message"))
+            .get();
+        assertNoFailures(searchResponse);
+        for (SearchHit searchHit : searchResponse.getHits()) {
+            assertThat(
+                searchHit.getHighlightFields().get("message").fragments()[0].string(),
+                containsString("<em>marking and sending shard</em>")
+            );
+        }
+    }
+
+}
diff --git a/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java b/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/MatchOnlyTextFieldMapper.java
@@ -211,9 +211,9 @@ private IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOExcepti
                     };
                 };
             }
-            ValueFetcher valueFetcher = valueFetcher(searchExecutionContext, null);
-            SourceProvider sourceProvider = searchExecutionContext.lookup();
             return context -> {
+                ValueFetcher valueFetcher = valueFetcher(searchExecutionContext, null);
+                SourceProvider sourceProvider = searchExecutionContext.lookup();
                 valueFetcher.setNextReader(context);
                 return docID -> {
                     try {

diff --git a/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java b/...-extras/src/main/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQuery.java
@@ -9,7 +9,9 @@
 package org.elasticsearch.index.mapper.extras;
 
 import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInvertState;
+import org.apache.lucene.index.IndexOptions;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermStates;
@@ -25,6 +27,7 @@
 import org.apache.lucene.search.LeafSimScorer;
 import org.apache.lucene.search.MatchAllDocsQuery;
 import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Matches;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.PrefixQuery;
@@ -288,6 +291,25 @@ public RuntimePhraseScorer scorer(LeafReaderContext context) throws IOException
                 return new RuntimePhraseScorer(this, approximation, leafSimScorer, valueFetcher, field, in);
             }
 
+            @Override
+            public Matches matches(LeafReaderContext context, int doc) throws IOException {
+                FieldInfo fi = context.reader().getFieldInfos().fieldInfo(field);
+                if (fi == null) {
+                    return null;
+                }
+                // Some highlighters will already have reindexed the source with positions and offsets,
+                // so rather than doing it again we check to see if this data is available on the
+                // current context and if so delegate directly to the inner query
+                if (fi.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS) > 0) {
+                    Weight innerWeight = in.createWeight(searcher, ScoreMode.COMPLETE_NO_SCORES, 1);
+                    return innerWeight.matches(context, doc);
+                }
+                RuntimePhraseScorer scorer = scorer(context);
+                if (scorer == null || scorer.iterator().advance(doc) != doc) {
+                    return null;
+                }
+                return scorer.matches();
+            }
         };
     }
 
@@ -380,6 +402,20 @@ private float computeFreq() throws IOException {
             }
             return frequency;
         }
+
+        private Matches matches() throws IOException {
+            MemoryIndex index = new MemoryIndex(true, false);
+            List<Object> values = valueFetcher.apply(docID());
+            for (Object value : values) {
+                if (value == null) {
+                    continue;
+                }
+                index.addField(field, value.toString(), indexAnalyzer);
+            }
+            IndexSearcher searcher = index.createSearcher();
+            Weight w = searcher.createWeight(searcher.rewrite(query), ScoreMode.COMPLETE_NO_SCORES, 1);
+            return w.matches(searcher.getLeafContexts().get(0), 0);
+        }
     }
 
 }
diff --git a/...as/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java b/...as/src/test/java/org/elasticsearch/index/mapper/extras/SourceConfirmedTextQueryTests.java
@@ -10,11 +10,13 @@
 
 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.document.KeywordField;
 import org.apache.lucene.document.TextField;
 import org.apache.lucene.index.DirectoryReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.IndexWriter;
 import org.apache.lucene.index.LeafReaderContext;
+import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.queries.spans.SpanNearQuery;
 import org.apache.lucene.queries.spans.SpanQuery;
@@ -23,12 +25,19 @@
 import org.apache.lucene.search.BooleanQuery;
 import org.apache.lucene.search.IndexSearcher;
 import org.apache.lucene.search.MatchNoDocsQuery;
+import org.apache.lucene.search.Matches;
+import org.apache.lucene.search.MatchesIterator;
 import org.apache.lucene.search.MultiPhraseQuery;
 import org.apache.lucene.search.PhraseQuery;
 import org.apache.lucene.search.PrefixQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreDoc;
+import org.apache.lucene.search.ScoreMode;
+import org.apache.lucene.search.Sort;
+import org.apache.lucene.search.SortedSetSelector;
 import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.Weight;
 import org.apache.lucene.store.Directory;
 import org.apache.lucene.tests.search.CheckHits;
 import org.apache.lucene.util.IOFunction;
@@ -41,6 +50,8 @@
 import java.util.Collections;
 import java.util.List;
 
+import static org.hamcrest.Matchers.greaterThan;
+
 public class SourceConfirmedTextQueryTests extends ESTestCase {
 
     private static final IOFunction<LeafReaderContext, CheckedIntFunction<List<Object>, IOException>> SOURCE_FETCHER_PROVIDER =
@@ -428,4 +439,65 @@ public void testEmptyIndex() throws Exception {
         }
     }
 
+    public void testMatches() throws Exception {
+        checkMatches(new TermQuery(new Term("body", "d")), "a b c d e", new int[] { 3, 3 });
+        checkMatches(new PhraseQuery("body", "b", "c"), "a b c d c b c a", new int[] { 1, 2, 5, 6 });
+    }
+
+    private static void checkMatches(Query query, String inputDoc, int[] expectedMatches) throws IOException {
+        try (Directory dir = newDirectory(); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(Lucene.STANDARD_ANALYZER))) {
+            Document doc = new Document();
+            doc.add(new TextField("body", "xxxxxnomatchxxxx", Store.YES));
+            doc.add(new KeywordField("sort", "0", Store.NO));
+            w.addDocument(doc);
+
+            doc = new Document();
+            doc.add(new TextField("body", inputDoc, Store.YES));
+            doc.add(new KeywordField("sort", "1", Store.NO));
+            w.addDocument(doc);
+
+            doc = new Document();
+            doc.add(new TextField("body", "xxxx " + inputDoc, Store.YES));
+            doc.add(new KeywordField("sort", "2", Store.NO));
+            w.addDocument(doc);
+
+            Query sourceConfirmedQuery = new SourceConfirmedTextQuery(query, SOURCE_FETCHER_PROVIDER, Lucene.STANDARD_ANALYZER);
+
+            try (IndexReader ir = DirectoryReader.open(w)) {
+
+                IndexSearcher searcher = new IndexSearcher(ir);
+                TopDocs td = searcher.search(
+                    sourceConfirmedQuery,
+                    3,
+                    new Sort(KeywordField.newSortField("sort", false, SortedSetSelector.Type.MAX))
+                );
+
+                Weight weight = searcher.createWeight(searcher.rewrite(sourceConfirmedQuery), ScoreMode.COMPLETE_NO_SCORES, 1);
+
+                int firstDoc = td.scoreDocs[0].doc;
+                LeafReaderContext firstCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(firstDoc, searcher.getLeafContexts()));
+                checkMatches(weight, firstCtx, firstDoc - firstCtx.docBase, expectedMatches, 0);
+
+                int secondDoc = td.scoreDocs[1].doc;
+                LeafReaderContext secondCtx = searcher.getLeafContexts().get(ReaderUtil.subIndex(secondDoc, searcher.getLeafContexts()));
+                checkMatches(weight, secondCtx, secondDoc - secondCtx.docBase, expectedMatches, 1);
+
+            }
+        }
+    }
+
+    private static void checkMatches(Weight w, LeafReaderContext ctx, int doc, int[] expectedMatches, int offset) throws IOException {
+        Matches matches = w.matches(ctx, doc);
+        assertNotNull(matches);
+        MatchesIterator mi = matches.getMatches("body");
+        int i = 0;
+        while (mi.next()) {
+            assertThat(expectedMatches.length, greaterThan(i + 1));
+            assertEquals(mi.startPosition(), expectedMatches[i] + offset);
+            assertEquals(mi.endPosition(), expectedMatches[i + 1] + offset);
+            i += 2;
+        }
+        assertEquals(expectedMatches.length, i);
+    }
+
 }