diff --git a/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml b/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml index b09bc8418c98a..15778393959e5 100644 --- a/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml +++ b/modules/mapper-extras/src/test/resources/rest-api-spec/test/search-as-you-type/20_highlighting.yml @@ -165,7 +165,7 @@ setup: - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" } - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" } - match: { hits.hits.0.highlight.a_field: ["quick brown fox jump lazy dog"] } - - match: { hits.hits.0.highlight.a_field\._2gram: ["quick brown fox jump lazy dog"] } + - match: { hits.hits.0.highlight.a_field\._2gram: ["quick brown fox jump lazy dog"] } - match: { hits.hits.0.highlight.a_field\._3gram: ["quick brown fox jump lazy dog"] } - match: { hits.hits.0.highlight.a_field\._4gram: null } @@ -197,6 +197,6 @@ setup: - match: { hits.hits.0._source.a_field: "quick brown fox jump lazy dog" } - match: { hits.hits.0._source.text_field: "quick brown fox jump lazy dog" } - match: { hits.hits.0.highlight.a_field: ["quick brown fox jump lazy dog"] } - - match: { hits.hits.0.highlight.a_field\._2gram: ["quick brown fox jump lazy dog"] } - - match: { hits.hits.0.highlight.a_field\._3gram: ["quick brown fox jump lazy dog"] } + - match: { hits.hits.0.highlight.a_field\._2gram: ["quick brown fox jump lazy dog"] } + - match: { hits.hits.0.highlight.a_field\._3gram: ["quick brown fox jump lazy dog"] } - match: { hits.hits.0.highlight.a_field\._4gram: ["quick brown fox jump lazy dog"] } diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java index 52eee559c6888..723c30f10dc61 100644 --- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java +++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomPassageFormatter.java @@ -49,17 +49,23 @@ public Snippet[] format(Passage[] passages, String content) { pos = passage.getStartOffset(); for (int i = 0; i < passage.getNumMatches(); i++) { int start = passage.getMatchStarts()[i]; + assert start >= pos && start < passage.getEndOffset(); + // append content before this start + append(sb, content, pos, start); + int end = passage.getMatchEnds()[i]; - // its possible to have overlapping terms - if (start > pos) { - append(sb, content, pos, start); - } - if (end > pos) { - sb.append(preTag); - append(sb, content, Math.max(pos, start), end); - sb.append(postTag); - pos = end; + assert end > start; + // Look ahead to expand 'end' past all overlapping: + while (i + 1 < passage.getNumMatches() && passage.getMatchStarts()[i + 1] < end) { + end = passage.getMatchEnds()[++i]; } + end = Math.min(end, passage.getEndOffset()); // in case match straddles past passage + + sb.append(preTag); + append(sb, content, start, end); + sb.append(postTag); + + pos = end; } // its possible a "term" from the analyzer could span a sentence boundary. append(sb, content, pos, Math.max(pos, passage.getEndOffset())); diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java index 3c24dc2d42b82..4504cfbee64c7 100644 --- a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java +++ b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java @@ -20,6 +20,8 @@ package org.apache.lucene.search.uhighlight; import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.custom.CustomAnalyzer; +import org.apache.lucene.analysis.ngram.EdgeNGramTokenizerFactory; import org.apache.lucene.analysis.standard.StandardAnalyzer; import org.apache.lucene.document.Document; import org.apache.lucene.document.Field; @@ -32,6 +34,7 @@ import org.apache.lucene.index.Term; import org.apache.lucene.search.BooleanClause; import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.FuzzyQuery; import org.apache.lucene.search.IndexSearcher; import org.apache.lucene.search.MatchAllDocsQuery; import org.apache.lucene.search.PhraseQuery; @@ -224,4 +227,33 @@ public void testGroupSentences() throws Exception { BoundedBreakIteratorScanner.getSentence(Locale.ROOT, 20), 0, outputs); } + public void testOverlappingTerms() throws Exception { + final String[] inputs = { + "bro", + "brown", + "brownie", + "browser" + }; + final String[] outputs = { + "bro", + "brown", + "brownie", + "browser" + }; + BooleanQuery query = new BooleanQuery.Builder() + .add(new FuzzyQuery(new Term("text", "brow")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "b")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "br")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "bro")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "brown")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "browni")), BooleanClause.Occur.SHOULD) + .add(new TermQuery(new Term("text", "browser")), BooleanClause.Occur.SHOULD) + .build(); + Analyzer analyzer = CustomAnalyzer.builder() + .withTokenizer(EdgeNGramTokenizerFactory.class, "minGramSize", "1", "maxGramSize", "7") + .build(); + assertHighlightOneDoc("text", inputs, + analyzer, query, Locale.ROOT, BreakIterator.getSentenceInstance(Locale.ROOT), 0, outputs); + } + }