From 11e61309765c865fb335cd22e5252b9dd8872dc2 Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Fri, 23 Feb 2018 16:18:38 -0800
Subject: [PATCH] Limit analyzed text for highlighting (improvements)

Increase the default limit of `index.highlight.max_analyzed_offset` to 1M instead of previous 10K.

Enhance an error message when offset increased to include field name, index name and doc_id.

Relates to https://github.com/elastic/kibana/issues/16764
---
 docs/reference/index-modules.asciidoc           |  2 +-
 .../migration/migrate_7_0/analysis.asciidoc     |  2 +-
 .../search/request/highlighting.asciidoc        |  2 +-
 .../uhighlight/CustomUnifiedHighlighter.java    | 14 +-------------
 .../org/elasticsearch/index/IndexSettings.java  |  4 ++--
 .../subphase/highlight/PlainHighlighter.java    | 11 ++++++-----
 .../subphase/highlight/UnifiedHighlighter.java  | 17 +++++++++++------
 .../CustomUnifiedHighlighterTests.java          |  2 +-
 8 files changed, 24 insertions(+), 30 deletions(-)

diff --git a/docs/reference/index-modules.asciidoc b/docs/reference/index-modules.asciidoc
index 2dc821fd1e12a..0ab742108b92f 100644
--- a/docs/reference/index-modules.asciidoc
+++ b/docs/reference/index-modules.asciidoc
@@ -202,7 +202,7 @@ specific index module:
 
      The maximum number of characters that will be analyzed for a highlight request.
      This setting is only applicable when highlighting is requested on a text that was indexed without offsets or term vectors.
-     Defaults to `10000`.
+     Defaults to `1000000`.
 
  `index.max_terms_count`::
 
diff --git a/docs/reference/migration/migrate_7_0/analysis.asciidoc b/docs/reference/migration/migrate_7_0/analysis.asciidoc
index 2a81dd7456dd7..560cc68818ad4 100644
--- a/docs/reference/migration/migrate_7_0/analysis.asciidoc
+++ b/docs/reference/migration/migrate_7_0/analysis.asciidoc
@@ -21,5 +21,5 @@ Highlighting a text that was indexed without offsets or term vectors,
 requires analysis of this text in memory real time during the search request.
 For large texts this analysis may take substantial amount of time and memory.
 To protect against this, the maximum number of characters that will be analyzed has been
-limited to 10000. This default limit can be changed
+limited to 1000000. This default limit can be changed
 for a particular index with the index setting `index.highlight.max_analyzed_offset`.
\ No newline at end of file
diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc
index 81d9c4c369075..a6d7bcf1415d6 100644
--- a/docs/reference/search/request/highlighting.asciidoc
+++ b/docs/reference/search/request/highlighting.asciidoc
@@ -104,7 +104,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
 [WARNING]
 Plain highlighting for large texts may require substantial amount of time and memory.
 To protect against this, the maximum number of text characters that will be analyzed has been
-limited to 10000. This default limit can be changed
+limited to 1000000. This default limit can be changed
 for a particular index with the index setting `index.highlight.max_analyzed_offset`.
 
 [[highlighting-settings]]
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
index f3723fc17bbc6..2c8169c3ac41f 100644
--- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
+++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -37,7 +37,6 @@
 import org.elasticsearch.common.Nullable;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
-import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
 
 import java.io.IOException;
@@ -68,7 +67,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final BreakIterator breakIterator;
     private final Locale breakIteratorLocale;
     private final int noMatchSize;
-    private final int maxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -83,7 +81,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
      *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
      * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
      * @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
-     * @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
      */
     public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     Analyzer analyzer,
@@ -92,8 +89,7 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     @Nullable Locale breakIteratorLocale,
                                     @Nullable BreakIterator breakIterator,
                                     String fieldValue,
-                                    int noMatchSize,
-                                    int maxAnalyzedOffset) {
+                                    int noMatchSize) {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
         this.breakIterator = breakIterator;
@@ -101,7 +97,6 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
         this.passageFormatter = passageFormatter;
         this.fieldValue = fieldValue;
         this.noMatchSize = noMatchSize;
-        this.maxAnalyzedOffset = maxAnalyzedOffset;
     }
 
     /**
@@ -125,13 +120,6 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
     @Override
     protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                    int cacheCharsThreshold) throws IOException {
-        if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
-            throw new IllegalArgumentException(
-                "The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
-                    maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
-                    IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
-                    "For large texts, indexing with offsets or term vectors is recommended!");
-        }
         // we only highlight one field, one document at a time
         return Collections.singletonList(new String[]{fieldValue});
     }
diff --git a/server/src/main/java/org/elasticsearch/index/IndexSettings.java b/server/src/main/java/org/elasticsearch/index/IndexSettings.java
index 3f09b8e015408..6c6f05623c355 100644
--- a/server/src/main/java/org/elasticsearch/index/IndexSettings.java
+++ b/server/src/main/java/org/elasticsearch/index/IndexSettings.java
@@ -123,11 +123,11 @@ public final class IndexSettings {
      * A setting describing the maximum number of characters that will be analyzed for a highlight request.
      * This setting is only applicable when highlighting is requested on a text that was indexed without
      * offsets or term vectors.
-     * The default maximum of 10000 characters is defensive as for highlighting larger texts,
+     * The default maximum of 1M characters is defensive as for highlighting larger texts,
      * indexing with offsets or term vectors is recommended.
      */
     public static final Setting<Integer> MAX_ANALYZED_OFFSET_SETTING =
-        Setting.intSetting("index.highlight.max_analyzed_offset", 10000, 1, Property.Dynamic, Property.IndexScope);
+        Setting.intSetting("index.highlight.max_analyzed_offset", 1000000, 1, Property.Dynamic, Property.IndexScope);
 
 
     /**
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
index f1fe32931d2b8..92fd2359aa941 100644
--- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
+++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
@@ -113,11 +113,12 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
                 String text = convertFieldValue(mapper.fieldType(), textToHighlight);
                 if (text.length() > maxAnalyzedOffset) {
                     throw new IllegalArgumentException(
-                        "The length of the text to be analyzed for highlighting has exceeded the allowed maximum of [" +
-                            maxAnalyzedOffset + "]. " + "This maximum can be set by changing the [" +
-                            IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
-                            "For large texts, indexing with offsets or term vectors, and highlighting with unified or " +
-                            "fvh highlighter is recommended!");
+                        "The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
+                            "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
+                            "has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
+                            "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                            "] index level setting. " + "For large texts, indexing with offsets or term vectors, and highlighting " +
+                            "with unified or fvh highlighter is recommended!");
                 }
 
                 try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
index 9cfc057e03885..c7ada10fcf6fc 100644
--- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
+++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -32,11 +32,11 @@
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.common.Strings;
 import org.elasticsearch.common.text.Text;
+import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.mapper.DocumentMapper;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
-import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
 import org.elasticsearch.search.internal.SearchContext;
@@ -44,10 +44,8 @@
 import java.io.IOException;
 import java.text.BreakIterator;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
-import java.util.Map;
 import java.util.stream.Collectors;
 
 import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
@@ -83,21 +81,28 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             final CustomUnifiedHighlighter highlighter;
             final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
             final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
+            if ((offsetSource == OffsetSource.ANALYSIS) && (fieldValue.length() > maxAnalyzedOffset)) {
+                throw new IllegalArgumentException(
+                    "The length of [" + highlighterContext.fieldName + "] field of [" + hitContext.hit().getId() +
+                        "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " + "has exceeded [" +
+                        maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
+                        "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                        "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
+            }
             if (field.fieldOptions().numberOfFragments() == 0) {
                 // we use a control char to separate values, which is the only char that the custom break iterator
                 // breaks the text on, so we don't lose the distinction between the different values of a field and we
                 // get back a snippet per value
                 CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
-                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
-                    maxAnalyzedOffset);
+                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
                 numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
             } else {
                 //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
                 BreakIterator bi = getBreakIterator(field);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
                     field.fieldOptions().boundaryScannerLocale(), bi,
-                    fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
+                    fieldValue, field.fieldOptions().noMatchSize());
                 numberOfFragments = field.fieldOptions().numberOfFragments();
             }
 
diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index a577b5f7aff85..796553034fb38 100644
--- a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -78,7 +78,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
         String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
         CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
                 new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
-                breakIterator, rawValue, noMatchSize, 10000);
+                breakIterator, rawValue, noMatchSize);
         highlighter.setFieldMatcher((name) -> "text".equals(name));
         final Snippet[] snippets =
             highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);