From e58a332b22802b8315731e74d818fd1702e44d1d Mon Sep 17 00:00:00 2001
From: Mayya Sharipova <mayya.sharipova@elastic.co>
Date: Mon, 5 Mar 2018 10:26:12 -0800
Subject: [PATCH] Limit analyzed text for highlighting (improvements)

Increase the default limit of index.highlight.max_analyzed_offset to 1M instead of previous 10K.
Enhance an error message when offset increased to include field name, index name and doc_id.

Relates to #27934, elastic/kibana#16764
---
 .../migration/migrate_6_0/analysis.asciidoc   |  4 +-
 .../search/request/highlighting.asciidoc      |  2 +-
 .../uhighlight/CustomUnifiedHighlighter.java  | 27 +------------
 .../subphase/highlight/PlainHighlighter.java  | 24 +++++++-----
 .../highlight/UnifiedHighlighter.java         | 38 ++++++++++++++-----
 .../CustomUnifiedHighlighterTests.java        |  2 +-
 6 files changed, 48 insertions(+), 49 deletions(-)

diff --git a/docs/reference/migration/migrate_6_0/analysis.asciidoc b/docs/reference/migration/migrate_6_0/analysis.asciidoc
index fba71827bedd1..4316a342eba86 100644
--- a/docs/reference/migration/migrate_6_0/analysis.asciidoc
+++ b/docs/reference/migration/migrate_6_0/analysis.asciidoc
@@ -17,7 +17,7 @@ Highlighting a text that was indexed without offsets or term vectors,
 requires analysis of this text in memory real time during the search request.
 For large texts this analysis may take substantial amount of time and memory.
 To protect against this, the maximum number of characters that to be analyzed will be
-limited to 10000 in the next major Elastic version. For this version, by default the limit
-is not set. A deprecation warning will be issued when an analyzed text exceeds 10000.
+limited to 1000000 in the next major Elastic version. For this version, by default the limit
+is not set. A deprecation warning will be issued when an analyzed text exceeds 1000000.
  The limit can be set for a particular index with the index setting
 `index.highlight.max_analyzed_offset`.
\ No newline at end of file
diff --git a/docs/reference/search/request/highlighting.asciidoc b/docs/reference/search/request/highlighting.asciidoc
index 974c141b03faa..e8e54dcf4b36c 100644
--- a/docs/reference/search/request/highlighting.asciidoc
+++ b/docs/reference/search/request/highlighting.asciidoc
@@ -107,7 +107,7 @@ needs highlighting. The `plain` highlighter always uses plain highlighting.
 [WARNING]
 Plain highlighting for large texts may require substantial amount of time and memory.
 To protect against this, the maximum number of text characters to be analyzed will be
-limited to 10000 in the next major Elastic version. The default limit is not set for this version,
+limited to 1000000 in the next major Elastic version. The default limit is not set for this version,
 but can be set for a particular index with the index setting `index.highlight.max_analyzed_offset`.
 
 [[highlighting-settings]]
diff --git a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
index dc32e42c57bd3..1e61037b2e0bb 100644
--- a/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
+++ b/server/src/main/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighter.java
@@ -35,12 +35,9 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.automaton.CharacterRunAutomaton;
 import org.elasticsearch.common.Nullable;
-import org.elasticsearch.common.logging.DeprecationLogger;
-import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.lucene.all.AllTermQuery;
 import org.elasticsearch.common.lucene.search.MultiPhrasePrefixQuery;
 import org.elasticsearch.common.lucene.search.function.FunctionScoreQuery;
-import org.elasticsearch.index.IndexSettings;
 import org.elasticsearch.index.search.ESToParentBlockJoinQuery;
 
 import java.io.IOException;
@@ -71,7 +68,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
     private final BreakIterator breakIterator;
     private final Locale breakIteratorLocale;
     private final int noMatchSize;
-    private final int maxAnalyzedOffset;
 
     /**
      * Creates a new instance of {@link CustomUnifiedHighlighter}
@@ -86,7 +82,6 @@ public class CustomUnifiedHighlighter extends UnifiedHighlighter {
      *                    If null {@link BreakIterator#getSentenceInstance(Locale)} is used.
      * @param fieldValue the original field values delimited by MULTIVAL_SEP_CHAR.
      * @param noMatchSize The size of the text that should be returned when no highlighting can be performed.
-     * @param maxAnalyzedOffset The maximum number of characters that will be analyzed for highlighting.
      */
     public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     Analyzer analyzer,
@@ -95,8 +90,7 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
                                     @Nullable Locale breakIteratorLocale,
                                     @Nullable BreakIterator breakIterator,
                                     String fieldValue,
-                                    int noMatchSize,
-                                    int maxAnalyzedOffset) {
+                                    int noMatchSize) {
         super(searcher, analyzer);
         this.offsetSource = offsetSource;
         this.breakIterator = breakIterator;
@@ -104,7 +98,6 @@ public CustomUnifiedHighlighter(IndexSearcher searcher,
         this.passageFormatter = passageFormatter;
         this.fieldValue = fieldValue;
         this.noMatchSize = noMatchSize;
-        this.maxAnalyzedOffset = maxAnalyzedOffset;
     }
 
     /**
@@ -128,24 +121,6 @@ public Snippet[] highlightField(String field, Query query, int docId, int maxPas
     @Override
     protected List<CharSequence[]> loadFieldValues(String[] fields, DocIdSetIterator docIter,
                                                    int cacheCharsThreshold) throws IOException {
-        // Issue deprecation warning if maxAnalyzedOffset is not set, and field length > default setting for 7.0
-        final int defaultMaxAnalyzedOffset7 = 10000;
-        if ((offsetSource == OffsetSource.ANALYSIS) &&  (maxAnalyzedOffset == -1) && (fieldValue.length() > defaultMaxAnalyzedOffset7)) {
-            DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(CustomUnifiedHighlighter.class));
-            deprecationLogger.deprecated(
-                "The length of text to be analyzed for highlighting [" + fieldValue.length() +
-                    "] exceeded the allowed maximum of [" + defaultMaxAnalyzedOffset7 + "] set for the next major Elastic version. " +
-                    "For large texts, indexing with offsets or term vectors is recommended!");
-        }
-        // Throw an error if maxAnalyzedOffset is explicitly set by the user, and field length > maxAnalyzedOffset
-        if ((offsetSource == OffsetSource.ANALYSIS) &&  (maxAnalyzedOffset > 0) && (fieldValue.length() > maxAnalyzedOffset)) {
-            // maxAnalyzedOffset is not set by user
-            throw new IllegalArgumentException(
-                "The length of text to be analyzed for highlighting [" + fieldValue.length() +
-                    "] exceeded the allowed maximum of [" + maxAnalyzedOffset + "]. This maximum can be set by changing the [" +
-                    IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
-                    "For large texts, indexing with offsets or term vectors is recommended!");
-        }
         // we only highlight one field, one document at a time
         return Collections.singletonList(new String[]{fieldValue});
     }
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
index 5c3aa876e5a42..53c734b25492c 100644
--- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
+++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/PlainHighlighter.java
@@ -55,6 +55,7 @@
 
 public class PlainHighlighter implements Highlighter {
     private static final String CACHE_KEY = "highlight-plain";
+    private static final DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
 
     @Override
     public HighlightField highlight(HighlighterContext highlighterContext) {
@@ -110,26 +111,29 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
 
         try {
             textsToHighlight = HighlightUtils.loadFieldValues(field, mapper, context, hitContext);
-            final int defaultMaxAnalyzedOffset7 = 10000;
+            final int maxAnalyzedOffset7 = 1000000;
             for (Object textToHighlight : textsToHighlight) {
                 String text = convertFieldValue(mapper.fieldType(), textToHighlight);
 
                 // Issue deprecation warning if maxAnalyzedOffset is not set, and text length > default setting for 7.0
-                if ((maxAnalyzedOffset == -1) && (text.length() > defaultMaxAnalyzedOffset7)) {
-                    DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(PlainHighlighter.class));
+                if ((maxAnalyzedOffset == -1) && (text.length() > maxAnalyzedOffset7)) {
                     deprecationLogger.deprecated(
-                        "The length of text to be analyzed for highlighting [" + text.length() + "] exceeded the allowed maximum of [" +
-                            defaultMaxAnalyzedOffset7 + "] set for the next major Elastic version. " +
-                            "For large texts, indexing with offsets or term vectors is recommended!");
+                        "The length [" + text.length()+ "] of [" + highlighterContext.fieldName + "] field of [" +
+                            hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index has " +
+                            "exceeded the allowed maximum of ["+ maxAnalyzedOffset7 + "] set for the next major Elastic version. " +
+                            "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                            "] index level setting. For large texts, indexing with offsets or term vectors is recommended!");
                 }
                 // Throw an error if maxAnalyzedOffset is explicitly set by the user, and text length > maxAnalyzedOffset
                 if ((maxAnalyzedOffset > 0) && (text.length() > maxAnalyzedOffset)) {
                     // maxAnalyzedOffset is not set by user
                     throw new IllegalArgumentException(
-                        "The length of text to be analyzed for highlighting [" + text.length() +
-                            "] exceeded the allowed maximum of [" + maxAnalyzedOffset + "]. This maximum can be set by changing the [" +
-                            IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() + "] index level setting. " +
-                            "For large texts, indexing with offsets or term vectors is recommended!");
+                        "The length [" + text.length()+ "] of [" + highlighterContext.fieldName + "] field of [" +
+                            hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " +
+                            "has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
+                            "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                            "] index level setting. " + "For large texts, indexing with offsets or term vectors, and highlighting " +
+                            "with unified or fvh highlighter is recommended!");
                 }
 
                 try (TokenStream tokenStream = analyzer.tokenStream(mapper.fieldType().name(), text)) {
diff --git a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
index 9cfc057e03885..7b72f608e764d 100644
--- a/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
+++ b/server/src/main/java/org/elasticsearch/search/fetch/subphase/highlight/UnifiedHighlighter.java
@@ -31,28 +31,30 @@
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.CollectionUtil;
 import org.elasticsearch.common.Strings;
+import org.elasticsearch.common.logging.DeprecationLogger;
+import org.elasticsearch.common.logging.Loggers;
 import org.elasticsearch.common.text.Text;
 import org.elasticsearch.index.mapper.DocumentMapper;
 import org.elasticsearch.index.mapper.FieldMapper;
 import org.elasticsearch.index.mapper.KeywordFieldMapper;
 import org.elasticsearch.index.mapper.MappedFieldType;
-import org.elasticsearch.index.mapper.MapperService;
 import org.elasticsearch.search.fetch.FetchPhaseExecutionException;
 import org.elasticsearch.search.fetch.FetchSubPhase;
 import org.elasticsearch.search.internal.SearchContext;
+import org.elasticsearch.index.IndexSettings;
 
 import java.io.IOException;
 import java.text.BreakIterator;
 import java.util.ArrayList;
-import java.util.HashMap;
 import java.util.List;
 import java.util.Locale;
-import java.util.Map;
 import java.util.stream.Collectors;
 
 import static org.apache.lucene.search.uhighlight.CustomUnifiedHighlighter.MULTIVAL_SEP_CHAR;
 
 public class UnifiedHighlighter implements Highlighter {
+    private static final DeprecationLogger deprecationLogger = new DeprecationLogger(Loggers.getLogger(UnifiedHighlighter.class));
+
     @Override
     public boolean canHighlight(FieldMapper fieldMapper) {
         return true;
@@ -67,8 +69,6 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
         Encoder encoder = field.fieldOptions().encoder().equals("html") ? HighlightUtils.Encoders.HTML : HighlightUtils.Encoders.DEFAULT;
         CustomPassageFormatter passageFormatter = new CustomPassageFormatter(field.fieldOptions().preTags()[0],
             field.fieldOptions().postTags()[0], encoder);
-        final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
-
         List<Snippet> snippets = new ArrayList<>();
         int numberOfFragments;
         try {
@@ -83,21 +83,41 @@ public HighlightField highlight(HighlighterContext highlighterContext) {
             final CustomUnifiedHighlighter highlighter;
             final String fieldValue = mergeFieldValues(fieldValues, MULTIVAL_SEP_CHAR);
             final OffsetSource offsetSource = getOffsetSource(fieldMapper.fieldType());
+
+            final int maxAnalyzedOffset = context.indexShard().indexSettings().getHighlightMaxAnalyzedOffset();
+            // Issue a deprecation warning if maxAnalyzedOffset is not set, and field length > default setting for 7.0
+            final int maxAnalyzedOffset7 = 1000000;
+            if ((offsetSource == OffsetSource.ANALYSIS) &&  (maxAnalyzedOffset == -1) && (fieldValue.length() > maxAnalyzedOffset7)) {
+                deprecationLogger.deprecated(
+                    "The length [" + fieldValue.length() + "] of [" + highlighterContext.fieldName + "] field of [" +
+                        hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index has " +
+                        "exceeded the allowed maximum of [" + maxAnalyzedOffset7 + "] set for the next major Elastic version. " +
+                        "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                        "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
+            }
+            // Throw an error if maxAnalyzedOffset is explicitly set by the user, and field length > maxAnalyzedOffset
+            if ((offsetSource == OffsetSource.ANALYSIS) &&  (maxAnalyzedOffset > 0) && (fieldValue.length() > maxAnalyzedOffset)) {
+                throw new IllegalArgumentException(
+                    "The length [" + fieldValue.length() + "] of [" + highlighterContext.fieldName + "] field of [" +
+                        hitContext.hit().getId() + "] doc of [" + context.indexShard().shardId().getIndexName() + "] index " + 
+                        "has exceeded [" + maxAnalyzedOffset + "] - maximum allowed to be analyzed for highlighting. " +
+                        "This maximum can be set by changing the [" + IndexSettings.MAX_ANALYZED_OFFSET_SETTING.getKey() +
+                        "] index level setting. " + "For large texts, indexing with offsets or term vectors is recommended!");
+            }
+
             if (field.fieldOptions().numberOfFragments() == 0) {
                 // we use a control char to separate values, which is the only char that the custom break iterator
                 // breaks the text on, so we don't lose the distinction between the different values of a field and we
                 // get back a snippet per value
                 CustomSeparatorBreakIterator breakIterator = new CustomSeparatorBreakIterator(MULTIVAL_SEP_CHAR);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
-                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize(),
-                    maxAnalyzedOffset);
+                    field.fieldOptions().boundaryScannerLocale(), breakIterator, fieldValue, field.fieldOptions().noMatchSize());
                 numberOfFragments = fieldValues.size(); // we are highlighting the whole content, one snippet per value
             } else {
                 //using paragraph separator we make sure that each field value holds a discrete passage for highlighting
                 BreakIterator bi = getBreakIterator(field);
                 highlighter = new CustomUnifiedHighlighter(searcher, analyzer, offsetSource, passageFormatter,
-                    field.fieldOptions().boundaryScannerLocale(), bi,
-                    fieldValue, field.fieldOptions().noMatchSize(), maxAnalyzedOffset);
+                    field.fieldOptions().boundaryScannerLocale(), bi, fieldValue, field.fieldOptions().noMatchSize());
                 numberOfFragments = field.fieldOptions().numberOfFragments();
             }
 
diff --git a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
index 9353600bd0422..04ac7ec03c20b 100644
--- a/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
+++ b/server/src/test/java/org/apache/lucene/search/uhighlight/CustomUnifiedHighlighterTests.java
@@ -79,7 +79,7 @@ private void assertHighlightOneDoc(String fieldName, String[] inputs, Analyzer a
         String rawValue = Strings.arrayToDelimitedString(inputs, String.valueOf(MULTIVAL_SEP_CHAR));
         CustomUnifiedHighlighter highlighter = new CustomUnifiedHighlighter(searcher, analyzer, null,
                 new CustomPassageFormatter("<b>", "</b>", new DefaultEncoder()), locale,
-                breakIterator, rawValue, noMatchSize, -1);
+                breakIterator, rawValue, noMatchSize);
         highlighter.setFieldMatcher((name) -> "text".equals(name));
         final Snippet[] snippets =
             highlighter.highlightField("text", query, topDocs.scoreDocs[0].doc, expectedPassages.length);