[ML] Add a timeout option to file structure finder (#34117)

This can be used to restrict the amount of CPU a single structure finder request can use. The timeout is not implemented precisely, so requests may run for slightly longer than the timeout before aborting. The default is 25 seconds, which is a little below Kibana's default timeout of 30 seconds for calls to Elasticsearch APIs.
elastic · Oct 30, 2018 · b5cd33a · b5cd33a
1 parent adb1642
commit b5cd33a
Show file tree

Hide file tree

Showing 28 changed files with 900 additions and 170 deletions.
diff --git a/docs/reference/ml/apis/find-file-structure.asciidoc b/docs/reference/ml/apis/find-file-structure.asciidoc
diff --git a/...in/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java b/...in/core/src/main/java/org/elasticsearch/xpack/core/ml/action/FindFileStructureAction.java
@@ -16,6 +16,7 @@
 import org.elasticsearch.common.io.stream.StreamInput;
 import org.elasticsearch.common.io.stream.StreamOutput;
 import org.elasticsearch.common.io.stream.Writeable;
+import org.elasticsearch.common.unit.TimeValue;
 import org.elasticsearch.common.xcontent.StatusToXContentObject;
 import org.elasticsearch.common.xcontent.XContentBuilder;
 import org.elasticsearch.rest.RestStatus;
@@ -112,6 +113,7 @@ public boolean equals(Object other) {
     public static class Request extends ActionRequest {
 
         public static final ParseField LINES_TO_SAMPLE = new ParseField("lines_to_sample");
+        public static final ParseField TIMEOUT = new ParseField("timeout");
         public static final ParseField CHARSET = FileStructure.CHARSET;
         public static final ParseField FORMAT = FileStructure.FORMAT;
         public static final ParseField COLUMN_NAMES = FileStructure.COLUMN_NAMES;
@@ -128,6 +130,7 @@ public static class Request extends ActionRequest {
             "[%s] may only be specified if [" + FORMAT.getPreferredName() + "] is [%s]";
 
         private Integer linesToSample;
+        private TimeValue timeout;
         private String charset;
         private FileStructure.Format format;
         private List<String> columnNames;
@@ -151,6 +154,14 @@ public void setLinesToSample(Integer linesToSample) {
             this.linesToSample = linesToSample;
         }
 
+        public TimeValue getTimeout() {
+            return timeout;
+        }
+
+        public void setTimeout(TimeValue timeout) {
+            this.timeout = timeout;
+        }
+
         public String getCharset() {
             return charset;
         }
@@ -313,6 +324,7 @@ public ActionRequestValidationException validate() {
         public void readFrom(StreamInput in) throws IOException {
             super.readFrom(in);
             linesToSample = in.readOptionalVInt();
+            timeout = in.readOptionalTimeValue();
             charset = in.readOptionalString();
             format = in.readBoolean() ? in.readEnum(FileStructure.Format.class) : null;
             columnNames = in.readBoolean() ? in.readList(StreamInput::readString) : null;
@@ -330,6 +342,7 @@ public void readFrom(StreamInput in) throws IOException {
         public void writeTo(StreamOutput out) throws IOException {
             super.writeTo(out);
             out.writeOptionalVInt(linesToSample);
+            out.writeOptionalTimeValue(timeout);
             out.writeOptionalString(charset);
             if (format == null) {
                 out.writeBoolean(false);
@@ -365,7 +378,7 @@ public void writeTo(StreamOutput out) throws IOException {
 
         @Override
         public int hashCode() {
-            return Objects.hash(linesToSample, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
+            return Objects.hash(linesToSample, timeout, charset, format, columnNames, hasHeaderRow, delimiter, grokPattern, timestampFormat,
                 timestampField, sample);
         }
 
@@ -382,6 +395,7 @@ public boolean equals(Object other) {
 
             Request that = (Request) other;
             return Objects.equals(this.linesToSample, that.linesToSample) &&
+                Objects.equals(this.timeout, that.timeout) &&
                 Objects.equals(this.charset, that.charset) &&
                 Objects.equals(this.format, that.format) &&
                 Objects.equals(this.columnNames, that.columnNames) &&

diff --git a/.../ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java b/.../ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportFindFileStructureAction.java
@@ -48,10 +48,10 @@ protected void doExecute(Task task, FindFileStructureAction.Request request,
 
     private FindFileStructureAction.Response buildFileStructureResponse(FindFileStructureAction.Request request) throws Exception {
 
-        FileStructureFinderManager structureFinderManager = new FileStructureFinderManager();
+        FileStructureFinderManager structureFinderManager = new FileStructureFinderManager(threadPool.scheduler());
 
         FileStructureFinder fileStructureFinder = structureFinderManager.findFileStructure(request.getLinesToSample(),
-            request.getSample().streamInput(), new FileStructureOverrides(request));
+            request.getSample().streamInput(), new FileStructureOverrides(request), request.getTimeout());
 
         return new FindFileStructureAction.Response(fileStructureFinder.getStructure());
     }

diff --git a/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java b/...ain/java/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinder.java
@@ -41,10 +41,11 @@ public class DelimitedFileStructureFinder implements FileStructureFinder {
 
     static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String> explanation, String sample, String charsetName,
                                                                          Boolean hasByteOrderMarker, CsvPreference csvPreference,
-                                                                         boolean trimFields, FileStructureOverrides overrides)
+                                                                         boolean trimFields, FileStructureOverrides overrides,
+                                                                         TimeoutChecker timeoutChecker)
         throws IOException {
 
-        Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference);
+        Tuple<List<List<String>>, List<Integer>> parsed = readRows(sample, csvPreference, timeoutChecker);
         List<List<String>> rows = parsed.v1();
         List<Integer> lineNumbers = parsed.v2();
 
@@ -106,7 +107,8 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
             structureBuilder.setShouldTrimFields(true);
         }
 
-        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides);
+        Tuple<String, TimestampMatch> timeField = FileStructureUtils.guessTimestampField(explanation, sampleRecords, overrides,
+            timeoutChecker);
         if (timeField != null) {
             String timeLineRegex = null;
             StringBuilder builder = new StringBuilder("^");
@@ -148,7 +150,7 @@ static DelimitedFileStructureFinder makeDelimitedFileStructureFinder(List<String
         }
 
         Tuple<SortedMap<String, Object>, SortedMap<String, FieldStats>> mappingsAndFieldStats =
-            FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords);
+            FileStructureUtils.guessMappingsAndCalculateFieldStats(explanation, sampleRecords, timeoutChecker);
 
         SortedMap<String, Object> mappings = mappingsAndFieldStats.v1();
         if (timeField != null) {
@@ -183,7 +185,8 @@ public FileStructure getStructure() {
         return structure;
     }
 
-    static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference) throws IOException {
+    static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPreference csvPreference, TimeoutChecker timeoutChecker)
+        throws IOException {
 
         int fieldsInFirstRow = -1;
 
@@ -204,6 +207,7 @@ static Tuple<List<List<String>>, List<Integer>> readRows(String sample, CsvPrefe
                         }
                     }
                     rows.add(row);
+                    timeoutChecker.check("delimited record parsing");
                     lineNumbers.add(csvReader.getLineNumber());
                 }
             } catch (SuperCsvException e) {

diff --git a/...a/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java b/...a/org/elasticsearch/xpack/ml/filestructurefinder/DelimitedFileStructureFinderFactory.java
@@ -62,8 +62,8 @@ public boolean canCreateFromSample(List<String> explanation, String sample) {
 
     @Override
     public FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
-                                                FileStructureOverrides overrides) throws IOException {
+                                                FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws IOException {
         return DelimitedFileStructureFinder.makeDelimitedFileStructureFinder(explanation, sample, charsetName, hasByteOrderMarker,
-            csvPreference, trimFields, overrides);
+            csvPreference, trimFields, overrides, timeoutChecker);
     }
 }
diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderFactory.java
@@ -39,9 +39,10 @@ public interface FileStructureFinderFactory {
      * @param hasByteOrderMarker Did the sample have a byte order marker?  <code>null</code> means "not relevant".
      * @param overrides Stores structure decisions that have been made by the end user, and should
      *                  take precedence over anything the {@link FileStructureFinder} may decide.
+     * @param timeoutChecker Will abort the operation if its timeout is exceeded.
      * @return A {@link FileStructureFinder} object suitable for determining the structure of the supplied sample.
      * @throws Exception if something goes wrong during creation.
      */
     FileStructureFinder createFromSample(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
-                                         FileStructureOverrides overrides) throws Exception;
+                                         FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception;
 }
diff --git a/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java b/.../main/java/org/elasticsearch/xpack/ml/filestructurefinder/FileStructureFinderManager.java
@@ -7,7 +7,9 @@
 
 import com.ibm.icu.text.CharsetDetector;
 import com.ibm.icu.text.CharsetMatch;
+import org.elasticsearch.ElasticsearchTimeoutException;
 import org.elasticsearch.common.collect.Tuple;
+import org.elasticsearch.common.unit.TimeValue;
 
 import java.io.BufferedInputStream;
 import java.io.BufferedReader;
@@ -23,15 +25,17 @@
 import java.util.HashSet;
 import java.util.List;
 import java.util.Locale;
+import java.util.Objects;
 import java.util.Optional;
 import java.util.Set;
+import java.util.concurrent.ScheduledExecutorService;
 import java.util.stream.Collectors;
 
 /**
  * Runs the high-level steps needed to create ingest configs for the specified file.  In order:
  * 1. Determine the most likely character set (UTF-8, UTF-16LE, ISO-8859-2, etc.)
  * 2. Load a sample of the file, consisting of the first 1000 lines of the file
- * 3. Determine the most likely file structure - one of ND-JSON, XML, CSV, TSV or semi-structured text
+ * 3. Determine the most likely file structure - one of ND-JSON, XML, delimited or semi-structured text
  * 4. Create an appropriate structure object and delegate writing configs to it
  */
 public final class FileStructureFinderManager {
@@ -81,8 +85,18 @@ public final class FileStructureFinderManager {
 
     private static final int BUFFER_SIZE = 8192;
 
+    private final ScheduledExecutorService scheduler;
+
+    /**
+     * Create the file structure manager.
+     * @param scheduler Used for checking timeouts.
+     */
+    public FileStructureFinderManager(ScheduledExecutorService scheduler) {
+        this.scheduler = Objects.requireNonNull(scheduler);
+    }
+
     public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile) throws Exception {
-        return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES);
+        return findFileStructure(idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null);
     }
 
     /**
@@ -95,42 +109,49 @@ public FileStructureFinder findFileStructure(Integer idealSampleLineCount, Input
      * @param overrides Aspects of the file structure that are known in advance.  These take precedence over
      *                  values determined by structure analysis.  An exception will be thrown if the file structure
      *                  is incompatible with an overridden value.
+     * @param timeout The maximum time the analysis is permitted to take.  If it takes longer than this an
+     *                {@link ElasticsearchTimeoutException} may be thrown (although not necessarily immediately
+     *                the timeout is exceeded).
      * @return A {@link FileStructureFinder} object from which the structure and messages can be queried.
      * @throws Exception A variety of problems could occur at various stages of the structure finding process.
      */
-    public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides)
+    public FileStructureFinder findFileStructure(Integer idealSampleLineCount, InputStream fromFile, FileStructureOverrides overrides,
+                                                 TimeValue timeout)
         throws Exception {
         return findFileStructure(new ArrayList<>(), (idealSampleLineCount == null) ? DEFAULT_IDEAL_SAMPLE_LINE_COUNT : idealSampleLineCount,
-            fromFile, overrides);
+            fromFile, overrides, timeout);
     }
 
     public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile)
         throws Exception {
-        return findFileStructure(new ArrayList<>(), idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES);
+        return findFileStructure(explanation, idealSampleLineCount, fromFile, FileStructureOverrides.EMPTY_OVERRIDES, null);
     }
 
     public FileStructureFinder findFileStructure(List<String> explanation, int idealSampleLineCount, InputStream fromFile,
-                                                 FileStructureOverrides overrides) throws Exception {
-
-        String charsetName = overrides.getCharset();
-        Reader sampleReader;
-        if (charsetName != null) {
-            // Creating the reader will throw if the specified character set does not exist
-            sampleReader = new InputStreamReader(fromFile, charsetName);
-            explanation.add("Using specified character encoding [" + charsetName + "]");
-        } else {
-            CharsetMatch charsetMatch = findCharset(explanation, fromFile);
-            charsetName = charsetMatch.getName();
-            sampleReader = charsetMatch.getReader();
-        }
+                                                 FileStructureOverrides overrides, TimeValue timeout) throws Exception {
+
+        try (TimeoutChecker timeoutChecker = new TimeoutChecker("structure analysis", timeout, scheduler)) {
+
+            String charsetName = overrides.getCharset();
+            Reader sampleReader;
+            if (charsetName != null) {
+                // Creating the reader will throw if the specified character set does not exist
+                sampleReader = new InputStreamReader(fromFile, charsetName);
+                explanation.add("Using specified character encoding [" + charsetName + "]");
+            } else {
+                CharsetMatch charsetMatch = findCharset(explanation, fromFile, timeoutChecker);
+                charsetName = charsetMatch.getName();
+                sampleReader = charsetMatch.getReader();
+            }
 
-        Tuple<String, Boolean> sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT,
-            Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount));
+            Tuple<String, Boolean> sampleInfo = sampleFile(sampleReader, charsetName, MIN_SAMPLE_LINE_COUNT,
+                Math.max(MIN_SAMPLE_LINE_COUNT, idealSampleLineCount), timeoutChecker);
 
-        return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides);
+            return makeBestStructureFinder(explanation, sampleInfo.v1(), charsetName, sampleInfo.v2(), overrides, timeoutChecker);
+        }
     }
 
-    CharsetMatch findCharset(List<String> explanation, InputStream inputStream) throws Exception {
+    CharsetMatch findCharset(List<String> explanation, InputStream inputStream, TimeoutChecker timeoutChecker) throws Exception {
 
         // We need an input stream that supports mark and reset, so wrap the argument
         // in a BufferedInputStream if it doesn't already support this feature
@@ -141,6 +162,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
         // This is from ICU4J
         CharsetDetector charsetDetector = new CharsetDetector().setText(inputStream);
         CharsetMatch[] charsetMatches = charsetDetector.detectAll();
+        timeoutChecker.check("character set detection");
 
         // Determine some extra characteristics of the input to compensate for some deficiencies of ICU4J
         boolean pureAscii = true;
@@ -164,6 +186,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
             remainingLength -= bytesRead;
         } while (containsZeroBytes == false && remainingLength > 0);
         inputStream.reset();
+        timeoutChecker.check("character set detection");
 
         if (pureAscii) {
             // If the input is pure ASCII then many single byte character sets will match.  We want to favour
@@ -220,7 +243,7 @@ CharsetMatch findCharset(List<String> explanation, InputStream inputStream) thro
     }
 
     FileStructureFinder makeBestStructureFinder(List<String> explanation, String sample, String charsetName, Boolean hasByteOrderMarker,
-                                                FileStructureOverrides overrides) throws Exception {
+                                                FileStructureOverrides overrides, TimeoutChecker timeoutChecker) throws Exception {
 
         Character delimiter = overrides.getDelimiter();
         Character quote = overrides.getQuote();
@@ -250,16 +273,18 @@ FileStructureFinder makeBestStructureFinder(List<String> explanation, String sam
         }
 
         for (FileStructureFinderFactory factory : factories) {
+            timeoutChecker.check("high level format detection");
             if (factory.canCreateFromSample(explanation, sample)) {
-                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides);
+                return factory.createFromSample(explanation, sample, charsetName, hasByteOrderMarker, overrides, timeoutChecker);
             }
         }
 
         throw new IllegalArgumentException("Input did not match " +
             ((overrides.getFormat() == null) ? "any known formats" : "the specified format [" + overrides.getFormat() + "]"));
     }
 
-    private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines) throws IOException {
+    private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int minLines, int maxLines, TimeoutChecker timeoutChecker)
+        throws IOException {
 
         int lineCount = 0;
         BufferedReader bufferedReader = new BufferedReader(reader);
@@ -283,6 +308,7 @@ private Tuple<String, Boolean> sampleFile(Reader reader, String charsetName, int
         String line;
         while ((line = bufferedReader.readLine()) != null && ++lineCount <= maxLines) {
             sample.append(line).append('\n');
+            timeoutChecker.check("sample line splitting");
         }
 
         if (lineCount < minLines) {