[8.1] [ML] Text structure finder caps exclude lines pattern at 1000 c…

…haracters Because of the way Filebeat parses CSV files the text structure finder needs to generate a regular expression that will ignore the header row of the CSV file. It does this by concatenating the column names separated by the delimiter with optional quoting. However, if there are hundreds of columns this can lead to a very long regular expression, potentially one that cannot be evaluated by some programming languages. This change limits the length of the regular expression to 1000 characters by only including elements for the first few columns when there are many. Matching 1000 characters of header should be sufficient to reliably identify the header row even when it is much longer. It is extremely unlikely that there would be a data row where the first 1000 characters exactly matched the header but then subsequent fields diverged. Backport of elastic#84236
droberts195 · Feb 22, 2022 · 07920cd · 07920cd
1 parent e39413d
commit 07920cd
Show file tree

Hide file tree

Showing 3 changed files with 66 additions and 12 deletions.
diff --git a/docs/changelog/84236.yaml b/docs/changelog/84236.yaml
@@ -0,0 +1,6 @@
+pr: 84236
+summary: Text structure finder caps exclude lines pattern at 1000 characters
+area: Machine Learning
+type: bug
+issues:
+ - 83434
diff --git a/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java b/...a/org/elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinder.java
@@ -33,7 +33,8 @@
 
 public class DelimitedTextStructureFinder implements TextStructureFinder {
 
-    private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
+    static final int MAX_EXCLUDE_LINES_PATTERN_LENGTH = 1000;
+    static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
     private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
     private static final int LONG_FIELD_THRESHOLD = 100;
     private final List<String> sampleMessages;
@@ -137,20 +138,11 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
             .setColumnNames(columnNamesList);
 
         String quote = String.valueOf(quoteChar);
-        String twoQuotes = quote + quote;
         String quotePattern = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
         String optQuotePattern = quotePattern + "?";
         String delimiterPattern = (delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
         if (isHeaderInText) {
-            structureBuilder.setExcludeLinesPattern(
-                "^"
-                    + Arrays.stream(header)
-                        .map(
-                            column -> optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1")
-                                + optQuotePattern
-                        )
-                        .collect(Collectors.joining(delimiterPattern))
-            );
+            structureBuilder.setExcludeLinesPattern(makeExcludeLinesPattern(header, quote, optQuotePattern, delimiterPattern));
         }
 
         if (trimFields) {
@@ -413,7 +405,7 @@ private static boolean isFirstRowUnusual(List<String> explanation, List<List<Str
         for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
             for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS
                 && j < otherRowStrs.size(); j += innerIncrement) {
-                otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j), shortFieldMask));
+                otherRowStats.accept(levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j), shortFieldMask));
                 ++numComparisons;
             }
         }
@@ -813,4 +805,37 @@ static String makeMultilineStartPattern(
         explanation.add("Failed to create a suitable multi-line start pattern");
         return null;
     }
+
+    /**
+     * Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
+     * (Such lines may be observed multiple times if multiple delimited files are concatenated.)
+     *
+     * This pattern consists of a pattern that matches the literal column names, optionally quoted and
+     * separated by the delimiter.
+     *
+     * In the event that the column names are long and/or numerous only the first few are included.
+     * These ought to be enough to reliably distinguish the header line from data lines.
+     */
+    static String makeExcludeLinesPattern(String[] header, String quote, String optQuotePattern, String delimiterPattern) {
+        String twoQuotes = quote + quote;
+        StringBuilder excludeLinesPattern = new StringBuilder("^");
+        boolean isFirst = true;
+        int maxLengthOfFields = MAX_EXCLUDE_LINES_PATTERN_LENGTH - delimiterPattern.length() - 2; // 2 is length of ".*"
+        for (String column : header) {
+            String columnPattern = optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1")
+                + optQuotePattern;
+            if (isFirst) {
+                // Always append the pattern for the first column, even if it exceeds the limit
+                excludeLinesPattern.append(columnPattern);
+                isFirst = false;
+            } else {
+                if (excludeLinesPattern.length() + columnPattern.length() > maxLengthOfFields) {
+                    excludeLinesPattern.append(".*");
+                    break;
+                }
+                excludeLinesPattern.append(delimiterPattern).append(columnPattern);
+            }
+        }
+        return excludeLinesPattern.toString();
+    }
 }
diff --git a/.../elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinderTests.java b/.../elasticsearch/xpack/textstructure/structurefinder/DelimitedTextStructureFinderTests.java
@@ -25,9 +25,12 @@
 import static org.elasticsearch.xpack.textstructure.structurefinder.TimestampFormatFinder.stringToNumberPosBitSet;
 import static org.hamcrest.Matchers.arrayContaining;
 import static org.hamcrest.Matchers.contains;
+import static org.hamcrest.Matchers.endsWith;
 import static org.hamcrest.Matchers.equalTo;
 import static org.hamcrest.Matchers.hasKey;
+import static org.hamcrest.Matchers.lessThanOrEqualTo;
 import static org.hamcrest.Matchers.not;
+import static org.hamcrest.Matchers.startsWith;
 
 public class DelimitedTextStructureFinderTests extends TextStructureTestCase {
 
@@ -1122,6 +1125,26 @@ public void testMultilineStartPatternDeterminationTooHard() {
         assertThat(explanation, contains("Failed to create a suitable multi-line start pattern"));
     }
 
+    public void testMakeExcludeLinesPattern() {
+
+        String[] header = generateRandomStringArray(1000, randomIntBetween(5, 50), false, false);
+        String quote = randomFrom("\"", "'");
+        String quotePattern = quote.replaceAll(DelimitedTextStructureFinder.REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
+        String optQuotePattern = quotePattern + "?";
+        char delimiter = randomFrom(',', ';', '\t', '|');
+        String delimiterPattern = (delimiter == '\t')
+            ? "\\t"
+            : String.valueOf(delimiter).replaceAll(DelimitedTextStructureFinder.REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
+
+        String excludeLinesPattern = DelimitedTextStructureFinder.makeExcludeLinesPattern(header, quote, optQuotePattern, delimiterPattern);
+
+        assertThat(excludeLinesPattern, startsWith("^"));
+        assertThat(excludeLinesPattern.length(), lessThanOrEqualTo(DelimitedTextStructureFinder.MAX_EXCLUDE_LINES_PATTERN_LENGTH));
+        if (excludeLinesPattern.contains(header[header.length - 1]) == false) {
+            assertThat(excludeLinesPattern, endsWith(".*"));
+        }
+    }
+
     static Map<String, Object> randomCsvProcessorSettings() {
         String field = randomAlphaOfLength(10);
         return DelimitedTextStructureFinder.makeCsvProcessorSettings(