Skip to content

Commit

Permalink
[8.1] [ML] Text structure finder caps exclude lines pattern at 1000 c…
Browse files Browse the repository at this point in the history
…haracters

Because of the way Filebeat parses CSV files the text structure finder
needs to generate a regular expression that will ignore the header row
of the CSV file.

It does this by concatenating the column names separated by the delimiter
with optional quoting. However, if there are hundreds of columns this can
lead to a very long regular expression, potentially one that cannot be
evaluated by some programming languages.

This change limits the length of the regular expression to 1000 characters
by only including elements for the first few columns when there are many.
Matching 1000 characters of header should be sufficient to reliably
identify the header row even when it is much longer. It is extremely
unlikely that there would be a data row where the first 1000 characters
exactly matched the header but then subsequent fields diverged.

Backport of elastic#84236
  • Loading branch information
droberts195 committed Feb 22, 2022
1 parent e39413d commit 07920cd
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 12 deletions.
6 changes: 6 additions & 0 deletions docs/changelog/84236.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
pr: 84236
summary: Text structure finder caps exclude lines pattern at 1000 characters
area: Machine Learning
type: bug
issues:
- 83434
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,8 @@

public class DelimitedTextStructureFinder implements TextStructureFinder {

private static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
static final int MAX_EXCLUDE_LINES_PATTERN_LENGTH = 1000;
static final String REGEX_NEEDS_ESCAPE_PATTERN = "([\\\\|()\\[\\]{}^$.+*?])";
private static final int MAX_LEVENSHTEIN_COMPARISONS = 100;
private static final int LONG_FIELD_THRESHOLD = 100;
private final List<String> sampleMessages;
Expand Down Expand Up @@ -137,20 +138,11 @@ static DelimitedTextStructureFinder makeDelimitedTextStructureFinder(
.setColumnNames(columnNamesList);

String quote = String.valueOf(quoteChar);
String twoQuotes = quote + quote;
String quotePattern = quote.replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
String optQuotePattern = quotePattern + "?";
String delimiterPattern = (delimiter == '\t') ? "\\t" : String.valueOf(delimiter).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
if (isHeaderInText) {
structureBuilder.setExcludeLinesPattern(
"^"
+ Arrays.stream(header)
.map(
column -> optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1")
+ optQuotePattern
)
.collect(Collectors.joining(delimiterPattern))
);
structureBuilder.setExcludeLinesPattern(makeExcludeLinesPattern(header, quote, optQuotePattern, delimiterPattern));
}

if (trimFields) {
Expand Down Expand Up @@ -413,7 +405,7 @@ private static boolean isFirstRowUnusual(List<String> explanation, List<List<Str
for (int i = 0; numComparisons < MAX_LEVENSHTEIN_COMPARISONS && i < otherRowStrs.size(); ++i) {
for (int j = i + 1 + random.nextInt(innerIncrement); numComparisons < MAX_LEVENSHTEIN_COMPARISONS
&& j < otherRowStrs.size(); j += innerIncrement) {
otherRowStats.accept((double) levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j), shortFieldMask));
otherRowStats.accept(levenshteinFieldwiseCompareRows(otherRows.get(i), otherRows.get(j), shortFieldMask));
++numComparisons;
}
}
Expand Down Expand Up @@ -813,4 +805,37 @@ static String makeMultilineStartPattern(
explanation.add("Failed to create a suitable multi-line start pattern");
return null;
}

/**
* Make a regular expression that Filebeat can use to ignore the header line of the delimited file.
* (Such lines may be observed multiple times if multiple delimited files are concatenated.)
*
* This pattern consists of a pattern that matches the literal column names, optionally quoted and
* separated by the delimiter.
*
* In the event that the column names are long and/or numerous only the first few are included.
* These ought to be enough to reliably distinguish the header line from data lines.
*/
static String makeExcludeLinesPattern(String[] header, String quote, String optQuotePattern, String delimiterPattern) {
String twoQuotes = quote + quote;
StringBuilder excludeLinesPattern = new StringBuilder("^");
boolean isFirst = true;
int maxLengthOfFields = MAX_EXCLUDE_LINES_PATTERN_LENGTH - delimiterPattern.length() - 2; // 2 is length of ".*"
for (String column : header) {
String columnPattern = optQuotePattern + column.replace(quote, twoQuotes).replaceAll(REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1")
+ optQuotePattern;
if (isFirst) {
// Always append the pattern for the first column, even if it exceeds the limit
excludeLinesPattern.append(columnPattern);
isFirst = false;
} else {
if (excludeLinesPattern.length() + columnPattern.length() > maxLengthOfFields) {
excludeLinesPattern.append(".*");
break;
}
excludeLinesPattern.append(delimiterPattern).append(columnPattern);
}
}
return excludeLinesPattern.toString();
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -25,9 +25,12 @@
import static org.elasticsearch.xpack.textstructure.structurefinder.TimestampFormatFinder.stringToNumberPosBitSet;
import static org.hamcrest.Matchers.arrayContaining;
import static org.hamcrest.Matchers.contains;
import static org.hamcrest.Matchers.endsWith;
import static org.hamcrest.Matchers.equalTo;
import static org.hamcrest.Matchers.hasKey;
import static org.hamcrest.Matchers.lessThanOrEqualTo;
import static org.hamcrest.Matchers.not;
import static org.hamcrest.Matchers.startsWith;

public class DelimitedTextStructureFinderTests extends TextStructureTestCase {

Expand Down Expand Up @@ -1122,6 +1125,26 @@ public void testMultilineStartPatternDeterminationTooHard() {
assertThat(explanation, contains("Failed to create a suitable multi-line start pattern"));
}

public void testMakeExcludeLinesPattern() {

String[] header = generateRandomStringArray(1000, randomIntBetween(5, 50), false, false);
String quote = randomFrom("\"", "'");
String quotePattern = quote.replaceAll(DelimitedTextStructureFinder.REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");
String optQuotePattern = quotePattern + "?";
char delimiter = randomFrom(',', ';', '\t', '|');
String delimiterPattern = (delimiter == '\t')
? "\\t"
: String.valueOf(delimiter).replaceAll(DelimitedTextStructureFinder.REGEX_NEEDS_ESCAPE_PATTERN, "\\\\$1");

String excludeLinesPattern = DelimitedTextStructureFinder.makeExcludeLinesPattern(header, quote, optQuotePattern, delimiterPattern);

assertThat(excludeLinesPattern, startsWith("^"));
assertThat(excludeLinesPattern.length(), lessThanOrEqualTo(DelimitedTextStructureFinder.MAX_EXCLUDE_LINES_PATTERN_LENGTH));
if (excludeLinesPattern.contains(header[header.length - 1]) == false) {
assertThat(excludeLinesPattern, endsWith(".*"));
}
}

static Map<String, Object> randomCsvProcessorSettings() {
String field = randomAlphaOfLength(10);
return DelimitedTextStructureFinder.makeCsvProcessorSettings(
Expand Down

0 comments on commit 07920cd

Please sign in to comment.