From 9e2a2dff52c95e46f41f6fc1f88ef65bbc95fb6d Mon Sep 17 00:00:00 2001 From: David Roberts Date: Wed, 25 Apr 2018 11:45:52 +0100 Subject: [PATCH 1/6] [ML] Reverse engineer Grok patterns from categorization results It has been noted that the regexes we produce in our categorization results are not that far away from Grok patterns that could be used in Logstash to categorize messages at ingest time and do better field extraction for log formats that do not have out-of-the-box patterns. This change adds a `grok_pattern` field to our GET categories API output. It's calculated using the regex and examples in the categorization result, and applying a list of candidate Grok patterns to the bits in between the tokens that are considered to define the category. This can currently be considered a prototype, as there is an outstanding question on how the functionality should work: * Is calculating the Grok patterns on the fly the best thing to do? It might be better to calculate them when categorization results are created/updated, and store the patterns in a new type of ML results document. Then we could let users manually improve the patterns and remember their edits. But the decision here needs to tie in with the end-to-end story for this functionality. If the intended flow is `ML -> User edits in the Grok debugger -> Logstash config` then maybe there's no need for ML to remember the user edits. --- .../ml/job/results/CategoryDefinition.java | 26 +- x-pack/plugin/ml/build.gradle | 3 +- .../action/TransportGetCategoriesAction.java | 2 +- .../categorization/GrokPatternCreator.java | 244 ++++++++++++++++++ .../xpack/ml/job/persistence/JobProvider.java | 18 +- .../AutodetectResultProcessorIT.java | 2 +- .../GrokPatternCreatorTests.java | 232 +++++++++++++++++ .../ml/job/persistence/JobProviderTests.java | 44 ++-- .../job/results/CategoryDefinitionTests.java | 3 + 9 files changed, 540 insertions(+), 34 deletions(-) create mode 100644 x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java create mode 100644 x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java diff --git a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java index 98c38241856b6..90d01f66f632b 100644 --- a/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java +++ b/x-pack/plugin/core/src/main/java/org/elasticsearch/xpack/core/ml/job/results/CategoryDefinition.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.core.ml.job.results; +import org.elasticsearch.Version; import org.elasticsearch.common.ParseField; import org.elasticsearch.common.io.stream.StreamInput; import org.elasticsearch.common.io.stream.StreamOutput; @@ -34,6 +35,7 @@ public class CategoryDefinition implements ToXContentObject, Writeable { public static final ParseField REGEX = new ParseField("regex"); public static final ParseField MAX_MATCHING_LENGTH = new ParseField("max_matching_length"); public static final ParseField EXAMPLES = new ParseField("examples"); + public static final ParseField GROK_PATTERN = new ParseField("grok_pattern"); // Used for QueryPage public static final ParseField RESULTS_FIELD = new ParseField("categories"); @@ -51,6 +53,7 @@ private static ConstructingObjectParser createParser(b parser.declareString(CategoryDefinition::setRegex, REGEX); parser.declareLong(CategoryDefinition::setMaxMatchingLength, MAX_MATCHING_LENGTH); parser.declareStringArray(CategoryDefinition::setExamples, EXAMPLES); + parser.declareString(CategoryDefinition::setGrokPattern, GROK_PATTERN); return parser; } @@ -61,6 +64,7 @@ private static ConstructingObjectParser createParser(b private String regex = ""; private long maxMatchingLength = 0L; private final Set examples; + private String grokPattern; public CategoryDefinition(String jobId) { this.jobId = jobId; @@ -74,6 +78,9 @@ public CategoryDefinition(StreamInput in) throws IOException { regex = in.readString(); maxMatchingLength = in.readLong(); examples = new TreeSet<>(in.readList(StreamInput::readString)); + if (in.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { + grokPattern = in.readOptionalString(); + } } @Override @@ -84,6 +91,9 @@ public void writeTo(StreamOutput out) throws IOException { out.writeString(regex); out.writeLong(maxMatchingLength); out.writeStringList(new ArrayList<>(examples)); + if (out.getVersion().onOrAfter(Version.V_7_0_0_alpha1)) { + out.writeOptionalString(grokPattern); + } } public String getJobId() { @@ -139,6 +149,14 @@ public void addExample(String example) { examples.add(example); } + public String getGrokPattern() { + return grokPattern; + } + + public void setGrokPattern(String grokPattern) { + this.grokPattern = grokPattern; + } + @Override public XContentBuilder toXContent(XContentBuilder builder, Params params) throws IOException { builder.startObject(); @@ -148,6 +166,9 @@ public XContentBuilder toXContent(XContentBuilder builder, Params params) throws builder.field(REGEX.getPreferredName(), regex); builder.field(MAX_MATCHING_LENGTH.getPreferredName(), maxMatchingLength); builder.field(EXAMPLES.getPreferredName(), examples); + if (grokPattern != null) { + builder.field(GROK_PATTERN.getPreferredName(), grokPattern); + } builder.endObject(); return builder; } @@ -166,11 +187,12 @@ public boolean equals(Object other) { && Objects.equals(this.terms, that.terms) && Objects.equals(this.regex, that.regex) && Objects.equals(this.maxMatchingLength, that.maxMatchingLength) - && Objects.equals(this.examples, that.examples); + && Objects.equals(this.examples, that.examples) + && Objects.equals(this.grokPattern, that.grokPattern); } @Override public int hashCode() { - return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples); + return Objects.hash(jobId, categoryId, terms, regex, maxMatchingLength, examples, grokPattern); } } diff --git a/x-pack/plugin/ml/build.gradle b/x-pack/plugin/ml/build.gradle index af2122d43d9a7..c80a751c237a6 100644 --- a/x-pack/plugin/ml/build.gradle +++ b/x-pack/plugin/ml/build.gradle @@ -46,6 +46,7 @@ dependencies { testCompile project(path: xpackModule('security'), configuration: 'testArtifacts') // ml deps + compile project(':libs:grok') compile 'net.sf.supercsv:super-csv:2.4.0' nativeBundle "org.elasticsearch.ml:ml-cpp:${project.version}@zip" testCompile 'org.ini4j:ini4j:0.5.2' @@ -85,7 +86,7 @@ task internalClusterTest(type: RandomizedTestingTask, include '**/*IT.class' systemProperty 'es.set.netty.runtime.available.processors', 'false' } -check.dependsOn internalClusterTest +check.dependsOn internalClusterTest internalClusterTest.mustRunAfter test // also add an "alias" task to make typing on the command line easier diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java index 25d0cc0cdf821..abf3a33052995 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/action/TransportGetCategoriesAction.java @@ -41,7 +41,7 @@ protected void doExecute(GetCategoriesAction.Request request, ActionListener listener.onResponse(new GetCategoriesAction.Response(r)), listener::onFailure, client); } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java new file mode 100644 index 0000000000000..8ab521e048204 --- /dev/null +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -0,0 +1,244 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.job.categorization; + +import org.elasticsearch.grok.Grok; + +import java.io.BufferedReader; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.UncheckedIOException; +import java.nio.charset.StandardCharsets; +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.regex.Matcher; +import java.util.regex.Pattern; + + +/** + * Creates Grok patterns that will match all the examples in a given category_definition. + * + * The choice of field names is quite primitive. The intention is that a human will edit these. + */ +public final class GrokPatternCreator { + + private static String PREFACE = "preface"; + private static String EPILOGUE = "epilogue"; + + /** + * The first match in this list will be chosen, so it needs to be ordered + * such that more generic patterns come after more specific patterns. + */ + private static final List ORDERED_CANDIDATE_GROK_PATTERNS = Arrays.asList( + new GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp"), + new GrokPatternCandidate("DATESTAMP_RFC822", "timestamp"), + new GrokPatternCandidate("DATESTAMP_RFC2822", "timestamp"), + new GrokPatternCandidate("DATESTAMP_OTHER", "timestamp"), + new GrokPatternCandidate("DATESTAMP_EVENTLOG", "timestamp"), + new GrokPatternCandidate("SYSLOGTIMESTAMP", "timestamp"), + new GrokPatternCandidate("HTTPDATE", "timestamp"), + new GrokPatternCandidate("CATALINA_DATESTAMP", "timestamp"), + new GrokPatternCandidate("TOMCAT_DATESTAMP", "timestamp"), + new GrokPatternCandidate("CISCOTIMESTAMP", "timestamp"), + new GrokPatternCandidate("DATE", "date"), + new GrokPatternCandidate("TIME", "time"), + new GrokPatternCandidate("LOGLEVEL", "loglevel"), + new GrokPatternCandidate("URI", "uri"), + new GrokPatternCandidate("UUID", "uuid"), + new GrokPatternCandidate("MAC", "macaddress"), + // Can't use \b as the breaks, because slashes are not "word" characters + new GrokPatternCandidate("PATH", "path", "(? examples) { + + // The first string in this array will end up being the empty string, and it doesn't correspond + // to an "in between" bit. Although it could be removed for "neatness", it actually makes the + // loops below slightly neater if it's left in. + // + // E.g., ".*?cat.+?sat.+?mat.*" -> [ "", "cat", "sat", "mat" ] + String[] fixedRegexBits = regex.split("\\.[*+]\\??"); + + // Create a pattern that will capture the bits in between the fixed parts of the regex + // + // E.g., ".*?cat.+?sat.+?mat.*" -> Pattern (.*?)cat(.+?)sat(.+?)mat(.*) + Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL); + + List> inBetweenBits = new ArrayList<>(fixedRegexBits.length); + for (String example : examples) { + Matcher matcher = exampleProcessor.matcher(example); + if (matcher.matches()) { + assert matcher.groupCount() == fixedRegexBits.length; + // E.g., if the input regex was ".*?cat.+?sat.+?mat.*" then the example + // "the cat sat on the mat" will result in "the ", " ", " on the ", and "" + // being added to the 4 "in between" collections in that order + for (int groupNum = 1; groupNum <= matcher.groupCount(); ++groupNum) { + if (inBetweenBits.size() < groupNum) { + inBetweenBits.add(new ArrayList<>(examples.size())); + } + inBetweenBits.get(groupNum - 1).add(matcher.group(groupNum)); + } + } else { + // We should never get here. If we do it implies a bug in the original categorization, + // as it's produced a regex that doesn't match the examples. + assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example; + } + } + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + // Finally, for each collection of "in between" bits we look for the best Grok pattern and incorporate + // it into the overall Grok pattern that will match the each example in its entirety + for (int inBetweenBitNum = 0; inBetweenBitNum < inBetweenBits.size(); ++inBetweenBitNum) { + // Remember (from the first comment in this method) that the first element in this array is + // always the empty string + overallGrokPatternBuilder.append(fixedRegexBits[inBetweenBitNum]); + appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, inBetweenBitNum == 0, + inBetweenBitNum == fixedRegexBits.length - 1, inBetweenBits.get(inBetweenBitNum)); + } + return overallGrokPatternBuilder.toString(); + } + + /** + * Given a collection of strings, work out which (if any) of the grok patterns we're allowed + * to use matches it best. Then append the appropriate grok language to represent that finding + * onto the supplied string builder. + */ + static void appendBestGrokMatchForStrings(Map fieldNameCountStore, StringBuilder overallGrokPatternBuilder, + boolean isFirst, boolean isLast, Collection mustMatchStrings) { + + GrokPatternCandidate bestCandidate = null; + for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) { + if (mustMatchStrings.stream().allMatch(candidate.grok::match)) { + bestCandidate = candidate; + break; + } + } + + if (bestCandidate == null) { + if (isLast) { + overallGrokPatternBuilder.append(".*"); + } else if (isFirst || mustMatchStrings.stream().anyMatch(String::isEmpty)) { + overallGrokPatternBuilder.append(".*?"); + } else { + overallGrokPatternBuilder.append(".+?"); + } + } else { + Collection prefaces = new ArrayList<>(); + Collection epilogues = new ArrayList<>(); + populatePrefacesAndEpilogues(mustMatchStrings, bestCandidate.grok, prefaces, epilogues); + appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, isFirst, false, prefaces); + overallGrokPatternBuilder.append("%{").append(bestCandidate.grokPatternName).append(':') + .append(buildFieldName(fieldNameCountStore, bestCandidate.fieldName)).append('}'); + appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, isLast, epilogues); + } + } + + /** + * Given a collection of strings, and a grok pattern that matches some part of them all, + * return collections of the bits that come before (prefaces) and after (epilogues) the + * bit that matches. + */ + static void populatePrefacesAndEpilogues(Collection matchingStrings, Grok grok, Collection prefaces, + Collection epilogues) { + for (String s : matchingStrings) { + Map captures = grok.captures(s); + // If the pattern doesn't match then captures will be null. But we expect this + // method to only be called after validating that the pattern does match. + assert captures != null; + prefaces.add(captures.getOrDefault(PREFACE, "").toString()); + epilogues.add(captures.getOrDefault(EPILOGUE, "").toString()); + } + } + + /** + * The first time a particular field name is passed, simply return it. + * The second time return it with "2" appended. + * The third time return it with "3" appended. + * Etc. + */ + static String buildFieldName(Map fieldNameCountStore, String fieldName) { + Integer numberSeen = fieldNameCountStore.compute(fieldName, (k, v) -> 1 + ((v == null) ? 0 : v)); + if (numberSeen > 1) { + return fieldName + numberSeen; + } else { + return fieldName; + } + } + + static class GrokPatternCandidate { + + final String grokPatternName; + final String fieldName; + final Grok grok; + + /** + * Pre/post breaks default to \b, but this may not be appropriate for Grok patterns that start or + * end with a non "word" character (i.e. letter, number or underscore). For such patterns use one + * of the other constructors. + * + * In cases where the Grok pattern defined by Logstash already includes conditions on what must + * come before and after the match, use one of the other constructors and specify an empty string + * for the pre and/or post breaks. + * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. + * @param fieldName Name of the field to extract from the match. + */ + GrokPatternCandidate(String grokPatternName, String fieldName) { + this(grokPatternName, fieldName, "\\b", "\\b"); + } + + GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak) { + this(grokPatternName, fieldName, preBreak, "\\b"); + } + + /** + * @param grokPatternName Name of the Grok pattern to try to match - must match one defined in Logstash. + * @param fieldName Name of the field to extract from the match. + * @param preBreak Only consider the match if it's broken from the previous text by this. + * @param postBreak Only consider the match if it's broken from the following text by this. + */ + GrokPatternCandidate(String grokPatternName, String fieldName, String preBreak, String postBreak) { + this.grokPatternName = grokPatternName; + this.fieldName = fieldName; + this.grok = new Grok(Grok.getBuiltinPatterns(), "%{DATA:" + PREFACE + "}" + preBreak + "%{" + grokPatternName + ":this}" + + postBreak + "%{GREEDYDATA:" + EPILOGUE + "}"); + } + } +} diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java index 4b15ef36e6ac7..e75a30ea264a6 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java @@ -98,6 +98,7 @@ import org.elasticsearch.xpack.core.ml.utils.ExceptionsHelper; import org.elasticsearch.xpack.core.ml.utils.MlIndicesUtils; import org.elasticsearch.xpack.core.security.support.Exceptions; +import org.elasticsearch.xpack.ml.job.categorization.GrokPatternCreator; import org.elasticsearch.xpack.ml.job.persistence.InfluencersQueryBuilder.InfluencersQuery; import org.elasticsearch.xpack.ml.job.process.autodetect.params.AutodetectParams; @@ -486,7 +487,7 @@ private T parseSearchHit(SearchHit hit, BiFunction } } - private T parseGetHit(GetResponse getResponse, BiFunction objectParser, + private T parseGetHit(GetResponse getResponse, BiFunction objectParser, Consumer errorHandler) { BytesReference source = getResponse.getSourceAsBytesRef(); @@ -629,7 +630,7 @@ public void bucketRecords(String jobId, Bucket bucket, int from, int size, boole * @param from Skip the first N categories. This parameter is for paging * @param size Take only this number of categories */ - public void categoryDefinitions(String jobId, Long categoryId, Integer from, Integer size, + public void categoryDefinitions(String jobId, Long categoryId, boolean augment, Integer from, Integer size, Consumer> handler, Consumer errorHandler, Client client) { if (categoryId != null && (from != null || size != null)) { @@ -663,6 +664,9 @@ public void categoryDefinitions(String jobId, Long categoryId, Integer from, Int XContentParser parser = XContentFactory.xContent(XContentHelper.xContentType(source)) .createParser(NamedXContentRegistry.EMPTY, LoggingDeprecationHandler.INSTANCE, stream)) { CategoryDefinition categoryDefinition = CategoryDefinition.LENIENT_PARSER.apply(parser, null); + if (augment) { + augmentWithGrokPattern(categoryDefinition); + } results.add(categoryDefinition); } catch (IOException e) { throw new ElasticsearchParseException("failed to parse category definition", e); @@ -674,6 +678,16 @@ public void categoryDefinitions(String jobId, Long categoryId, Integer from, Int }, e -> errorHandler.accept(mapAuthFailure(e, jobId, GetCategoriesAction.NAME))), client::search); } + void augmentWithGrokPattern(CategoryDefinition categoryDefinition) { + List examples = categoryDefinition.getExamples(); + String regex = categoryDefinition.getRegex(); + if (examples.isEmpty() || regex.isEmpty()) { + categoryDefinition.setGrokPattern(""); + } else { + categoryDefinition.setGrokPattern(GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + } + } + /** * Search for anomaly records with the parameters in the * {@link RecordsQueryBuilder} diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java index 484d1648fbbb2..09bb3f7591677 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/integration/AutodetectResultProcessorIT.java @@ -461,7 +461,7 @@ private QueryPage getCategoryDefinition(long categoryId) thr AtomicReference errorHolder = new AtomicReference<>(); AtomicReference> resultHolder = new AtomicReference<>(); CountDownLatch latch = new CountDownLatch(1); - jobProvider.categoryDefinitions(JOB_ID, categoryId, null, null, r -> { + jobProvider.categoryDefinitions(JOB_ID, categoryId, false, null, null, r -> { resultHolder.set(r); latch.countDown(); }, e -> { diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java new file mode 100644 index 0000000000000..d5d8b517c62ea --- /dev/null +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java @@ -0,0 +1,232 @@ +/* + * Copyright Elasticsearch B.V. and/or licensed to Elasticsearch B.V. under one + * or more contributor license agreements. Licensed under the Elastic License; + * you may not use this file except in compliance with the Elastic License. + */ +package org.elasticsearch.xpack.ml.job.categorization; + +import org.elasticsearch.grok.Grok; +import org.elasticsearch.test.ESTestCase; + +import java.util.ArrayList; +import java.util.Arrays; +import java.util.Collection; +import java.util.HashMap; +import java.util.Map; + +import static org.hamcrest.Matchers.containsInAnyOrder; + +public class GrokPatternCreatorTests extends ESTestCase { + + public void testBuildFieldName() { + Map fieldNameCountStore = new HashMap<>(); + assertEquals("field", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field")); + assertEquals("field2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field")); + assertEquals("field3", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field")); + assertEquals("timestamp", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp")); + assertEquals("field4", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field")); + assertEquals("uri", GrokPatternCreator.buildFieldName(fieldNameCountStore, "uri")); + assertEquals("timestamp2", GrokPatternCreator.buildFieldName(fieldNameCountStore, "timestamp")); + assertEquals("field5", GrokPatternCreator.buildFieldName(fieldNameCountStore, "field")); + } + + public void testPopulatePrefacesAndEpiloguesGivenTimestamp() { + + Collection matchingStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ", + "[2018-01-24T12:33:23] ERROR ", + "junk [2018-01-22T07:33:23] INFO ", + "[2018-01-21T03:33:23] DEBUG "); + Grok grok = new GrokPatternCreator.GrokPatternCandidate("TIMESTAMP_ISO8601", "timestamp").grok; + Collection prefaces = new ArrayList<>(); + Collection epilogues = new ArrayList<>(); + + GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues); + + assertThat(prefaces, containsInAnyOrder("[", "[", "junk [", "[")); + assertThat(epilogues, containsInAnyOrder("] DEBUG ", "] ERROR ", "] INFO ", "] DEBUG ")); + } + + public void testPopulatePrefacesAndEpiloguesGivenEmailAddress() { + + Collection matchingStrings = Arrays.asList("before alice@acme.com after", + "abc bob@acme.com xyz", + "carol@acme.com"); + Grok grok = new GrokPatternCreator.GrokPatternCandidate("EMAILADDRESS", "email").grok; + Collection prefaces = new ArrayList<>(); + Collection epilogues = new ArrayList<>(); + + GrokPatternCreator.populatePrefacesAndEpilogues(matchingStrings, grok, prefaces, epilogues); + + assertThat(prefaces, containsInAnyOrder("before ", "abc ", "")); + assertThat(epilogues, containsInAnyOrder(" after", " xyz", "")); + } + + public void testAppendBestGrokMatchForStringsGivenTimestampsAndLogLevels() { + + Collection mustMatchStrings = Arrays.asList("[2018-01-25T15:33:23] DEBUG ", + "[2018-01-24T12:33:23] ERROR ", + "junk [2018-01-22T07:33:23] INFO ", + "[2018-01-21T03:33:23] DEBUG "); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".+?%{TIMESTAMP_ISO8601:timestamp}.+?%{LOGLEVEL:loglevel}.+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenNumbersInBrackets() { + + Collection mustMatchStrings = Arrays.asList("(-2)", + " (-3)", + " (4)", + " (-5) "); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".+?%{NUMBER:field}.+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenNegativeNumbersWithoutBreak() { + + Collection mustMatchStrings = Arrays.asList("before-2 ", + "prior to-3", + "-4"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + // It seems sensible that we don't detect these suffices as either base 10 or base 16 numbers + assertEquals(".+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenHexNumbers() { + + Collection mustMatchStrings = Arrays.asList(" abc", + " 123", + " -123", + "1f is hex"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".*?%{BASE16NUM:field}.*?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenHostnamesWithNumbers() { + + Collection mustMatchStrings = Arrays.asList(" fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + // We don't want the .1. in the middle to get detected as a hex number + assertEquals(".+?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenEmailAddresses() { + + Collection mustMatchStrings = Arrays.asList("before alice@acme.com after", + "abc bob@acme.com xyz", + "carol@acme.com"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".*?%{EMAILADDRESS:email}.*?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenUris() { + + Collection mustMatchStrings = Arrays.asList("main site https://www.elastic.co/ with trailing slash", + "https://www.elastic.co/guide/en/x-pack/current/ml-configuring-categories.html#ml-configuring-categories is a section", + "download today from https://www.elastic.co/downloads"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".*?%{URI:uri}.*?", overallGrokPatternBuilder.toString()); + } + + public void testAppendBestGrokMatchForStringsGivenPaths() { + + Collection mustMatchStrings = Arrays.asList("on Mac /Users/dave", + "on Windows C:\\Users\\dave", + "on Linux /home/dave"); + + Map fieldNameCountStore = new HashMap<>(); + StringBuilder overallGrokPatternBuilder = new StringBuilder(); + + GrokPatternCreator.appendBestGrokMatchForStrings(fieldNameCountStore, overallGrokPatternBuilder, false, false, mustMatchStrings); + + assertEquals(".+?%{PATH:path}.*?", overallGrokPatternBuilder.toString()); + } + + public void testFindBestGrokMatchFromExamplesGivenNamedLogs() { + + String regex = ".*?linux.+?named.+?error.+?unexpected.+?RCODE.+?REFUSED.+?resolving.*"; + Collection examples = Arrays.asList( + "Sep 8 11:55:06 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'elastic.slack.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:08 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'slack-imgs.com/A/IN': 95.110.64.205#53", + "Sep 8 11:55:35 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'www.elastic.co/A/IN': 95.110.68.206#53", + "Sep 8 11:55:42 linux named[22529]: error (unexpected RCODE REFUSED) resolving 'b.akamaiedge.net/A/IN': 95.110.64.205#53"); + + assertEquals(".*?%{SYSLOGTIMESTAMP:timestamp}.+?linux.+?named.+?%{NUMBER:field}.+?error.+?" + + "unexpected.+?RCODE.+?REFUSED.+?resolving.+?%{QUOTEDSTRING:field2}.+?%{IP:ipaddress}.+?%{NUMBER:field3}.*", + GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + } + + public void testFindBestGrokMatchFromExamplesGivenCatalinaLogs() { + + String regex = ".*?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?WARNING.+?Parameters.+?" + + "Invalid.+?chunk.+?ignored.*"; + // The embedded newline ensures the regular expressions we're using are compiled with Pattern.DOTALL + Collection examples = Arrays.asList( + "Aug 29, 2009 12:03:33 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:40 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:45 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored.", + "Aug 29, 2009 12:03:57 AM org.apache.tomcat.util.http.Parameters processParameters\nWARNING: Parameters: " + + "Invalid chunk ignored."); + + assertEquals(".*?%{CATALINA_DATESTAMP:timestamp}.+?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?" + + "WARNING.+?Parameters.+?Invalid.+?chunk.+?ignored.*", + GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + } + + public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() { + + String regex = ".*?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*"; + // Two timestamps: one local, one UTC + Collection examples = Arrays.asList( + "559550912540598297\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t38545844\tserv02nw07\t192.168.114.28\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986880\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t9049724\tserv02nw03\t10.120.48.147\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912548986887\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t884343\tserv02tw03\t192.168.121.189\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp", + "559550912603512850\t2016-04-20T14:06:53\t2016-04-20T21:06:53Z\t8907014\tserv02nw01\t192.168.118.208\tAuthpriv\t" + + "Info\tsshd\tsubsystem request for sftp"); + + assertEquals(".*?%{NUMBER:field}.+?%{TIMESTAMP_ISO8601:timestamp}.+?%{TIMESTAMP_ISO8601:timestamp2}.+?%{NUMBER:field2}.+?" + + "%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*", + GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + } +} diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java index 485fe44a95fa9..9fea904a99fa1 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/persistence/JobProviderTests.java @@ -61,7 +61,6 @@ import java.util.HashMap; import java.util.List; import java.util.Map; -import java.util.concurrent.ExecutionException; import java.util.concurrent.atomic.AtomicReference; import java.util.function.Consumer; @@ -235,8 +234,7 @@ public void onFailure(Exception e) { }); } - public void testBuckets_OneBucketNoInterim() - throws InterruptedException, ExecutionException, IOException { + public void testBuckets_OneBucketNoInterim() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -268,8 +266,7 @@ public void testBuckets_OneBucketNoInterim() ".*")); } - public void testBuckets_OneBucketInterim() - throws InterruptedException, ExecutionException, IOException { + public void testBuckets_OneBucketInterim() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -302,8 +299,7 @@ public void testBuckets_OneBucketInterim() assertFalse(queryString.matches("(?s).*is_interim.*")); } - public void testBuckets_UsingBuilder() - throws InterruptedException, ExecutionException, IOException { + public void testBuckets_UsingBuilder() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -339,8 +335,7 @@ public void testBuckets_UsingBuilder() assertFalse(queryString.matches("(?s).*is_interim.*")); } - public void testBucket_NoBucketNoExpand() - throws InterruptedException, ExecutionException, IOException { + public void testBucket_NoBucketNoExpand() throws IOException { String jobId = "TestJobIdentification"; Long timestamp = 98765432123456789L; List> source = new ArrayList<>(); @@ -357,8 +352,7 @@ public void testBucket_NoBucketNoExpand() assertEquals(ResourceNotFoundException.class, holder[0].getClass()); } - public void testBucket_OneBucketNoExpand() - throws InterruptedException, ExecutionException, IOException { + public void testBucket_OneBucketNoExpand() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -384,7 +378,7 @@ public void testBucket_OneBucketNoExpand() assertEquals(now, b.getTimestamp()); } - public void testRecords() throws InterruptedException, ExecutionException, IOException { + public void testRecords() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -431,8 +425,7 @@ public void testRecords() throws InterruptedException, ExecutionException, IOExc assertEquals("irrascible", records.get(1).getFunction()); } - public void testRecords_UsingBuilder() - throws InterruptedException, ExecutionException, IOException { + public void testRecords_UsingBuilder() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); List> source = new ArrayList<>(); @@ -485,7 +478,7 @@ public void testRecords_UsingBuilder() assertEquals("irrascible", records.get(1).getFunction()); } - public void testBucketRecords() throws InterruptedException, ExecutionException, IOException { + public void testBucketRecords() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); Bucket bucket = mock(Bucket.class); @@ -532,7 +525,7 @@ public void testBucketRecords() throws InterruptedException, ExecutionException, assertEquals("irrascible", records.get(1).getFunction()); } - public void testexpandBucket() throws InterruptedException, ExecutionException, IOException { + public void testexpandBucket() throws IOException { String jobId = "TestJobIdentification"; Date now = new Date(); Bucket bucket = new Bucket("foo", now, 22); @@ -559,8 +552,7 @@ public void testexpandBucket() throws InterruptedException, ExecutionException, assertEquals(400L, records); } - public void testCategoryDefinitions() - throws InterruptedException, ExecutionException, IOException { + public void testCategoryDefinitions() throws IOException { String jobId = "TestJobIdentification"; String terms = "the terms and conditions are not valid here"; List> source = new ArrayList<>(); @@ -580,15 +572,14 @@ public void testCategoryDefinitions() JobProvider provider = createProvider(client); @SuppressWarnings({"unchecked", "rawtypes"}) QueryPage[] holder = new QueryPage[1]; - provider.categoryDefinitions(jobId, null, from, size, r -> holder[0] = r, + provider.categoryDefinitions(jobId, null, false, from, size, r -> holder[0] = r, e -> {throw new RuntimeException(e);}, client); QueryPage categoryDefinitions = holder[0]; assertEquals(1L, categoryDefinitions.count()); assertEquals(terms, categoryDefinitions.results().get(0).getTerms()); } - public void testCategoryDefinition() - throws InterruptedException, ExecutionException, IOException { + public void testCategoryDefinition() throws IOException { String jobId = "TestJobIdentification"; String terms = "the terms and conditions are not valid here"; @@ -603,14 +594,14 @@ public void testCategoryDefinition() JobProvider provider = createProvider(client); @SuppressWarnings({"unchecked", "rawtypes"}) QueryPage[] holder = new QueryPage[1]; - provider.categoryDefinitions(jobId, categoryId, null, null, + provider.categoryDefinitions(jobId, categoryId, false, null, null, r -> holder[0] = r, e -> {throw new RuntimeException(e);}, client); QueryPage categoryDefinitions = holder[0]; assertEquals(1L, categoryDefinitions.count()); assertEquals(terms, categoryDefinitions.results().get(0).getTerms()); } - public void testInfluencers_NoInterim() throws InterruptedException, ExecutionException, IOException { + public void testInfluencers_NoInterim() throws IOException { String jobId = "TestJobIdentificationForInfluencers"; Date now = new Date(); List> source = new ArrayList<>(); @@ -670,7 +661,7 @@ public void testInfluencers_NoInterim() throws InterruptedException, ExecutionEx assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001); } - public void testInfluencers_WithInterim() throws InterruptedException, ExecutionException, IOException { + public void testInfluencers_WithInterim() throws IOException { String jobId = "TestJobIdentificationForInfluencers"; Date now = new Date(); List> source = new ArrayList<>(); @@ -730,7 +721,7 @@ public void testInfluencers_WithInterim() throws InterruptedException, Execution assertEquals(5.0, records.get(1).getInitialInfluencerScore(), 0.00001); } - public void testModelSnapshots() throws InterruptedException, ExecutionException, IOException { + public void testModelSnapshots() throws IOException { String jobId = "TestJobIdentificationForInfluencers"; Date now = new Date(); List> source = new ArrayList<>(); @@ -851,8 +842,7 @@ private static GetResponse createGetResponse(boolean exists, Map return getResponse; } - private static SearchResponse createSearchResponse(List> source) - throws IOException { + private static SearchResponse createSearchResponse(List> source) throws IOException { SearchResponse response = mock(SearchResponse.class); List list = new ArrayList<>(); diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java index fdaa28508235a..ee7d4ad4b7add 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/results/CategoryDefinitionTests.java @@ -25,6 +25,9 @@ public CategoryDefinition createTestInstance(String jobId) { categoryDefinition.setRegex(randomAlphaOfLength(10)); categoryDefinition.setMaxMatchingLength(randomLong()); categoryDefinition.setExamples(Arrays.asList(generateRandomStringArray(10, 10, false))); + if (randomBoolean()) { + categoryDefinition.setGrokPattern(randomAlphaOfLength(50)); + } return categoryDefinition; } From aebba0485de523de7a3cfa6120bf64849ec458b1 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Tue, 1 May 2018 10:30:21 +0100 Subject: [PATCH 2/6] Address review comments --- .../categorization/GrokPatternCreator.java | 29 +++++++++++-------- .../xpack/ml/job/persistence/JobProvider.java | 3 +- .../GrokPatternCreatorTests.java | 6 ++-- 3 files changed, 22 insertions(+), 16 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java index 8ab521e048204..b4f475cd8de14 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -5,6 +5,7 @@ */ package org.elasticsearch.xpack.ml.job.categorization; +import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.grok.Grok; import java.io.BufferedReader; @@ -87,7 +88,7 @@ private GrokPatternCreator() { * expectation is that a user will adjust the extracted field names based on their domain * knowledge. */ - public static String findBestGrokMatchFromExamples(String regex, Collection examples) { + public static String findBestGrokMatchFromExamples(String jobId, String regex, Collection examples) { // The first string in this array will end up being the empty string, and it doesn't correspond // to an "in between" bit. Although it could be removed for "neatness", it actually makes the @@ -101,7 +102,10 @@ public static String findBestGrokMatchFromExamples(String regex, Collection Pattern (.*?)cat(.+?)sat(.+?)mat(.*) Pattern exampleProcessor = Pattern.compile(regex.replaceAll("(\\.[*+]\\??)", "($1)"), Pattern.DOTALL); - List> inBetweenBits = new ArrayList<>(fixedRegexBits.length); + List> groupsMatchesFromExamples = new ArrayList<>(fixedRegexBits.length); + for (int i = 0; i < fixedRegexBits.length; ++i) { + groupsMatchesFromExamples.add(new ArrayList<>(examples.size())); + } for (String example : examples) { Matcher matcher = exampleProcessor.matcher(example); if (matcher.matches()) { @@ -110,15 +114,14 @@ public static String findBestGrokMatchFromExamples(String regex, Collection(examples.size())); - } - inBetweenBits.get(groupNum - 1).add(matcher.group(groupNum)); + groupsMatchesFromExamples.get(groupNum - 1).add(matcher.group(groupNum)); } } else { // We should never get here. If we do it implies a bug in the original categorization, // as it's produced a regex that doesn't match the examples. assert matcher.matches() : exampleProcessor.pattern() + " did not match " + example; + Loggers.getLogger(GrokPatternCreator.class).error("[{}] Pattern [{}] did not match example [{}]", jobId, + exampleProcessor.pattern(), example); } } @@ -126,12 +129,12 @@ public static String findBestGrokMatchFromExamples(String regex, Collection fieldNameCountSto boolean isFirst, boolean isLast, Collection mustMatchStrings) { GrokPatternCandidate bestCandidate = null; - for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) { - if (mustMatchStrings.stream().allMatch(candidate.grok::match)) { - bestCandidate = candidate; - break; + if (mustMatchStrings.isEmpty() == false) { + for (GrokPatternCandidate candidate : ORDERED_CANDIDATE_GROK_PATTERNS) { + if (mustMatchStrings.stream().allMatch(candidate.grok::match)) { + bestCandidate = candidate; + break; + } } } diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java index e75a30ea264a6..791187d9fc589 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java @@ -684,7 +684,8 @@ void augmentWithGrokPattern(CategoryDefinition categoryDefinition) { if (examples.isEmpty() || regex.isEmpty()) { categoryDefinition.setGrokPattern(""); } else { - categoryDefinition.setGrokPattern(GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + categoryDefinition.setGrokPattern(GrokPatternCreator.findBestGrokMatchFromExamples(categoryDefinition.getJobId(), + regex, examples)); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java index d5d8b517c62ea..4189dc35f0caa 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreatorTests.java @@ -188,7 +188,7 @@ public void testFindBestGrokMatchFromExamplesGivenNamedLogs() { assertEquals(".*?%{SYSLOGTIMESTAMP:timestamp}.+?linux.+?named.+?%{NUMBER:field}.+?error.+?" + "unexpected.+?RCODE.+?REFUSED.+?resolving.+?%{QUOTEDSTRING:field2}.+?%{IP:ipaddress}.+?%{NUMBER:field3}.*", - GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples)); } public void testFindBestGrokMatchFromExamplesGivenCatalinaLogs() { @@ -208,7 +208,7 @@ public void testFindBestGrokMatchFromExamplesGivenCatalinaLogs() { assertEquals(".*?%{CATALINA_DATESTAMP:timestamp}.+?org\\.apache\\.tomcat\\.util\\.http\\.Parameters.+?processParameters.+?" + "WARNING.+?Parameters.+?Invalid.+?chunk.+?ignored.*", - GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples)); } public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() { @@ -227,6 +227,6 @@ public void testFindBestGrokMatchFromExamplesGivenMultiTimestampLogs() { assertEquals(".*?%{NUMBER:field}.+?%{TIMESTAMP_ISO8601:timestamp}.+?%{TIMESTAMP_ISO8601:timestamp2}.+?%{NUMBER:field2}.+?" + "%{IP:ipaddress}.+?Authpriv.+?Info.+?sshd.+?subsystem.+?request.+?for.+?sftp.*", - GrokPatternCreator.findBestGrokMatchFromExamples(regex, examples)); + GrokPatternCreator.findBestGrokMatchFromExamples("foo", regex, examples)); } } From 3797beb6eb90192ef8ba7641b183169a421aeac5 Mon Sep 17 00:00:00 2001 From: David Roberts Date: Thu, 10 May 2018 17:05:24 +0100 Subject: [PATCH 3/6] Remove redundant imports --- .../xpack/ml/job/categorization/GrokPatternCreator.java | 6 ------ 1 file changed, 6 deletions(-) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java index b4f475cd8de14..04280261b2634 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/GrokPatternCreator.java @@ -8,15 +8,9 @@ import org.elasticsearch.common.logging.Loggers; import org.elasticsearch.grok.Grok; -import java.io.BufferedReader; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.UncheckedIOException; -import java.nio.charset.StandardCharsets; import java.util.ArrayList; import java.util.Arrays; import java.util.Collection; -import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; From 6ad523a29fd9ef3549ad85be31e932c69424ad2a Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 14 May 2018 10:32:45 +0100 Subject: [PATCH 4/6] Update docs --- .../docs/en/rest-api/ml/get-category.asciidoc | 24 +++++++++++-------- .../en/rest-api/ml/resultsresource.asciidoc | 7 ++++++ 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/x-pack/docs/en/rest-api/ml/get-category.asciidoc b/x-pack/docs/en/rest-api/ml/get-category.asciidoc index 37d0a95c14c71..9e69083355bbb 100644 --- a/x-pack/docs/en/rest-api/ml/get-category.asciidoc +++ b/x-pack/docs/en/rest-api/ml/get-category.asciidoc @@ -62,11 +62,11 @@ roles provide these privileges. For more information, see ==== Examples The following example gets information about one category for the -`it_ops_new_logs` job: +`esxi_log` job: [source,js] -------------------------------------------------- -GET _xpack/ml/anomaly_detectors/it_ops_new_logs/results/categories +GET _xpack/ml/anomaly_detectors/esxi_log/results/categories { "page":{ "size": 1 @@ -83,14 +83,18 @@ In this example, the API returns the following information: "count": 11, "categories": [ { - "job_id": "it_ops_new_logs", - "category_id": 1, - "terms": "Actual Transaction Already Voided Reversed hostname dbserver.acme.com physicalhost esxserver1.acme.com vmhost app1.acme.com", - "regex": ".*?Actual.+?Transaction.+?Already.+?Voided.+?Reversed.+?hostname.+?dbserver.acme.com.+?physicalhost.+?esxserver1.acme.com.+?vmhost.+?app1.acme.com.*", - "max_matching_length": 137, - "examples": [ - "Actual Transaction Already Voided / Reversed;hostname=dbserver.acme.com;physicalhost=esxserver1.acme.com;vmhost=app1.acme.com" - ] + "job_id" : "esxi_log", + "category_id" : 1, + "terms" : "Vpxa verbose vpxavpxaInvtVm opID VpxaInvtVmChangeListener Guest DiskInfo Changed", + "regex" : ".*?Vpxa.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*", + "max_matching_length": 154, + "examples" : [ + "Oct 19 17:04:44 esxi1.acme.com Vpxa: [3CB3FB90 verbose 'vpxavpxaInvtVm' opID=WFU-33d82c31] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", + "Oct 19 17:04:45 esxi2.acme.com Vpxa: [3CA66B90 verbose 'vpxavpxaInvtVm' opID=WFU-33927856] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", + "Oct 19 17:04:51 esxi1.acme.com Vpxa: [FFDBAB90 verbose 'vpxavpxaInvtVm' opID=WFU-25e0d447] [VpxaInvtVmChangeListener] Guest DiskInfo Changed", + "Oct 19 17:04:58 esxi2.acme.com Vpxa: [FFDDBB90 verbose 'vpxavpxaInvtVm' opID=WFU-bbff0134] [VpxaInvtVmChangeListener] Guest DiskInfo Changed" + ], + "grok_pattern" : ".*?%{SYSLOGTIMESTAMP:timestamp}.+?Vpxa.+?%{BASE16NUM:field}.+?verbose.+?vpxavpxaInvtVm.+?opID.+?VpxaInvtVmChangeListener.+?Guest.+?DiskInfo.+?Changed.*" } ] } diff --git a/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc b/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc index fba6522141bf7..b5c8875399331 100644 --- a/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc +++ b/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc @@ -405,6 +405,13 @@ A category resource has the following properties: `examples`:: (array) A list of examples of actual values that matched the category. +`grok_pattern`:: + (string) A Grok pattern that could be used in Logstash or an Ingest Pipeline + to extract fields from messages that match the category. This field is + experimental and may be changed or removed in a future version. The Grok + patterns that are found are not optimal, but are often a good starting point + for manual tweaking. + `job_id`:: (string) The unique identifier for the job that these results belong to. From b0f99cc39a5cc2926a17dc48050610c8f5aa311f Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 14 May 2018 12:08:05 +0100 Subject: [PATCH 5/6] Add new argument to Javadoc --- .../org/elasticsearch/xpack/ml/job/persistence/JobProvider.java | 1 + 1 file changed, 1 insertion(+) diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java index 791187d9fc589..d7b10fb622bdf 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/persistence/JobProvider.java @@ -627,6 +627,7 @@ public void bucketRecords(String jobId, Bucket bucket, int from, int size, boole * Get a page of {@linkplain CategoryDefinition}s for the given jobId. * Uses a supplied client, so may run as the currently authenticated user * @param jobId the job id + * @param augment Should the category definition be augmented with a Grok pattern? * @param from Skip the first N categories. This parameter is for paging * @param size Take only this number of categories */ From 70dd1d4eae8505891db058d08c132331f83536ba Mon Sep 17 00:00:00 2001 From: David Roberts Date: Mon, 14 May 2018 16:32:43 +0100 Subject: [PATCH 6/6] Add experimental markup in docs --- x-pack/docs/en/rest-api/ml/resultsresource.asciidoc | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc b/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc index b5c8875399331..c28ed72aedb36 100644 --- a/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc +++ b/x-pack/docs/en/rest-api/ml/resultsresource.asciidoc @@ -406,11 +406,11 @@ A category resource has the following properties: (array) A list of examples of actual values that matched the category. `grok_pattern`:: - (string) A Grok pattern that could be used in Logstash or an Ingest Pipeline - to extract fields from messages that match the category. This field is - experimental and may be changed or removed in a future version. The Grok - patterns that are found are not optimal, but are often a good starting point - for manual tweaking. + experimental[] (string) A Grok pattern that could be used in Logstash or an + Ingest Pipeline to extract fields from messages that match the category. This + field is experimental and may be changed or removed in a future release. The + Grok patterns that are found are not optimal, but are often a good starting + point for manual tweaking. `job_id`:: (string) The unique identifier for the job that these results belong to.