diff --git a/x-pack/plugin/build.gradle b/x-pack/plugin/build.gradle index 7daf9babb7494..bb4aab08f9e5c 100644 --- a/x-pack/plugin/build.gradle +++ b/x-pack/plugin/build.gradle @@ -121,6 +121,8 @@ tasks.named("yamlRestCompatTest").configure { 'ml/jobs_get_stats/Test get job stats after uploading data prompting the creation of some stats', 'ml/jobs_get_stats/Test get job stats for closed job', 'ml/jobs_get_stats/Test no exception on get job stats with missing index', + // TODO: remove the next one after backporting https://github.com/elastic/elasticsearch/pull/73828 + 'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines', 'ml/post_data/Test POST data job api, flush, close and verify DataCounts doc', 'ml/post_data/Test flush with skip_time', 'ml/set_upgrade_mode/Setting upgrade mode to disabled from enabled', diff --git a/x-pack/plugin/ml/qa/ml-with-security/build.gradle b/x-pack/plugin/ml/qa/ml-with-security/build.gradle index 6522aed8c5cf9..72f34486ff2cb 100644 --- a/x-pack/plugin/ml/qa/ml-with-security/build.gradle +++ b/x-pack/plugin/ml/qa/ml-with-security/build.gradle @@ -22,6 +22,8 @@ tasks.named("yamlRestTest").configure { 'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code', 'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer', 'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines', + 'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages', + 'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages', // Remove tests that are expected to throw an exception, because we cannot then // know whether to expect an authorization exception or a validation exception 'ml/calendar_crud/Test get calendar given missing', diff --git a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java index 8cfdd76c37fe2..3ea0fbb523432 100644 --- a/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java +++ b/x-pack/plugin/ml/src/main/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilter.java @@ -93,6 +93,9 @@ private CharSequence process(CharSequence input) { } addOffCorrectMap(0, prevNewlineIndex + 1); + if (endIndex < input.length()) { + addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1); + } return input.subSequence(prevNewlineIndex + 1, endIndex); } } diff --git a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java index 9c6982c674959..1ea0cbf8cdf90 100644 --- a/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java +++ b/x-pack/plugin/ml/src/test/java/org/elasticsearch/xpack/ml/job/categorization/FirstNonBlankLineCharFilterTests.java @@ -121,8 +121,11 @@ public void testCorrect() throws IOException { assertThat(new String(output), equalTo(expectedOutput)); int expectedOutputIndex = input.indexOf(expectedOutput); - for (int i = 0; i <= expectedOutput.length(); ++i) { + for (int i = 0; i < expectedOutput.length(); ++i) { assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i)); } + // When the input gets chopped by a char filter immediately after a token, that token must be reported as + // ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets + assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length())); } } diff --git a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml index c2c23162b6d5d..24242db4c1148 100644 --- a/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml +++ b/x-pack/plugin/src/yamlRestTest/resources/rest-api-spec/test/ml/ml_standard_analyze.yml @@ -111,5 +111,107 @@ - match: { tokens.0.position: 0 } - match: { tokens.1.token: "line" } - match: { tokens.1.start_offset: 10 } - - match: { tokens.1.end_offset: 14 } + - match: { tokens.1.end_offset: 26 } - match: { tokens.1.position: 1 } + +--- +"Test 7.14 analyzer with multiple multiline messages": + - do: + indices.analyze: + body: > + { + "char_filter" : [ + "first_non_blank_line" + ], + "tokenizer" : "ml_standard", + "filter" : [ + { "type" : "stop", "stopwords": [ + "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", + "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", + "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + "GMT", "UTC" + ] } + ], + "text" : [ + " \nfirst line\nsecond line", + " \nfirst line of second message\nsecond line of second message" + ] + } + - match: { tokens.0.token: "first" } + - match: { tokens.0.start_offset: 4 } + - match: { tokens.0.end_offset: 9 } + - match: { tokens.0.position: 0 } + - match: { tokens.1.token: "line" } + - match: { tokens.1.start_offset: 10 } + - match: { tokens.1.end_offset: 26 } + - match: { tokens.1.position: 1 } + - match: { tokens.2.token: "first" } + - match: { tokens.2.start_offset: 31 } + - match: { tokens.2.end_offset: 36 } + - match: { tokens.2.position: 102 } + - match: { tokens.3.token: "line" } + - match: { tokens.3.start_offset: 37 } + - match: { tokens.3.end_offset: 41 } + - match: { tokens.3.position: 103 } + - match: { tokens.4.token: "of" } + - match: { tokens.4.start_offset: 42 } + - match: { tokens.4.end_offset: 44 } + - match: { tokens.4.position: 104 } + - match: { tokens.5.token: "second" } + - match: { tokens.5.start_offset: 45 } + - match: { tokens.5.end_offset: 51 } + - match: { tokens.5.position: 105 } + - match: { tokens.6.token: "message" } + - match: { tokens.6.start_offset: 52 } + - match: { tokens.6.end_offset: 89 } + - match: { tokens.6.position: 106 } + +--- +"Test 7.14 analyzer with stop words in messages": + - do: + indices.analyze: + body: > + { + "char_filter" : [ + "first_non_blank_line" + ], + "tokenizer" : "ml_standard", + "filter" : [ + { "type" : "stop", "stopwords": [ + "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday", + "Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun", + "January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December", + "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", + "GMT", "UTC" + ] } + ], + "text" : [ + "May 27, 2021 @ 19:51:15.288 UTC log message one", + "May 27, 2021 @ 19:52:25.288 UTC log message two" + ] + } + - match: { tokens.0.token: "log" } + - match: { tokens.0.start_offset: 32 } + - match: { tokens.0.end_offset: 35 } + - match: { tokens.0.position: 7 } + - match: { tokens.1.token: "message" } + - match: { tokens.1.start_offset: 36 } + - match: { tokens.1.end_offset: 43 } + - match: { tokens.1.position: 8 } + - match: { tokens.2.token: "one" } + - match: { tokens.2.start_offset: 44 } + - match: { tokens.2.end_offset: 47 } + - match: { tokens.2.position: 9 } + - match: { tokens.3.token: "log" } + - match: { tokens.3.start_offset: 80 } + - match: { tokens.3.end_offset: 83 } + - match: { tokens.3.position: 117 } + - match: { tokens.4.token: "message" } + - match: { tokens.4.start_offset: 84 } + - match: { tokens.4.end_offset: 91 } + - match: { tokens.4.position: 118 } + - match: { tokens.5.token: "two" } + - match: { tokens.5.start_offset: 92 } + - match: { tokens.5.end_offset: 95 } + - match: { tokens.5.position: 119 }