Skip to content

Commit

Permalink
[ML] Fix end offset for first_non_blank_line char_filter (#73828)
Browse files Browse the repository at this point in the history
When the input gets chopped by a char_filter immediately after
a token, that token must be reported as ending at the very end
of the original input, otherwise analysis will have incorrect
offsets when multiple field values are analyzed in the same
_analyze request.

The pattern_replace filter works like this.  This PR changes
the new first_non_blank_line filter to work in the same way.

Fixes elastic/kibana#101255
  • Loading branch information
droberts195 authored Jun 7, 2021
1 parent 6b7fea0 commit 334ad82
Show file tree
Hide file tree
Showing 5 changed files with 114 additions and 2 deletions.
2 changes: 2 additions & 0 deletions x-pack/plugin/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -121,6 +121,8 @@ tasks.named("yamlRestCompatTest").configure {
'ml/jobs_get_stats/Test get job stats after uploading data prompting the creation of some stats',
'ml/jobs_get_stats/Test get job stats for closed job',
'ml/jobs_get_stats/Test no exception on get job stats with missing index',
// TODO: remove the next one after backporting https://github.com/elastic/elasticsearch/pull/73828
'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
'ml/post_data/Test POST data job api, flush, close and verify DataCounts doc',
'ml/post_data/Test flush with skip_time',
'ml/set_upgrade_mode/Setting upgrade mode to disabled from enabled',
Expand Down
2 changes: 2 additions & 0 deletions x-pack/plugin/ml/qa/ml-with-security/build.gradle
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@ tasks.named("yamlRestTest").configure {
'ml/ml_classic_analyze/Test analyze API with an analyzer that does what we used to do in native code',
'ml/ml_standard_analyze/Test analyze API with the standard 7.14 ML analyzer',
'ml/ml_standard_analyze/Test 7.14 analyzer with blank lines',
'ml/ml_standard_analyze/Test 7.14 analyzer with multiple multiline messages',
'ml/ml_standard_analyze/Test 7.14 analyzer with stop words in messages',
// Remove tests that are expected to throw an exception, because we cannot then
// know whether to expect an authorization exception or a validation exception
'ml/calendar_crud/Test get calendar given missing',
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -93,6 +93,9 @@ private CharSequence process(CharSequence input) {
}

addOffCorrectMap(0, prevNewlineIndex + 1);
if (endIndex < input.length()) {
addOffCorrectMap(endIndex - prevNewlineIndex - 1, input.length() - endIndex + prevNewlineIndex + 1);
}
return input.subSequence(prevNewlineIndex + 1, endIndex);
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,11 @@ public void testCorrect() throws IOException {
assertThat(new String(output), equalTo(expectedOutput));

int expectedOutputIndex = input.indexOf(expectedOutput);
for (int i = 0; i <= expectedOutput.length(); ++i) {
for (int i = 0; i < expectedOutput.length(); ++i) {
assertThat(filter.correctOffset(i), equalTo(expectedOutputIndex + i));
}
// When the input gets chopped by a char filter immediately after a token, that token must be reported as
// ending at the very end of the original input, otherwise multi-message analysis will have incorrect offsets
assertThat(filter.correctOffset(expectedOutput.length()), equalTo(input.length()));
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -111,5 +111,107 @@
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "line" }
- match: { tokens.1.start_offset: 10 }
- match: { tokens.1.end_offset: 14 }
- match: { tokens.1.end_offset: 26 }
- match: { tokens.1.position: 1 }

---
"Test 7.14 analyzer with multiple multiline messages":
- do:
indices.analyze:
body: >
{
"char_filter" : [
"first_non_blank_line"
],
"tokenizer" : "ml_standard",
"filter" : [
{ "type" : "stop", "stopwords": [
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
"GMT", "UTC"
] }
],
"text" : [
" \nfirst line\nsecond line",
" \nfirst line of second message\nsecond line of second message"
]
}
- match: { tokens.0.token: "first" }
- match: { tokens.0.start_offset: 4 }
- match: { tokens.0.end_offset: 9 }
- match: { tokens.0.position: 0 }
- match: { tokens.1.token: "line" }
- match: { tokens.1.start_offset: 10 }
- match: { tokens.1.end_offset: 26 }
- match: { tokens.1.position: 1 }
- match: { tokens.2.token: "first" }
- match: { tokens.2.start_offset: 31 }
- match: { tokens.2.end_offset: 36 }
- match: { tokens.2.position: 102 }
- match: { tokens.3.token: "line" }
- match: { tokens.3.start_offset: 37 }
- match: { tokens.3.end_offset: 41 }
- match: { tokens.3.position: 103 }
- match: { tokens.4.token: "of" }
- match: { tokens.4.start_offset: 42 }
- match: { tokens.4.end_offset: 44 }
- match: { tokens.4.position: 104 }
- match: { tokens.5.token: "second" }
- match: { tokens.5.start_offset: 45 }
- match: { tokens.5.end_offset: 51 }
- match: { tokens.5.position: 105 }
- match: { tokens.6.token: "message" }
- match: { tokens.6.start_offset: 52 }
- match: { tokens.6.end_offset: 89 }
- match: { tokens.6.position: 106 }

---
"Test 7.14 analyzer with stop words in messages":
- do:
indices.analyze:
body: >
{
"char_filter" : [
"first_non_blank_line"
],
"tokenizer" : "ml_standard",
"filter" : [
{ "type" : "stop", "stopwords": [
"Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday", "Sunday",
"Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun",
"January", "February", "March", "April", "May", "June", "July", "August", "September", "October", "November", "December",
"Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec",
"GMT", "UTC"
] }
],
"text" : [
"May 27, 2021 @ 19:51:15.288 UTC log message one",
"May 27, 2021 @ 19:52:25.288 UTC log message two"
]
}
- match: { tokens.0.token: "log" }
- match: { tokens.0.start_offset: 32 }
- match: { tokens.0.end_offset: 35 }
- match: { tokens.0.position: 7 }
- match: { tokens.1.token: "message" }
- match: { tokens.1.start_offset: 36 }
- match: { tokens.1.end_offset: 43 }
- match: { tokens.1.position: 8 }
- match: { tokens.2.token: "one" }
- match: { tokens.2.start_offset: 44 }
- match: { tokens.2.end_offset: 47 }
- match: { tokens.2.position: 9 }
- match: { tokens.3.token: "log" }
- match: { tokens.3.start_offset: 80 }
- match: { tokens.3.end_offset: 83 }
- match: { tokens.3.position: 117 }
- match: { tokens.4.token: "message" }
- match: { tokens.4.start_offset: 84 }
- match: { tokens.4.end_offset: 91 }
- match: { tokens.4.position: 118 }
- match: { tokens.5.token: "two" }
- match: { tokens.5.start_offset: 92 }
- match: { tokens.5.end_offset: 95 }
- match: { tokens.5.position: 119 }

0 comments on commit 334ad82

Please sign in to comment.