diff --git a/pkg/logql/log/pattern/lexer.go b/pkg/logql/log/pattern/lexer.go index 2f867f93dc305..febe39a1331e4 100644 --- a/pkg/logql/log/pattern/lexer.go +++ b/pkg/logql/log/pattern/lexer.go @@ -1,5 +1,7 @@ package pattern +import "unicode/utf8" + type lexer struct { data []byte p, pe, cs int @@ -57,6 +59,7 @@ func (lex *lexer) identifier(out *exprSymType) (int, error) { // nolint func (lex *lexer) literal(out *exprSymType) (int, error) { - out.literal = rune(lex.data[lex.ts]) + decoded, _ := utf8.DecodeRune(lex.data[lex.ts:lex.te]) + out.literal = decoded return LITERAL, nil } diff --git a/pkg/logql/log/pattern/lexer.rl b/pkg/logql/log/pattern/lexer.rl index 7b1d25467c45a..b28579afb424b 100644 --- a/pkg/logql/log/pattern/lexer.rl +++ b/pkg/logql/log/pattern/lexer.rl @@ -13,11 +13,25 @@ package pattern } }%% +%%{ +utf8 = ( + 0x00..0x7F | + 0xC2..0xDF 0x80..0xBF | + 0xE0 0xA0..0xBF 0x80..0xBF | + 0xE1..0xEC 0x80..0xBF 0x80..0xBF | + 0xED 0x80..0x9F 0x80..0xBF | + 0xEE..0xEF 0x80..0xBF 0x80..0xBF | + 0xF0 0x90..0xBF 0x80..0xBF 0x80..0xBF | + 0xF1..0xF3 0x80..0xBF 0x80..0xBF 0x80..0xBF | + 0xF4 0x80..0x8F 0x80..0xBF 0x80..0xBF +); +}%% + const LEXER_ERROR = 0 %%{ identifier = '<' (alpha| '_') (alnum | '_' )* '>'; - literal = any; + literal = utf8; }%% func (lex *lexer) Lex(out *exprSymType) int { diff --git a/pkg/logql/log/pattern/lexer.rl.go b/pkg/logql/log/pattern/lexer.rl.go index 4e3b3e188fca3..67adde2c3101a 100644 --- a/pkg/logql/log/pattern/lexer.rl.go +++ b/pkg/logql/log/pattern/lexer.rl.go @@ -1,251 +1,271 @@ + //line pkg/logql/log/pattern/lexer.rl:1 package pattern + //line pkg/logql/log/pattern/lexer.rl.go:7 var _pattern_actions []byte = []byte{ - 0, 1, 0, 1, 1, 1, 2, 1, 3, - 1, 4, 1, 5, 1, 6, + 0, 1, 0, 1, 1, 1, 2, 1, 3, + 1, 4, 1, 5, 1, 6, } var _pattern_key_offsets []byte = []byte{ - 0, 8, 9, + 0, 0, 8, 10, 12, 14, 16, 18, + 20, 22, 37, } var _pattern_trans_keys []byte = []byte{ - 62, 95, 48, 57, 65, 90, 97, 122, - 60, 95, 65, 90, 97, 122, + 62, 95, 48, 57, 65, 90, 97, 122, + 128, 191, 160, 191, 128, 191, 128, 159, + 144, 191, 128, 191, 128, 143, 60, 224, + 237, 240, 244, 128, 193, 194, 223, 225, + 239, 241, 243, 245, 255, 95, 65, 90, + 97, 122, } var _pattern_single_lengths []byte = []byte{ - 2, 1, 1, + 0, 2, 0, 0, 0, 0, 0, 0, + 0, 5, 1, } var _pattern_range_lengths []byte = []byte{ - 3, 0, 2, + 0, 3, 1, 1, 1, 1, 1, 1, + 1, 5, 2, } var _pattern_index_offsets []byte = []byte{ - 0, 6, 8, + 0, 0, 6, 8, 10, 12, 14, 16, + 18, 20, 31, +} + +var _pattern_indicies []byte = []byte{ + 2, 1, 1, 1, 1, 0, 3, 4, + 5, 4, 5, 4, 5, 4, 6, 4, + 6, 4, 6, 4, 7, 8, 9, 10, + 12, 4, 5, 6, 11, 4, 3, 1, + 1, 1, 13, } var _pattern_trans_targs []byte = []byte{ - 1, 0, 0, 0, 0, 1, 2, 1, - 0, 0, 0, 1, 1, 1, + 9, 1, 9, 9, 0, 2, 4, 10, + 3, 5, 6, 7, 8, 9, } var _pattern_trans_actions []byte = []byte{ - 7, 0, 0, 0, 0, 13, 5, 9, - 0, 0, 0, 11, 13, 11, + 13, 0, 7, 9, 0, 0, 0, 5, + 0, 0, 0, 0, 0, 11, } var _pattern_to_state_actions []byte = []byte{ - 0, 1, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 1, 0, } var _pattern_from_state_actions []byte = []byte{ - 0, 3, 0, + 0, 0, 0, 0, 0, 0, 0, 0, + 0, 3, 0, } var _pattern_eof_trans []byte = []byte{ - 13, 0, 14, + 0, 1, 0, 0, 0, 0, 0, 0, + 0, 0, 14, } -const pattern_start int = 1 -const pattern_first_final int = 1 -const pattern_error int = -1 +const pattern_start int = 9 +const pattern_first_final int = 9 +const pattern_error int = 0 + +const pattern_en_main int = 9 -const pattern_en_main int = 1 //line pkg/logql/log/pattern/lexer.rl:14 + + +//line pkg/logql/log/pattern/lexer.rl:28 + + const LEXER_ERROR = 0 -//line pkg/logql/log/pattern/lexer.rl:21 + +//line pkg/logql/log/pattern/lexer.rl:35 + func (lex *lexer) Lex(out *exprSymType) int { - eof := lex.pe - tok := 0 + eof := lex.pe + tok := 0 -//line pkg/logql/log/pattern/lexer.rl.go:77 + +//line pkg/logql/log/pattern/lexer.rl.go:100 { - var _klen int - var _trans int - var _acts int - var _nacts uint - var _keys int - if (lex.p) == (lex.pe) { - goto _test_eof - } - _resume: - _acts = int(_pattern_from_state_actions[lex.cs]) - _nacts = uint(_pattern_actions[_acts]) - _acts++ - for ; _nacts > 0; _nacts-- { - _acts++ - switch _pattern_actions[_acts-1] { - case 1: + var _klen int + var _trans int + var _acts int + var _nacts uint + var _keys int + if ( lex.p) == ( lex.pe) { + goto _test_eof + } + if lex.cs == 0 { + goto _out + } +_resume: + _acts = int(_pattern_from_state_actions[ lex.cs]) + _nacts = uint(_pattern_actions[_acts]); _acts++ + for ; _nacts > 0; _nacts-- { + _acts++ + switch _pattern_actions[_acts - 1] { + case 1: //line NONE:1 - lex.ts = (lex.p) + lex.ts = ( lex.p) -//line pkg/logql/log/pattern/lexer.rl.go:97 - } +//line pkg/logql/log/pattern/lexer.rl.go:123 } + } - _keys = int(_pattern_key_offsets[lex.cs]) - _trans = int(_pattern_index_offsets[lex.cs]) - - _klen = int(_pattern_single_lengths[lex.cs]) - if _klen > 0 { - _lower := int(_keys) - var _mid int - _upper := int(_keys + _klen - 1) - for { - if _upper < _lower { - break - } - - _mid = _lower + ((_upper - _lower) >> 1) - switch { - case lex.data[(lex.p)] < _pattern_trans_keys[_mid]: - _upper = _mid - 1 - case lex.data[(lex.p)] > _pattern_trans_keys[_mid]: - _lower = _mid + 1 - default: - _trans += int(_mid - int(_keys)) - goto _match - } + _keys = int(_pattern_key_offsets[ lex.cs]) + _trans = int(_pattern_index_offsets[ lex.cs]) + + _klen = int(_pattern_single_lengths[ lex.cs]) + if _klen > 0 { + _lower := int(_keys) + var _mid int + _upper := int(_keys + _klen - 1) + for { + if _upper < _lower { + break } - _keys += _klen - _trans += _klen - } - _klen = int(_pattern_range_lengths[lex.cs]) - if _klen > 0 { - _lower := int(_keys) - var _mid int - _upper := int(_keys + (_klen << 1) - 2) - for { - if _upper < _lower { - break - } - - _mid = _lower + (((_upper - _lower) >> 1) & ^1) - switch { - case lex.data[(lex.p)] < _pattern_trans_keys[_mid]: - _upper = _mid - 2 - case lex.data[(lex.p)] > _pattern_trans_keys[_mid+1]: - _lower = _mid + 2 - default: - _trans += int((_mid - int(_keys)) >> 1) - goto _match - } + _mid = _lower + ((_upper - _lower) >> 1) + switch { + case lex.data[( lex.p)] < _pattern_trans_keys[_mid]: + _upper = _mid - 1 + case lex.data[( lex.p)] > _pattern_trans_keys[_mid]: + _lower = _mid + 1 + default: + _trans += int(_mid - int(_keys)) + goto _match } - _trans += _klen } + _keys += _klen + _trans += _klen + } - _match: - _eof_trans: - lex.cs = int(_pattern_trans_targs[_trans]) + _klen = int(_pattern_range_lengths[ lex.cs]) + if _klen > 0 { + _lower := int(_keys) + var _mid int + _upper := int(_keys + (_klen << 1) - 2) + for { + if _upper < _lower { + break + } - if _pattern_trans_actions[_trans] == 0 { - goto _again + _mid = _lower + (((_upper - _lower) >> 1) & ^1) + switch { + case lex.data[( lex.p)] < _pattern_trans_keys[_mid]: + _upper = _mid - 2 + case lex.data[( lex.p)] > _pattern_trans_keys[_mid + 1]: + _lower = _mid + 2 + default: + _trans += int((_mid - int(_keys)) >> 1) + goto _match + } } + _trans += _klen + } + +_match: + _trans = int(_pattern_indicies[_trans]) +_eof_trans: + lex.cs = int(_pattern_trans_targs[_trans]) + + if _pattern_trans_actions[_trans] == 0 { + goto _again + } - _acts = int(_pattern_trans_actions[_trans]) - _nacts = uint(_pattern_actions[_acts]) + _acts = int(_pattern_trans_actions[_trans]) + _nacts = uint(_pattern_actions[_acts]); _acts++ + for ; _nacts > 0; _nacts-- { _acts++ - for ; _nacts > 0; _nacts-- { - _acts++ - switch _pattern_actions[_acts-1] { - case 2: + switch _pattern_actions[_acts-1] { + case 2: //line NONE:1 - lex.te = (lex.p) + 1 - - case 3: -//line pkg/logql/log/pattern/lexer.rl:30 - lex.te = (lex.p) + 1 - { - tok = lex.handle(lex.identifier(out)) - (lex.p)++ - goto _out - } - case 4: -//line pkg/logql/log/pattern/lexer.rl:31 - lex.te = (lex.p) + 1 - { - tok = lex.handle(lex.literal(out)) - (lex.p)++ - goto _out - } - case 5: -//line pkg/logql/log/pattern/lexer.rl:31 - lex.te = (lex.p) - (lex.p)-- - { - tok = lex.handle(lex.literal(out)) - (lex.p)++ - goto _out - } - case 6: -//line pkg/logql/log/pattern/lexer.rl:31 - (lex.p) = (lex.te) - 1 - { - tok = lex.handle(lex.literal(out)) - (lex.p)++ - goto _out - } -//line pkg/logql/log/pattern/lexer.rl.go:191 - } + lex.te = ( lex.p)+1 + + case 3: +//line pkg/logql/log/pattern/lexer.rl:44 + lex.te = ( lex.p)+1 +{ tok = lex.handle(lex.identifier(out)); ( lex.p)++; goto _out + } + case 4: +//line pkg/logql/log/pattern/lexer.rl:45 + lex.te = ( lex.p)+1 +{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out + } + case 5: +//line pkg/logql/log/pattern/lexer.rl:45 + lex.te = ( lex.p) +( lex.p)-- +{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out + } + case 6: +//line pkg/logql/log/pattern/lexer.rl:45 +( lex.p) = ( lex.te) - 1 +{ tok = lex.handle(lex.literal(out)); ( lex.p)++; goto _out + } +//line pkg/logql/log/pattern/lexer.rl.go:218 } + } - _again: - _acts = int(_pattern_to_state_actions[lex.cs]) - _nacts = uint(_pattern_actions[_acts]) +_again: + _acts = int(_pattern_to_state_actions[ lex.cs]) + _nacts = uint(_pattern_actions[_acts]); _acts++ + for ; _nacts > 0; _nacts-- { _acts++ - for ; _nacts > 0; _nacts-- { - _acts++ - switch _pattern_actions[_acts-1] { - case 0: + switch _pattern_actions[_acts-1] { + case 0: //line NONE:1 - lex.ts = 0 + lex.ts = 0 -//line pkg/logql/log/pattern/lexer.rl.go:205 - } +//line pkg/logql/log/pattern/lexer.rl.go:232 } + } - (lex.p)++ - if (lex.p) != (lex.pe) { - goto _resume - } - _test_eof: - { - } - if (lex.p) == eof { - if _pattern_eof_trans[lex.cs] > 0 { - _trans = int(_pattern_eof_trans[lex.cs] - 1) - goto _eof_trans - } + if lex.cs == 0 { + goto _out + } + ( lex.p)++ + if ( lex.p) != ( lex.pe) { + goto _resume + } + _test_eof: {} + if ( lex.p) == eof { + if _pattern_eof_trans[ lex.cs] > 0 { + _trans = int(_pattern_eof_trans[ lex.cs] - 1) + goto _eof_trans } + } - _out: - { - } + _out: {} } -//line pkg/logql/log/pattern/lexer.rl:35 +//line pkg/logql/log/pattern/lexer.rl:49 + - return tok + return tok; } -func (lex *lexer) init() { -//line pkg/logql/log/pattern/lexer.rl.go:233 +func (lex *lexer) init() { + +//line pkg/logql/log/pattern/lexer.rl.go:263 { - lex.cs = pattern_start - lex.ts = 0 - lex.te = 0 - lex.act = 0 + lex.cs = pattern_start + lex.ts = 0 + lex.te = 0 + lex.act = 0 } -//line pkg/logql/log/pattern/lexer.rl:43 +//line pkg/logql/log/pattern/lexer.rl:57 } diff --git a/pkg/logql/log/pattern/lexer_test.go b/pkg/logql/log/pattern/lexer_test.go index ff4d61591fb39..3e6bcf1b12e97 100644 --- a/pkg/logql/log/pattern/lexer_test.go +++ b/pkg/logql/log/pattern/lexer_test.go @@ -18,6 +18,7 @@ func Test_Lex(t *testing.T) { {`<_1foo>`, []int{IDENTIFIER}}, {`<_1foo> bar `, []int{IDENTIFIER, LITERAL, LITERAL, LITERAL, LITERAL, LITERAL, IDENTIFIER}}, {`<1foo>`, []int{LITERAL, LITERAL, LITERAL, LITERAL, LITERAL, LITERAL}}, + {`▶`, []int{LITERAL}}, } { tc := tc t.Run(tc.input, func(t *testing.T) { diff --git a/pkg/logql/log/pattern/parser_test.go b/pkg/logql/log/pattern/parser_test.go index 8a40d983c0a3a..dbcb418fd382b 100644 --- a/pkg/logql/log/pattern/parser_test.go +++ b/pkg/logql/log/pattern/parser_test.go @@ -47,6 +47,11 @@ func Test_Parse(t *testing.T) { expr{capture("ip"), literals(" - "), capture("user"), literals(" ["), capture("_"), literals(`] "`), capture("method"), literals(" "), capture("path"), literals(" "), capture('_'), literals(`" `), capture("status"), literals(" "), capture("size"), literals(" "), capture("url"), literals(" "), capture("user_agent")}, nil, }, + { + "▶", + expr{literals("▶")}, + nil, + }, } { tc := tc actual, err := parseExpr(tc.input) diff --git a/pkg/logql/log/pattern/pattern_test.go b/pkg/logql/log/pattern/pattern_test.go index 0d1c47f0bea29..ca4f3ea47d96c 100644 --- a/pkg/logql/log/pattern/pattern_test.go +++ b/pkg/logql/log/pattern/pattern_test.go @@ -1,6 +1,7 @@ package pattern import ( + "bytes" "fmt" "testing" @@ -151,6 +152,26 @@ var fixtures = []struct { []string{"POST", "/api/v1/locations", "204", "154", "0", "226", "100", "10.0.35.28", "nsq2http", "tcp://10.0.2.1:80"}, true, }, + { + // UTF-8: Matches a unicode character + `unicode character`, + `unicode 🤷 character`, + []string{`🤷`}, + true, + }, + { + // UTF-8: Parses unicode character as literal + "unicode ▶ ", + "unicode ▶ character", + []string{"character"}, + true, + }, +} + +func Test_BytesIndexUnicode(t *testing.T) { + data := []byte("Hello ▶ World") + index := bytes.Index(data, []byte("▶")) + require.Equal(t, 6, index) } func Test_matcher_Matches(t *testing.T) { diff --git a/pkg/pattern/drain/drain_test.go b/pkg/pattern/drain/drain_test.go index 7c502a895e7e9..cc16f0b7fd64c 100644 --- a/pkg/pattern/drain/drain_test.go +++ b/pkg/pattern/drain/drain_test.go @@ -508,6 +508,19 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T) ` test 4 test test`, }, }, + { + name: "Unicode characters are matchable", + drain: New(DefaultConfig(), nil), + inputLines: []string{ + `13:25:18.033470 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_999 0.00 1717075518`, + `13:25:18.033422 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_99 0.00 1717075518`, + `13:25:18.033394 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_95 0.00 1717075518`, + `13:25:18.033364 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_75 0.00 1717075518`, + `13:25:18.033335 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_max_50 0.00 1717075518`, + `13:25:18.033304 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_std 0.00 1717075518`, + `13:25:18.033281 ▶ INFO route ops sending to dest https://graphite-cortex-ops-blocks-us-east4.grafana.net/graphite/metrics: service_is_carbon-relay-ng.instance_is_carbon-relay-ng-c665b7b-j2trk.mtype_is_gauge.dest_is_https_graphite-cortex-ops-blocks-us-east4_grafana_netgraphitemetrics.unit_is_B.what_is_FlushSize.type_is_manual.stat_is_mean 0.00 1717075518`, + }, + }, } for _, tt := range tests { tt := tt @@ -523,7 +536,8 @@ func TestDrain_TrainGeneratesPatternsMatchableByLokiPatternFilter(t *testing.T) for _, line := range tt.inputLines { passes := matcher.Test([]byte(line)) - require.Truef(t, passes, `Line %q should match extracted pattern`, line) + require.Truef(t, passes, "Line should match extracted pattern: \nPatt[%q] \nLine[%q]", cluster.String(), line) + } }) }