Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: Add utf8 support to Pattern Lexer to support utf8 chars #13085

Merged
merged 4 commits into from
May 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pkg/logql/log/pattern/lexer.go
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
package pattern

import "unicode/utf8"

type lexer struct {
data []byte
p, pe, cs int
Expand Down Expand Up @@ -57,6 +59,7 @@ func (lex *lexer) identifier(out *exprSymType) (int, error) {

// nolint
func (lex *lexer) literal(out *exprSymType) (int, error) {
out.literal = rune(lex.data[lex.ts])
decoded, _ := utf8.DecodeRune(lex.data[lex.ts:lex.te])
out.literal = decoded
return LITERAL, nil
}
16 changes: 15 additions & 1 deletion pkg/logql/log/pattern/lexer.rl
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,25 @@ package pattern
}
}%%

%%{
utf8 = (
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this bit was generated by copilot but I've manually checked it and I think its correct. As far as I understand, anyway.

0x00..0x7F |
0xC2..0xDF 0x80..0xBF |
0xE0 0xA0..0xBF 0x80..0xBF |
0xE1..0xEC 0x80..0xBF 0x80..0xBF |
0xED 0x80..0x9F 0x80..0xBF |
0xEE..0xEF 0x80..0xBF 0x80..0xBF |
0xF0 0x90..0xBF 0x80..0xBF 0x80..0xBF |
0xF1..0xF3 0x80..0xBF 0x80..0xBF 0x80..0xBF |
0xF4 0x80..0x8F 0x80..0xBF 0x80..0xBF
);
}%%

const LEXER_ERROR = 0

%%{
identifier = '<' (alpha| '_') (alnum | '_' )* '>';
literal = any;
literal = utf8;
}%%

func (lex *lexer) Lex(out *exprSymType) int {
Expand Down
Loading
Loading