From e6a5352c971fcedf4f882673447a9ac5b90ece99 Mon Sep 17 00:00:00 2001 From: apstndb <803393+apstndb@users.noreply.github.com> Date: Tue, 29 Oct 2024 21:36:55 +0900 Subject: [PATCH] Sync lexer punctuations with ZetaSQL (#182) * Sync lexer symbols with ZetaSQL * Revert illegal input character test case --- lexer.go | 42 ++++++++++++++++++++++++++++++++++++++- lexer_test.go | 55 +++++++++++++++++++++++++++++++-------------------- split_test.go | 2 -- 3 files changed, 75 insertions(+), 24 deletions(-) diff --git a/lexer.go b/lexer.go index 26d679b0..632e07f0 100644 --- a/lexer.go +++ b/lexer.go @@ -109,7 +109,9 @@ func (l *Lexer) consumeToken() { } switch l.peek(0) { - case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '+', '-': + case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '%', ':', + // Belows are not yet used in Spanner. + '?', '\\', '$': l.Token.Kind = token.TokenKind([]byte{l.skip()}) return case '.': @@ -151,6 +153,32 @@ func (l *Lexer) consumeToken() { l.Token.Kind = ">" } return + case '+': + switch { + // KW_ADD_ASSIGN in ZetaSQL + case l.peekIs(1, '='): + l.skipN(2) + l.Token.Kind = "+=" + default: + l.skip() + l.Token.Kind = "+" + } + return + case '-': + switch { + // KW_SUB_ASSIGN in ZetaSQL + case l.peekIs(1, '='): + l.skipN(2) + l.Token.Kind = "-=" + // KW_LAMBDA_ARROW in ZetaSQL + case l.peekIs(1, '>'): + l.skipN(2) + l.Token.Kind = "->" + default: + l.skip() + l.Token.Kind = "-" + } + return case '=': switch { case l.peekIs(1, '>'): @@ -163,6 +191,9 @@ func (l *Lexer) consumeToken() { return case '|': switch { + case l.peekIs(1, '>'): + l.skipN(2) + l.Token.Kind = "|>" case l.peekIs(1, '|'): l.skipN(2) l.Token.Kind = "||" @@ -177,7 +208,16 @@ func (l *Lexer) consumeToken() { l.Token.Kind = "!=" return } + l.skip() + l.Token.Kind = "!" + return case '@': + // KW_DOUBLE_AT is not yet used in Cloud Spanner, but used in BigQuery. + if l.peekIs(1, '@') { + l.skipN(2) + l.Token.Kind = "@@" + return + } if l.peekOk(1) && char.IsIdentStart(l.peek(1)) { i := 1 for l.peekOk(i) && char.IsIdentPart(l.peek(i)) { diff --git a/lexer_test.go b/lexer_test.go index 4919c0d5..6ad623e3 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -11,35 +11,48 @@ import ( . "github.com/cloudspannerecosystem/memefish/token" ) +// Keep same order https://github.com/google/zetasql/blob/master/zetasql/parser/flex_tokenizer.l var symbols = []string{ - ".", - ",", - ";", "(", - ")", - "{", - "}", "[", + "{", + ")", "]", - "@", - "~", - "+", - "-", + "}", "*", - "/", - "&", - "^", - "|", - "||", + ",", "=", - "<", - "<<", + "+=", + "-=", + "!=", "<=", - "<>", + "<<", + "=>", + "->", + "<", ">", - ">>", ">=", - "!=", + "||", + "|", + "^", + "&", + "+", + "-", + "/", + "~", + "?", + "!", + "%", + "|>", + "@", + "@@", + ".", + ":", + "\\", + ";", + "$", + "<>", // <> is not a valid token in ZetaSQL, but it is a token in memefish + ">>", // >> is not a valid token in ZetaSQL, but it is a token in memefish. } var lexerTestCases = []struct { @@ -132,7 +145,7 @@ var lexerWrongTestCase = []struct { pos Pos message string }{ - {"?", 0, "illegal input character: '?'"}, + {"\b", 0, "illegal input character: '\\b'"}, {`"foo`, 0, "unclosed string literal"}, {`R"foo`, 1, "unclosed raw string literal"}, {"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"}, diff --git a/split_test.go b/split_test.go index 14d838fd..0e7533bf 100644 --- a/split_test.go +++ b/split_test.go @@ -69,8 +69,6 @@ func TestSplitRawStatements(t *testing.T) { want: []*memefish.RawStatement{ {Statement: "SELECT `1;2;3`", End: token.Pos(14)}, }}, - // $` may become a valid token in the future, but it's reasonable to check its current behavior. - {desc: "unknown token", input: "SELECT $;", errRe: regexp.MustCompile(`illegal input character: '\$'`)}, } { t.Run(test.desc, func(t *testing.T) { stmts, err := memefish.SplitRawStatements("", test.input)