Sync lexer punctuations with ZetaSQL (#182)

* Sync lexer symbols with ZetaSQL * Revert illegal input character test case
cloudspannerecosystem · Oct 29, 2024 · e6a5352 · e6a5352
1 parent 560e46c
commit e6a5352
Show file tree

Hide file tree

Showing 3 changed files with 75 additions and 24 deletions.
diff --git a/lexer.go b/lexer.go
@@ -109,7 +109,9 @@ func (l *Lexer) consumeToken() {
 	}
 
 	switch l.peek(0) {
-	case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '+', '-':
+	case '(', ')', '{', '}', ';', ',', '[', ']', '~', '*', '/', '&', '^', '%', ':',
+		// Belows are not yet used in Spanner.
+		'?', '\\', '$':
 		l.Token.Kind = token.TokenKind([]byte{l.skip()})
 		return
 	case '.':
@@ -151,6 +153,32 @@ func (l *Lexer) consumeToken() {
 			l.Token.Kind = ">"
 		}
 		return
+	case '+':
+		switch {
+		// KW_ADD_ASSIGN in ZetaSQL
+		case l.peekIs(1, '='):
+			l.skipN(2)
+			l.Token.Kind = "+="
+		default:
+			l.skip()
+			l.Token.Kind = "+"
+		}
+		return
+	case '-':
+		switch {
+		// KW_SUB_ASSIGN in ZetaSQL
+		case l.peekIs(1, '='):
+			l.skipN(2)
+			l.Token.Kind = "-="
+		// KW_LAMBDA_ARROW in ZetaSQL
+		case l.peekIs(1, '>'):
+			l.skipN(2)
+			l.Token.Kind = "->"
+		default:
+			l.skip()
+			l.Token.Kind = "-"
+		}
+		return
 	case '=':
 		switch {
 		case l.peekIs(1, '>'):
@@ -163,6 +191,9 @@ func (l *Lexer) consumeToken() {
 		return
 	case '|':
 		switch {
+		case l.peekIs(1, '>'):
+			l.skipN(2)
+			l.Token.Kind = "|>"
 		case l.peekIs(1, '|'):
 			l.skipN(2)
 			l.Token.Kind = "||"
@@ -177,7 +208,16 @@ func (l *Lexer) consumeToken() {
 			l.Token.Kind = "!="
 			return
 		}
+		l.skip()
+		l.Token.Kind = "!"
+		return
 	case '@':
+		// KW_DOUBLE_AT is not yet used in Cloud Spanner, but used in BigQuery.
+		if l.peekIs(1, '@') {
+			l.skipN(2)
+			l.Token.Kind = "@@"
+			return
+		}
 		if l.peekOk(1) && char.IsIdentStart(l.peek(1)) {
 			i := 1
 			for l.peekOk(i) && char.IsIdentPart(l.peek(i)) {

diff --git a/lexer_test.go b/lexer_test.go
@@ -11,35 +11,48 @@ import (
 	. "github.com/cloudspannerecosystem/memefish/token"
 )
 
+// Keep same order https://github.com/google/zetasql/blob/master/zetasql/parser/flex_tokenizer.l
 var symbols = []string{
-	".",
-	",",
-	";",
 	"(",
-	")",
-	"{",
-	"}",
 	"[",
+	"{",
+	")",
 	"]",
-	"@",
-	"~",
-	"+",
-	"-",
+	"}",
 	"*",
-	"/",
-	"&",
-	"^",
-	"|",
-	"||",
+	",",
 	"=",
-	"<",
-	"<<",
+	"+=",
+	"-=",
+	"!=",
 	"<=",
-	"<>",
+	"<<",
+	"=>",
+	"->",
+	"<",
 	">",
-	">>",
 	">=",
-	"!=",
+	"||",
+	"|",
+	"^",
+	"&",
+	"+",
+	"-",
+	"/",
+	"~",
+	"?",
+	"!",
+	"%",
+	"|>",
+	"@",
+	"@@",
+	".",
+	":",
+	"\\",
+	";",
+	"$",
+	"<>", // <> is not a valid token in ZetaSQL, but it is a token in memefish
+	">>", // >> is not a valid token in ZetaSQL, but it is a token in memefish.
 }
 
 var lexerTestCases = []struct {
@@ -132,7 +145,7 @@ var lexerWrongTestCase = []struct {
 	pos     Pos
 	message string
 }{
-	{"?", 0, "illegal input character: '?'"},
+	{"\b", 0, "illegal input character: '\\b'"},
 	{`"foo`, 0, "unclosed string literal"},
 	{`R"foo`, 1, "unclosed raw string literal"},
 	{"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"},

diff --git a/split_test.go b/split_test.go
@@ -69,8 +69,6 @@ func TestSplitRawStatements(t *testing.T) {
 			want: []*memefish.RawStatement{
 				{Statement: "SELECT `1;2;3`", End: token.Pos(14)},
 			}},
-		// $` may become a valid token in the future, but it's reasonable to check its current behavior.
-		{desc: "unknown token", input: "SELECT $;", errRe: regexp.MustCompile(`illegal input character: '\$'`)},
 	} {
 		t.Run(test.desc, func(t *testing.T) {
 			stmts, err := memefish.SplitRawStatements("", test.input)