Improve error position reported by the lexer (#186)

This introduces two helper methods `errorfAtPosition` and `panicfAtPosition`, and uses them for reporting errors caused by tokenizing string-like literals and skipping comments.
cloudspannerecosystem · Oct 29, 2024 · e50eded · e50eded
1 parent e6a5352
commit e50eded
Show file tree

Hide file tree

Showing 2 changed files with 55 additions and 36 deletions.
diff --git a/lexer.go b/lexer.go
@@ -418,7 +418,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 	for l.peekOk(i) {
 		if l.slice(i, i+len(q)) == q {
 			if len(content) == 0 && name == "identifier" {
-				l.panicf("invalid empty identifier")
+				l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+len(q)), "invalid empty identifier")
 			}
 			l.skipN(i + len(q))
 			return string(content)
@@ -428,7 +428,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 		if c == '\\' {
 			i++
 			if !l.peekOk(i) {
-				l.panicf("invalid escape sequence: \\<eof>")
+				l.panicfAtPosition(token.Pos(l.pos+i-1), token.Pos(l.pos+i), "invalid escape sequence: \\<eof>")
 			}
 
 			c := l.peek(i)
@@ -457,65 +457,69 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
 			case '\\', '?', '"', '\'', '`':
 				content = append(content, c)
 			case 'x', 'X':
-				if !(l.peekOk(i+1) && char.IsHexDigit(l.peek(i)) && char.IsHexDigit(l.peek(i+1))) {
-					l.panicf("invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
+				for j := 0; j < 2; j++ {
+					if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
+					}
 				}
 				u, err := strconv.ParseUint(l.slice(i, i+2), 16, 8)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
 				}
 				content = append(content, byte(u))
 				i += 2
 			case 'u', 'U':
 				if !unicode {
-					l.panicf("invalid escape sequence: \\%c is not allowed in %s", c, name)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c is not allowed in %s", c, name)
 				}
 				size := 4
 				if c == 'U' {
 					size = 8
 				}
 				for j := 0; j < size; j++ {
 					if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
-						l.panicf("invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
 					}
 				}
 				u, err := strconv.ParseUint(l.slice(i, i+size), 16, 32)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: %v", err)
 				}
 				if 0xD800 <= u && u <= 0xDFFF || 0x10FFFF < u {
-					l.panicf("invalid escape sequence: invalid code point: U+%04X", u)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: invalid code point: U+%04X", u)
 				}
 				var buf [utf8.MaxRune]byte
 				n := utf8.EncodeRune(buf[:], rune(u))
 				content = append(content, buf[:n]...)
 				i += size
 			case '0', '1', '2', '3':
-				if !(l.peekOk(i+1) && char.IsOctalDigit(l.peek(i)) && char.IsOctalDigit(l.peek(i+1))) {
-					l.panicf("invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
+				for j := 0; j < 2; j++ {
+					if !(l.peekOk(i+j) && char.IsOctalDigit(l.peek(i+j))) {
+						l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
+					}
 				}
 				u, err := strconv.ParseUint(l.slice(i-1, i+2), 8, 8)
 				if err != nil {
-					l.panicf("invalid escape sequence: %v", err)
+					l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
 				}
 				content = append(content, byte(u))
 				i += 2
 			default:
-				l.panicf("invalid escape sequence: \\%c", c)
+				l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c", c)
 			}
 
 			continue
 		}
 
 		if c == '\n' && len(q) != 3 {
-			l.panicf("unclosed %s: newline appears in non triple-quoted", name)
+			l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+1), "unclosed %s: newline appears in non triple-quoted", name)
 		}
 
 		content = append(content, c)
 		i++
 	}
 
-	panic(l.errorf("unclosed %s", name))
+	panic(l.errorfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i), "unclosed %s", name))
 }
 
 func (l *Lexer) skipSpaces() {
@@ -543,6 +547,7 @@ func (l *Lexer) skipComment() {
 }
 
 func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
+	pos := token.Pos(l.pos)
 	for !l.eof() {
 		if l.slice(0, len(end)) == end {
 			l.skipN(len(end))
@@ -551,8 +556,7 @@ func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
 		l.skip()
 	}
 	if mustEnd {
-		// TODO: improve error position
-		l.panicf("unclosed comment")
+		l.panicfAtPosition(pos, token.Pos(l.pos), "unclosed comment")
 	}
 }
 
@@ -596,6 +600,17 @@ func (l *Lexer) errorf(msg string, param ...interface{}) *Error {
 	}
 }
 
+func (l *Lexer) errorfAtPosition(pos, end token.Pos, msg string, param ...interface{}) *Error {
+	return &Error{
+		Message:  fmt.Sprintf(msg, param...),
+		Position: l.Position(pos, end),
+	}
+}
+
 func (l *Lexer) panicf(msg string, param ...interface{}) {
 	panic(l.errorf(msg, param...))
 }
+
+func (l *Lexer) panicfAtPosition(pos, end token.Pos, msg string, param ...interface{}) {
+	panic(l.errorfAtPosition(pos, end, msg, param...))
+}
diff --git a/lexer_test.go b/lexer_test.go
@@ -143,26 +143,27 @@ var lexerTestCases = []struct {
 var lexerWrongTestCase = []struct {
 	source  string
 	pos     Pos
+	end     Pos
 	message string
 }{
-	{"\b", 0, "illegal input character: '\\b'"},
-	{`"foo`, 0, "unclosed string literal"},
-	{`R"foo`, 1, "unclosed raw string literal"},
-	{"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"},
-	{"R'foo\n", 1, "unclosed raw string literal: newline appears in non triple-quoted"},
-	{"R'foo\\", 1, "invalid escape sequence: \\<eof>"},
-	{`"\400"`, 0, "invalid escape sequence: \\4"},
-	{`"\3xx"`, 0, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
-	{`"\xZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
-	{`"\XZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
-	{`B"\u0031"`, 1, "invalid escape sequence: \\u is not allowed in bytes literal"},
-	{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
-	{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
-	{`"\UFFFFFFFF"`, 0, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
-	{"``", 0, "invalid empty identifier"},
-	{"1from", 1, "number literal cannot follow identifier without any spaces"},
-	{`'''0`, 0, "unclosed triple-quoted string literal"},
-	{`/*`, 2, "unclosed comment"},
+	{"\b", 0, 0, "illegal input character: '\\b'"},
+	{`"foo`, 0, 4, "unclosed string literal"},
+	{`R"foo`, 1, 5, "unclosed raw string literal"},
+	{"'foo\n", 0, 5, "unclosed string literal: newline appears in non triple-quoted"},
+	{"R'foo\n", 1, 6, "unclosed raw string literal: newline appears in non triple-quoted"},
+	{"R'foo\\", 5, 6, "invalid escape sequence: \\<eof>"},
+	{`"\400"`, 1, 3, "invalid escape sequence: \\4"},
+	{`"\3xx"`, 1, 4, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
+	{`"\xZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
+	{`"\XZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
+	{`B"\u0031"`, 2, 4, "invalid escape sequence: \\u is not allowed in bytes literal"},
+	{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
+	{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
+	{`"\UFFFFFFFF"`, 1, 11, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
+	{"``", 0, 2, "invalid empty identifier"},
+	{"1from", 1, 1, "number literal cannot follow identifier without any spaces"},
+	{`'''0`, 0, 4, "unclosed triple-quoted string literal"},
+	{`/*`, 0, 2, "unclosed comment"},
 }
 
 func testLexer(t *testing.T, source string, tokens []*Token) {
@@ -240,7 +241,10 @@ func TestLexerWrong(t *testing.T) {
 					t.Errorf("expected error message: %q, but: %q", tc.message, e.Message)
 				}
 				if e.Position.Pos != tc.pos {
-					t.Errorf("expected error position: %v, but: %v", tc.pos, e.Position.Pos)
+					t.Errorf("expected error position (pos): %v, but: %v", tc.pos, e.Position.Pos)
+				}
+				if e.Position.End != tc.end {
+					t.Errorf("expected error position (end): %v, but: %v", tc.end, e.Position.End)
 				}
 			} else {
 				t.Errorf("unexpected error: %v", err)