From 957fd8c6d99f74bd254f7d4e18a3e668bd4f18e8 Mon Sep 17 00:00:00 2001 From: TSUYUSATO Kitsune Date: Mon, 28 Oct 2024 20:34:03 +0900 Subject: [PATCH] Improve error position reported by the lexer This introduces two helper method `errorfAtPosition` and `panicfAtPosition`, and uses them for reporting errors caused by tokenizing string-like literals and skipping comments. --- lexer.go | 49 ++++++++++++++++++++++++++++++++----------------- lexer_test.go | 42 +++++++++++++++++++++++------------------- 2 files changed, 55 insertions(+), 36 deletions(-) diff --git a/lexer.go b/lexer.go index 632e07f0..4b96e4b4 100644 --- a/lexer.go +++ b/lexer.go @@ -418,7 +418,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s for l.peekOk(i) { if l.slice(i, i+len(q)) == q { if len(content) == 0 && name == "identifier" { - l.panicf("invalid empty identifier") + l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+len(q)), "invalid empty identifier") } l.skipN(i + len(q)) return string(content) @@ -428,7 +428,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s if c == '\\' { i++ if !l.peekOk(i) { - l.panicf("invalid escape sequence: \\") + l.panicfAtPosition(token.Pos(l.pos+i-1), token.Pos(l.pos+i), "invalid escape sequence: \\") } c := l.peek(i) @@ -457,18 +457,20 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s case '\\', '?', '"', '\'', '`': content = append(content, c) case 'x', 'X': - if !(l.peekOk(i+1) && char.IsHexDigit(l.peek(i)) && char.IsHexDigit(l.peek(i+1))) { - l.panicf("invalid escape sequence: hex escape sequence must be follwed by 2 hex digits") + for j := 0; j < 2; j++ { + if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) { + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits") + } } u, err := strconv.ParseUint(l.slice(i, i+2), 16, 8) if err != nil { - l.panicf("invalid escape sequence: %v", err) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err) } content = append(content, byte(u)) i += 2 case 'u', 'U': if !unicode { - l.panicf("invalid escape sequence: \\%c is not allowed in %s", c, name) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c is not allowed in %s", c, name) } size := 4 if c == 'U' { @@ -476,46 +478,48 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s } for j := 0; j < size; j++ { if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) { - l.panicf("invalid escape sequence: \\%c must be followed by %d hex digits", c, size) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: \\%c must be followed by %d hex digits", c, size) } } u, err := strconv.ParseUint(l.slice(i, i+size), 16, 32) if err != nil { - l.panicf("invalid escape sequence: %v", err) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: %v", err) } if 0xD800 <= u && u <= 0xDFFF || 0x10FFFF < u { - l.panicf("invalid escape sequence: invalid code point: U+%04X", u) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: invalid code point: U+%04X", u) } var buf [utf8.MaxRune]byte n := utf8.EncodeRune(buf[:], rune(u)) content = append(content, buf[:n]...) i += size case '0', '1', '2', '3': - if !(l.peekOk(i+1) && char.IsOctalDigit(l.peek(i)) && char.IsOctalDigit(l.peek(i+1))) { - l.panicf("invalid escape sequence: octal escape sequence must be follwed by 3 octal digits") + for j := 0; j < 2; j++ { + if !(l.peekOk(i+j) && char.IsOctalDigit(l.peek(i+j))) { + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits") + } } u, err := strconv.ParseUint(l.slice(i-1, i+2), 8, 8) if err != nil { - l.panicf("invalid escape sequence: %v", err) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err) } content = append(content, byte(u)) i += 2 default: - l.panicf("invalid escape sequence: \\%c", c) + l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c", c) } continue } if c == '\n' && len(q) != 3 { - l.panicf("unclosed %s: newline appears in non triple-quoted", name) + l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+1), "unclosed %s: newline appears in non triple-quoted", name) } content = append(content, c) i++ } - panic(l.errorf("unclosed %s", name)) + panic(l.errorfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i), "unclosed %s", name)) } func (l *Lexer) skipSpaces() { @@ -543,6 +547,7 @@ func (l *Lexer) skipComment() { } func (l *Lexer) skipCommentUntil(end string, mustEnd bool) { + pos := token.Pos(l.pos) for !l.eof() { if l.slice(0, len(end)) == end { l.skipN(len(end)) @@ -551,8 +556,7 @@ func (l *Lexer) skipCommentUntil(end string, mustEnd bool) { l.skip() } if mustEnd { - // TODO: improve error position - l.panicf("unclosed comment") + l.panicfAtPosition(pos, token.Pos(l.pos), "unclosed comment") } } @@ -596,6 +600,17 @@ func (l *Lexer) errorf(msg string, param ...interface{}) *Error { } } +func (l *Lexer) errorfAtPosition(pos, end token.Pos, msg string, param ...interface{}) *Error { + return &Error{ + Message: fmt.Sprintf(msg, param...), + Position: l.Position(pos, end), + } +} + func (l *Lexer) panicf(msg string, param ...interface{}) { panic(l.errorf(msg, param...)) } + +func (l *Lexer) panicfAtPosition(pos, end token.Pos, msg string, param ...interface{}) { + panic(l.errorfAtPosition(pos, end, msg, param...)) +} diff --git a/lexer_test.go b/lexer_test.go index 6ad623e3..7f9fc38c 100644 --- a/lexer_test.go +++ b/lexer_test.go @@ -143,26 +143,27 @@ var lexerTestCases = []struct { var lexerWrongTestCase = []struct { source string pos Pos + end Pos message string }{ - {"\b", 0, "illegal input character: '\\b'"}, - {`"foo`, 0, "unclosed string literal"}, - {`R"foo`, 1, "unclosed raw string literal"}, - {"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"}, - {"R'foo\n", 1, "unclosed raw string literal: newline appears in non triple-quoted"}, - {"R'foo\\", 1, "invalid escape sequence: \\"}, - {`"\400"`, 0, "invalid escape sequence: \\4"}, - {`"\3xx"`, 0, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"}, - {`"\xZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"}, - {`"\XZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"}, - {`B"\u0031"`, 1, "invalid escape sequence: \\u is not allowed in bytes literal"}, - {`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"}, - {`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"}, - {`"\UFFFFFFFF"`, 0, "invalid escape sequence: invalid code point: U+FFFFFFFF"}, - {"``", 0, "invalid empty identifier"}, - {"1from", 1, "number literal cannot follow identifier without any spaces"}, - {`'''0`, 0, "unclosed triple-quoted string literal"}, - {`/*`, 2, "unclosed comment"}, + {"\b", 0, 0, "illegal input character: '\\b'"}, + {`"foo`, 0, 4, "unclosed string literal"}, + {`R"foo`, 1, 5, "unclosed raw string literal"}, + {"'foo\n", 0, 5, "unclosed string literal: newline appears in non triple-quoted"}, + {"R'foo\n", 1, 6, "unclosed raw string literal: newline appears in non triple-quoted"}, + {"R'foo\\", 5, 6, "invalid escape sequence: \\"}, + {`"\400"`, 1, 3, "invalid escape sequence: \\4"}, + {`"\3xx"`, 1, 4, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"}, + {`"\xZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"}, + {`"\XZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"}, + {`B"\u0031"`, 2, 4, "invalid escape sequence: \\u is not allowed in bytes literal"}, + {`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"}, + {`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"}, + {`"\UFFFFFFFF"`, 1, 11, "invalid escape sequence: invalid code point: U+FFFFFFFF"}, + {"``", 0, 2, "invalid empty identifier"}, + {"1from", 1, 1, "number literal cannot follow identifier without any spaces"}, + {`'''0`, 0, 4, "unclosed triple-quoted string literal"}, + {`/*`, 0, 2, "unclosed comment"}, } func testLexer(t *testing.T, source string, tokens []*Token) { @@ -240,7 +241,10 @@ func TestLexerWrong(t *testing.T) { t.Errorf("expected error message: %q, but: %q", tc.message, e.Message) } if e.Position.Pos != tc.pos { - t.Errorf("expected error position: %v, but: %v", tc.pos, e.Position.Pos) + t.Errorf("expected error position (pos): %v, but: %v", tc.pos, e.Position.Pos) + } + if e.Position.End != tc.end { + t.Errorf("expected error position (end): %v, but: %v", tc.end, e.Position.End) } } else { t.Errorf("unexpected error: %v", err)