Skip to content

Commit

Permalink
Improve error position reported by the lexer (#186)
Browse files Browse the repository at this point in the history
This introduces two helper methods `errorfAtPosition` and
`panicfAtPosition`, and uses them for reporting errors caused by
tokenizing string-like literals and skipping comments.
  • Loading branch information
makenowjust authored Oct 29, 2024
1 parent e6a5352 commit e50eded
Show file tree
Hide file tree
Showing 2 changed files with 55 additions and 36 deletions.
49 changes: 32 additions & 17 deletions lexer.go
Original file line number Diff line number Diff line change
Expand Up @@ -418,7 +418,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
for l.peekOk(i) {
if l.slice(i, i+len(q)) == q {
if len(content) == 0 && name == "identifier" {
l.panicf("invalid empty identifier")
l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+len(q)), "invalid empty identifier")
}
l.skipN(i + len(q))
return string(content)
Expand All @@ -428,7 +428,7 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
if c == '\\' {
i++
if !l.peekOk(i) {
l.panicf("invalid escape sequence: \\<eof>")
l.panicfAtPosition(token.Pos(l.pos+i-1), token.Pos(l.pos+i), "invalid escape sequence: \\<eof>")
}

c := l.peek(i)
Expand Down Expand Up @@ -457,65 +457,69 @@ func (l *Lexer) consumeQuotedContent(q string, raw, unicode bool, name string) s
case '\\', '?', '"', '\'', '`':
content = append(content, c)
case 'x', 'X':
if !(l.peekOk(i+1) && char.IsHexDigit(l.peek(i)) && char.IsHexDigit(l.peek(i+1))) {
l.panicf("invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
for j := 0; j < 2; j++ {
if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits")
}
}
u, err := strconv.ParseUint(l.slice(i, i+2), 16, 8)
if err != nil {
l.panicf("invalid escape sequence: %v", err)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
}
content = append(content, byte(u))
i += 2
case 'u', 'U':
if !unicode {
l.panicf("invalid escape sequence: \\%c is not allowed in %s", c, name)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c is not allowed in %s", c, name)
}
size := 4
if c == 'U' {
size = 8
}
for j := 0; j < size; j++ {
if !(l.peekOk(i+j) && char.IsHexDigit(l.peek(i+j))) {
l.panicf("invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: \\%c must be followed by %d hex digits", c, size)
}
}
u, err := strconv.ParseUint(l.slice(i, i+size), 16, 32)
if err != nil {
l.panicf("invalid escape sequence: %v", err)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: %v", err)
}
if 0xD800 <= u && u <= 0xDFFF || 0x10FFFF < u {
l.panicf("invalid escape sequence: invalid code point: U+%04X", u)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+size), "invalid escape sequence: invalid code point: U+%04X", u)
}
var buf [utf8.MaxRune]byte
n := utf8.EncodeRune(buf[:], rune(u))
content = append(content, buf[:n]...)
i += size
case '0', '1', '2', '3':
if !(l.peekOk(i+1) && char.IsOctalDigit(l.peek(i)) && char.IsOctalDigit(l.peek(i+1))) {
l.panicf("invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
for j := 0; j < 2; j++ {
if !(l.peekOk(i+j) && char.IsOctalDigit(l.peek(i+j))) {
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+j+1), "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits")
}
}
u, err := strconv.ParseUint(l.slice(i-1, i+2), 8, 8)
if err != nil {
l.panicf("invalid escape sequence: %v", err)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i+2), "invalid escape sequence: %v", err)
}
content = append(content, byte(u))
i += 2
default:
l.panicf("invalid escape sequence: \\%c", c)
l.panicfAtPosition(token.Pos(l.pos+i-2), token.Pos(l.pos+i), "invalid escape sequence: \\%c", c)
}

continue
}

if c == '\n' && len(q) != 3 {
l.panicf("unclosed %s: newline appears in non triple-quoted", name)
l.panicfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i+1), "unclosed %s: newline appears in non triple-quoted", name)
}

content = append(content, c)
i++
}

panic(l.errorf("unclosed %s", name))
panic(l.errorfAtPosition(token.Pos(l.pos), token.Pos(l.pos+i), "unclosed %s", name))
}

func (l *Lexer) skipSpaces() {
Expand Down Expand Up @@ -543,6 +547,7 @@ func (l *Lexer) skipComment() {
}

func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
pos := token.Pos(l.pos)
for !l.eof() {
if l.slice(0, len(end)) == end {
l.skipN(len(end))
Expand All @@ -551,8 +556,7 @@ func (l *Lexer) skipCommentUntil(end string, mustEnd bool) {
l.skip()
}
if mustEnd {
// TODO: improve error position
l.panicf("unclosed comment")
l.panicfAtPosition(pos, token.Pos(l.pos), "unclosed comment")
}
}

Expand Down Expand Up @@ -596,6 +600,17 @@ func (l *Lexer) errorf(msg string, param ...interface{}) *Error {
}
}

func (l *Lexer) errorfAtPosition(pos, end token.Pos, msg string, param ...interface{}) *Error {
return &Error{
Message: fmt.Sprintf(msg, param...),
Position: l.Position(pos, end),
}
}

func (l *Lexer) panicf(msg string, param ...interface{}) {
panic(l.errorf(msg, param...))
}

func (l *Lexer) panicfAtPosition(pos, end token.Pos, msg string, param ...interface{}) {
panic(l.errorfAtPosition(pos, end, msg, param...))
}
42 changes: 23 additions & 19 deletions lexer_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -143,26 +143,27 @@ var lexerTestCases = []struct {
var lexerWrongTestCase = []struct {
source string
pos Pos
end Pos
message string
}{
{"\b", 0, "illegal input character: '\\b'"},
{`"foo`, 0, "unclosed string literal"},
{`R"foo`, 1, "unclosed raw string literal"},
{"'foo\n", 0, "unclosed string literal: newline appears in non triple-quoted"},
{"R'foo\n", 1, "unclosed raw string literal: newline appears in non triple-quoted"},
{"R'foo\\", 1, "invalid escape sequence: \\<eof>"},
{`"\400"`, 0, "invalid escape sequence: \\4"},
{`"\3xx"`, 0, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
{`"\xZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
{`"\XZZ"`, 0, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
{`B"\u0031"`, 1, "invalid escape sequence: \\u is not allowed in bytes literal"},
{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
{`B"\U00000031"`, 1, "invalid escape sequence: \\U is not allowed in bytes literal"},
{`"\UFFFFFFFF"`, 0, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
{"``", 0, "invalid empty identifier"},
{"1from", 1, "number literal cannot follow identifier without any spaces"},
{`'''0`, 0, "unclosed triple-quoted string literal"},
{`/*`, 2, "unclosed comment"},
{"\b", 0, 0, "illegal input character: '\\b'"},
{`"foo`, 0, 4, "unclosed string literal"},
{`R"foo`, 1, 5, "unclosed raw string literal"},
{"'foo\n", 0, 5, "unclosed string literal: newline appears in non triple-quoted"},
{"R'foo\n", 1, 6, "unclosed raw string literal: newline appears in non triple-quoted"},
{"R'foo\\", 5, 6, "invalid escape sequence: \\<eof>"},
{`"\400"`, 1, 3, "invalid escape sequence: \\4"},
{`"\3xx"`, 1, 4, "invalid escape sequence: octal escape sequence must be follwed by 3 octal digits"},
{`"\xZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
{`"\XZZ"`, 1, 4, "invalid escape sequence: hex escape sequence must be follwed by 2 hex digits"},
{`B"\u0031"`, 2, 4, "invalid escape sequence: \\u is not allowed in bytes literal"},
{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
{`B"\U00000031"`, 2, 4, "invalid escape sequence: \\U is not allowed in bytes literal"},
{`"\UFFFFFFFF"`, 1, 11, "invalid escape sequence: invalid code point: U+FFFFFFFF"},
{"``", 0, 2, "invalid empty identifier"},
{"1from", 1, 1, "number literal cannot follow identifier without any spaces"},
{`'''0`, 0, 4, "unclosed triple-quoted string literal"},
{`/*`, 0, 2, "unclosed comment"},
}

func testLexer(t *testing.T, source string, tokens []*Token) {
Expand Down Expand Up @@ -240,7 +241,10 @@ func TestLexerWrong(t *testing.T) {
t.Errorf("expected error message: %q, but: %q", tc.message, e.Message)
}
if e.Position.Pos != tc.pos {
t.Errorf("expected error position: %v, but: %v", tc.pos, e.Position.Pos)
t.Errorf("expected error position (pos): %v, but: %v", tc.pos, e.Position.Pos)
}
if e.Position.End != tc.end {
t.Errorf("expected error position (end): %v, but: %v", tc.end, e.Position.End)
}
} else {
t.Errorf("unexpected error: %v", err)
Expand Down

0 comments on commit e50eded

Please sign in to comment.