From 7d3c9677b0941b162366e8ef6a57fdd5c70b42bf Mon Sep 17 00:00:00 2001 From: KristofferC Date: Thu, 18 Aug 2022 15:42:55 +0200 Subject: [PATCH] get rid of the startpos field this makes untokenization slightly more annoying --- src/tokenize.jl | 36 +---- test/tokenize.jl | 388 +++++++++++++++++++++++++++-------------------- 2 files changed, 231 insertions(+), 193 deletions(-) diff --git a/src/tokenize.jl b/src/tokenize.jl index df7d8e62..3c762320 100644 --- a/src/tokenize.jl +++ b/src/tokenize.jl @@ -25,32 +25,23 @@ TOKEN_ERROR_DESCRIPTION = Dict{Kind, String}( struct Token kind::Kind # Offsets into a string or buffer - startbyte::Int # The byte where the token start in the buffer endbyte::Int # The byte where the token ended in the buffer dotop::Bool suffix::Bool end -function Token(kind::Kind, startbyte::Int, endbyte::Int) - Token(kind, startbyte, endbyte, false, false) +function Token(kind::Kind, endbyte::Int) + Token(kind, endbyte, false, false) end -Token() = Token(K"error", 0, 0, false, false) +Token() = Token(K"error", 0, false, false) const EMPTY_TOKEN = Token() kind(t::Token) = t.kind -startbyte(t::Token) = t.startbyte endbyte(t::Token) = t.endbyte -function untokenize(t::Token, str::String) - String(codeunits(str)[1 .+ (t.startbyte:t.endbyte)]) -end -function Base.show(io::IO, t::Token) - print(io, rpad(string(startbyte(t), "-", endbyte(t)), 11, " ")) - print(io, rpad(kind(t), 15, " ")) -end #------------------------------------------------------------------------------- # Lexer @@ -77,9 +68,7 @@ Ideally a lexer is stateless but some state is needed here for: """ mutable struct Lexer{IO_t <: IO} io::IO_t - token_startpos::Int - last_token::Kind string_states::Vector{StringState} chars::Tuple{Char,Char,Char,Char} @@ -156,13 +145,6 @@ Return the latest `Token`'s starting position. """ startpos(l::Lexer) = l.token_startpos -""" - startpos!(l::Lexer, i::Integer) - -Set a new starting position. -""" -startpos!(l::Lexer, i::Integer) = l.token_startpos = i - """ peekchar(l::Lexer) @@ -171,14 +153,14 @@ Returns the next character without changing the lexer's state. peekchar(l::Lexer) = l.chars[2] """ -dpeekchar(l::Lexer) + dpeekchar(l::Lexer) Returns the next two characters without changing the lexer's state. """ dpeekchar(l::Lexer) = l.chars[2], l.chars[3] """ -peekchar3(l::Lexer) + peekchar3(l::Lexer) Returns the next three characters without changing the lexer's state. """ @@ -198,8 +180,6 @@ Determine whether the end of the lexer's underlying buffer has been reached. """ Base.eof(l::Lexer) = eof(l.io) -Base.seek(l::Lexer, pos) = seek(l.io, pos) - """ start_token!(l::Lexer) @@ -215,9 +195,6 @@ end Returns the next character and increments the current position. """ -function readchar end - - function readchar(l::Lexer) c = readchar(l.io) l.chars = (l.chars[2], l.chars[3], l.chars[4], c) @@ -271,8 +248,7 @@ function emit(l::Lexer, kind::Kind) suffix = true end end - - tok = Token(kind, startpos(l), position(l) - 1, l.dotop, suffix) + tok = Token(kind, position(l) - 1, l.dotop, suffix) l.dotop = false l.last_token = kind diff --git a/test/tokenize.jl b/test/tokenize.jl index c0e81278..25e2be2c 100644 --- a/test/tokenize.jl +++ b/test/tokenize.jl @@ -13,12 +13,30 @@ using JuliaSyntax: using JuliaSyntax.Tokenize: Tokenize, tokenize, - untokenize, Token tok(str, i = 1) = collect(tokenize(str))[i] -strtok(str) = untokenize.(collect(tokenize(str)), str) +function untokenize(tok::Token, str, offset=0) + String(codeunits(str)[offset+1:tok.endbyte+1]) +end + +function strtok(str::AbstractString) + tokens = collect(tokenize(str)) + return strtok(tokens, str) +end + +function strtok(tokens::AbstractVector{Token}, str::AbstractString) + strs = String[] + + offset = 0 + for tok in tokens + push!(strs, untokenize(tok, str, offset)) + offset = tok.endbyte+1 + end + return strs +end + @testset "tokens" begin for s in ["a", IOBuffer("a")] @@ -36,13 +54,14 @@ end # testset @testset "tokenize unicode" begin str = "𝘋 =2β" for s in [str, IOBuffer(str)] - l = tokenize(s) kinds = [K"Identifier", K"Whitespace", K"=", K"Integer", K"Identifier", K"EndMarker"] token_strs = ["𝘋", " ", "=", "2", "β", ""] - for (i, n) in enumerate(l) + tokens = collect(tokenize(s)) + strs = strtok(tokens, str) + for (i, n) in enumerate(kinds) @test kind(n) == kinds[i] - @test untokenize(n, str) == token_strs[i] + @test strs[i] == token_strs[i] end end end # testset @@ -136,10 +155,11 @@ end # testset end @testset "roundtrippability" begin - @test join(untokenize.(collect(tokenize(str)), str)) == str + @test join(strtok(str)) == str end - @test all((t.endbyte - t.startbyte + 1)==sizeof(untokenize(t, str)) for t in tokenize(str)) + # TODO: Rewrite based on the fact that `.startbyte` no longer + # @test all((t.endbyte - t.startbyte + 1)==sizeof(untokenize(t, str)) for t in tokenize(str)) end # testset @testset "issue 5, '..'" begin @@ -226,8 +246,8 @@ end ImageMagick.save(fn, reinterpret(ARGB32, [0xf0884422]'')) D = ImageMagick.load(fn) """ - tokens = collect(tokenize(str)) - @test string(untokenize(tokens[16], str))==string(untokenize(tokens[17], str))=="'" + strs = strtok(str) + @test strs[16] == strs[17] == "'" test_roundtrip("'a'", K"Char", "'a'") test_roundtrip("''", K"Char", "''") @@ -305,218 +325,259 @@ end @test tok("0b0101").kind==K"BinInt" end -@testset "show" begin - io = IOBuffer() - show(io, collect(tokenize("\"abc\nd\"ef"))[2]) - @test String(take!(io)) == "1-5 String " +function check_lexing(str, results::Vector{Pair{Kind, String}}) + offset = 0 + ts = collect(tokenize(str)) + for (tok, res) in zip(ts, results) + @test tok.kind == res[1] + @test untokenize(tok, str, offset) == res[2] + offset = tok.endbyte+1 + end end -~(tok::Token, t::Tuple) = tok.kind == t[1] && untokenize(tok, t[3]) == t[2] - @testset "raw strings" begin str = raw""" str"x $ \ y" """ - ts = collect(tokenize(str)) - @test ts[1] ~ (K"Whitespace" , " " , str) - @test ts[2] ~ (K"Identifier" , "str" , str) - @test ts[3] ~ (K"\"" , "\"" , str) - @test ts[4] ~ (K"String" , "x \$ \\ y", str) - @test ts[5] ~ (K"\"" , "\"" , str) - @test ts[6] ~ (K"Whitespace" , " " , str) - @test ts[7] ~ (K"EndMarker" , "" , str) + + check_lexing(str, + [ + K"Whitespace" => " " , + K"Identifier" => "str" , + K"\"" => "\"" , + K"String" => "x \$ \\ y", + K"\"" => "\"" , + K"Whitespace" => " " , + K"EndMarker" => "" , + ]) str = raw"""`x $ \ y`""" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"`" , "`" , str) - @test ts[2] ~ (K"CmdString" , "x \$ \\ y" , str) - @test ts[3] ~ (K"`" , "`" , str) - @test ts[4] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"`" => "`" , + K"CmdString" => "x \$ \\ y", + K"`" => "`" , + K"EndMarker" => "" , + ]) # str"\\" str = "str\"\\\\\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"Identifier" , "str" , str) - @test ts[2] ~ (K"\"" , "\"" , str) - @test ts[3] ~ (K"String" , "\\\\" , str) - @test ts[4] ~ (K"\"" , "\"" , str) - @test ts[5] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"Identifier" => "str" , + K"\"" => "\"" , + K"String" => "\\\\", + K"\"" => "\"" , + K"EndMarker" => "" , + ]) # str"\\\"" str = "str\"\\\\\\\"\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"Identifier" , "str" , str) - @test ts[2] ~ (K"\"" , "\"" , str) - @test ts[3] ~ (K"String" , "\\\\\\\"" , str) - @test ts[4] ~ (K"\"" , "\"" , str) - @test ts[5] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"Identifier" => "str" , + K"\"" => "\"" , + K"String" => "\\\\\\\"", + K"\"" => "\"" , + K"EndMarker" => "" , + ]) # Contextual keywords and operators allowed as raw string prefixes - str = raw""" var"x $ \ y" """ - ts = collect(tokenize(str)) - @test ts[2] ~ (K"var" , "var", str) - @test ts[4] ~ (K"String" , "x \$ \\ y", str) - - str = raw""" outer"x $ \ y" """ - ts = collect(tokenize(str)) - @test ts[2] ~ (K"outer" , "outer", str) - @test ts[4] ~ (K"String" , "x \$ \\ y", str) - - str = raw""" isa"x $ \ y" """ - ts = collect(tokenize(str)) - @test ts[2] ~ (K"isa" , "isa", str) - @test ts[4] ~ (K"String" , "x \$ \\ y", str) + str = raw"""var"x $ \ y" """ + check_lexing(str, + [ + K"var" => "var", + K"\"" => "\"", + K"String" => "x \$ \\ y", + ]) + + str = raw"""outer"x $ \ y" """ + check_lexing(str, + [ + K"outer" => "outer", + K"\"" => "\"", + K"String" => "x \$ \\ y", + ]) + + str = raw"""isa"x $ \ y" """ + check_lexing(str, + [ + K"isa" => "isa", + K"\"" => "\"", + K"String" => "x \$ \\ y", + ]) end @testset "string escaped newline whitespace" begin str = "\"x\\\n \ty\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"\"", "\"", str) - @test ts[2] ~ (K"String", "x", str) - @test ts[3] ~ (K"Whitespace", "\\\n \t", str) - @test ts[4] ~ (K"String", "y", str) - @test ts[5] ~ (K"\"", "\"", str) + check_lexing(str, + [ + K"\""=> "\"", + K"String"=> "x", + K"Whitespace"=> "\\\n \t", + K"String"=> "y", + K"\""=> "\"", + ]) # No newline escape for raw strings str = "r\"x\\\ny\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"Identifier", "r", str) - @test ts[2] ~ (K"\"", "\"", str) - @test ts[3] ~ (K"String", "x\\\ny", str) - @test ts[4] ~ (K"\"", "\"", str) + check_lexing(str, + [ + K"Identifier"=> "r", + K"\""=> "\"", + K"String"=> "x\\\ny", + K"\""=> "\"", + ]) end @testset "triple quoted string line splitting" begin str = "\"\"\"\nx\r\ny\rz\n\r\"\"\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"\"\"\"" , "\"\"\"", str) - @test ts[2] ~ (K"String" , "\n", str) - @test ts[3] ~ (K"String" , "x\r\n", str) - @test ts[4] ~ (K"String" , "y\r", str) - @test ts[5] ~ (K"String" , "z\n", str) - @test ts[6] ~ (K"String" , "\r", str) - @test ts[7] ~ (K"\"\"\"" , "\"\"\"", str) + check_lexing(str, + [ + K"\"\"\"" => "\"\"\"", + K"String" => "\n", + K"String" => "x\r\n", + K"String" => "y\r", + K"String" => "z\n", + K"String" => "\r", + K"\"\"\"" => "\"\"\"", + ]) # Also for raw strings str = "r\"\"\"\nx\ny\"\"\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"Identifier" , "r", str) - @test ts[2] ~ (K"\"\"\"" , "\"\"\"", str) - @test ts[3] ~ (K"String" , "\n", str) - @test ts[4] ~ (K"String" , "x\n", str) - @test ts[5] ~ (K"String" , "y", str) - @test ts[6] ~ (K"\"\"\"" , "\"\"\"", str) + check_lexing(str, + [ + K"Identifier" => "r", + K"\"\"\"" => "\"\"\"", + K"String" => "\n", + K"String" => "x\n", + K"String" => "y", + K"\"\"\"" => "\"\"\"", + ]) end @testset "interpolation" begin @testset "basic" begin str = "\"\$x \$y\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"\"" , "\"", str) - @test ts[2] ~ (K"$" , "\$", str) - @test ts[3] ~ (K"Identifier" , "x" , str) - @test ts[4] ~ (K"String" , " " , str) - @test ts[5] ~ (K"$" , "\$", str) - @test ts[6] ~ (K"Identifier" , "y" , str) - @test ts[7] ~ (K"\"" , "\"", str) - @test ts[8] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"\"" => "\"", + K"$" => "\$", + K"Identifier" => "x", + K"String" => " ", + K"$" => "\$", + K"Identifier" => "y", + K"\"" => "\"", + K"EndMarker" => "" , + ]) end @testset "nested" begin str = """"str: \$(g("str: \$(h("str"))"))" """ - ts = collect(tokenize(str)) - @test length(ts) == 23 - @test ts[1] ~ (K"\"" , "\"" , str) - @test ts[2] ~ (K"String" , "str: ", str) - @test ts[3] ~ (K"$" , "\$" , str) - @test ts[4] ~ (K"(" , "(" , str) - @test ts[5] ~ (K"Identifier", "g" , str) - @test ts[6] ~ (K"(" , "(" , str) - @test ts[7] ~ (K"\"" , "\"" , str) - @test ts[8] ~ (K"String" , "str: ", str) - @test ts[9] ~ (K"$" , "\$" , str) - @test ts[10] ~ (K"(" , "(" , str) - @test ts[11] ~ (K"Identifier", "h" , str) - @test ts[12] ~ (K"(" , "(" , str) - @test ts[13] ~ (K"\"" , "\"" , str) - @test ts[14] ~ (K"String" , "str" , str) - @test ts[15] ~ (K"\"" , "\"" , str) - @test ts[16] ~ (K")" , ")" , str) - @test ts[17] ~ (K")" , ")" , str) - @test ts[18] ~ (K"\"" , "\"" , str) - @test ts[19] ~ (K")" , ")" , str) - @test ts[20] ~ (K")" , ")" , str) - @test ts[21] ~ (K"\"" , "\"" , str) - @test ts[22] ~ (K"Whitespace", " " , str) - @test ts[23] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"\"" => "\"" , + K"String" => "str: " , + K"$" => "\$" , + K"(" => "(" , + K"Identifier"=> "g" , + K"(" => "(" , + K"\"" => "\"" , + K"String" => "str: " , + K"$" => "\$" , + K"(" => "(" , + K"Identifier"=> "h" , + K"(" => "(" , + K"\"" => "\"" , + K"String" => "str" , + K"\"" => "\"" , + K")" => ")" , + K")" => ")" , + K"\"" => "\"" , + K")" => ")" , + K")" => ")" , + K"\"" => "\"" , + K"Whitespace"=> " " , + K"EndMarker" => "" , + ]) end @testset "duplicate \$" begin str = "\"\$\$\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"\"" , "\"", str) - @test ts[2] ~ (K"$" , "\$", str) - @test ts[3] ~ (K"$" , "\$", str) - @test ts[4] ~ (K"\"" , "\"", str) - @test ts[5] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"\"" => "\"", + K"$" => "\$", + K"$" => "\$", + K"\"" => "\"", + ]) end @testset "Unmatched parens" begin # issue 73: https://github.com/JuliaLang/Tokenize.jl/issues/73 str = "\"\$(fdsf\"" - ts = collect(tokenize(str)) - @test ts[1] ~ (K"\"" , "\"" , str) - @test ts[2] ~ (K"$" , "\$" , str) - @test ts[3] ~ (K"(" , "(" , str) - @test ts[4] ~ (K"Identifier" , "fdsf" , str) - @test ts[5] ~ (K"\"" , "\"" , str) - @test ts[6] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"\"" => "\"" , + K"$" => "\$" , + K"(" => "(" , + K"Identifier" => "fdsf", + K"\"" => "\"" , + K"EndMarker" => "" , + ]) end @testset "Unicode" begin # issue 178: https://github.com/JuliaLang/Tokenize.jl/issues/178 str = """ "\$uₕx \$(uₕx - ux)" """ - ts = collect(tokenize(str)) - @test ts[ 1] ~ (K"Whitespace" , " " , str) - @test ts[ 2] ~ (K"\"" , "\"" , str) - @test ts[ 3] ~ (K"$" , "\$" , str) - @test ts[ 4] ~ (K"Identifier" , "uₕx" , str) - @test ts[ 5] ~ (K"String" , " " , str) - @test ts[ 6] ~ (K"$" , "\$" , str) - @test ts[ 7] ~ (K"(" , "(" , str) - @test ts[ 8] ~ (K"Identifier" , "uₕx" , str) - @test ts[ 9] ~ (K"Whitespace" , " " , str) - @test ts[10] ~ (K"-" , "-" , str) - @test ts[11] ~ (K"Whitespace" , " " , str) - @test ts[12] ~ (K"Identifier" , "ux" , str) - @test ts[13] ~ (K")" , ")" , str) - @test ts[14] ~ (K"\"" , "\"" , str) - @test ts[15] ~ (K"Whitespace" , " " , str) - @test ts[16] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"Whitespace" => " " , + K"\"" => "\"" , + K"$" => "\$" , + K"Identifier" => "uₕx", + K"String" => " " , + K"$" => "\$" , + K"(" => "(" , + K"Identifier" => "uₕx", + K"Whitespace" => " " , + K"-" => "-" , + K"Whitespace" => " " , + K"Identifier" => "ux" , + K")" => ")" , + K"\"" => "\"" , + K"Whitespace" => " " , + K"EndMarker" => "" , + ]) end @testset "var\"...\" disabled in interpolations" begin str = """ "\$var"x" " """ - ts = collect(tokenize(str)) - @test ts[ 1] ~ (K"Whitespace" , " " , str) - @test ts[ 2] ~ (K"\"" , "\"" , str) - @test ts[ 3] ~ (K"$" , "\$" , str) - @test ts[ 4] ~ (K"var" , "var" , str) - @test ts[ 5] ~ (K"\"" , "\"" , str) - @test ts[ 6] ~ (K"Identifier" , "x" , str) - @test ts[ 7] ~ (K"\"" , "\"" , str) - @test ts[ 8] ~ (K"String" , " " , str) - @test ts[ 9] ~ (K"\"" , "\"" , str) - @test ts[10] ~ (K"Whitespace" , " " , str) - @test ts[11] ~ (K"EndMarker" , "" , str) + check_lexing(str, + [ + K"Whitespace" => " " , + K"\"" => "\"" , + K"$" => "\$" , + K"var" => "var", + K"\"" => "\"" , + K"Identifier" => "x" , + K"\"" => "\"" , + K"String" => " " , + K"\"" => "\"" , + K"Whitespace" => " " , + K"EndMarker" => "" , + ]) end @testset "invalid chars after identifier" begin str = """ "\$x෴" """ - ts = collect(tokenize(str)) - @test ts[4] ~ (K"Identifier" , "x" , str) - @test ts[5] ~ (K"ErrorInvalidInterpolationTerminator" , "" , str) - @test ts[6] ~ (K"String" , "෴" , str) - @test is_error(ts[5].kind) - @test ts[5].kind == K"ErrorInvalidInterpolationTerminator" + check_lexing(str, + [ + K"Whitespace" => " ", + K"\"" => "\"", + K"$" => "\$", + K"Identifier" => "x", + K"ErrorInvalidInterpolationTerminator" => "" , + K"String" => "෴", + ]) end end @@ -705,7 +766,8 @@ for op in ops tokens = collect(tokenize(str)) exop = expr.head == :call ? expr.args[1] : expr.head #println(str) - @test Symbol(Tokenize.untokenize(tokens[arity == 1 ? 1 : 3], str)) == exop + strs = strtok(tokens, str) + @test Symbol(strs[arity == 1 ? 1 : 3]) == exop else break end