diff --git a/NEWS.md b/NEWS.md index 63027b9aabf7c..fd1e959b33572 100644 --- a/NEWS.md +++ b/NEWS.md @@ -7,6 +7,8 @@ New language features * It is now possible to assign to bindings in another module using `setproperty!(::Module, ::Symbol, x)`. ([#44137]) * Slurping in assignments is now also allowed in non-final position. This is handled via `Base.split_rest`. ([#42902]) +* Character literals now support the same syntax allowed in string literals; i.e. the syntax can + represent invalid UTF-8 sequences as allowed by the `Char` type ([#44989]). Language changes ---------------- diff --git a/src/ast.c b/src/ast.c index 14a6e21e54bbe..70ee915475651 100644 --- a/src/ast.c +++ b/src/ast.c @@ -506,6 +506,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m return jl_true; else if (hd == jl_ast_ctx(fl_ctx)->false_sym && llength(e) == 1) return jl_false; + else if (hd == fl_ctx->jl_char_sym && llength(e) == 2) { + value_t v = car_(cdr_(e)); + if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type)) + jl_error("malformed julia char"); + uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v)); + return jl_box_char(c); + } } if (issymbol(hd)) sym = scmsym_to_julia(fl_ctx, hd); diff --git a/src/flisp/flisp.c b/src/flisp/flisp.c index 86421f6d966cf..32c0008025559 100644 --- a/src/flisp/flisp.c +++ b/src/flisp/flisp.c @@ -2396,6 +2396,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize) #endif fl_ctx->jl_sym = symbol(fl_ctx, "julia_value"); + fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char"); fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR); vector_setsize(fl_ctx->the_empty_vector, 0); diff --git a/src/flisp/flisp.h b/src/flisp/flisp.h index 209a4f2d4fcdb..e77904a32d1f2 100644 --- a/src/flisp/flisp.h +++ b/src/flisp/flisp.h @@ -502,6 +502,7 @@ struct _fl_context_t { value_t apply_func, apply_v, apply_e; value_t jl_sym; + value_t jl_char_sym; // persistent buffer (avoid repeated malloc/free) // for julia_extensions.c: normalize size_t jlbuflen; diff --git a/src/flisp/julia_extensions.c b/src/flisp/julia_extensions.c index 9fcd3e9789af4..f29e3972755c5 100644 --- a/src/flisp/julia_extensions.c +++ b/src/flisp/julia_extensions.c @@ -361,6 +361,55 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0]))); } +static uint32_t _iterate_continued(uint8_t *s, size_t n, size_t *i, uint32_t u) { + if (u < 0xc0000000) { ++*i; return u; } + uint8_t b; + + if (++*i >= n) return u; + b = s[*i]; // cont byte 1 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b << 16; + + if (++*i >= n || u < 0xe0000000) return u; + b = s[*i]; // cont byte 2 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b << 8; + + if (++*i >= n || u < 0xf0000000) return u; + b = s[*i]; // cont byte 3 + if ((b & 0xc0) != 0x80) return u; + u |= (uint32_t)b; ++*i; + + return u; +} + +static uint32_t _string_only_julia_char(uint8_t *s, size_t n) { + if (!(0 < n && n <= 4)) + return -1; + size_t i = 0; + uint8_t b = s[i]; + uint32_t u = (uint32_t)b << 24; + if (0x80 <= b && b <= 0xf7) + u = _iterate_continued(s, n, &i, u); + else + i = 1; + if (i < n) + return -1; + return u; +} + +value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) { + argcount(fl_ctx, "string.only-julia-char", nargs, 1); + if (!fl_isstring(fl_ctx, args[0])) + type_error(fl_ctx, "string.only-julia-char", "string", args[0]); + uint8_t *s = (uint8_t*)cvalue_data(args[0]); + size_t len = cv_len((cvalue_t*)ptr(args[0])); + uint32_t u = _string_only_julia_char(s, len); + if (u == (uint32_t)-1) + return fl_ctx->F; + return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u)); +} + static const builtinspec_t julia_flisp_func_info[] = { { "skip-ws", fl_skipws }, { "accum-julia-symbol", fl_accum_julia_symbol }, @@ -371,6 +420,7 @@ static const builtinspec_t julia_flisp_func_info[] = { { "strip-op-suffix", fl_julia_strip_op_suffix }, { "underscore-symbol?", fl_julia_underscore_symbolp }, { "string->normsymbol", fl_string2normsymbol }, + { "string.only-julia-char", fl_string_only_julia_char }, { NULL, NULL } }; diff --git a/src/julia-parser.scm b/src/julia-parser.scm index 22d677b8bdaa2..38969faf5caf4 100644 --- a/src/julia-parser.scm +++ b/src/julia-parser.scm @@ -2495,13 +2495,12 @@ (write-char (not-eof-1 (read-char (ts:port s))) b)) (loop (read-char (ts:port s)))))) - (let ((str (unescape-string (io.tostring! b)))) - (let ((len (string-length str))) - (if (= len 1) - (string.char str 0) - (if (= len 0) - (error "invalid empty character literal") - (error "character literal contains multiple characters"))))))))) + (let* ((str (unescape-string (io.tostring! b))) + (c (string.only-julia-char str))) + (or c + (if (= (string-length str) 0) + (error "invalid empty character literal") + (error "character literal contains multiple characters")))))))) ;; symbol/expression quote ((eq? t ':) diff --git a/test/syntax.jl b/test/syntax.jl index 3009727fa61bf..3d306e8c2d780 100644 --- a/test/syntax.jl +++ b/test/syntax.jl @@ -276,9 +276,6 @@ end @test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42' # issue #24558 -@test_throws ParseError Meta.parse("'\\xff'") -@test_throws ParseError Meta.parse("'\\x80'") -@test_throws ParseError Meta.parse("'ab'") @test '\u2200' == "\u2200"[1] @test_throws ParseError Meta.parse("f(2x for x=1:10, y") @@ -317,19 +314,16 @@ let p = 15 @test 2p+1 == 31 # not a hex float literal end -function test_parseerror(str, msg) - try - Meta.parse(str) - @test false - catch e - @test isa(e,ParseError) && e.msg == msg - end +macro test_parseerror(str, msg) + ex = :(@test_throws ParseError($(esc(msg))) Meta.parse($(esc(str)))) + ex.args[2] = __source__ + return ex end -test_parseerror("0x", "invalid numeric constant \"0x\"") -test_parseerror("0b", "invalid numeric constant \"0b\"") -test_parseerror("0o", "invalid numeric constant \"0o\"") -test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") -test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") +@test_parseerror("0x", "invalid numeric constant \"0x\"") +@test_parseerror("0b", "invalid numeric constant \"0b\"") +@test_parseerror("0o", "invalid numeric constant \"0o\"") +@test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"") +@test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") # issue #15798 @test Meta.lower(Main, Base.parse_input_line(""" @@ -345,8 +339,8 @@ test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"") """)::Expr) == 23341 # issue #15763 -test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") -test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") +@test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1") +@test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2") # issue #15828 @test Meta.lower(Main, Meta.parse("x...")) == Expr(:error, "\"...\" expression outside call") @@ -2059,8 +2053,8 @@ end == 1 # issue #29982 @test Meta.parse("'a'") == 'a' @test Meta.parse("'\U0061'") == 'a' -test_parseerror("''", "invalid empty character literal") -test_parseerror("'abc'", "character literal contains multiple characters") +@test_parseerror("''", "invalid empty character literal") +@test_parseerror("'abc'", "character literal contains multiple characters") # optional soft scope: #28789, #33864 @@ -3379,3 +3373,25 @@ f45162(f) = f(x=1) @test Meta.lower(@__MODULE__, :(global const x::Int)) == Expr(:error, "expected assignment after \"const\"") @test Meta.lower(@__MODULE__, :(const global x)) == Expr(:error, "expected assignment after \"const\"") @test Meta.lower(@__MODULE__, :(const global x::Int)) == Expr(:error, "expected assignment after \"const\"") + +@testset "issue 25072" begin + @test '\xc0\x80' == reinterpret(Char, 0xc0800000) + @test '\x80' == reinterpret(Char, 0x80000000) + @test '\xff' == reinterpret(Char, 0xff000000) + @test_parseerror "'\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" # == reinterpret(Char, 0xffffffff) + @test '\uffff' == Char(0xffff) + @test '\U00002014' == Char(0x2014) + @test '\100' == reinterpret(Char, UInt32(0o100) << 24) + @test_parseerror "'\\100\\42'" "character literal contains multiple characters" # == reinterpret(Char, (UInt32(0o100) << 24) | (UInt32(0o42) << 16)) + @test_parseerror "''" "invalid empty character literal" + @test_parseerror "'\\xff\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" + @test_parseerror "'abcd'" "character literal contains multiple characters" + @test_parseerror "'\\uff\\xff'" "character literal contains multiple characters" + @test_parseerror "'\\xff\\uff'" "character literal contains multiple characters" + @test_parseerror "'\\xffa'" "character literal contains multiple characters" + @test_parseerror "'\\uffffa'" "character literal contains multiple characters" + @test_parseerror "'\\U00002014a'" "character literal contains multiple characters" + @test_parseerror "'\\1000'" "character literal contains multiple characters" + @test Meta.isexpr(Meta.parse("'a"), :incomplete) + @test ''' == "'"[1] +end