Skip to content

Commit

Permalink
support malformed chars in char literal syntax (#44989)
Browse files Browse the repository at this point in the history
Make the syntax for character literals the same as what is allowed in
single-character string literals.

Alternative to #44765

fixes #25072
  • Loading branch information
simeonschaub authored May 25, 2022
1 parent ba4a4b2 commit 991190f
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 26 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ New language features
* It is now possible to assign to bindings in another module using `setproperty!(::Module, ::Symbol, x)`. ([#44137])
* Slurping in assignments is now also allowed in non-final position. This is
handled via `Base.split_rest`. ([#42902])
* Character literals now support the same syntax allowed in string literals; i.e. the syntax can
represent invalid UTF-8 sequences as allowed by the `Char` type ([#44989]).

Language changes
----------------
Expand Down
7 changes: 7 additions & 0 deletions src/ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
return jl_true;
else if (hd == jl_ast_ctx(fl_ctx)->false_sym && llength(e) == 1)
return jl_false;
else if (hd == fl_ctx->jl_char_sym && llength(e) == 2) {
value_t v = car_(cdr_(e));
if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type))
jl_error("malformed julia char");
uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v));
return jl_box_char(c);
}
}
if (issymbol(hd))
sym = scmsym_to_julia(fl_ctx, hd);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2396,6 +2396,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
#endif

fl_ctx->jl_sym = symbol(fl_ctx, "julia_value");
fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char");

fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR);
vector_setsize(fl_ctx->the_empty_vector, 0);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct _fl_context_t {
value_t apply_func, apply_v, apply_e;

value_t jl_sym;
value_t jl_char_sym;
// persistent buffer (avoid repeated malloc/free)
// for julia_extensions.c: normalize
size_t jlbuflen;
Expand Down
50 changes: 50 additions & 0 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,55 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs
return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0])));
}

static uint32_t _iterate_continued(uint8_t *s, size_t n, size_t *i, uint32_t u) {
if (u < 0xc0000000) { ++*i; return u; }
uint8_t b;

if (++*i >= n) return u;
b = s[*i]; // cont byte 1
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b << 16;

if (++*i >= n || u < 0xe0000000) return u;
b = s[*i]; // cont byte 2
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b << 8;

if (++*i >= n || u < 0xf0000000) return u;
b = s[*i]; // cont byte 3
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b; ++*i;

return u;
}

static uint32_t _string_only_julia_char(uint8_t *s, size_t n) {
if (!(0 < n && n <= 4))
return -1;
size_t i = 0;
uint8_t b = s[i];
uint32_t u = (uint32_t)b << 24;
if (0x80 <= b && b <= 0xf7)
u = _iterate_continued(s, n, &i, u);
else
i = 1;
if (i < n)
return -1;
return u;
}

value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) {
argcount(fl_ctx, "string.only-julia-char", nargs, 1);
if (!fl_isstring(fl_ctx, args[0]))
type_error(fl_ctx, "string.only-julia-char", "string", args[0]);
uint8_t *s = (uint8_t*)cvalue_data(args[0]);
size_t len = cv_len((cvalue_t*)ptr(args[0]));
uint32_t u = _string_only_julia_char(s, len);
if (u == (uint32_t)-1)
return fl_ctx->F;
return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u));
}

static const builtinspec_t julia_flisp_func_info[] = {
{ "skip-ws", fl_skipws },
{ "accum-julia-symbol", fl_accum_julia_symbol },
Expand All @@ -371,6 +420,7 @@ static const builtinspec_t julia_flisp_func_info[] = {
{ "strip-op-suffix", fl_julia_strip_op_suffix },
{ "underscore-symbol?", fl_julia_underscore_symbolp },
{ "string->normsymbol", fl_string2normsymbol },
{ "string.only-julia-char", fl_string_only_julia_char },
{ NULL, NULL }
};

Expand Down
13 changes: 6 additions & 7 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2495,13 +2495,12 @@
(write-char (not-eof-1 (read-char (ts:port s)))
b))
(loop (read-char (ts:port s))))))
(let ((str (unescape-string (io.tostring! b))))
(let ((len (string-length str)))
(if (= len 1)
(string.char str 0)
(if (= len 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters")))))))))
(let* ((str (unescape-string (io.tostring! b)))
(c (string.only-julia-char str)))
(or c
(if (= (string-length str) 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters"))))))))

;; symbol/expression quote
((eq? t ':)
Expand Down
54 changes: 35 additions & 19 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,6 @@ end
@test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42'

# issue #24558
@test_throws ParseError Meta.parse("'\\xff'")
@test_throws ParseError Meta.parse("'\\x80'")
@test_throws ParseError Meta.parse("'ab'")
@test '\u2200' == "\u2200"[1]

@test_throws ParseError Meta.parse("f(2x for x=1:10, y")
Expand Down Expand Up @@ -317,19 +314,16 @@ let p = 15
@test 2p+1 == 31 # not a hex float literal
end

function test_parseerror(str, msg)
try
Meta.parse(str)
@test false
catch e
@test isa(e,ParseError) && e.msg == msg
end
macro test_parseerror(str, msg)
ex = :(@test_throws ParseError($(esc(msg))) Meta.parse($(esc(str))))
ex.args[2] = __source__
return ex
end
test_parseerror("0x", "invalid numeric constant \"0x\"")
test_parseerror("0b", "invalid numeric constant \"0b\"")
test_parseerror("0o", "invalid numeric constant \"0o\"")
test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"")
test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")
@test_parseerror("0x", "invalid numeric constant \"0x\"")
@test_parseerror("0b", "invalid numeric constant \"0b\"")
@test_parseerror("0o", "invalid numeric constant \"0o\"")
@test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"")
@test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")

# issue #15798
@test Meta.lower(Main, Base.parse_input_line("""
Expand All @@ -345,8 +339,8 @@ test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")
""")::Expr) == 23341

# issue #15763
test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1")
test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2")
@test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1")
@test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2")

# issue #15828
@test Meta.lower(Main, Meta.parse("x...")) == Expr(:error, "\"...\" expression outside call")
Expand Down Expand Up @@ -2059,8 +2053,8 @@ end == 1
# issue #29982
@test Meta.parse("'a'") == 'a'
@test Meta.parse("'\U0061'") == 'a'
test_parseerror("''", "invalid empty character literal")
test_parseerror("'abc'", "character literal contains multiple characters")
@test_parseerror("''", "invalid empty character literal")
@test_parseerror("'abc'", "character literal contains multiple characters")

# optional soft scope: #28789, #33864

Expand Down Expand Up @@ -3379,3 +3373,25 @@ f45162(f) = f(x=1)
@test Meta.lower(@__MODULE__, :(global const x::Int)) == Expr(:error, "expected assignment after \"const\"")
@test Meta.lower(@__MODULE__, :(const global x)) == Expr(:error, "expected assignment after \"const\"")
@test Meta.lower(@__MODULE__, :(const global x::Int)) == Expr(:error, "expected assignment after \"const\"")

@testset "issue 25072" begin
@test '\xc0\x80' == reinterpret(Char, 0xc0800000)
@test '\x80' == reinterpret(Char, 0x80000000)
@test '\xff' == reinterpret(Char, 0xff000000)
@test_parseerror "'\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" # == reinterpret(Char, 0xffffffff)
@test '\uffff' == Char(0xffff)
@test '\U00002014' == Char(0x2014)
@test '\100' == reinterpret(Char, UInt32(0o100) << 24)
@test_parseerror "'\\100\\42'" "character literal contains multiple characters" # == reinterpret(Char, (UInt32(0o100) << 24) | (UInt32(0o42) << 16))
@test_parseerror "''" "invalid empty character literal"
@test_parseerror "'\\xff\\xff\\xff\\xff\\xff'" "character literal contains multiple characters"
@test_parseerror "'abcd'" "character literal contains multiple characters"
@test_parseerror "'\\uff\\xff'" "character literal contains multiple characters"
@test_parseerror "'\\xff\\uff'" "character literal contains multiple characters"
@test_parseerror "'\\xffa'" "character literal contains multiple characters"
@test_parseerror "'\\uffffa'" "character literal contains multiple characters"
@test_parseerror "'\\U00002014a'" "character literal contains multiple characters"
@test_parseerror "'\\1000'" "character literal contains multiple characters"
@test Meta.isexpr(Meta.parse("'a"), :incomplete)
@test ''' == "'"[1]
end

0 comments on commit 991190f

Please sign in to comment.