Skip to content

Commit

Permalink
support malformed chars in char literal syntax
Browse files Browse the repository at this point in the history
Make the syntax for character literals the same as what is allowed in
single-character string literals.

Alternative to #44765

fixes #25072
  • Loading branch information
simeonschaub authored and JeffBezanson committed May 24, 2022
1 parent 8bb973a commit a1ce793
Show file tree
Hide file tree
Showing 7 changed files with 102 additions and 26 deletions.
2 changes: 2 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,8 @@ New language features
* It is now possible to assign to bindings in another module using `setproperty!(::Module, ::Symbol, x)`. ([#44137])
* Slurping in assignments is now also allowed in non-final position. This is
handled via `Base.split_rest`. ([#42902])
* Character literals now support the same syntax allowed in string literals; i.e. the syntax can
represent invalid UTF-8 sequences as allowed by the `Char` type ([#44989]).

Language changes
----------------
Expand Down
7 changes: 7 additions & 0 deletions src/ast.c
Original file line number Diff line number Diff line change
Expand Up @@ -506,6 +506,13 @@ static jl_value_t *scm_to_julia_(fl_context_t *fl_ctx, value_t e, jl_module_t *m
return jl_true;
else if (hd == jl_ast_ctx(fl_ctx)->false_sym && llength(e) == 1)
return jl_false;
else if (hd == fl_ctx->jl_char_sym && llength(e) == 2) {
value_t v = car_(cdr_(e));
if (!(iscprim(v) && cp_class((cprim_t*)ptr(v)) == fl_ctx->uint32type))
jl_error("malformed julia char");
uint32_t c = *(uint32_t*)cp_data((cprim_t*)ptr(v));
return jl_box_char(c);
}
}
if (issymbol(hd))
sym = scmsym_to_julia(fl_ctx, hd);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.c
Original file line number Diff line number Diff line change
Expand Up @@ -2396,6 +2396,7 @@ static void lisp_init(fl_context_t *fl_ctx, size_t initial_heapsize)
#endif

fl_ctx->jl_sym = symbol(fl_ctx, "julia_value");
fl_ctx->jl_char_sym = symbol(fl_ctx, "julia_char");

fl_ctx->the_empty_vector = tagptr(alloc_words(fl_ctx, 1), TAG_VECTOR);
vector_setsize(fl_ctx->the_empty_vector, 0);
Expand Down
1 change: 1 addition & 0 deletions src/flisp/flisp.h
Original file line number Diff line number Diff line change
Expand Up @@ -502,6 +502,7 @@ struct _fl_context_t {
value_t apply_func, apply_v, apply_e;

value_t jl_sym;
value_t jl_char_sym;
// persistent buffer (avoid repeated malloc/free)
// for julia_extensions.c: normalize
size_t jlbuflen;
Expand Down
50 changes: 50 additions & 0 deletions src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -361,6 +361,55 @@ value_t fl_string2normsymbol(fl_context_t *fl_ctx, value_t *args, uint32_t nargs
return symbol(fl_ctx, normalize(fl_ctx, (char*)cvalue_data(args[0])));
}

static uint32_t _iterate_continued(uint8_t *s, size_t n, size_t *i, uint32_t u) {
if (u < 0xc0000000) { ++*i; return u; }
uint8_t b;

if (++*i >= n) return u;
b = s[*i]; // cont byte 1
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b << 16;

if (++*i >= n || u < 0xe0000000) return u;
b = s[*i]; // cont byte 2
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b << 8;

if (++*i >= n || u < 0xf0000000) return u;
b = s[*i]; // cont byte 3
if ((b & 0xc0) != 0x80) return u;
u |= (uint32_t)b; ++*i;

return u;
}

static uint32_t _string_only_julia_char(uint8_t *s, size_t n) {
if (!(0 < n && n <= 4))
return -1;
size_t i = 0;
uint8_t b = s[i];
uint32_t u = (uint32_t)b << 24;
if (0x80 <= b && b <= 0xf7)
u = _iterate_continued(s, n, &i, u);
else
i = 1;
if (i < n)
return -1;
return u;
}

value_t fl_string_only_julia_char(fl_context_t *fl_ctx, value_t *args, uint32_t nargs) {
argcount(fl_ctx, "string.only-julia-char", nargs, 1);
if (!fl_isstring(fl_ctx, args[0]))
type_error(fl_ctx, "string.only-julia-char", "string", args[0]);
uint8_t *s = (uint8_t*)cvalue_data(args[0]);
size_t len = cv_len((cvalue_t*)ptr(args[0]));
uint32_t u = _string_only_julia_char(s, len);
if (u == (uint32_t)-1)
return fl_ctx->F;
return fl_list2(fl_ctx, fl_ctx->jl_char_sym, mk_uint32(fl_ctx, u));
}

static const builtinspec_t julia_flisp_func_info[] = {
{ "skip-ws", fl_skipws },
{ "accum-julia-symbol", fl_accum_julia_symbol },
Expand All @@ -371,6 +420,7 @@ static const builtinspec_t julia_flisp_func_info[] = {
{ "strip-op-suffix", fl_julia_strip_op_suffix },
{ "underscore-symbol?", fl_julia_underscore_symbolp },
{ "string->normsymbol", fl_string2normsymbol },
{ "string.only-julia-char", fl_string_only_julia_char },
{ NULL, NULL }
};

Expand Down
13 changes: 6 additions & 7 deletions src/julia-parser.scm
Original file line number Diff line number Diff line change
Expand Up @@ -2495,13 +2495,12 @@
(write-char (not-eof-1 (read-char (ts:port s)))
b))
(loop (read-char (ts:port s))))))
(let ((str (unescape-string (io.tostring! b))))
(let ((len (string-length str)))
(if (= len 1)
(string.char str 0)
(if (= len 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters")))))))))
(let* ((str (unescape-string (io.tostring! b)))
(c (string.only-julia-char str)))
(or c
(if (= (string-length str) 0)
(error "invalid empty character literal")
(error "character literal contains multiple characters"))))))))

;; symbol/expression quote
((eq? t ':)
Expand Down
54 changes: 35 additions & 19 deletions test/syntax.jl
Original file line number Diff line number Diff line change
Expand Up @@ -276,9 +276,6 @@ end
@test Meta.parse("'\"'") == Meta.parse("'\\\"'") == '"' == "\""[1] == '\42'

# issue #24558
@test_throws ParseError Meta.parse("'\\xff'")
@test_throws ParseError Meta.parse("'\\x80'")
@test_throws ParseError Meta.parse("'ab'")
@test '\u2200' == "\u2200"[1]

@test_throws ParseError Meta.parse("f(2x for x=1:10, y")
Expand Down Expand Up @@ -317,19 +314,16 @@ let p = 15
@test 2p+1 == 31 # not a hex float literal
end

function test_parseerror(str, msg)
try
Meta.parse(str)
@test false
catch e
@test isa(e,ParseError) && e.msg == msg
end
macro test_parseerror(str, msg)
ex = :(@test_throws ParseError($(esc(msg))) Meta.parse($(esc(str))))
ex.args[2] = __source__
return ex
end
test_parseerror("0x", "invalid numeric constant \"0x\"")
test_parseerror("0b", "invalid numeric constant \"0b\"")
test_parseerror("0o", "invalid numeric constant \"0o\"")
test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"")
test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")
@test_parseerror("0x", "invalid numeric constant \"0x\"")
@test_parseerror("0b", "invalid numeric constant \"0b\"")
@test_parseerror("0o", "invalid numeric constant \"0o\"")
@test_parseerror("0x0.1", "hex float literal must contain \"p\" or \"P\"")
@test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")

# issue #15798
@test Meta.lower(Main, Base.parse_input_line("""
Expand All @@ -345,8 +339,8 @@ test_parseerror("0x1.0p", "invalid numeric constant \"0x1.0\"")
""")::Expr) == 23341

# issue #15763
test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1")
test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2")
@test_parseerror("if\nfalse\nend", "missing condition in \"if\" at none:1")
@test_parseerror("if false\nelseif\nend", "missing condition in \"elseif\" at none:2")

# issue #15828
@test Meta.lower(Main, Meta.parse("x...")) == Expr(:error, "\"...\" expression outside call")
Expand Down Expand Up @@ -2059,8 +2053,8 @@ end == 1
# issue #29982
@test Meta.parse("'a'") == 'a'
@test Meta.parse("'\U0061'") == 'a'
test_parseerror("''", "invalid empty character literal")
test_parseerror("'abc'", "character literal contains multiple characters")
@test_parseerror("''", "invalid empty character literal")
@test_parseerror("'abc'", "character literal contains multiple characters")

# optional soft scope: #28789, #33864

Expand Down Expand Up @@ -3379,3 +3373,25 @@ f45162(f) = f(x=1)
@test Meta.lower(@__MODULE__, :(global const x::Int)) == Expr(:error, "expected assignment after \"const\"")
@test Meta.lower(@__MODULE__, :(const global x)) == Expr(:error, "expected assignment after \"const\"")
@test Meta.lower(@__MODULE__, :(const global x::Int)) == Expr(:error, "expected assignment after \"const\"")

@testset "issue 25072" begin
@test '\xc0\x80' == reinterpret(Char, 0xc0800000)
@test '\x80' == reinterpret(Char, 0x80000000)
@test '\xff' == reinterpret(Char, 0xff000000)
@test_parseerror "'\\xff\\xff\\xff\\xff'" "character literal contains multiple characters" # == reinterpret(Char, 0xffffffff)
@test '\uffff' == Char(0xffff)
@test '\U00002014' == Char(0x2014)
@test '\100' == reinterpret(Char, UInt32(0o100) << 24)
@test_parseerror "'\\100\\42'" "character literal contains multiple characters" # == reinterpret(Char, (UInt32(0o100) << 24) | (UInt32(0o42) << 16))
@test_parseerror "''" "invalid empty character literal"
@test_parseerror "'\\xff\\xff\\xff\\xff\\xff'" "character literal contains multiple characters"
@test_parseerror "'abcd'" "character literal contains multiple characters"
@test_parseerror "'\\uff\\xff'" "character literal contains multiple characters"
@test_parseerror "'\\xff\\uff'" "character literal contains multiple characters"
@test_parseerror "'\\xffa'" "character literal contains multiple characters"
@test_parseerror "'\\uffffa'" "character literal contains multiple characters"
@test_parseerror "'\\U00002014a'" "character literal contains multiple characters"
@test_parseerror "'\\1000'" "character literal contains multiple characters"
@test Meta.isexpr(Meta.parse("'a"), :incomplete)
@test ''' == "'"[1]
end

0 comments on commit a1ce793

Please sign in to comment.