Skip to content

Commit

Permalink
Disallow surrogate halves in string and char literals (#10443)
Browse files Browse the repository at this point in the history
  • Loading branch information
HertzDevil authored Mar 19, 2021
1 parent 3386930 commit 395d0bf
Show file tree
Hide file tree
Showing 4 changed files with 14 additions and 1 deletion.
4 changes: 4 additions & 0 deletions spec/compiler/lexer/lexer_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -528,6 +528,10 @@ describe "Lexer" do
assert_syntax_error "'\\uFEDZ'", "expected hexadecimal character in unicode escape"
assert_syntax_error "'\\u{}'", "expected hexadecimal character in unicode escape"
assert_syntax_error "'\\u{110000}'", "invalid unicode codepoint (too large)"
assert_syntax_error "'\\uD800'", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "'\\uDFFF'", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "'\\u{D800}'", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "'\\u{DFFF}'", "invalid unicode codepoint (surrogate half)"
assert_syntax_error ":+1", "unexpected token"

assert_syntax_error "'\\1'", "invalid char escape sequence"
Expand Down
4 changes: 4 additions & 0 deletions spec/compiler/lexer/lexer_string_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -288,6 +288,10 @@ describe "Lexer string" do
assert_syntax_error "\"\\uFEDZ\"", "expected hexadecimal character in unicode escape"
assert_syntax_error "\"\\u{}\"", "expected hexadecimal character in unicode escape"
assert_syntax_error "\"\\u{110000}\"", "invalid unicode codepoint (too large)"
assert_syntax_error "\"\\uD800\"", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "\"\\uDFFF\"", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "\"\\u{D800}\"", "invalid unicode codepoint (surrogate half)"
assert_syntax_error "\"\\u{DFFF}\"", "invalid unicode codepoint (surrogate half)"

it "lexes backtick string" do
lexer = Lexer.new(%(`hello`))
Expand Down
2 changes: 1 addition & 1 deletion spec/std/string/utf16_spec.cr
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ describe "String UTF16" do
end

it "in the range U+D800..U+DFFF" do
encoded = "\u{D800}\u{DFFF}".to_utf16
encoded = String.new(Bytes[0xED, 0xA0, 0x80, 0xED, 0xBF, 0xBF]).to_utf16
encoded.should eq(Slice[0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16, 0xFFFD_u16])
encoded.unsafe_fetch(encoded.size).should eq 0_u16
end
Expand Down
5 changes: 5 additions & 0 deletions src/compiler/crystal/syntax/lexer.cr
Original file line number Diff line number Diff line change
Expand Up @@ -2739,6 +2739,9 @@ module Crystal
hex_value = char_to_hex(next_char) { expected_hexacimal_character_in_unicode_escape }
codepoint = 16 * codepoint + hex_value
end
if 0xD800 <= codepoint <= 0xDFFF
raise "invalid unicode codepoint (surrogate half)"
end
codepoint
end

Expand Down Expand Up @@ -2773,6 +2776,8 @@ module Crystal
expected_hexacimal_character_in_unicode_escape
elsif codepoint > 0x10FFFF
raise "invalid unicode codepoint (too large)"
elsif 0xD800 <= codepoint <= 0xDFFF
raise "invalid unicode codepoint (surrogate half)"
end

unless found_space
Expand Down

0 comments on commit 395d0bf

Please sign in to comment.