From 0735010ef8a7b50b0fbfa761d3209c13a19b33fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Sun, 31 Oct 2021 04:14:52 +0100 Subject: [PATCH 1/3] Fix disallow Unicode bi-directional control characters String literals, symbol names and comments are no longer allowed to contain bi-directional control characters in order to prevent trojan source vulnerability. --- spec/compiler/lexer/lexer_spec.cr | 12 ++++++++++++ spec/support/syntax.cr | 2 +- src/compiler/crystal/syntax/lexer.cr | 14 +++++++++++--- 3 files changed, 24 insertions(+), 4 deletions(-) diff --git a/spec/compiler/lexer/lexer_spec.cr b/spec/compiler/lexer/lexer_spec.cr index 30910c406354..1c601d7aa87b 100644 --- a/spec/compiler/lexer/lexer_spec.cr +++ b/spec/compiler/lexer/lexer_spec.cr @@ -558,4 +558,16 @@ describe "Lexer" do assert_syntax_error %("\\x1z"), "invalid hex escape" assert_syntax_error %("hi\\) + + # CVE-2021-42574 + describe "trojan source" do + ['\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char| + assert_syntax_error %("#{char}"), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(%w(#{char})), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(:#{char}), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(%i(#{char})), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(##{char}), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(macro foo\n##{char}\nend), "Invalid unicode control character: #{char.dump}" + end + end end diff --git a/spec/support/syntax.cr b/spec/support/syntax.cr index 67634180f565..b27ec339b032 100644 --- a/spec/support/syntax.cr +++ b/spec/support/syntax.cr @@ -130,7 +130,7 @@ class Crystal::ASTNode end def assert_syntax_error(str, message = nil, line = nil, column = nil, metafile = __FILE__, metaline = __LINE__, metaendline = __END_LINE__) - it "says syntax error on #{str.inspect}", metafile, metaline, metaendline do + it "says syntax error on #{str.dump}", metafile, metaline, metaendline do begin parse str fail "Expected SyntaxException to be raised", metafile, metaline diff --git a/src/compiler/crystal/syntax/lexer.cr b/src/compiler/crystal/syntax/lexer.cr index 5efa3490cca8..093d2d02d85e 100644 --- a/src/compiler/crystal/syntax/lexer.cr +++ b/src/compiler/crystal/syntax/lexer.cr @@ -1349,9 +1349,7 @@ module Crystal start_pos = current_pos end - while char != '\n' && char != '\0' - char = next_char_no_column_increment - end + skip_comment if doc_buffer = @token.doc_buffer doc_buffer << '\n' @@ -1365,6 +1363,7 @@ module Crystal def skip_comment char = current_char while char != '\n' && char != '\0' + ensure_no_unicode_control char = next_char_no_column_increment end end @@ -2166,6 +2165,7 @@ module Crystal current_char != '#' && current_char != '\r' && current_char != '\n' + ensure_no_unicode_control next_char end @@ -2338,6 +2338,7 @@ module Crystal when '\0' raise "unterminated macro" else + ensure_no_unicode_control char = next_char end end @@ -2909,6 +2910,7 @@ module Crystal sub_start = current_pos + 1 end + ensure_no_unicode_control next_char end @@ -3235,6 +3237,12 @@ module Crystal raise "unknown token: #{current_char.inspect}", @line_number, @column_number end + def ensure_no_unicode_control + if current_char.in?('\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069') + raise "Invalid unicode control character: #{current_char.dump}" + end + end + def set_token_raw_from_start(start) @token.raw = string_range(start) if @wants_raw end From 15ebc9c6f2c69ebef092105dca86b3fd2bf5bca3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Sun, 31 Oct 2021 06:09:08 +0100 Subject: [PATCH 2/3] Disallow bidi control characters anywhere in source code --- spec/compiler/lexer/lexer_spec.cr | 2 ++ src/compiler/crystal/syntax/lexer.cr | 13 +++---------- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/spec/compiler/lexer/lexer_spec.cr b/spec/compiler/lexer/lexer_spec.cr index 1c601d7aa87b..f51ef3694dc8 100644 --- a/spec/compiler/lexer/lexer_spec.cr +++ b/spec/compiler/lexer/lexer_spec.cr @@ -562,9 +562,11 @@ describe "Lexer" do # CVE-2021-42574 describe "trojan source" do ['\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069'].each do |char| + assert_syntax_error %(f#{char}), "Invalid unicode control character: #{char.dump}" assert_syntax_error %("#{char}"), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(%w(#{char})), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(:#{char}), "Invalid unicode control character: #{char.dump}" + assert_syntax_error %(:"#{char}"), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(%i(#{char})), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(##{char}), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(macro foo\n##{char}\nend), "Invalid unicode control character: #{char.dump}" diff --git a/src/compiler/crystal/syntax/lexer.cr b/src/compiler/crystal/syntax/lexer.cr index 093d2d02d85e..e7adc97ee72e 100644 --- a/src/compiler/crystal/syntax/lexer.cr +++ b/src/compiler/crystal/syntax/lexer.cr @@ -1363,7 +1363,6 @@ module Crystal def skip_comment char = current_char while char != '\n' && char != '\0' - ensure_no_unicode_control char = next_char_no_column_increment end end @@ -2165,7 +2164,6 @@ module Crystal current_char != '#' && current_char != '\r' && current_char != '\n' - ensure_no_unicode_control next_char end @@ -2338,7 +2336,6 @@ module Crystal when '\0' raise "unterminated macro" else - ensure_no_unicode_control char = next_char end end @@ -2910,7 +2907,6 @@ module Crystal sub_start = current_pos + 1 end - ensure_no_unicode_control next_char end @@ -3080,6 +3076,9 @@ module Crystal if error = @reader.error ::raise InvalidByteSequenceError.new("Unexpected byte 0x#{error.to_s(16)} at position #{@reader.pos}, malformed UTF-8") end + if current_char.in?('\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069') + raise "Invalid unicode control character: #{current_char.dump}" + end char end @@ -3237,12 +3236,6 @@ module Crystal raise "unknown token: #{current_char.inspect}", @line_number, @column_number end - def ensure_no_unicode_control - if current_char.in?('\u202A', '\u202B', '\u202C', '\u202D', '\u202E', '\u2066', '\u2067', '\u2068', '\u2069') - raise "Invalid unicode control character: #{current_char.dump}" - end - end - def set_token_raw_from_start(start) @token.raw = string_range(start) if @wants_raw end From 68d09ed6e60b259ce4536d37975145d321b8df09 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Johannes=20M=C3=BCller?= Date: Sun, 31 Oct 2021 23:42:11 +0100 Subject: [PATCH 3/3] Add specs for escape sequences --- spec/compiler/lexer/lexer_spec.cr | 3 +++ 1 file changed, 3 insertions(+) diff --git a/spec/compiler/lexer/lexer_spec.cr b/spec/compiler/lexer/lexer_spec.cr index f51ef3694dc8..9f2be5dfcc51 100644 --- a/spec/compiler/lexer/lexer_spec.cr +++ b/spec/compiler/lexer/lexer_spec.cr @@ -570,6 +570,9 @@ describe "Lexer" do assert_syntax_error %(%i(#{char})), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(##{char}), "Invalid unicode control character: #{char.dump}" assert_syntax_error %(macro foo\n##{char}\nend), "Invalid unicode control character: #{char.dump}" + + it_lexes_string char.to_s.dump, char.to_s + it_lexes_char char.dump, char end end end