From d2c66e9fa9a0b35f9065644e24d80568744a06ce Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Sun, 15 Sep 2024 05:59:52 -0400 Subject: [PATCH] =?UTF-8?q?Allow=20including=20arbitrary=20characters=20in?= =?UTF-8?q?=20strings=20with=20`\u{=E2=80=A6}`=20(#2360)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/compile_error.rs | 19 +++++ src/compile_error_kind.rs | 14 ++++ src/parser.rs | 108 ++++++++++++++++++++------- tests/string.rs | 153 ++++++++++++++++++++++++++++++++++++++ 4 files changed, 269 insertions(+), 25 deletions(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index b2396219b8..94df9e24c9 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -259,6 +259,25 @@ impl Display for CompileError<'_> { ref expected, found, } => write!(f, "Expected {}, but found {found}", List::or(expected)), + UnicodeEscapeCharacter { character } => { + write!(f, "expected hex digit [0-9A-Fa-f] but found `{character}`") + } + UnicodeEscapeDelimiter { character } => write!( + f, + "expected unicode escape sequence delimiter `{{` but found `{character}`" + ), + UnicodeEscapeEmpty => write!(f, "unicode escape sequences must not be empty"), + UnicodeEscapeLength { hex } => write!( + f, + "unicode escape sequence starting with `\\u{{{hex}` longer than six hex digits" + ), + UnicodeEscapeRange { hex } => { + write!( + f, + "unicode escape sequence value `{hex}` greater than maximum valid code point `10FFFF`", + ) + } + UnicodeEscapeUnterminated => write!(f, "unterminated unicode escape sequence"), UnknownAliasTarget { alias, target } => { write!(f, "Alias `{alias}` has an unknown target `{target}`") } diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index 73c3960e26..c99cabac1b 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -120,6 +120,20 @@ pub(crate) enum CompileErrorKind<'src> { expected: Vec, found: TokenKind, }, + UnicodeEscapeCharacter { + character: char, + }, + UnicodeEscapeDelimiter { + character: char, + }, + UnicodeEscapeEmpty, + UnicodeEscapeLength { + hex: String, + }, + UnicodeEscapeRange { + hex: String, + }, + UnicodeEscapeUnterminated, UnknownAliasTarget { alias: &'src str, target: &'src str, diff --git a/src/parser.rs b/src/parser.rs index b896cd0b13..da2ff76f3f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -676,31 +676,7 @@ impl<'run, 'src> Parser<'run, 'src> { }; let cooked = if kind.processes_escape_sequences() { - let mut cooked = String::new(); - let mut escape = false; - for c in unindented.chars() { - if escape { - match c { - 'n' => cooked.push('\n'), - 'r' => cooked.push('\r'), - 't' => cooked.push('\t'), - '\\' => cooked.push('\\'), - '\n' => {} - '"' => cooked.push('"'), - other => { - return Err( - token.error(CompileErrorKind::InvalidEscapeSequence { character: other }), - ); - } - } - escape = false; - } else if c == '\\' { - escape = true; - } else { - cooked.push(c); - } - } - cooked + Self::cook_string(token, &unindented)? } else { unindented }; @@ -724,6 +700,88 @@ impl<'run, 'src> Parser<'run, 'src> { )) } + // Transform escape sequences in from string literal `token` with content `text` + fn cook_string(token: Token<'src>, text: &str) -> CompileResult<'src, String> { + #[derive(PartialEq, Eq)] + enum State { + Initial, + Backslash, + Unicode, + UnicodeValue { hex: String }, + } + + let mut cooked = String::new(); + + let mut state = State::Initial; + + for c in text.chars() { + match state { + State::Initial => { + if c == '\\' { + state = State::Backslash; + } else { + cooked.push(c); + } + } + State::Backslash if c == 'u' => { + state = State::Unicode; + } + State::Backslash => { + match c { + 'n' => cooked.push('\n'), + 'r' => cooked.push('\r'), + 't' => cooked.push('\t'), + '\\' => cooked.push('\\'), + '\n' => {} + '"' => cooked.push('"'), + character => { + return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })) + } + } + state = State::Initial; + } + State::Unicode => match c { + '{' => { + state = State::UnicodeValue { hex: String::new() }; + } + character => { + return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); + } + }, + State::UnicodeValue { ref mut hex } => match c { + '}' => { + if hex.is_empty() { + return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); + } + + let codepoint = u32::from_str_radix(hex, 16).unwrap(); + + cooked.push(char::from_u32(codepoint).ok_or_else(|| { + token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) + })?); + + state = State::Initial; + } + '0'..='9' | 'A'..='F' | 'a'..='f' => { + hex.push(c); + if hex.len() > 6 { + return Err(token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() })); + } + } + _ => { + return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); + } + }, + } + } + + if state != State::Initial { + return Err(token.error(CompileErrorKind::UnicodeEscapeUnterminated)); + } + + Ok(cooked) + } + /// Parse a string literal, e.g. `"FOO"` fn parse_string_literal(&mut self) -> CompileResult<'src, StringLiteral<'src>> { let (_token, string_literal) = self.parse_string_literal_token()?; diff --git a/tests/string.rs b/tests/string.rs index 1803877a31..12bcbac0f7 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -391,3 +391,156 @@ test! { ", status: EXIT_FAILURE, } + +#[test] +fn valid_unicode_escape() { + Test::new() + .justfile(r#"x := "\u{1f916}\u{1F916}""#) + .args(["--evaluate", "x"]) + .stdout("🤖🤖") + .run(); +} + +#[test] +fn unicode_escapes_with_all_hex_digits() { + Test::new() + .justfile(r#"x := "\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}""#) + .args(["--evaluate", "x"]) + .stdout("\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}") + .run(); +} + +#[test] +fn maximum_valid_unicode_escape() { + Test::new() + .justfile(r#"x := "\u{10FFFF}""#) + .args(["--evaluate", "x"]) + .stdout("\u{10FFFF}") + .run(); +} + +#[test] +fn unicode_escape_no_braces() { + Test::new() + .justfile("x := \"\\u1234\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected unicode escape sequence delimiter `{` but found `1` + ——▶ justfile:1:6 + │ +1 │ x := "\u1234" + │ ^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_empty() { + Test::new() + .justfile("x := \"\\u{}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: unicode escape sequences must not be empty + ——▶ justfile:1:6 + │ +1 │ x := "\u{}" + │ ^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_requires_immediate_opening_brace() { + Test::new() + .justfile("x := \"\\u {1f916}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected unicode escape sequence delimiter `{` but found ` ` + ——▶ justfile:1:6 + │ +1 │ x := "\u {1f916}" + │ ^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_non_hex() { + Test::new() + .justfile("x := \"\\u{foo}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected hex digit [0-9A-Fa-f] but found `o` + ——▶ justfile:1:6 + │ +1 │ x := "\u{foo}" + │ ^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_invalid_character() { + Test::new() + .justfile("x := \"\\u{BadBad}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: unicode escape sequence value `BadBad` greater than maximum valid code point `10FFFF` + ——▶ justfile:1:6 + │ +1 │ x := "\u{BadBad}" + │ ^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_too_long() { + Test::new() + .justfile("x := \"\\u{FFFFFFFFFF}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: unicode escape sequence starting with `\u{FFFFFFF` longer than six hex digits + ——▶ justfile:1:6 + │ +1 │ x := "\u{FFFFFFFFFF}" + │ ^^^^^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn unicode_escape_unterminated() { + Test::new() + .justfile("x := \"\\u{1f917\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: unterminated unicode escape sequence + ——▶ justfile:1:6 + │ +1 │ x := "\u{1f917" + │ ^^^^^^^^^^ +"#, + ) + .run(); +}