Skip to content

Commit

Permalink
Allow including arbitrary characters in strings with \u{…} (#2360)
Browse files Browse the repository at this point in the history
  • Loading branch information
laniakea64 authored Sep 15, 2024
1 parent d4e1799 commit d2c66e9
Show file tree
Hide file tree
Showing 4 changed files with 269 additions and 25 deletions.
19 changes: 19 additions & 0 deletions src/compile_error.rs
Original file line number Diff line number Diff line change
Expand Up @@ -259,6 +259,25 @@ impl Display for CompileError<'_> {
ref expected,
found,
} => write!(f, "Expected {}, but found {found}", List::or(expected)),
UnicodeEscapeCharacter { character } => {
write!(f, "expected hex digit [0-9A-Fa-f] but found `{character}`")
}
UnicodeEscapeDelimiter { character } => write!(
f,
"expected unicode escape sequence delimiter `{{` but found `{character}`"
),
UnicodeEscapeEmpty => write!(f, "unicode escape sequences must not be empty"),
UnicodeEscapeLength { hex } => write!(
f,
"unicode escape sequence starting with `\\u{{{hex}` longer than six hex digits"
),
UnicodeEscapeRange { hex } => {
write!(
f,
"unicode escape sequence value `{hex}` greater than maximum valid code point `10FFFF`",
)
}
UnicodeEscapeUnterminated => write!(f, "unterminated unicode escape sequence"),
UnknownAliasTarget { alias, target } => {
write!(f, "Alias `{alias}` has an unknown target `{target}`")
}
Expand Down
14 changes: 14 additions & 0 deletions src/compile_error_kind.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,20 @@ pub(crate) enum CompileErrorKind<'src> {
expected: Vec<TokenKind>,
found: TokenKind,
},
UnicodeEscapeCharacter {
character: char,
},
UnicodeEscapeDelimiter {
character: char,
},
UnicodeEscapeEmpty,
UnicodeEscapeLength {
hex: String,
},
UnicodeEscapeRange {
hex: String,
},
UnicodeEscapeUnterminated,
UnknownAliasTarget {
alias: &'src str,
target: &'src str,
Expand Down
108 changes: 83 additions & 25 deletions src/parser.rs
Original file line number Diff line number Diff line change
Expand Up @@ -676,31 +676,7 @@ impl<'run, 'src> Parser<'run, 'src> {
};

let cooked = if kind.processes_escape_sequences() {
let mut cooked = String::new();
let mut escape = false;
for c in unindented.chars() {
if escape {
match c {
'n' => cooked.push('\n'),
'r' => cooked.push('\r'),
't' => cooked.push('\t'),
'\\' => cooked.push('\\'),
'\n' => {}
'"' => cooked.push('"'),
other => {
return Err(
token.error(CompileErrorKind::InvalidEscapeSequence { character: other }),
);
}
}
escape = false;
} else if c == '\\' {
escape = true;
} else {
cooked.push(c);
}
}
cooked
Self::cook_string(token, &unindented)?
} else {
unindented
};
Expand All @@ -724,6 +700,88 @@ impl<'run, 'src> Parser<'run, 'src> {
))
}

// Transform escape sequences in from string literal `token` with content `text`
fn cook_string(token: Token<'src>, text: &str) -> CompileResult<'src, String> {
#[derive(PartialEq, Eq)]
enum State {
Initial,
Backslash,
Unicode,
UnicodeValue { hex: String },
}

let mut cooked = String::new();

let mut state = State::Initial;

for c in text.chars() {
match state {
State::Initial => {
if c == '\\' {
state = State::Backslash;
} else {
cooked.push(c);
}
}
State::Backslash if c == 'u' => {
state = State::Unicode;
}
State::Backslash => {
match c {
'n' => cooked.push('\n'),
'r' => cooked.push('\r'),
't' => cooked.push('\t'),
'\\' => cooked.push('\\'),
'\n' => {}
'"' => cooked.push('"'),
character => {
return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character }))
}
}
state = State::Initial;
}
State::Unicode => match c {
'{' => {
state = State::UnicodeValue { hex: String::new() };
}
character => {
return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character }));
}
},
State::UnicodeValue { ref mut hex } => match c {
'}' => {
if hex.is_empty() {
return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty));
}

let codepoint = u32::from_str_radix(hex, 16).unwrap();

cooked.push(char::from_u32(codepoint).ok_or_else(|| {
token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() })
})?);

state = State::Initial;
}
'0'..='9' | 'A'..='F' | 'a'..='f' => {
hex.push(c);
if hex.len() > 6 {
return Err(token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() }));
}
}
_ => {
return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c }));
}
},
}
}

if state != State::Initial {
return Err(token.error(CompileErrorKind::UnicodeEscapeUnterminated));
}

Ok(cooked)
}

/// Parse a string literal, e.g. `"FOO"`
fn parse_string_literal(&mut self) -> CompileResult<'src, StringLiteral<'src>> {
let (_token, string_literal) = self.parse_string_literal_token()?;
Expand Down
153 changes: 153 additions & 0 deletions tests/string.rs
Original file line number Diff line number Diff line change
Expand Up @@ -391,3 +391,156 @@ test! {
",
status: EXIT_FAILURE,
}

#[test]
fn valid_unicode_escape() {
Test::new()
.justfile(r#"x := "\u{1f916}\u{1F916}""#)
.args(["--evaluate", "x"])
.stdout("πŸ€–πŸ€–")
.run();
}

#[test]
fn unicode_escapes_with_all_hex_digits() {
Test::new()
.justfile(r#"x := "\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}""#)
.args(["--evaluate", "x"])
.stdout("\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}")
.run();
}

#[test]
fn maximum_valid_unicode_escape() {
Test::new()
.justfile(r#"x := "\u{10FFFF}""#)
.args(["--evaluate", "x"])
.stdout("\u{10FFFF}")
.run();
}

#[test]
fn unicode_escape_no_braces() {
Test::new()
.justfile("x := \"\\u1234\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: expected unicode escape sequence delimiter `{` but found `1`
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u1234"
β”‚ ^^^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_empty() {
Test::new()
.justfile("x := \"\\u{}\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: unicode escape sequences must not be empty
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u{}"
β”‚ ^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_requires_immediate_opening_brace() {
Test::new()
.justfile("x := \"\\u {1f916}\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: expected unicode escape sequence delimiter `{` but found ` `
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u {1f916}"
β”‚ ^^^^^^^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_non_hex() {
Test::new()
.justfile("x := \"\\u{foo}\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: expected hex digit [0-9A-Fa-f] but found `o`
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u{foo}"
β”‚ ^^^^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_invalid_character() {
Test::new()
.justfile("x := \"\\u{BadBad}\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: unicode escape sequence value `BadBad` greater than maximum valid code point `10FFFF`
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u{BadBad}"
β”‚ ^^^^^^^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_too_long() {
Test::new()
.justfile("x := \"\\u{FFFFFFFFFF}\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: unicode escape sequence starting with `\u{FFFFFFF` longer than six hex digits
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u{FFFFFFFFFF}"
β”‚ ^^^^^^^^^^^^^^^^
"#,
)
.run();
}

#[test]
fn unicode_escape_unterminated() {
Test::new()
.justfile("x := \"\\u{1f917\"")
.args(["--evaluate", "x"])
.status(1)
.stderr(
r#"
error: unterminated unicode escape sequence
β€”β€”β–Ά justfile:1:6
β”‚
1 β”‚ x := "\u{1f917"
β”‚ ^^^^^^^^^^
"#,
)
.run();
}

0 comments on commit d2c66e9

Please sign in to comment.