From eeeacc92e2ca9e8d54478bea99150376cdbb262d Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Tue, 10 Sep 2024 19:12:29 -0400 Subject: [PATCH 01/20] Initial implementation of `\u{...}` escape sequence --- src/compile_error.rs | 13 ++++++++++ src/compile_error_kind.rs | 10 ++++++++ src/parser.rs | 51 ++++++++++++++++++++++++++++++++++++++- 3 files changed, 73 insertions(+), 1 deletion(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index b2396219b8..e12cd89790 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -181,6 +181,7 @@ impl Display for CompileError<'_> { "{item_kind} `{item_name}` has invalid attribute `{}`", attribute.name(), ), + InvalidCharacter { hex } => write!(f, "`{hex}` does not represent a valid character"), InvalidEscapeSequence { character } => write!( f, "`\\{}` is not a valid escape sequence", @@ -192,6 +193,14 @@ impl Display for CompileError<'_> { _ => character.escape_default().collect(), } ), + InvalidUEscapeSequence { expected, found } => write!( + f, + "expected {expected} but found {}", + match found { + Some(c) => format!("`{c}`"), + None => String::from("end of string"), + } + ), MismatchedClosingDelimiter { open, open_line, @@ -247,6 +256,10 @@ impl Display for CompileError<'_> { f, "Non-default parameter `{parameter}` follows default parameter" ), + UEscapeSequenceTooLong { hex } => write!( + f, + "more than 6 hex digits in escape sequence starting with `\\u{{{hex}`" + ), UndefinedVariable { variable } => write!(f, "Variable `{variable}` not defined"), UnexpectedCharacter { expected } => write!(f, "Expected character `{expected}`"), UnexpectedClosingDelimiter { close } => { diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index 73c3960e26..bb144a8e77 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -80,9 +80,16 @@ pub(crate) enum CompileErrorKind<'src> { item_name: &'src str, attribute: Attribute<'src>, }, + InvalidCharacter { + hex: String, + }, InvalidEscapeSequence { character: char, }, + InvalidUEscapeSequence { + expected: &'src str, + found: Option, + }, MismatchedClosingDelimiter { close: Delimiter, open: Delimiter, @@ -104,6 +111,9 @@ pub(crate) enum CompileErrorKind<'src> { ShellExpansion { err: shellexpand::LookupError, }, + UEscapeSequenceTooLong { + hex: String, + }, UndefinedVariable { variable: &'src str, }, diff --git a/src/parser.rs b/src/parser.rs index b896cd0b13..c331c4da19 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -678,12 +678,61 @@ impl<'run, 'src> Parser<'run, 'src> { let cooked = if kind.processes_escape_sequences() { let mut cooked = String::new(); let mut escape = false; - for c in unindented.chars() { + let mut chars = unindented.chars(); + while let Some(c) = chars.next() { if escape { match c { 'n' => cooked.push('\n'), 'r' => cooked.push('\r'), 't' => cooked.push('\t'), + 'u' => { + let should_be_opening_brace = chars.next(); + if should_be_opening_brace != Some('{') { + return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { + expected: "{", + found: should_be_opening_brace, + })); + } + let mut hex = String::new(); + loop { + if let Some(c) = chars.next() { + if "0123456789ABCDEFabcdef".contains(c) { + hex.push(c); + if hex.len() > 6 { + return Err(token.error(CompileErrorKind::UEscapeSequenceTooLong { hex })); + } + } else if c == '}' { + if hex.is_empty() { + return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { + expected: "hex digit (0-9A-Fa-f)", + found: Some(c), + })); + } + break; + } else { + return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { + expected: "hex digit (0-9A-Fa-f) or `}`", + found: Some(c), + })); + } + } else { + return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { + expected: "hex digit (0-9A-Fa-f) or `}`", + found: None, + })); + } + } + + // We know this will be Ok(...): + // - empty string and invalid hex digits were filtered out already + // - u32::MAX is 8 hex digits, only up to 6 are allowed here, so this won't overflow + let char_u32 = u32::from_str_radix(hex.as_str(), 16).unwrap(); + + cooked.push(match char::from_u32(char_u32) { + Some(c) => c, + None => return Err(token.error(CompileErrorKind::InvalidCharacter { hex })), + }); + } '\\' => cooked.push('\\'), '\n' => {} '"' => cooked.push('"'), From a5d332026f106e13bd971280dbe3a4c2027a8b4c Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Tue, 10 Sep 2024 19:42:13 -0400 Subject: [PATCH 02/20] Add tests --- tests/string.rs | 135 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 135 insertions(+) diff --git a/tests/string.rs b/tests/string.rs index 1803877a31..9f00b31dd6 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -391,3 +391,138 @@ test! { ", status: EXIT_FAILURE, } + +#[test] +fn valid_unicode_escape() { + Test::new() + .justfile("x := \"\\u{1f916}\\u{1F916}\"") + .args(["--evaluate", "x"]) + .stdout("🤖🤖") + .run(); +} + +#[test] +fn u_escape_no_braces() { + Test::new() + .justfile("x := \"\\u1234\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected { but found `1` + ——▶ justfile:1:6 + │ +1 │ x := "\u1234" + │ ^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_empty() { + Test::new() + .justfile("x := \"\\u{}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected hex digit (0-9A-Fa-f) but found `}` + ——▶ justfile:1:6 + │ +1 │ x := "\u{}" + │ ^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_requires_immediate_opening_brace() { + Test::new() + .justfile("x := \"\\u {1f916}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected { but found ` ` + ——▶ justfile:1:6 + │ +1 │ x := "\u {1f916}" + │ ^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_non_hex() { + Test::new() + .justfile("x := \"\\u{foo}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected hex digit (0-9A-Fa-f) or `}` but found `o` + ——▶ justfile:1:6 + │ +1 │ x := "\u{foo}" + │ ^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_invalid_character() { + Test::new() + .justfile("x := \"\\u{BadBad}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: `BadBad` does not represent a valid character + ——▶ justfile:1:6 + │ +1 │ x := "\u{BadBad}" + │ ^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_too_long() { + Test::new() + .justfile("x := \"\\u{FFFFFFFFFF}\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: more than 6 hex digits in escape sequence starting with `\u{FFFFFFF` + ——▶ justfile:1:6 + │ +1 │ x := "\u{FFFFFFFFFF}" + │ ^^^^^^^^^^^^^^^^ +"#, + ) + .run(); +} + +#[test] +fn u_escape_unterminated() { + Test::new() + .justfile("x := \"\\u{1f917\"") + .args(["--evaluate", "x"]) + .status(1) + .stderr( + r#" +error: expected hex digit (0-9A-Fa-f) or `}` but found end of string + ——▶ justfile:1:6 + │ +1 │ x := "\u{1f917" + │ ^^^^^^^^^^ +"#, + ) + .run(); +} From 4c582233eccca3edff4d737f79044dfe240456f6 Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Wed, 11 Sep 2024 14:37:03 -0400 Subject: [PATCH 03/20] Use enum and use `u32::str_from_radix` error messages --- src/compile_error.rs | 13 ++-- src/compile_error_kind.rs | 9 ++- src/parser.rs | 135 +++++++++++++++++++++----------------- tests/string.rs | 10 +-- 4 files changed, 92 insertions(+), 75 deletions(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index e12cd89790..25b3e75bd0 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -193,14 +193,10 @@ impl Display for CompileError<'_> { _ => character.escape_default().collect(), } ), - InvalidUEscapeSequence { expected, found } => write!( - f, - "expected {expected} but found {}", - match found { - Some(c) => format!("`{c}`"), - None => String::from("end of string"), - } - ), + InvalidHex { hex, error } => write!(f, "`{hex}` is not a valid hexadecimal number: {error}"), + InvalidUEscapeSequence { expected, found } => { + write!(f, "expected `{expected}` but found `{found}`") + } MismatchedClosingDelimiter { open, open_line, @@ -284,6 +280,7 @@ impl Display for CompileError<'_> { UnknownStartOfToken => write!(f, "Unknown start of token:"), UnpairedCarriageReturn => write!(f, "Unpaired carriage return"), UnterminatedBacktick => write!(f, "Unterminated backtick"), + UnterminatedEscapeSequence => write!(f, "Unterminated escape sequence"), UnterminatedInterpolation => write!(f, "Unterminated interpolation"), UnterminatedString => write!(f, "Unterminated string"), } diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index bb144a8e77..cb056224c4 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -86,9 +86,13 @@ pub(crate) enum CompileErrorKind<'src> { InvalidEscapeSequence { character: char, }, + InvalidHex { + hex: String, + error: std::num::ParseIntError, + }, InvalidUEscapeSequence { - expected: &'src str, - found: Option, + expected: char, + found: char, }, MismatchedClosingDelimiter { close: Delimiter, @@ -150,6 +154,7 @@ pub(crate) enum CompileErrorKind<'src> { UnknownStartOfToken, UnpairedCarriageReturn, UnterminatedBacktick, + UnterminatedEscapeSequence, UnterminatedInterpolation, UnterminatedString, } diff --git a/src/parser.rs b/src/parser.rs index c331c4da19..b2e28d1ba1 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -676,79 +676,94 @@ impl<'run, 'src> Parser<'run, 'src> { }; let cooked = if kind.processes_escape_sequences() { + #[derive(PartialEq, Eq)] + enum State { + Initial, + Backslash, + Unicode, + UnicodeValue { hex: String }, + } let mut cooked = String::new(); - let mut escape = false; + let mut state = State::Initial; let mut chars = unindented.chars(); while let Some(c) = chars.next() { - if escape { - match c { - 'n' => cooked.push('\n'), - 'r' => cooked.push('\r'), - 't' => cooked.push('\t'), - 'u' => { - let should_be_opening_brace = chars.next(); - if should_be_opening_brace != Some('{') { - return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { - expected: "{", - found: should_be_opening_brace, - })); + match state { + State::Initial => { + if c == '\\' { + state = State::Backslash; + } else { + cooked.push(c); + } + } + State::Backslash => { + match c { + 'n' => cooked.push('\n'), + 'r' => cooked.push('\r'), + 't' => cooked.push('\t'), + 'u' => { + state = State::Unicode; + continue; } - let mut hex = String::new(); - loop { - if let Some(c) = chars.next() { - if "0123456789ABCDEFabcdef".contains(c) { - hex.push(c); - if hex.len() > 6 { - return Err(token.error(CompileErrorKind::UEscapeSequenceTooLong { hex })); - } - } else if c == '}' { - if hex.is_empty() { - return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { - expected: "hex digit (0-9A-Fa-f)", - found: Some(c), - })); - } - break; - } else { - return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { - expected: "hex digit (0-9A-Fa-f) or `}`", - found: Some(c), - })); - } - } else { - return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { - expected: "hex digit (0-9A-Fa-f) or `}`", - found: None, - })); - } + '\\' => cooked.push('\\'), + '\n' => {} + '"' => cooked.push('"'), + other => { + return Err( + token.error(CompileErrorKind::InvalidEscapeSequence { character: other }), + ); } - - // We know this will be Ok(...): - // - empty string and invalid hex digits were filtered out already - // - u32::MAX is 8 hex digits, only up to 6 are allowed here, so this won't overflow - let char_u32 = u32::from_str_radix(hex.as_str(), 16).unwrap(); + } + state = State::Initial; + } + State::Unicode => match c { + '{' => { + state = State::UnicodeValue { hex: String::new() }; + continue; + } + other => { + return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { + expected: '{', + found: other, + })); + } + }, + State::UnicodeValue { ref mut hex } => { + if c == '}' { + let char_u32 = match u32::from_str_radix(hex.as_str(), 16) { + Ok(c) => c, + Err(error) => { + return Err(token.error(CompileErrorKind::InvalidHex { + hex: hex.clone(), + error, + })) + } + }; cooked.push(match char::from_u32(char_u32) { Some(c) => c, - None => return Err(token.error(CompileErrorKind::InvalidCharacter { hex })), + None => { + return Err(token.error(CompileErrorKind::InvalidCharacter { hex: hex.clone() })) + } }); + } else { + hex.push(c); + if hex.len() > 6 { + return Err( + token.error(CompileErrorKind::UEscapeSequenceTooLong { hex: hex.clone() }), + ); + } + continue; } - '\\' => cooked.push('\\'), - '\n' => {} - '"' => cooked.push('"'), - other => { - return Err( - token.error(CompileErrorKind::InvalidEscapeSequence { character: other }), - ); - } + + state = State::Initial; } - escape = false; - } else if c == '\\' { - escape = true; - } else { - cooked.push(c); } } + + if state != State::Initial { + return Err(token.error(CompileErrorKind::UnterminatedEscapeSequence)); + } + cooked } else { unindented diff --git a/tests/string.rs b/tests/string.rs index 9f00b31dd6..494fbe610b 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -409,7 +409,7 @@ fn u_escape_no_braces() { .status(1) .stderr( r#" -error: expected { but found `1` +error: expected `{` but found `1` ——▶ justfile:1:6 │ 1 │ x := "\u1234" @@ -427,7 +427,7 @@ fn u_escape_empty() { .status(1) .stderr( r#" -error: expected hex digit (0-9A-Fa-f) but found `}` +error: `` is not a valid hexadecimal number: cannot parse integer from empty string ——▶ justfile:1:6 │ 1 │ x := "\u{}" @@ -445,7 +445,7 @@ fn u_escape_requires_immediate_opening_brace() { .status(1) .stderr( r#" -error: expected { but found ` ` +error: expected `{` but found ` ` ——▶ justfile:1:6 │ 1 │ x := "\u {1f916}" @@ -463,7 +463,7 @@ fn u_escape_non_hex() { .status(1) .stderr( r#" -error: expected hex digit (0-9A-Fa-f) or `}` but found `o` +error: `foo` is not a valid hexadecimal number: invalid digit found in string ——▶ justfile:1:6 │ 1 │ x := "\u{foo}" @@ -517,7 +517,7 @@ fn u_escape_unterminated() { .status(1) .stderr( r#" -error: expected hex digit (0-9A-Fa-f) or `}` but found end of string +error: Unterminated escape sequence ——▶ justfile:1:6 │ 1 │ x := "\u{1f917" From aecee511d6e8efcae3d860d7ec26c572fa18e038 Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Wed, 11 Sep 2024 14:40:24 -0400 Subject: [PATCH 04/20] Clippy --- src/parser.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index b2e28d1ba1..23a625d4dd 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -685,8 +685,7 @@ impl<'run, 'src> Parser<'run, 'src> { } let mut cooked = String::new(); let mut state = State::Initial; - let mut chars = unindented.chars(); - while let Some(c) = chars.next() { + for c in unindented.chars() { match state { State::Initial => { if c == '\\' { From 5f6d0a940855cdcdb63f8b22ed32d36a503d90c8 Mon Sep 17 00:00:00 2001 From: laniakea64 Date: Sat, 14 Sep 2024 13:36:52 -0400 Subject: [PATCH 05/20] Don't use `u32::from_str_radix` error messages --- src/compile_error.rs | 29 ++++++++++++++++++++--------- src/compile_error_kind.rs | 23 +++++++++++------------ src/parser.rs | 31 +++++++++++++++---------------- tests/string.rs | 6 +++--- 4 files changed, 49 insertions(+), 40 deletions(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index 25b3e75bd0..76993b0b9b 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -181,7 +181,6 @@ impl Display for CompileError<'_> { "{item_kind} `{item_name}` has invalid attribute `{}`", attribute.name(), ), - InvalidCharacter { hex } => write!(f, "`{hex}` does not represent a valid character"), InvalidEscapeSequence { character } => write!( f, "`\\{}` is not a valid escape sequence", @@ -193,10 +192,7 @@ impl Display for CompileError<'_> { _ => character.escape_default().collect(), } ), - InvalidHex { hex, error } => write!(f, "`{hex}` is not a valid hexadecimal number: {error}"), - InvalidUEscapeSequence { expected, found } => { - write!(f, "expected `{expected}` but found `{found}`") - } + InvalidUEscapeSequence { character } => write!(f, "expected `{{` but found `{character}`"), MismatchedClosingDelimiter { open, open_line, @@ -252,10 +248,6 @@ impl Display for CompileError<'_> { f, "Non-default parameter `{parameter}` follows default parameter" ), - UEscapeSequenceTooLong { hex } => write!( - f, - "more than 6 hex digits in escape sequence starting with `\\u{{{hex}`" - ), UndefinedVariable { variable } => write!(f, "Variable `{variable}` not defined"), UnexpectedCharacter { expected } => write!(f, "Expected character `{expected}`"), UnexpectedClosingDelimiter { close } => { @@ -268,6 +260,25 @@ impl Display for CompileError<'_> { ref expected, found, } => write!(f, "Expected {}, but found {found}", List::or(expected)), + UnicodeEscapeCharacter { character } => { + write!(f, "expected hex digit (0-9A-Fa-f), found `{character}`") + } + UnicodeEscapeEmpty => write!(f, "expected hex digit (0-9A-Fa-f) but found `}}`"), + UnicodeEscapeLength { hex } => write!( + f, + "more than 6 hex digits in escape sequence starting with `\\u{{{hex}`" + ), + UnicodeEscapeRange { hex } => { + write!( + f, + "`{hex}` does not represent a valid character{}", + if u32::from_str_radix(hex, 16).unwrap() > 1_114_111 { + ": maximum valid code point is 10FFFF" + } else { + "" + } + ) + } UnknownAliasTarget { alias, target } => { write!(f, "Alias `{alias}` has an unknown target `{target}`") } diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index cb056224c4..27f88c3628 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -80,19 +80,11 @@ pub(crate) enum CompileErrorKind<'src> { item_name: &'src str, attribute: Attribute<'src>, }, - InvalidCharacter { - hex: String, - }, InvalidEscapeSequence { character: char, }, - InvalidHex { - hex: String, - error: std::num::ParseIntError, - }, InvalidUEscapeSequence { - expected: char, - found: char, + character: char, }, MismatchedClosingDelimiter { close: Delimiter, @@ -115,9 +107,6 @@ pub(crate) enum CompileErrorKind<'src> { ShellExpansion { err: shellexpand::LookupError, }, - UEscapeSequenceTooLong { - hex: String, - }, UndefinedVariable { variable: &'src str, }, @@ -134,6 +123,16 @@ pub(crate) enum CompileErrorKind<'src> { expected: Vec, found: TokenKind, }, + UnicodeEscapeCharacter { + character: char, + }, + UnicodeEscapeEmpty, + UnicodeEscapeLength { + hex: String, + }, + UnicodeEscapeRange { + hex: String, + }, UnknownAliasTarget { alias: &'src str, target: &'src str, diff --git a/src/parser.rs b/src/parser.rs index 23a625d4dd..7ff3d29304 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -720,38 +720,37 @@ impl<'run, 'src> Parser<'run, 'src> { continue; } other => { - return Err(token.error(CompileErrorKind::InvalidUEscapeSequence { - expected: '{', - found: other, - })); + return Err( + token.error(CompileErrorKind::InvalidUEscapeSequence { character: other }), + ); } }, State::UnicodeValue { ref mut hex } => { if c == '}' { - let char_u32 = match u32::from_str_radix(hex.as_str(), 16) { - Ok(c) => c, - Err(error) => { - return Err(token.error(CompileErrorKind::InvalidHex { - hex: hex.clone(), - error, - })) - } - }; + if hex.is_empty() { + return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); + } + + let char_u32 = u32::from_str_radix(hex.as_str(), 16).unwrap(); cooked.push(match char::from_u32(char_u32) { Some(c) => c, None => { - return Err(token.error(CompileErrorKind::InvalidCharacter { hex: hex.clone() })) + return Err( + token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }), + ) } }); - } else { + } else if "0123456789ABCDEFabcdef".contains(c) { hex.push(c); if hex.len() > 6 { return Err( - token.error(CompileErrorKind::UEscapeSequenceTooLong { hex: hex.clone() }), + token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() }), ); } continue; + } else { + return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); } state = State::Initial; diff --git a/tests/string.rs b/tests/string.rs index 494fbe610b..3849a3b8e7 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -427,7 +427,7 @@ fn u_escape_empty() { .status(1) .stderr( r#" -error: `` is not a valid hexadecimal number: cannot parse integer from empty string +error: expected hex digit (0-9A-Fa-f) but found `}` ——▶ justfile:1:6 │ 1 │ x := "\u{}" @@ -463,7 +463,7 @@ fn u_escape_non_hex() { .status(1) .stderr( r#" -error: `foo` is not a valid hexadecimal number: invalid digit found in string +error: expected hex digit (0-9A-Fa-f), found `o` ——▶ justfile:1:6 │ 1 │ x := "\u{foo}" @@ -481,7 +481,7 @@ fn u_escape_invalid_character() { .status(1) .stderr( r#" -error: `BadBad` does not represent a valid character +error: `BadBad` does not represent a valid character: maximum valid code point is 10FFFF ——▶ justfile:1:6 │ 1 │ x := "\u{BadBad}" From 6246f255c6a0d23a15da947ac45481260511097a Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:15:54 +0700 Subject: [PATCH 06/20] Rename InvalidUEscapeSequence to UnicodeEscapeDelimiter --- src/compile_error.rs | 2 +- src/compile_error_kind.rs | 6 +++--- src/parser.rs | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index 76993b0b9b..aac219acf8 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -192,7 +192,6 @@ impl Display for CompileError<'_> { _ => character.escape_default().collect(), } ), - InvalidUEscapeSequence { character } => write!(f, "expected `{{` but found `{character}`"), MismatchedClosingDelimiter { open, open_line, @@ -263,6 +262,7 @@ impl Display for CompileError<'_> { UnicodeEscapeCharacter { character } => { write!(f, "expected hex digit (0-9A-Fa-f), found `{character}`") } + UnicodeEscapeDelimiter { character } => write!(f, "expected `{{` but found `{character}`"), UnicodeEscapeEmpty => write!(f, "expected hex digit (0-9A-Fa-f) but found `}}`"), UnicodeEscapeLength { hex } => write!( f, diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index 27f88c3628..4f39e43006 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -83,9 +83,6 @@ pub(crate) enum CompileErrorKind<'src> { InvalidEscapeSequence { character: char, }, - InvalidUEscapeSequence { - character: char, - }, MismatchedClosingDelimiter { close: Delimiter, open: Delimiter, @@ -126,6 +123,9 @@ pub(crate) enum CompileErrorKind<'src> { UnicodeEscapeCharacter { character: char, }, + UnicodeEscapeDelimiter { + character: char, + }, UnicodeEscapeEmpty, UnicodeEscapeLength { hex: String, diff --git a/src/parser.rs b/src/parser.rs index 7ff3d29304..c4ebbb103b 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -721,7 +721,7 @@ impl<'run, 'src> Parser<'run, 'src> { } other => { return Err( - token.error(CompileErrorKind::InvalidUEscapeSequence { character: other }), + token.error(CompileErrorKind::UnicodeEscapeDelimiter { character: other }), ); } }, From e16462b18eb19fd9150e445e942acc0aa23c9954 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:16:21 +0700 Subject: [PATCH 07/20] Rename other to use shorthand syntax --- src/parser.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index c4ebbb103b..536386a0d2 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -719,10 +719,8 @@ impl<'run, 'src> Parser<'run, 'src> { state = State::UnicodeValue { hex: String::new() }; continue; } - other => { - return Err( - token.error(CompileErrorKind::UnicodeEscapeDelimiter { character: other }), - ); + character => { + return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); } }, State::UnicodeValue { ref mut hex } => { From 7436b6a10ca25e1ec3a7d81c5775eb11f9738b11 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:17:21 +0700 Subject: [PATCH 08/20] char_u32 -> codepoint --- src/parser.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 536386a0d2..3e8d5a26d7 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -729,9 +729,9 @@ impl<'run, 'src> Parser<'run, 'src> { return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); } - let char_u32 = u32::from_str_radix(hex.as_str(), 16).unwrap(); + let codepoint = u32::from_str_radix(hex.as_str(), 16).unwrap(); - cooked.push(match char::from_u32(char_u32) { + cooked.push(match char::from_u32(codepoint) { Some(c) => c, None => { return Err( From 43aaf46c58f4f67d46f3226cb856ae1b2a647965 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:19:36 +0700 Subject: [PATCH 09/20] Use ok_or_else --- src/parser.rs | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 3e8d5a26d7..06eac819db 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -731,14 +731,9 @@ impl<'run, 'src> Parser<'run, 'src> { let codepoint = u32::from_str_radix(hex.as_str(), 16).unwrap(); - cooked.push(match char::from_u32(codepoint) { - Some(c) => c, - None => { - return Err( - token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }), - ) - } - }); + cooked.push(char::from_u32(codepoint).ok_or_else(|| { + token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) + })?); } else if "0123456789ABCDEFabcdef".contains(c) { hex.push(c); if hex.len() > 6 { From c2cb6a5648941011ba100c71d16cb323c1f0321a Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:26:54 +0700 Subject: [PATCH 10/20] Handle \u separately to avoid continue --- src/parser.rs | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 06eac819db..15cadcfed7 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -694,15 +694,18 @@ impl<'run, 'src> Parser<'run, 'src> { cooked.push(c); } } + State::Backslash if c == 'u' => { + state = State::Unicode; + } State::Backslash => { match c { - 'n' => cooked.push('\n'), - 'r' => cooked.push('\r'), - 't' => cooked.push('\t'), - 'u' => { - state = State::Unicode; - continue; + 'n' => { + cooked.push('\n'); } + 'r' => { + cooked.push('\r'); + } + 't' => cooked.push('\t'), '\\' => cooked.push('\\'), '\n' => {} '"' => cooked.push('"'), From ae0f383f304bb5bfa8fff72f1aaa52d209ac1853 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:27:15 +0700 Subject: [PATCH 11/20] Shorthand intitializer --- src/parser.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 15cadcfed7..ff4abf6dd0 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -709,10 +709,8 @@ impl<'run, 'src> Parser<'run, 'src> { '\\' => cooked.push('\\'), '\n' => {} '"' => cooked.push('"'), - other => { - return Err( - token.error(CompileErrorKind::InvalidEscapeSequence { character: other }), - ); + character => { + return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })); } } state = State::Initial; From d7963482ebcf5e609789639f3ff10f428a7b904b Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:29:08 +0700 Subject: [PATCH 12/20] Avoid continue --- src/parser.rs | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index ff4abf6dd0..f6a85a6374 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -718,7 +718,6 @@ impl<'run, 'src> Parser<'run, 'src> { State::Unicode => match c { '{' => { state = State::UnicodeValue { hex: String::new() }; - continue; } character => { return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); @@ -735,6 +734,8 @@ impl<'run, 'src> Parser<'run, 'src> { cooked.push(char::from_u32(codepoint).ok_or_else(|| { token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) })?); + + state = State::Initial; } else if "0123456789ABCDEFabcdef".contains(c) { hex.push(c); if hex.len() > 6 { @@ -742,12 +743,9 @@ impl<'run, 'src> Parser<'run, 'src> { token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() }), ); } - continue; } else { return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); } - - state = State::Initial; } } } From 894ee882b43db3f1ecb73e807a843faac0a173f9 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:33:57 +0700 Subject: [PATCH 13/20] Move escape processing into dedicated function --- src/parser.rs | 161 +++++++++++++++++++++++++------------------------- 1 file changed, 82 insertions(+), 79 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index f6a85a6374..d560fa2662 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -676,85 +676,7 @@ impl<'run, 'src> Parser<'run, 'src> { }; let cooked = if kind.processes_escape_sequences() { - #[derive(PartialEq, Eq)] - enum State { - Initial, - Backslash, - Unicode, - UnicodeValue { hex: String }, - } - let mut cooked = String::new(); - let mut state = State::Initial; - for c in unindented.chars() { - match state { - State::Initial => { - if c == '\\' { - state = State::Backslash; - } else { - cooked.push(c); - } - } - State::Backslash if c == 'u' => { - state = State::Unicode; - } - State::Backslash => { - match c { - 'n' => { - cooked.push('\n'); - } - 'r' => { - cooked.push('\r'); - } - 't' => cooked.push('\t'), - '\\' => cooked.push('\\'), - '\n' => {} - '"' => cooked.push('"'), - character => { - return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })); - } - } - state = State::Initial; - } - State::Unicode => match c { - '{' => { - state = State::UnicodeValue { hex: String::new() }; - } - character => { - return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); - } - }, - State::UnicodeValue { ref mut hex } => { - if c == '}' { - if hex.is_empty() { - return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); - } - - let codepoint = u32::from_str_radix(hex.as_str(), 16).unwrap(); - - cooked.push(char::from_u32(codepoint).ok_or_else(|| { - token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) - })?); - - state = State::Initial; - } else if "0123456789ABCDEFabcdef".contains(c) { - hex.push(c); - if hex.len() > 6 { - return Err( - token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() }), - ); - } - } else { - return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); - } - } - } - } - - if state != State::Initial { - return Err(token.error(CompileErrorKind::UnterminatedEscapeSequence)); - } - - cooked + Self::cook_string(token, &unindented)? } else { unindented }; @@ -778,6 +700,87 @@ impl<'run, 'src> Parser<'run, 'src> { )) } + // Transform escape sequences in from string literal `token` with content `text` + fn cook_string(token: Token<'src>, text: &str) -> CompileResult<'src, String> { + #[derive(PartialEq, Eq)] + enum State { + Initial, + Backslash, + Unicode, + UnicodeValue { hex: String }, + } + let mut cooked = String::new(); + let mut state = State::Initial; + for c in text.chars() { + match state { + State::Initial => { + if c == '\\' { + state = State::Backslash; + } else { + cooked.push(c); + } + } + State::Backslash if c == 'u' => { + state = State::Unicode; + } + State::Backslash => { + match c { + 'n' => { + cooked.push('\n'); + } + 'r' => { + cooked.push('\r'); + } + 't' => cooked.push('\t'), + '\\' => cooked.push('\\'), + '\n' => {} + '"' => cooked.push('"'), + character => { + return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })); + } + } + state = State::Initial; + } + State::Unicode => match c { + '{' => { + state = State::UnicodeValue { hex: String::new() }; + } + character => { + return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); + } + }, + State::UnicodeValue { ref mut hex } => { + if c == '}' { + if hex.is_empty() { + return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); + } + + let codepoint = u32::from_str_radix(hex.as_str(), 16).unwrap(); + + cooked.push(char::from_u32(codepoint).ok_or_else(|| { + token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) + })?); + + state = State::Initial; + } else if "0123456789ABCDEFabcdef".contains(c) { + hex.push(c); + if hex.len() > 6 { + return Err(token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() })); + } + } else { + return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); + } + } + } + } + + if state != State::Initial { + return Err(token.error(CompileErrorKind::UnterminatedEscapeSequence)); + } + + Ok(cooked) + } + /// Parse a string literal, e.g. `"FOO"` fn parse_string_literal(&mut self) -> CompileResult<'src, StringLiteral<'src>> { let (_token, string_literal) = self.parse_string_literal_token()?; From 0abd691f546a3454d466653f95e267f59b2e7304 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:36:22 +0700 Subject: [PATCH 14/20] Use if instead of match --- src/parser.rs | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index d560fa2662..248baf4a30 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -709,8 +709,11 @@ impl<'run, 'src> Parser<'run, 'src> { Unicode, UnicodeValue { hex: String }, } + let mut cooked = String::new(); + let mut state = State::Initial; + for c in text.chars() { match state { State::Initial => { @@ -725,30 +728,25 @@ impl<'run, 'src> Parser<'run, 'src> { } State::Backslash => { match c { - 'n' => { - cooked.push('\n'); - } - 'r' => { - cooked.push('\r'); - } + 'n' => cooked.push('\n'), + 'r' => cooked.push('\r'), 't' => cooked.push('\t'), '\\' => cooked.push('\\'), '\n' => {} '"' => cooked.push('"'), character => { - return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })); + return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character })) } } state = State::Initial; } - State::Unicode => match c { - '{' => { + State::Unicode => { + if c == '{' { state = State::UnicodeValue { hex: String::new() }; + } else { + return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character: c })); } - character => { - return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); - } - }, + } State::UnicodeValue { ref mut hex } => { if c == '}' { if hex.is_empty() { From 08761839d8ee9728db02889b5738d4ffb0e26617 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:36:43 +0700 Subject: [PATCH 15/20] Use & instead of as_str --- src/parser.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/parser.rs b/src/parser.rs index 248baf4a30..228fabf469 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -753,7 +753,7 @@ impl<'run, 'src> Parser<'run, 'src> { return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); } - let codepoint = u32::from_str_radix(hex.as_str(), 16).unwrap(); + let codepoint = u32::from_str_radix(&hex, 16).unwrap(); cooked.push(char::from_u32(codepoint).ok_or_else(|| { token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) From ae8b77912b40518d3e3974bf0fb11bee568f0e04 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:38:58 +0700 Subject: [PATCH 16/20] Use pattern matching instead of contains --- src/parser.rs | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 228fabf469..816619de68 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -747,8 +747,8 @@ impl<'run, 'src> Parser<'run, 'src> { return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character: c })); } } - State::UnicodeValue { ref mut hex } => { - if c == '}' { + State::UnicodeValue { ref mut hex } => match c { + '}' => { if hex.is_empty() { return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); } @@ -760,15 +760,17 @@ impl<'run, 'src> Parser<'run, 'src> { })?); state = State::Initial; - } else if "0123456789ABCDEFabcdef".contains(c) { + } + '0'..='9' | 'A'..='F' | 'a'..='f' => { hex.push(c); if hex.len() > 6 { return Err(token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() })); } - } else { + } + _ => { return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c })); } - } + }, } } From 2ea4f84dd5835c818b7b52c507606faf6bbe23e1 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:51:54 +0700 Subject: [PATCH 17/20] Tweak error messages and test names --- src/compile_error.rs | 20 +++++++++----------- src/compile_error_kind.rs | 2 +- src/parser.rs | 4 ++-- tests/string.rs | 28 ++++++++++++++-------------- 4 files changed, 26 insertions(+), 28 deletions(-) diff --git a/src/compile_error.rs b/src/compile_error.rs index aac219acf8..94df9e24c9 100644 --- a/src/compile_error.rs +++ b/src/compile_error.rs @@ -260,25 +260,24 @@ impl Display for CompileError<'_> { found, } => write!(f, "Expected {}, but found {found}", List::or(expected)), UnicodeEscapeCharacter { character } => { - write!(f, "expected hex digit (0-9A-Fa-f), found `{character}`") + write!(f, "expected hex digit [0-9A-Fa-f] but found `{character}`") } - UnicodeEscapeDelimiter { character } => write!(f, "expected `{{` but found `{character}`"), - UnicodeEscapeEmpty => write!(f, "expected hex digit (0-9A-Fa-f) but found `}}`"), + UnicodeEscapeDelimiter { character } => write!( + f, + "expected unicode escape sequence delimiter `{{` but found `{character}`" + ), + UnicodeEscapeEmpty => write!(f, "unicode escape sequences must not be empty"), UnicodeEscapeLength { hex } => write!( f, - "more than 6 hex digits in escape sequence starting with `\\u{{{hex}`" + "unicode escape sequence starting with `\\u{{{hex}` longer than six hex digits" ), UnicodeEscapeRange { hex } => { write!( f, - "`{hex}` does not represent a valid character{}", - if u32::from_str_radix(hex, 16).unwrap() > 1_114_111 { - ": maximum valid code point is 10FFFF" - } else { - "" - } + "unicode escape sequence value `{hex}` greater than maximum valid code point `10FFFF`", ) } + UnicodeEscapeUnterminated => write!(f, "unterminated unicode escape sequence"), UnknownAliasTarget { alias, target } => { write!(f, "Alias `{alias}` has an unknown target `{target}`") } @@ -291,7 +290,6 @@ impl Display for CompileError<'_> { UnknownStartOfToken => write!(f, "Unknown start of token:"), UnpairedCarriageReturn => write!(f, "Unpaired carriage return"), UnterminatedBacktick => write!(f, "Unterminated backtick"), - UnterminatedEscapeSequence => write!(f, "Unterminated escape sequence"), UnterminatedInterpolation => write!(f, "Unterminated interpolation"), UnterminatedString => write!(f, "Unterminated string"), } diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs index 4f39e43006..c99cabac1b 100644 --- a/src/compile_error_kind.rs +++ b/src/compile_error_kind.rs @@ -133,6 +133,7 @@ pub(crate) enum CompileErrorKind<'src> { UnicodeEscapeRange { hex: String, }, + UnicodeEscapeUnterminated, UnknownAliasTarget { alias: &'src str, target: &'src str, @@ -153,7 +154,6 @@ pub(crate) enum CompileErrorKind<'src> { UnknownStartOfToken, UnpairedCarriageReturn, UnterminatedBacktick, - UnterminatedEscapeSequence, UnterminatedInterpolation, UnterminatedString, } diff --git a/src/parser.rs b/src/parser.rs index 816619de68..825b4f06fa 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -753,7 +753,7 @@ impl<'run, 'src> Parser<'run, 'src> { return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty)); } - let codepoint = u32::from_str_radix(&hex, 16).unwrap(); + let codepoint = u32::from_str_radix(hex, 16).unwrap(); cooked.push(char::from_u32(codepoint).ok_or_else(|| { token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() }) @@ -775,7 +775,7 @@ impl<'run, 'src> Parser<'run, 'src> { } if state != State::Initial { - return Err(token.error(CompileErrorKind::UnterminatedEscapeSequence)); + return Err(token.error(CompileErrorKind::UnicodeEscapeUnterminated)); } Ok(cooked) diff --git a/tests/string.rs b/tests/string.rs index 3849a3b8e7..b9782ae520 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -402,14 +402,14 @@ fn valid_unicode_escape() { } #[test] -fn u_escape_no_braces() { +fn unicode_escape_no_braces() { Test::new() .justfile("x := \"\\u1234\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: expected `{` but found `1` +error: expected unicode escape sequence delimiter `{` but found `1` ——▶ justfile:1:6 │ 1 │ x := "\u1234" @@ -420,14 +420,14 @@ error: expected `{` but found `1` } #[test] -fn u_escape_empty() { +fn unicode_escape_empty() { Test::new() .justfile("x := \"\\u{}\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: expected hex digit (0-9A-Fa-f) but found `}` +error: unicode escape sequences must not be empty ——▶ justfile:1:6 │ 1 │ x := "\u{}" @@ -438,14 +438,14 @@ error: expected hex digit (0-9A-Fa-f) but found `}` } #[test] -fn u_escape_requires_immediate_opening_brace() { +fn unicode_escape_requires_immediate_opening_brace() { Test::new() .justfile("x := \"\\u {1f916}\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: expected `{` but found ` ` +error: expected unicode escape sequence delimiter `{` but found ` ` ——▶ justfile:1:6 │ 1 │ x := "\u {1f916}" @@ -456,14 +456,14 @@ error: expected `{` but found ` ` } #[test] -fn u_escape_non_hex() { +fn unicode_escape_non_hex() { Test::new() .justfile("x := \"\\u{foo}\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: expected hex digit (0-9A-Fa-f), found `o` +error: expected hex digit [0-9A-Fa-f] but found `o` ——▶ justfile:1:6 │ 1 │ x := "\u{foo}" @@ -474,14 +474,14 @@ error: expected hex digit (0-9A-Fa-f), found `o` } #[test] -fn u_escape_invalid_character() { +fn unicode_escape_invalid_character() { Test::new() .justfile("x := \"\\u{BadBad}\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: `BadBad` does not represent a valid character: maximum valid code point is 10FFFF +error: unicode escape sequence value `BadBad` greater than maximum valid code point `10FFFF` ——▶ justfile:1:6 │ 1 │ x := "\u{BadBad}" @@ -492,14 +492,14 @@ error: `BadBad` does not represent a valid character: maximum valid code point i } #[test] -fn u_escape_too_long() { +fn unicode_escape_too_long() { Test::new() .justfile("x := \"\\u{FFFFFFFFFF}\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: more than 6 hex digits in escape sequence starting with `\u{FFFFFFF` +error: unicode escape sequence starting with `\u{FFFFFFF` longer than six hex digits ——▶ justfile:1:6 │ 1 │ x := "\u{FFFFFFFFFF}" @@ -510,14 +510,14 @@ error: more than 6 hex digits in escape sequence starting with `\u{FFFFFFF` } #[test] -fn u_escape_unterminated() { +fn unicode_escape_unterminated() { Test::new() .justfile("x := \"\\u{1f917\"") .args(["--evaluate", "x"]) .status(1) .stderr( r#" -error: Unterminated escape sequence +error: unterminated unicode escape sequence ——▶ justfile:1:6 │ 1 │ x := "\u{1f917" From f27ab6d9fde85176f2b7b9df16248287b8af7fa4 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:54:28 +0700 Subject: [PATCH 18/20] Reform --- src/parser.rs | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/parser.rs b/src/parser.rs index 825b4f06fa..da2ff76f3f 100644 --- a/src/parser.rs +++ b/src/parser.rs @@ -740,13 +740,14 @@ impl<'run, 'src> Parser<'run, 'src> { } state = State::Initial; } - State::Unicode => { - if c == '{' { + State::Unicode => match c { + '{' => { state = State::UnicodeValue { hex: String::new() }; - } else { - return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character: c })); } - } + character => { + return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character })); + } + }, State::UnicodeValue { ref mut hex } => match c { '}' => { if hex.is_empty() { From c568fd0c8900166ff96e7298b7f6e637d4288023 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:55:36 +0700 Subject: [PATCH 19/20] Test maximum valid char --- tests/string.rs | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/tests/string.rs b/tests/string.rs index b9782ae520..457e6779ea 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -401,6 +401,15 @@ fn valid_unicode_escape() { .run(); } +#[test] +fn maximum_valid_unicode_escape() { + Test::new() + .justfile(r#"x := "\u{10FFFF}""#) + .args(["--evaluate", "x"]) + .stdout("\u{10FFFF}") + .run(); +} + #[test] fn unicode_escape_no_braces() { Test::new() From 8b714b81d7c3f0ae4c9c695a8821b5348d6c9b03 Mon Sep 17 00:00:00 2001 From: Casey Rodarmor Date: Sun, 15 Sep 2024 16:57:36 +0700 Subject: [PATCH 20/20] Test all hex digits --- tests/string.rs | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/tests/string.rs b/tests/string.rs index 457e6779ea..12bcbac0f7 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -395,12 +395,21 @@ test! { #[test] fn valid_unicode_escape() { Test::new() - .justfile("x := \"\\u{1f916}\\u{1F916}\"") + .justfile(r#"x := "\u{1f916}\u{1F916}""#) .args(["--evaluate", "x"]) .stdout("🤖🤖") .run(); } +#[test] +fn unicode_escapes_with_all_hex_digits() { + Test::new() + .justfile(r#"x := "\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}""#) + .args(["--evaluate", "x"]) + .stdout("\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}") + .run(); +} + #[test] fn maximum_valid_unicode_escape() { Test::new()