From d2c66e9fa9a0b35f9065644e24d80568744a06ce Mon Sep 17 00:00:00 2001
From: laniakea64 <laniakea64@users.noreply.github.com>
Date: Sun, 15 Sep 2024 05:59:52 -0400
Subject: [PATCH] =?UTF-8?q?Allow=20including=20arbitrary=20characters=20in?=
 =?UTF-8?q?=20strings=20with=20`\u{=E2=80=A6}`=20(#2360)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 src/compile_error.rs      |  19 +++++
 src/compile_error_kind.rs |  14 ++++
 src/parser.rs             | 108 ++++++++++++++++++++-------
 tests/string.rs           | 153 ++++++++++++++++++++++++++++++++++++++
 4 files changed, 269 insertions(+), 25 deletions(-)
diff --git a/src/compile_error.rs b/src/compile_error.rs
index b2396219b8..94df9e24c9 100644
--- a/src/compile_error.rs
+++ b/src/compile_error.rs
@@ -259,6 +259,25 @@ impl Display for CompileError<'_> {
         ref expected,
         found,
       } => write!(f, "Expected {}, but found {found}", List::or(expected)),
+      UnicodeEscapeCharacter { character } => {
+        write!(f, "expected hex digit [0-9A-Fa-f] but found `{character}`")
+      }
+      UnicodeEscapeDelimiter { character } => write!(
+        f,
+        "expected unicode escape sequence delimiter `{{` but found `{character}`"
+      ),
+      UnicodeEscapeEmpty => write!(f, "unicode escape sequences must not be empty"),
+      UnicodeEscapeLength { hex } => write!(
+        f,
+        "unicode escape sequence starting with `\\u{{{hex}` longer than six hex digits"
+      ),
+      UnicodeEscapeRange { hex } => {
+        write!(
+          f,
+          "unicode escape sequence value `{hex}` greater than maximum valid code point `10FFFF`",
+        )
+      }
+      UnicodeEscapeUnterminated => write!(f, "unterminated unicode escape sequence"),
       UnknownAliasTarget { alias, target } => {
         write!(f, "Alias `{alias}` has an unknown target `{target}`")
       }
diff --git a/src/compile_error_kind.rs b/src/compile_error_kind.rs
index 73c3960e26..c99cabac1b 100644
--- a/src/compile_error_kind.rs
+++ b/src/compile_error_kind.rs
@@ -120,6 +120,20 @@ pub(crate) enum CompileErrorKind<'src> {
     expected: Vec<TokenKind>,
     found: TokenKind,
   },
+  UnicodeEscapeCharacter {
+    character: char,
+  },
+  UnicodeEscapeDelimiter {
+    character: char,
+  },
+  UnicodeEscapeEmpty,
+  UnicodeEscapeLength {
+    hex: String,
+  },
+  UnicodeEscapeRange {
+    hex: String,
+  },
+  UnicodeEscapeUnterminated,
   UnknownAliasTarget {
     alias: &'src str,
     target: &'src str,
diff --git a/src/parser.rs b/src/parser.rs
index b896cd0b13..da2ff76f3f 100644
--- a/src/parser.rs
+++ b/src/parser.rs
@@ -676,31 +676,7 @@ impl<'run, 'src> Parser<'run, 'src> {
     };
 
     let cooked = if kind.processes_escape_sequences() {
-      let mut cooked = String::new();
-      let mut escape = false;
-      for c in unindented.chars() {
-        if escape {
-          match c {
-            'n' => cooked.push('\n'),
-            'r' => cooked.push('\r'),
-            't' => cooked.push('\t'),
-            '\\' => cooked.push('\\'),
-            '\n' => {}
-            '"' => cooked.push('"'),
-            other => {
-              return Err(
-                token.error(CompileErrorKind::InvalidEscapeSequence { character: other }),
-              );
-            }
-          }
-          escape = false;
-        } else if c == '\\' {
-          escape = true;
-        } else {
-          cooked.push(c);
-        }
-      }
-      cooked
+      Self::cook_string(token, &unindented)?
     } else {
       unindented
     };
@@ -724,6 +700,88 @@ impl<'run, 'src> Parser<'run, 'src> {
     ))
   }
 
+  // Transform escape sequences in from string literal `token` with content `text`
+  fn cook_string(token: Token<'src>, text: &str) -> CompileResult<'src, String> {
+    #[derive(PartialEq, Eq)]
+    enum State {
+      Initial,
+      Backslash,
+      Unicode,
+      UnicodeValue { hex: String },
+    }
+
+    let mut cooked = String::new();
+
+    let mut state = State::Initial;
+
+    for c in text.chars() {
+      match state {
+        State::Initial => {
+          if c == '\\' {
+            state = State::Backslash;
+          } else {
+            cooked.push(c);
+          }
+        }
+        State::Backslash if c == 'u' => {
+          state = State::Unicode;
+        }
+        State::Backslash => {
+          match c {
+            'n' => cooked.push('\n'),
+            'r' => cooked.push('\r'),
+            't' => cooked.push('\t'),
+            '\\' => cooked.push('\\'),
+            '\n' => {}
+            '"' => cooked.push('"'),
+            character => {
+              return Err(token.error(CompileErrorKind::InvalidEscapeSequence { character }))
+            }
+          }
+          state = State::Initial;
+        }
+        State::Unicode => match c {
+          '{' => {
+            state = State::UnicodeValue { hex: String::new() };
+          }
+          character => {
+            return Err(token.error(CompileErrorKind::UnicodeEscapeDelimiter { character }));
+          }
+        },
+        State::UnicodeValue { ref mut hex } => match c {
+          '}' => {
+            if hex.is_empty() {
+              return Err(token.error(CompileErrorKind::UnicodeEscapeEmpty));
+            }
+
+            let codepoint = u32::from_str_radix(hex, 16).unwrap();
+
+            cooked.push(char::from_u32(codepoint).ok_or_else(|| {
+              token.error(CompileErrorKind::UnicodeEscapeRange { hex: hex.clone() })
+            })?);
+
+            state = State::Initial;
+          }
+          '0'..='9' | 'A'..='F' | 'a'..='f' => {
+            hex.push(c);
+            if hex.len() > 6 {
+              return Err(token.error(CompileErrorKind::UnicodeEscapeLength { hex: hex.clone() }));
+            }
+          }
+          _ => {
+            return Err(token.error(CompileErrorKind::UnicodeEscapeCharacter { character: c }));
+          }
+        },
+      }
+    }
+
+    if state != State::Initial {
+      return Err(token.error(CompileErrorKind::UnicodeEscapeUnterminated));
+    }
+
+    Ok(cooked)
+  }
+
   /// Parse a string literal, e.g. `"FOO"`
   fn parse_string_literal(&mut self) -> CompileResult<'src, StringLiteral<'src>> {
     let (_token, string_literal) = self.parse_string_literal_token()?;
diff --git a/tests/string.rs b/tests/string.rs
index 1803877a31..12bcbac0f7 100644
--- a/tests/string.rs
+++ b/tests/string.rs
@@ -391,3 +391,156 @@ test! {
   ",
   status:   EXIT_FAILURE,
 }
+
+#[test]
+fn valid_unicode_escape() {
+  Test::new()
+    .justfile(r#"x := "\u{1f916}\u{1F916}""#)
+    .args(["--evaluate", "x"])
+    .stdout("🤖🤖")
+    .run();
+}
+
+#[test]
+fn unicode_escapes_with_all_hex_digits() {
+  Test::new()
+    .justfile(r#"x := "\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}""#)
+    .args(["--evaluate", "x"])
+    .stdout("\u{012345}\u{6789a}\u{bcdef}\u{ABCDE}\u{F}")
+    .run();
+}
+
+#[test]
+fn maximum_valid_unicode_escape() {
+  Test::new()
+    .justfile(r#"x := "\u{10FFFF}""#)
+    .args(["--evaluate", "x"])
+    .stdout("\u{10FFFF}")
+    .run();
+}
+
+#[test]
+fn unicode_escape_no_braces() {
+  Test::new()
+    .justfile("x := \"\\u1234\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: expected unicode escape sequence delimiter `{` but found `1`
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u1234"
+  │      ^^^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_empty() {
+  Test::new()
+    .justfile("x := \"\\u{}\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: unicode escape sequences must not be empty
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u{}"
+  │      ^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_requires_immediate_opening_brace() {
+  Test::new()
+    .justfile("x := \"\\u {1f916}\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: expected unicode escape sequence delimiter `{` but found ` `
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u {1f916}"
+  │      ^^^^^^^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_non_hex() {
+  Test::new()
+    .justfile("x := \"\\u{foo}\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: expected hex digit [0-9A-Fa-f] but found `o`
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u{foo}"
+  │      ^^^^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_invalid_character() {
+  Test::new()
+    .justfile("x := \"\\u{BadBad}\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: unicode escape sequence value `BadBad` greater than maximum valid code point `10FFFF`
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u{BadBad}"
+  │      ^^^^^^^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_too_long() {
+  Test::new()
+    .justfile("x := \"\\u{FFFFFFFFFF}\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: unicode escape sequence starting with `\u{FFFFFFF` longer than six hex digits
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u{FFFFFFFFFF}"
+  │      ^^^^^^^^^^^^^^^^
+"#,
+    )
+    .run();
+}
+
+#[test]
+fn unicode_escape_unterminated() {
+  Test::new()
+    .justfile("x := \"\\u{1f917\"")
+    .args(["--evaluate", "x"])
+    .status(1)
+    .stderr(
+      r#"
+error: unterminated unicode escape sequence
+ ——▶ justfile:1:6
+  │
+1 │ x := "\u{1f917"
+  │      ^^^^^^^^^^
+"#,
+    )
+    .run();
+}