From 62d98065f6c4887ddb4f2222fd762c1f0a65a325 Mon Sep 17 00:00:00 2001
From: Logan Hodgson <loganhodgson@live.com>
Date: Mon, 26 Aug 2024 18:58:07 -0700
Subject: [PATCH 1/3] Added raw string literal support and 2 tests

---
 src/tokenizer.rs | 152 ++++++++++++++++++++++++++++++++++++++++++++++-
 tests/string.rs  |   3 +
 2 files changed, 153 insertions(+), 2 deletions(-)
diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index c8c627316..a969dd3fe 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -10,7 +10,7 @@ use std::prelude::v1::*;
 use std::{
     cell::RefCell,
     char, fmt,
-    iter::{FusedIterator, Peekable},
+    iter::{repeat, FusedIterator, Peekable},
     rc::Rc,
     str::{Chars, FromStr},
 };
@@ -1177,6 +1177,146 @@ pub trait InputStream {
     }
 }
 
+/// _(internals)_ Parse a raw string literal. Raw string literals do not process any escapes.
+/// Raw string literals do not process any escapes. They start with the character
+/// `U+0072` (`r`), followed by fewer than 256 of the character `U+0023` (`#`) and a
+/// `U+0022` (double-quote) character.
+///
+/// The _raw string body_ can contain any sequence of Unicode characters other than `U+000D` (CR).
+/// It is terminated only by another `U+0022` (double-quote) character, followed by the same number of `U+0023` (`#`) characters that preceded the opening `U+0022` (double-quote) character.
+///
+/// All Unicode characters contained in the raw string body represent themselves,
+/// the characters `U+0022` (double-quote) (except when followed by at least as
+/// many `U+0023` (`#`) characters as were used to start the raw string literal) or
+/// `U+005C` (`\`) do not have any special meaning.
+///
+/// Returns the parsed string.
+///
+/// # Returns
+///
+/// | Type                      | Return Value                        |
+/// |---------------------------|:-----------------------------------:|
+/// |`r"hello"`                 |`StringConstant("hello")`            |
+/// |`r"hello`_{EOF}_           |`LexError`                           |
+/// |`r#" "hello" "`_{EOF}_     |`LexError`                           |
+/// |`r#""hello""#`             |`StringConstant(""hello"")`          |
+/// |`r##"hello #"# world"##`   |`StringConstant("hello #\"# world")` |
+/// |`r"R"`                     |`StringConstant("R")`                |
+/// |`r"\x52"`                  |`StringConstant("\x52")`             |
+///
+/// This function does not throw a `LexError` for the following conditions:
+///
+/// * Unterminated literal string at _{EOF}_
+///
+/// * Unterminated normal string with continuation at _{EOF}_
+///
+/// This is to facilitate using this function to parse a script line-by-line, where the end of the
+/// line (i.e. _{EOF}_) is not necessarily the end of the script.
+///
+/// Any time a [`StringConstant`][`Token::StringConstant`] is returned with
+/// `state.is_within_text_terminated_by` set to `Some(_)` is one of the above conditions.
+pub fn parse_raw_string_literal(
+    stream: &mut (impl InputStream + ?Sized),
+    state: &mut TokenizeState,
+    pos: &mut Position,
+) -> Result<(SmartString, Position), (LexError, Position)> {
+    let start = *pos;
+    let mut first_char = Position::NONE;
+
+    // Count the number of '#'s
+    let mut hash_count = 0;
+    while let Some('#') = stream.peek_next() {
+        stream.eat_next_and_advance(pos);
+        hash_count += 1;
+    }
+
+    // Match '"'
+    match stream.get_next() {
+        Some('"') => pos.advance(),
+        Some(c) => return Err((LERR::UnexpectedInput(c.to_string()), start)),
+        None => {
+            return Err((LERR::UnterminatedString, start));
+        }
+    }
+
+    let mut seen_hashes: Option<u8> = None;
+    // Match everything until the same number of '#'s are seen, prepended by a '"'
+    let mut result = SmartString::new_const();
+
+    loop {
+        let next_char = match stream.get_next() {
+            Some(ch) => {
+                pos.advance();
+                ch
+            }
+            None => {
+                pos.advance();
+                return Err((LERR::UnterminatedString, start));
+            }
+        };
+        pos.advance();
+
+        match (next_char, &mut seen_hashes) {
+            // Begin attempt to close string
+            ('"', None) => {
+                if hash_count == 0 {
+                    return Ok((result, first_char));
+                } else {
+                    seen_hashes = Some(0);
+                }
+            }
+            // Restart attempt to close string
+            ('"', Some(count)) => {
+                if hash_count == 0 {
+                    return Ok((result, first_char));
+                } else {
+                    // result.reserve(*count as usize+c.len());
+                    result.push('"');
+                    result.extend(repeat('#').take(*count as usize));
+                    seen_hashes = Some(0);
+                }
+            }
+            // Continue attempt to close string
+            ('#', Some(count)) => {
+                *count += 1;
+                if *count == hash_count {
+                    return Ok((result, first_char));
+                }
+            }
+            // Fail to close the string - add previous quote and hashes
+            (c, Some(count)) => {
+                // result.reserve(*count as usize +1+c.len());
+                result.push('"');
+                result.extend(repeat('#').take(*count as usize));
+                result.push(c);
+                seen_hashes = None;
+            }
+            // Normal new character seen
+            (c, None) => {
+                result.push(c);
+            }
+        }
+
+        if next_char == '\n' {
+            pos.new_line();
+        } else {
+            pos.advance();
+        }
+
+        // Check string length
+        #[cfg(not(feature = "unchecked"))]
+        if let Some(max) = state.max_string_len {
+            if result.len() > max.get() {
+                return Err((LexError::StringTooLong(max.get()), start));
+            }
+        }
+
+        if first_char.is_none() {
+            first_char = *pos;
+        }
+    }
+}
+
 /// _(internals)_ Parse a string literal ended by a specified termination character.
 /// Exported under the `internals` feature only.
 ///
@@ -1194,7 +1334,7 @@ pub trait InputStream {
 /// |`` `hello``_{LF}{EOF}_           |`StringConstant("hello\n")` |``Some('`')``                       |
 /// |`` `hello ${``                   |`InterpolatedString("hello ")`<br/>next token is `{`|`None`      |
 /// |`` } hello` ``                   |`StringConstant(" hello")`  |`None`                              |
-/// |`} hello`_{EOF}_                 |`StringConstant(" hello")`  |``Some('`')``                       |
+/// |`} hello`_{EOF}_                 |`StringConstant(" hello")`  |``Some('`')``                       |                            |
 ///
 /// This function does not throw a `LexError` for the following conditions:
 ///
@@ -1795,6 +1935,14 @@ fn get_next_token_inner(
                 );
             }
 
+            // r - raw string literal
+            ('r', '"' | '#') => {
+                return parse_raw_string_literal(stream, state, pos).map_or_else(
+                    |(err, err_pos)| (Token::LexError(err.into()), err_pos),
+                    |(result, ..)| (Token::StringConstant(result.into()), start_pos),
+                );
+            }
+
             // ' - character literal
             ('\'', '\'') => {
                 return (
diff --git a/tests/string.rs b/tests/string.rs
index b1addd725..6547c5e0a 100644
--- a/tests/string.rs
+++ b/tests/string.rs
@@ -17,6 +17,9 @@ fn test_string() {
     assert_eq!(engine.eval::<String>("     `\r\nTest string: \\u2764\nhello,\\nworld!`").unwrap(), "Test string: \\u2764\nhello,\\nworld!");
     assert_eq!(engine.eval::<String>(r#""Test string: \x58""#).unwrap(), "Test string: X");
     assert_eq!(engine.eval::<String>(r#""\"hello\"""#).unwrap(), r#""hello""#);
+    assert_eq!(engine.eval::<String>(r#"r"Test""#).unwrap(), "Test");
+    assert_eq!(engine.eval::<String>(r##"r"Test string: \\u2764\nhello,\nworld!""##).unwrap(), r"Test string: \\u2764\nhello,\nworld!");
+    assert_eq!(engine.eval::<String>(r###"r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##"###).unwrap(), r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##);
 
     assert_eq!(engine.eval::<String>(r#""foo" + "bar""#).unwrap(), "foobar");
 

From 758b1e64f592eb629253c4a09f12c8b0c35fd27a Mon Sep 17 00:00:00 2001
From: Logan Hodgson <9051940+cellomath@users.noreply.github.com>
Date: Mon, 26 Aug 2024 19:02:33 -0700
Subject: [PATCH 2/3] Update raw string literal tokenizer.rs docs

---
 src/tokenizer.rs | 16 +++-------------
 1 file changed, 3 insertions(+), 13 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index a969dd3fe..68e3ad4cb 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1199,22 +1199,12 @@ pub trait InputStream {
 /// |`r"hello"`                 |`StringConstant("hello")`            |
 /// |`r"hello`_{EOF}_           |`LexError`                           |
 /// |`r#" "hello" "`_{EOF}_     |`LexError`                           |
-/// |`r#""hello""#`             |`StringConstant(""hello"")`          |
+/// |`r#""hello""#`             |`StringConstant("\"hello\"")`        |
 /// |`r##"hello #"# world"##`   |`StringConstant("hello #\"# world")` |
 /// |`r"R"`                     |`StringConstant("R")`                |
-/// |`r"\x52"`                  |`StringConstant("\x52")`             |
+/// |`r"\x52"`                  |`StringConstant("\\x52")`            |
 ///
-/// This function does not throw a `LexError` for the following conditions:
-///
-/// * Unterminated literal string at _{EOF}_
-///
-/// * Unterminated normal string with continuation at _{EOF}_
-///
-/// This is to facilitate using this function to parse a script line-by-line, where the end of the
-/// line (i.e. _{EOF}_) is not necessarily the end of the script.
-///
-/// Any time a [`StringConstant`][`Token::StringConstant`] is returned with
-/// `state.is_within_text_terminated_by` set to `Some(_)` is one of the above conditions.
+/// This function throws a `LexError` for an unterminated literal string at _{EOF}_.
 pub fn parse_raw_string_literal(
     stream: &mut (impl InputStream + ?Sized),
     state: &mut TokenizeState,

From 72c84fdfa2a557638b870100dce9281fb87da28f Mon Sep 17 00:00:00 2001
From: Logan Hodgson <loganhodgson@live.com>
Date: Tue, 27 Aug 2024 09:17:23 -0700
Subject: [PATCH 3/3] Added more tests. Added comments.

---
 src/tokenizer.rs | 27 ++++++++++-----------------
 tests/string.rs  | 24 ++++++++++++++++++++++--
 2 files changed, 32 insertions(+), 19 deletions(-)

diff --git a/src/tokenizer.rs b/src/tokenizer.rs
index 68e3ad4cb..cc9407a1b 100644
--- a/src/tokenizer.rs
+++ b/src/tokenizer.rs
@@ -1224,27 +1224,22 @@ pub fn parse_raw_string_literal(
     match stream.get_next() {
         Some('"') => pos.advance(),
         Some(c) => return Err((LERR::UnexpectedInput(c.to_string()), start)),
-        None => {
-            return Err((LERR::UnterminatedString, start));
-        }
+        None => return Err((LERR::UnterminatedString, start))
     }
 
-    let mut seen_hashes: Option<u8> = None;
     // Match everything until the same number of '#'s are seen, prepended by a '"'
+
+    // Counts the number of '#' characters seen after a quotation mark.
+    // Becomes Some(0) after a quote is seen, but resets to None if a hash doesn't follow.
+    let mut seen_hashes: Option<u8> = None;
     let mut result = SmartString::new_const();
 
+
     loop {
         let next_char = match stream.get_next() {
-            Some(ch) => {
-                pos.advance();
-                ch
-            }
-            None => {
-                pos.advance();
-                return Err((LERR::UnterminatedString, start));
-            }
+            Some(ch) => ch,
+            None => return Err((LERR::UnterminatedString, start))
         };
-        pos.advance();
 
         match (next_char, &mut seen_hashes) {
             // Begin attempt to close string
@@ -1282,9 +1277,7 @@ pub fn parse_raw_string_literal(
                 seen_hashes = None;
             }
             // Normal new character seen
-            (c, None) => {
-                result.push(c);
-            }
+            (c, None) => result.push(c)
         }
 
         if next_char == '\n' {
@@ -1324,7 +1317,7 @@ pub fn parse_raw_string_literal(
 /// |`` `hello``_{LF}{EOF}_           |`StringConstant("hello\n")` |``Some('`')``                       |
 /// |`` `hello ${``                   |`InterpolatedString("hello ")`<br/>next token is `{`|`None`      |
 /// |`` } hello` ``                   |`StringConstant(" hello")`  |`None`                              |
-/// |`} hello`_{EOF}_                 |`StringConstant(" hello")`  |``Some('`')``                       |                            |
+/// |`} hello`_{EOF}_                 |`StringConstant(" hello")`  |``Some('`')``                       |
 ///
 /// This function does not throw a `LexError` for the following conditions:
 ///
diff --git a/tests/string.rs b/tests/string.rs
index 6547c5e0a..69ae0e5d3 100644
--- a/tests/string.rs
+++ b/tests/string.rs
@@ -1,4 +1,4 @@
-use rhai::{Engine, EvalAltResult, ImmutableString, Scope, INT};
+use rhai::{Engine, EvalAltResult, ImmutableString, LexError, ParseErrorType, Position, Scope, INT};
 
 #[test]
 fn test_string() {
@@ -18,8 +18,28 @@ fn test_string() {
     assert_eq!(engine.eval::<String>(r#""Test string: \x58""#).unwrap(), "Test string: X");
     assert_eq!(engine.eval::<String>(r#""\"hello\"""#).unwrap(), r#""hello""#);
     assert_eq!(engine.eval::<String>(r#"r"Test""#).unwrap(), "Test");
-    assert_eq!(engine.eval::<String>(r##"r"Test string: \\u2764\nhello,\nworld!""##).unwrap(), r"Test string: \\u2764\nhello,\nworld!");
+    assert_eq!(engine.eval::<String>(r#"r"Test string: \\u2764\nhello,\nworld!""#).unwrap(), r#"Test string: \\u2764\nhello,\nworld!"#);
     assert_eq!(engine.eval::<String>(r###"r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##"###).unwrap(), r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##);
+    assert_eq!(engine.eval::<String>(r###"r##"Test string: "## + "\u2764""###).unwrap(), "Test string: ❤");
+    let bad_result = *engine.eval::<String>(r###"r#"Test string: \"##"###).unwrap_err();
+    if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result {
+        assert_eq!(parse_error, ParseErrorType::UnknownOperator("#".to_string()));
+        assert_eq!(pos, Position::new(1, 19));
+    } else {
+        panic!("Wrong error type: {}", bad_result);
+    }
+    let bad_result = *engine
+        .eval::<String>(
+            r###"r##"Test string:
+    \"#"###,
+        )
+        .unwrap_err();
+    if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result {
+        assert_eq!(parse_error, ParseErrorType::BadInput(LexError::UnterminatedString));
+        assert_eq!(pos, Position::new(1, 1));
+    } else {
+        panic!("Wrong error type: {}", bad_result);
+    }
 
     assert_eq!(engine.eval::<String>(r#""foo" + "bar""#).unwrap(), "foobar");