From 62d98065f6c4887ddb4f2222fd762c1f0a65a325 Mon Sep 17 00:00:00 2001 From: Logan Hodgson Date: Mon, 26 Aug 2024 18:58:07 -0700 Subject: [PATCH 1/3] Added raw string literal support and 2 tests --- src/tokenizer.rs | 152 ++++++++++++++++++++++++++++++++++++++++++++++- tests/string.rs | 3 + 2 files changed, 153 insertions(+), 2 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index c8c627316..a969dd3fe 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -10,7 +10,7 @@ use std::prelude::v1::*; use std::{ cell::RefCell, char, fmt, - iter::{FusedIterator, Peekable}, + iter::{repeat, FusedIterator, Peekable}, rc::Rc, str::{Chars, FromStr}, }; @@ -1177,6 +1177,146 @@ pub trait InputStream { } } +/// _(internals)_ Parse a raw string literal. Raw string literals do not process any escapes. +/// Raw string literals do not process any escapes. They start with the character +/// `U+0072` (`r`), followed by fewer than 256 of the character `U+0023` (`#`) and a +/// `U+0022` (double-quote) character. +/// +/// The _raw string body_ can contain any sequence of Unicode characters other than `U+000D` (CR). +/// It is terminated only by another `U+0022` (double-quote) character, followed by the same number of `U+0023` (`#`) characters that preceded the opening `U+0022` (double-quote) character. +/// +/// All Unicode characters contained in the raw string body represent themselves, +/// the characters `U+0022` (double-quote) (except when followed by at least as +/// many `U+0023` (`#`) characters as were used to start the raw string literal) or +/// `U+005C` (`\`) do not have any special meaning. +/// +/// Returns the parsed string. +/// +/// # Returns +/// +/// | Type | Return Value | +/// |---------------------------|:-----------------------------------:| +/// |`r"hello"` |`StringConstant("hello")` | +/// |`r"hello`_{EOF}_ |`LexError` | +/// |`r#" "hello" "`_{EOF}_ |`LexError` | +/// |`r#""hello""#` |`StringConstant(""hello"")` | +/// |`r##"hello #"# world"##` |`StringConstant("hello #\"# world")` | +/// |`r"R"` |`StringConstant("R")` | +/// |`r"\x52"` |`StringConstant("\x52")` | +/// +/// This function does not throw a `LexError` for the following conditions: +/// +/// * Unterminated literal string at _{EOF}_ +/// +/// * Unterminated normal string with continuation at _{EOF}_ +/// +/// This is to facilitate using this function to parse a script line-by-line, where the end of the +/// line (i.e. _{EOF}_) is not necessarily the end of the script. +/// +/// Any time a [`StringConstant`][`Token::StringConstant`] is returned with +/// `state.is_within_text_terminated_by` set to `Some(_)` is one of the above conditions. +pub fn parse_raw_string_literal( + stream: &mut (impl InputStream + ?Sized), + state: &mut TokenizeState, + pos: &mut Position, +) -> Result<(SmartString, Position), (LexError, Position)> { + let start = *pos; + let mut first_char = Position::NONE; + + // Count the number of '#'s + let mut hash_count = 0; + while let Some('#') = stream.peek_next() { + stream.eat_next_and_advance(pos); + hash_count += 1; + } + + // Match '"' + match stream.get_next() { + Some('"') => pos.advance(), + Some(c) => return Err((LERR::UnexpectedInput(c.to_string()), start)), + None => { + return Err((LERR::UnterminatedString, start)); + } + } + + let mut seen_hashes: Option = None; + // Match everything until the same number of '#'s are seen, prepended by a '"' + let mut result = SmartString::new_const(); + + loop { + let next_char = match stream.get_next() { + Some(ch) => { + pos.advance(); + ch + } + None => { + pos.advance(); + return Err((LERR::UnterminatedString, start)); + } + }; + pos.advance(); + + match (next_char, &mut seen_hashes) { + // Begin attempt to close string + ('"', None) => { + if hash_count == 0 { + return Ok((result, first_char)); + } else { + seen_hashes = Some(0); + } + } + // Restart attempt to close string + ('"', Some(count)) => { + if hash_count == 0 { + return Ok((result, first_char)); + } else { + // result.reserve(*count as usize+c.len()); + result.push('"'); + result.extend(repeat('#').take(*count as usize)); + seen_hashes = Some(0); + } + } + // Continue attempt to close string + ('#', Some(count)) => { + *count += 1; + if *count == hash_count { + return Ok((result, first_char)); + } + } + // Fail to close the string - add previous quote and hashes + (c, Some(count)) => { + // result.reserve(*count as usize +1+c.len()); + result.push('"'); + result.extend(repeat('#').take(*count as usize)); + result.push(c); + seen_hashes = None; + } + // Normal new character seen + (c, None) => { + result.push(c); + } + } + + if next_char == '\n' { + pos.new_line(); + } else { + pos.advance(); + } + + // Check string length + #[cfg(not(feature = "unchecked"))] + if let Some(max) = state.max_string_len { + if result.len() > max.get() { + return Err((LexError::StringTooLong(max.get()), start)); + } + } + + if first_char.is_none() { + first_char = *pos; + } + } +} + /// _(internals)_ Parse a string literal ended by a specified termination character. /// Exported under the `internals` feature only. /// @@ -1194,7 +1334,7 @@ pub trait InputStream { /// |`` `hello``_{LF}{EOF}_ |`StringConstant("hello\n")` |``Some('`')`` | /// |`` `hello ${`` |`InterpolatedString("hello ")`
next token is `{`|`None` | /// |`` } hello` `` |`StringConstant(" hello")` |`None` | -/// |`} hello`_{EOF}_ |`StringConstant(" hello")` |``Some('`')`` | +/// |`} hello`_{EOF}_ |`StringConstant(" hello")` |``Some('`')`` | | /// /// This function does not throw a `LexError` for the following conditions: /// @@ -1795,6 +1935,14 @@ fn get_next_token_inner( ); } + // r - raw string literal + ('r', '"' | '#') => { + return parse_raw_string_literal(stream, state, pos).map_or_else( + |(err, err_pos)| (Token::LexError(err.into()), err_pos), + |(result, ..)| (Token::StringConstant(result.into()), start_pos), + ); + } + // ' - character literal ('\'', '\'') => { return ( diff --git a/tests/string.rs b/tests/string.rs index b1addd725..6547c5e0a 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -17,6 +17,9 @@ fn test_string() { assert_eq!(engine.eval::(" `\r\nTest string: \\u2764\nhello,\\nworld!`").unwrap(), "Test string: \\u2764\nhello,\\nworld!"); assert_eq!(engine.eval::(r#""Test string: \x58""#).unwrap(), "Test string: X"); assert_eq!(engine.eval::(r#""\"hello\"""#).unwrap(), r#""hello""#); + assert_eq!(engine.eval::(r#"r"Test""#).unwrap(), "Test"); + assert_eq!(engine.eval::(r##"r"Test string: \\u2764\nhello,\nworld!""##).unwrap(), r"Test string: \\u2764\nhello,\nworld!"); + assert_eq!(engine.eval::(r###"r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##"###).unwrap(), r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##); assert_eq!(engine.eval::(r#""foo" + "bar""#).unwrap(), "foobar"); From 758b1e64f592eb629253c4a09f12c8b0c35fd27a Mon Sep 17 00:00:00 2001 From: Logan Hodgson <9051940+cellomath@users.noreply.github.com> Date: Mon, 26 Aug 2024 19:02:33 -0700 Subject: [PATCH 2/3] Update raw string literal tokenizer.rs docs --- src/tokenizer.rs | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index a969dd3fe..68e3ad4cb 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1199,22 +1199,12 @@ pub trait InputStream { /// |`r"hello"` |`StringConstant("hello")` | /// |`r"hello`_{EOF}_ |`LexError` | /// |`r#" "hello" "`_{EOF}_ |`LexError` | -/// |`r#""hello""#` |`StringConstant(""hello"")` | +/// |`r#""hello""#` |`StringConstant("\"hello\"")` | /// |`r##"hello #"# world"##` |`StringConstant("hello #\"# world")` | /// |`r"R"` |`StringConstant("R")` | -/// |`r"\x52"` |`StringConstant("\x52")` | +/// |`r"\x52"` |`StringConstant("\\x52")` | /// -/// This function does not throw a `LexError` for the following conditions: -/// -/// * Unterminated literal string at _{EOF}_ -/// -/// * Unterminated normal string with continuation at _{EOF}_ -/// -/// This is to facilitate using this function to parse a script line-by-line, where the end of the -/// line (i.e. _{EOF}_) is not necessarily the end of the script. -/// -/// Any time a [`StringConstant`][`Token::StringConstant`] is returned with -/// `state.is_within_text_terminated_by` set to `Some(_)` is one of the above conditions. +/// This function throws a `LexError` for an unterminated literal string at _{EOF}_. pub fn parse_raw_string_literal( stream: &mut (impl InputStream + ?Sized), state: &mut TokenizeState, From 72c84fdfa2a557638b870100dce9281fb87da28f Mon Sep 17 00:00:00 2001 From: Logan Hodgson Date: Tue, 27 Aug 2024 09:17:23 -0700 Subject: [PATCH 3/3] Added more tests. Added comments. --- src/tokenizer.rs | 27 ++++++++++----------------- tests/string.rs | 24 ++++++++++++++++++++++-- 2 files changed, 32 insertions(+), 19 deletions(-) diff --git a/src/tokenizer.rs b/src/tokenizer.rs index 68e3ad4cb..cc9407a1b 100644 --- a/src/tokenizer.rs +++ b/src/tokenizer.rs @@ -1224,27 +1224,22 @@ pub fn parse_raw_string_literal( match stream.get_next() { Some('"') => pos.advance(), Some(c) => return Err((LERR::UnexpectedInput(c.to_string()), start)), - None => { - return Err((LERR::UnterminatedString, start)); - } + None => return Err((LERR::UnterminatedString, start)) } - let mut seen_hashes: Option = None; // Match everything until the same number of '#'s are seen, prepended by a '"' + + // Counts the number of '#' characters seen after a quotation mark. + // Becomes Some(0) after a quote is seen, but resets to None if a hash doesn't follow. + let mut seen_hashes: Option = None; let mut result = SmartString::new_const(); + loop { let next_char = match stream.get_next() { - Some(ch) => { - pos.advance(); - ch - } - None => { - pos.advance(); - return Err((LERR::UnterminatedString, start)); - } + Some(ch) => ch, + None => return Err((LERR::UnterminatedString, start)) }; - pos.advance(); match (next_char, &mut seen_hashes) { // Begin attempt to close string @@ -1282,9 +1277,7 @@ pub fn parse_raw_string_literal( seen_hashes = None; } // Normal new character seen - (c, None) => { - result.push(c); - } + (c, None) => result.push(c) } if next_char == '\n' { @@ -1324,7 +1317,7 @@ pub fn parse_raw_string_literal( /// |`` `hello``_{LF}{EOF}_ |`StringConstant("hello\n")` |``Some('`')`` | /// |`` `hello ${`` |`InterpolatedString("hello ")`
next token is `{`|`None` | /// |`` } hello` `` |`StringConstant(" hello")` |`None` | -/// |`} hello`_{EOF}_ |`StringConstant(" hello")` |``Some('`')`` | | +/// |`} hello`_{EOF}_ |`StringConstant(" hello")` |``Some('`')`` | /// /// This function does not throw a `LexError` for the following conditions: /// diff --git a/tests/string.rs b/tests/string.rs index 6547c5e0a..69ae0e5d3 100644 --- a/tests/string.rs +++ b/tests/string.rs @@ -1,4 +1,4 @@ -use rhai::{Engine, EvalAltResult, ImmutableString, Scope, INT}; +use rhai::{Engine, EvalAltResult, ImmutableString, LexError, ParseErrorType, Position, Scope, INT}; #[test] fn test_string() { @@ -18,8 +18,28 @@ fn test_string() { assert_eq!(engine.eval::(r#""Test string: \x58""#).unwrap(), "Test string: X"); assert_eq!(engine.eval::(r#""\"hello\"""#).unwrap(), r#""hello""#); assert_eq!(engine.eval::(r#"r"Test""#).unwrap(), "Test"); - assert_eq!(engine.eval::(r##"r"Test string: \\u2764\nhello,\nworld!""##).unwrap(), r"Test string: \\u2764\nhello,\nworld!"); + assert_eq!(engine.eval::(r#"r"Test string: \\u2764\nhello,\nworld!""#).unwrap(), r#"Test string: \\u2764\nhello,\nworld!"#); assert_eq!(engine.eval::(r###"r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##"###).unwrap(), r##"Test string: r#"\\u2764\nhello,\\nworld!"#"##); + assert_eq!(engine.eval::(r###"r##"Test string: "## + "\u2764""###).unwrap(), "Test string: ❤"); + let bad_result = *engine.eval::(r###"r#"Test string: \"##"###).unwrap_err(); + if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result { + assert_eq!(parse_error, ParseErrorType::UnknownOperator("#".to_string())); + assert_eq!(pos, Position::new(1, 19)); + } else { + panic!("Wrong error type: {}", bad_result); + } + let bad_result = *engine + .eval::( + r###"r##"Test string: + \"#"###, + ) + .unwrap_err(); + if let EvalAltResult::ErrorParsing(parse_error, pos) = bad_result { + assert_eq!(parse_error, ParseErrorType::BadInput(LexError::UnterminatedString)); + assert_eq!(pos, Position::new(1, 1)); + } else { + panic!("Wrong error type: {}", bad_result); + } assert_eq!(engine.eval::(r#""foo" + "bar""#).unwrap(), "foobar");