From 314dbc7f2250cc7bece306254f6bfafc96bf913f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Mon, 22 Jan 2024 16:58:39 +1100 Subject: [PATCH 1/7] Avoid useless checking in `from_token_lit`. The parser already does a check-only unescaping which catches all errors. So the checking done in `from_token_lit` never hits. But literals causing warnings can still occur in `from_token_lit`. So the commit changes `str-escape.rs` to use byte string literals and C string literals as well, to give better coverage and ensure the new assertions in `from_token_lit` are correct. --- compiler/rustc_ast/src/util/literal.rs | 83 +++++++------------------- tests/ui/str/str-escape.rs | 9 +-- tests/ui/str/str-escape.stderr | 14 ++--- 3 files changed, 33 insertions(+), 73 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 7b781ba1e1121..852d49fc5b628 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -48,6 +48,9 @@ impl LitKind { return Err(LitError::InvalidSuffix); } + // For byte/char/string literals, chars and escapes have already been + // checked in the lexer (in `cook_lexer_literal`). So we can assume all + // chars and escapes are valid here. Ok(match kind { token::Bool => { assert!(symbol.is_bool_lit()); @@ -56,12 +59,12 @@ impl LitKind { token::Byte => { return unescape_byte(symbol.as_str()) .map(LitKind::Byte) - .map_err(|_| LitError::LexerError); + .map_err(|_| panic!("failed to unescape byte literal")); } token::Char => { return unescape_char(symbol.as_str()) .map(LitKind::Char) - .map_err(|_| LitError::LexerError); + .map_err(|_| panic!("failed to unescape char literal")); } // There are some valid suffixes for integer and float literals, @@ -77,26 +80,22 @@ impl LitKind { let s = symbol.as_str(); // Vanilla strings are so common we optimize for the common case where no chars // requiring special behaviour are present. - let symbol = if s.contains(['\\', '\r']) { + let symbol = if s.contains('\\') { let mut buf = String::with_capacity(s.len()); - let mut error = Ok(()); // Force-inlining here is aggressive but the closure is - // called on every char in the string, so it can be - // hot in programs with many long strings. + // called on every char in the string, so it can be hot in + // programs with many long strings containing escapes. unescape_literal( s, Mode::Str, &mut #[inline(always)] - |_, unescaped_char| match unescaped_char { + |_, c| match c { Ok(c) => buf.push(c), Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + assert!(!err.is_fatal(), "failed to unescape string literal") } }, ); - error?; Symbol::intern(&buf) } else { symbol @@ -104,86 +103,46 @@ impl LitKind { LitKind::Str(symbol, ast::StrStyle::Cooked) } token::StrRaw(n) => { - // Raw strings have no escapes, so we only need to check for invalid chars, and we - // can reuse the symbol on success. - let mut error = Ok(()); - unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| { - match unescaped_char { - Ok(_) => {} - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } - } - }); - error?; + // Raw strings have no escapes so no work is needed here. LitKind::Str(symbol, ast::StrStyle::Raw(n)) } token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - let mut error = Ok(()); unescape_literal(s, Mode::ByteStr, &mut |_, c| match c { Ok(c) => buf.push(byte_from_char(c)), Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + assert!(!err.is_fatal(), "failed to unescape string literal") } }); - error?; LitKind::ByteStr(buf.into(), StrStyle::Cooked) } token::ByteStrRaw(n) => { - // Raw strings have no escapes, so we only need to check for invalid chars, and we - // can convert the symbol directly to a `Lrc` on success. - let s = symbol.as_str(); - let mut error = Ok(()); - unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c { - Ok(_) => {} - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } - }); - LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n)) + // Raw strings have no escapes so we can convert the symbol + // directly to a `Lrc`. + let buf = symbol.as_str().to_owned().into_bytes(); + LitKind::ByteStr(buf.into(), StrStyle::Raw(n)) } token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - let mut error = Ok(()); unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { Ok(CStrUnit::Byte(b)) => buf.push(b), Ok(CStrUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } + assert!(!err.is_fatal(), "failed to unescape C string literal") } }); - error?; buf.push(0); LitKind::CStr(buf.into(), StrStyle::Cooked) } token::CStrRaw(n) => { - // Raw strings have no escapes, so we only need to check for invalid chars, and we - // can convert the symbol directly to a `Lrc` on success. - let s = symbol.as_str(); - let mut error = Ok(()); - unescape_c_string(s, Mode::RawCStr, &mut |_, c| match c { - Ok(_) => {} - Err(err) => { - if err.is_fatal() { - error = Err(LitError::LexerError); - } - } - }); - error?; - let mut buf = s.to_owned().into_bytes(); + // Raw strings have no escapes so we can convert the symbol + // directly to a `Lrc` after appending the terminating NUL + // char. + let mut buf = symbol.as_str().to_owned().into_bytes(); buf.push(0); LitKind::CStr(buf.into(), StrStyle::Raw(n)) } diff --git a/tests/ui/str/str-escape.rs b/tests/ui/str/str-escape.rs index 10a72421f24a7..89a8217106391 100644 --- a/tests/ui/str/str-escape.rs +++ b/tests/ui/str/str-escape.rs @@ -1,5 +1,6 @@ // check-pass // ignore-tidy-tab +// edition: 2021 fn main() { let s = "\ @@ -8,11 +9,11 @@ fn main() { //~^^^ WARNING multiple lines skipped by escaped newline assert_eq!(s, ""); - let s = "foo\ + let s = c"foo\   bar "; //~^^^ WARNING whitespace symbol '\u{a0}' is not skipped - assert_eq!(s, "foo  bar\n "); + assert_eq!(s, c"foo  bar\n "); let s = "a\ b"; @@ -22,10 +23,10 @@ fn main() { b"; assert_eq!(s, "ab"); - let s = "a\ + let s = b"a\ b"; //~^^ WARNING whitespace symbol '\u{c}' is not skipped // '\x0c' is ASCII whitespace, but it may not need skipped // discussion: https://github.com/rust-lang/rust/pull/108403 - assert_eq!(s, "a\x0cb"); + assert_eq!(s, b"a\x0cb"); } diff --git a/tests/ui/str/str-escape.stderr b/tests/ui/str/str-escape.stderr index 43b4f7e36f6ab..00fe5444e1a4e 100644 --- a/tests/ui/str/str-escape.stderr +++ b/tests/ui/str/str-escape.stderr @@ -1,5 +1,5 @@ warning: multiple lines skipped by escaped newline - --> $DIR/str-escape.rs:5:14 + --> $DIR/str-escape.rs:6:14 | LL | let s = "\ | ______________^ @@ -8,20 +8,20 @@ LL | | "; | |_____________^ skipping everything up to and including this point warning: whitespace symbol '\u{a0}' is not skipped - --> $DIR/str-escape.rs:11:17 + --> $DIR/str-escape.rs:12:18 | -LL | let s = "foo\ - | _________________^ +LL | let s = c"foo\ + | __________________^ LL | |   bar | | ^ whitespace symbol '\u{a0}' is not skipped | |___| | warning: whitespace symbol '\u{c}' is not skipped - --> $DIR/str-escape.rs:25:15 + --> $DIR/str-escape.rs:26:16 | -LL | let s = "a\ - | _______________^ +LL | let s = b"a\ + | ________________^ LL | | b"; | | ^- whitespace symbol '\u{c}' is not skipped | |____| From 4b4bdb575b8498b22729a522885ffadd0e646c0f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 24 Jan 2024 16:46:57 +1100 Subject: [PATCH 2/7] Fix copy/paste error. The `CString` handling code is erroneously identical to the `ByteString` handling code. --- src/tools/rust-analyzer/crates/syntax/src/validation.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index 69dffbf79f191..fadcbaef14330 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,7 +5,7 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{self, unescape_literal, Mode}; +use rustc_lexer::unescape::{self, unescape_c_string, unescape_literal, Mode}; use crate::{ algo, @@ -162,7 +162,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_c_string(without_quotes, Mode::CStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } From ef1e2228cfd9df4059aa44740b0659fea7c5a52f Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 23 Jan 2024 10:37:27 +1100 Subject: [PATCH 3/7] Use `from` instead of `into` in unescaping code. The `T` type in these functions took me some time to understand, and I find the explicit `T` in the use of `from` makes the code easier to read, as does the `u8` annotation in `scan_escape`. --- compiler/rustc_lexer/src/unescape.rs | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 0a632c4d12ad5..a5ab3fcdd34a0 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -222,7 +222,7 @@ fn scan_escape + From>( mode: Mode, ) -> Result { // Previous character was '\\', unescape what follows. - let res = match chars.next().ok_or(EscapeError::LoneSlash)? { + let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? { '"' => b'"', 'n' => b'\n', 'r' => b'\r', @@ -249,10 +249,10 @@ fn scan_escape + From>( value as u8 } - 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into), + 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; - Ok(res.into()) + Ok(T::from(res)) } fn scan_unicode( @@ -366,7 +366,7 @@ where } '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, chars_should_be_ascii).map(Into::into), + _ => ascii_check(c, chars_should_be_ascii).map(T::from), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); From a1c07214f0f7988cbc5a645a499bb8f7dd9cbed7 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 23 Jan 2024 12:27:56 +1100 Subject: [PATCH 4/7] Rework `CStrUnit`. - Rename it as `MixedUnit`, because it will soon be used in more than just C string literals. - Change the `Byte` variant to `HighByte` and use it only for `\x80`..`\xff` cases. This fixes the old inexactness where ASCII chars could be encoded with either `Byte` or `Char`. - Add useful comments. - Remove `is_ascii`, in favour of `u8::is_ascii`. --- compiler/rustc_ast/src/util/literal.rs | 6 +- compiler/rustc_lexer/src/unescape.rs | 79 +++++++++++-------- .../crates/syntax/src/ast/token_ext.rs | 9 +-- 3 files changed, 52 insertions(+), 42 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index 852d49fc5b628..c3995c7776f94 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,7 +3,7 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; use rustc_lexer::unescape::{ - byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, + byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode, }; use rustc_span::symbol::{kw, sym, Symbol}; @@ -127,10 +127,10 @@ impl LitKind { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { - Ok(CStrUnit::Byte(b)) => buf.push(b), - Ok(CStrUnit::Char(c)) => { + Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } + Ok(MixedUnit::HighByte(b)) => buf.push(b), Err(err) => { assert!(!err.is_fatal(), "failed to unescape C string literal") } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index a5ab3fcdd34a0..3c23af58f376a 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -101,32 +101,45 @@ where } } -/// A unit within CStr. Must not be a nul character. -pub enum CStrUnit { - Byte(u8), +/// Used for mixed utf8 string literals, i.e. those that allow both unicode +/// chars and high bytes. +pub enum MixedUnit { + /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes) + /// and Unicode chars (written directly or via `\u` escapes). + /// + /// For example, if '¥' appears in a string it is represented here as + /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte + /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]` Char(char), + + /// Used for high bytes (`\x80`..`\xff`). + /// + /// For example, if `\xa5` appears in a string it is represented here as + /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant + /// byte string as the single byte `0xa5`. + HighByte(u8), } -impl From for CStrUnit { - fn from(value: u8) -> Self { - CStrUnit::Byte(value) +impl From for MixedUnit { + fn from(c: char) -> Self { + MixedUnit::Char(c) } } -impl From for CStrUnit { - fn from(value: char) -> Self { - CStrUnit::Char(value) +impl From for MixedUnit { + fn from(n: u8) -> Self { + if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) } } } pub fn unescape_c_string(src: &str, mode: Mode, callback: &mut F) where - F: FnMut(Range, Result), + F: FnMut(Range, Result), { match mode { CStr => { unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result { + if let Ok(MixedUnit::Char('\0')) = result { result = Err(EscapeError::NulInCStr); } callback(r, result) @@ -137,7 +150,8 @@ where if let Ok('\0') = result { result = Err(EscapeError::NulInCStr); } - callback(r, result.map(CStrUnit::Char)) + // High bytes aren't possible in raw strings. + callback(r, result.map(MixedUnit::Char)) }); } Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), @@ -217,20 +231,19 @@ impl Mode { } } -fn scan_escape + From>( +fn scan_escape + From>( chars: &mut Chars<'_>, mode: Mode, ) -> Result { // Previous character was '\\', unescape what follows. - let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? { - '"' => b'"', - 'n' => b'\n', - 'r' => b'\r', - 't' => b'\t', - '\\' => b'\\', - '\'' => b'\'', - '0' => b'\0', - + let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? { + '"' => '"', + 'n' => '\n', + 'r' => '\r', + 't' => '\t', + '\\' => '\\', + '\'' => '\'', + '0' => '\0', 'x' => { // Parse hexadecimal character code. @@ -240,15 +253,17 @@ fn scan_escape + From>( let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?; let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?; - let value = hi * 16 + lo; - - if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) { - return Err(EscapeError::OutOfRangeHexEscape); - } + let value = (hi * 16 + lo) as u8; - value as u8 + return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() { + Err(EscapeError::OutOfRangeHexEscape) + } else { + // This may be a high byte, but that will only happen if `T` is + // `MixedUnit`, because of the `ascii_escapes_should_be_ascii` + // check above. + Ok(T::from(value as u8)) + }; } - 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; @@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result + From>(src: &str, mode: Mode, callback: &mut F) +fn unescape_non_raw_common + From>(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 { debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr"); res as u8 } - -fn is_ascii(x: u32) -> bool { - x <= 0x7F -} diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index b39006e2ff265..2f75e9677ec89 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -6,7 +6,7 @@ use std::{ }; use rustc_lexer::unescape::{ - unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode, + unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode, }; use crate::{ @@ -336,10 +336,9 @@ impl ast::CString { let mut buf = Vec::new(); let mut prev_end = 0; let mut has_error = false; - let mut char_buf = [0u8; 4]; - let mut extend_unit = |buf: &mut Vec, unit: CStrUnit| match unit { - CStrUnit::Byte(b) => buf.push(b), - CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()), + let extend_unit = |buf: &mut Vec, unit: MixedUnit| match unit { + MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), + MixedUnit::HighByte(b) => buf.push(b), }; unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( unescaped, From 5e5aa6d556c273141a37c8cb542a022e3e9fae67 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Tue, 23 Jan 2024 14:25:34 +1100 Subject: [PATCH 5/7] Rename and invert sense of `Mode` predicates. I find it easier if they describe what's allowed, rather than what's forbidden. Also, consistent naming makes them easier to understand. --- compiler/rustc_lexer/src/unescape.rs | 56 ++++++++++++---------------- 1 file changed, 24 insertions(+), 32 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 3c23af58f376a..e4cf7439b97da 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -195,29 +195,29 @@ impl Mode { } } - /// Non-byte literals should have `\xXX` escapes that are within the ASCII range. - fn ascii_escapes_should_be_ascii(self) -> bool { + /// Are `\x80`..`\xff` allowed? + fn allow_high_bytes(self) -> bool { match self { - Char | Str => true, - Byte | ByteStr | CStr => false, + Char | Str => false, + Byte | ByteStr | CStr => true, RawStr | RawByteStr | RawCStr => unreachable!(), } } - /// Whether characters within the literal must be within the ASCII range. + /// Are unicode (non-ASCII) chars allowed? #[inline] - fn chars_should_be_ascii(self) -> bool { + fn allow_unicode_chars(self) -> bool { match self { - Byte | ByteStr | RawByteStr => true, - Char | Str | RawStr | CStr | RawCStr => false, + Byte | ByteStr | RawByteStr => false, + Char | Str | RawStr | CStr | RawCStr => true, } } - /// Byte literals do not allow unicode escape. - fn is_unicode_escape_disallowed(self) -> bool { + /// Are unicode escapes (`\u`) allowed? + fn allow_unicode_escapes(self) -> bool { match self { - Byte | ByteStr => true, - Char | Str | CStr => false, + Byte | ByteStr => false, + Char | Str | CStr => true, RawByteStr | RawStr | RawCStr => unreachable!(), } } @@ -255,25 +255,21 @@ fn scan_escape + From>( let value = (hi * 16 + lo) as u8; - return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() { + return if !mode.allow_high_bytes() && !value.is_ascii() { Err(EscapeError::OutOfRangeHexEscape) } else { // This may be a high byte, but that will only happen if `T` is - // `MixedUnit`, because of the `ascii_escapes_should_be_ascii` - // check above. + // `MixedUnit`, because of the `allow_high_bytes` check above. Ok(T::from(value as u8)) }; } - 'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from), + 'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from), _ => return Err(EscapeError::InvalidEscape), }; Ok(T::from(res)) } -fn scan_unicode( - chars: &mut Chars<'_>, - is_unicode_escape_disallowed: bool, -) -> Result { +fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result { // We've parsed '\u', now we have to parse '{..}'. if chars.next() != Some('{') { @@ -301,7 +297,7 @@ fn scan_unicode( // Incorrect syntax has higher priority for error reporting // than unallowed value for a literal. - if is_unicode_escape_disallowed { + if !allow_unicode_escapes { return Err(EscapeError::UnicodeEscapeInByte); } @@ -327,12 +323,8 @@ fn scan_unicode( } #[inline] -fn ascii_check(c: char, chars_should_be_ascii: bool) -> Result { - if chars_should_be_ascii && !c.is_ascii() { - Err(EscapeError::NonAsciiCharInByte) - } else { - Ok(c) - } +fn ascii_check(c: char, allow_unicode_chars: bool) -> Result { + if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) } } fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result { @@ -341,7 +333,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result scan_escape(chars, mode), '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, mode.chars_should_be_ascii()), + _ => ascii_check(c, mode.allow_unicode_chars()), }?; if chars.next().is_some() { return Err(EscapeError::MoreThanOneChar); @@ -356,7 +348,7 @@ where F: FnMut(Range, Result), { let mut chars = src.chars(); - let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop + let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop // The `start` and `end` computation here is complicated because // `skip_ascii_whitespace` makes us to skip over chars without counting @@ -381,7 +373,7 @@ where } '"' => Err(EscapeError::EscapeOnlyChar), '\r' => Err(EscapeError::BareCarriageReturn), - _ => ascii_check(c, chars_should_be_ascii).map(T::from), + _ => ascii_check(c, allow_unicode_chars).map(T::from), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); @@ -423,7 +415,7 @@ where F: FnMut(Range, Result), { let mut chars = src.chars(); - let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop + let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop // The `start` and `end` computation here matches the one in // `unescape_non_raw_common` for consistency, even though this function @@ -432,7 +424,7 @@ where let start = src.len() - chars.as_str().len() - c.len_utf8(); let res = match c { '\r' => Err(EscapeError::BareCarriageReturnInRawString), - _ => ascii_check(c, chars_should_be_ascii), + _ => ascii_check(c, allow_unicode_chars), }; let end = src.len() - chars.as_str().len(); callback(start..end, res); From 86f371ed59e4172c9891bc9b976362a73689fb12 Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 24 Jan 2024 15:24:58 +1100 Subject: [PATCH 6/7] Rename the unescaping functions. `unescape_literal` becomes `unescape_unicode`, and `unescape_c_string` becomes `unescape_mixed`. Because rfc3349 will mean that C string literals will no longer be the only mixed utf8 literals. --- compiler/rustc_ast/src/util/literal.rs | 9 ++++--- compiler/rustc_lexer/src/unescape.rs | 12 ++++++---- compiler/rustc_lexer/src/unescape/tests.rs | 10 ++++---- compiler/rustc_parse/src/lexer/mod.rs | 24 +++++++++---------- compiler/rustc_parse_format/src/lib.rs | 2 +- .../clippy/clippy_dev/src/update_lints.rs | 2 +- .../crates/parser/src/lexed_str.rs | 4 ++-- .../crates/syntax/src/ast/token_ext.rs | 12 +++++----- .../crates/syntax/src/validation.rs | 12 +++++----- 9 files changed, 45 insertions(+), 42 deletions(-) diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs index c3995c7776f94..aaeb1bb9bff82 100644 --- a/compiler/rustc_ast/src/util/literal.rs +++ b/compiler/rustc_ast/src/util/literal.rs @@ -3,8 +3,7 @@ use crate::ast::{self, LitKind, MetaItemLit, StrStyle}; use crate::token::{self, Token}; use rustc_lexer::unescape::{ - byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, - Mode, + byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, }; use rustc_span::symbol::{kw, sym, Symbol}; use rustc_span::Span; @@ -85,7 +84,7 @@ impl LitKind { // Force-inlining here is aggressive but the closure is // called on every char in the string, so it can be hot in // programs with many long strings containing escapes. - unescape_literal( + unescape_unicode( s, Mode::Str, &mut #[inline(always)] @@ -109,7 +108,7 @@ impl LitKind { token::ByteStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_literal(s, Mode::ByteStr, &mut |_, c| match c { + unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c { Ok(c) => buf.push(byte_from_char(c)), Err(err) => { assert!(!err.is_fatal(), "failed to unescape string literal") @@ -126,7 +125,7 @@ impl LitKind { token::CStr => { let s = symbol.as_str(); let mut buf = Vec::with_capacity(s.len()); - unescape_c_string(s, Mode::CStr, &mut |_span, c| match c { + unescape_mixed(s, Mode::CStr, &mut |_span, c| match c { Ok(MixedUnit::Char(c)) => { buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes()) } diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index e4cf7439b97da..4da6d35727cb5 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -80,12 +80,12 @@ impl EscapeError { } } -/// Takes a contents of a literal (without quotes) and produces a sequence of -/// escaped characters or errors. +/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without +/// quotes) and produces a sequence of escaped characters or errors. /// /// Values are returned by invoking `callback`. For `Char` and `Byte` modes, /// the callback will be called exactly once. -pub fn unescape_literal(src: &str, mode: Mode, callback: &mut F) +pub fn unescape_unicode(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { @@ -132,7 +132,11 @@ impl From for MixedUnit { } } -pub fn unescape_c_string(src: &str, mode: Mode, callback: &mut F) +/// Takes the contents of a mixed-utf8 literal (without quotes) and produces +/// a sequence of escaped characters or errors. +/// +/// Values are returned by invoking `callback`. +pub fn unescape_mixed(src: &str, mode: Mode, callback: &mut F) where F: FnMut(Range, Result), { diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs index 1c25b03fdb22e..5b99495f47581 100644 --- a/compiler/rustc_lexer/src/unescape/tests.rs +++ b/compiler/rustc_lexer/src/unescape/tests.rs @@ -100,7 +100,7 @@ fn test_unescape_char_good() { fn test_unescape_str_warn() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -124,7 +124,7 @@ fn test_unescape_str_warn() { fn test_unescape_str_good() { fn check(literal_text: &str, expected: &str) { let mut buf = Ok(String::with_capacity(literal_text.len())); - unescape_literal(literal_text, Mode::Str, &mut |range, c| { + unescape_unicode(literal_text, Mode::Str, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(c), @@ -241,7 +241,7 @@ fn test_unescape_byte_good() { fn test_unescape_byte_str_good() { fn check(literal_text: &str, expected: &[u8]) { let mut buf = Ok(Vec::with_capacity(literal_text.len())); - unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| { + unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| { if let Ok(b) = &mut buf { match c { Ok(c) => b.push(byte_from_char(c)), @@ -264,7 +264,7 @@ fn test_unescape_byte_str_good() { fn test_unescape_raw_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } @@ -276,7 +276,7 @@ fn test_unescape_raw_str() { fn test_unescape_raw_byte_str() { fn check(literal: &str, expected: &[(Range, Result)]) { let mut unescaped = Vec::with_capacity(literal.len()); - unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); + unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res))); assert_eq!(unescaped, expected); } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index d7ecf577ed676..a491d1969bd5c 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -400,7 +400,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0762)) .emit() } - self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' ' + self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' ' } rustc_lexer::LiteralKind::Byte { terminated } => { if !terminated { @@ -412,7 +412,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0763)) .emit() } - self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' + self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' ' } rustc_lexer::LiteralKind::Str { terminated } => { if !terminated { @@ -424,7 +424,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0765)) .emit() } - self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " " + self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " " } rustc_lexer::LiteralKind::ByteStr { terminated } => { if !terminated { @@ -436,7 +436,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0766)) .emit() } - self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " + self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" " } rustc_lexer::LiteralKind::CStr { terminated } => { if !terminated { @@ -448,13 +448,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> { .with_code(error_code!(E0767)) .emit() } - self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" " + self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" " } rustc_lexer::LiteralKind::RawStr { n_hashes } => { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::StrRaw(n_hashes); - self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## + self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "## } else { self.report_raw_str_error(start, 1); } @@ -463,7 +463,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::ByteStrRaw(n_hashes); - self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## + self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "## } else { self.report_raw_str_error(start, 2); } @@ -472,7 +472,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::CStrRaw(n_hashes); - self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## + self.cook_mixed(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## } else { self.report_raw_str_error(start, 2); } @@ -735,7 +735,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { } } - fn cook_quoted( + fn cook_unicode( &self, kind: token::LitKind, mode: Mode, @@ -745,13 +745,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> { postfix_len: u32, ) -> (token::LitKind, Symbol) { self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape::unescape_literal(src, mode, &mut |span, result| { + unescape::unescape_unicode(src, mode, &mut |span, result| { callback(span, result.map(drop)) }) }) } - fn cook_c_string( + fn cook_mixed( &self, kind: token::LitKind, mode: Mode, @@ -761,7 +761,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { postfix_len: u32, ) -> (token::LitKind, Symbol) { self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| { - unescape::unescape_c_string(src, mode, &mut |span, result| { + unescape::unescape_mixed(src, mode, &mut |span, result| { callback(span, result.map(drop)) }) }) diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs index 625764876a6b6..d76ee161da6fd 100644 --- a/compiler/rustc_parse_format/src/lib.rs +++ b/compiler/rustc_parse_format/src/lib.rs @@ -1056,7 +1056,7 @@ fn find_width_map_from_snippet( fn unescape_string(string: &str) -> Option { let mut buf = string::String::new(); let mut ok = true; - unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| { + unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| { match unescaped_char { Ok(c) => buf.push(c), Err(_) => ok = false, diff --git a/src/tools/clippy/clippy_dev/src/update_lints.rs b/src/tools/clippy/clippy_dev/src/update_lints.rs index 6b76a44debff7..f598f5d3d50f8 100644 --- a/src/tools/clippy/clippy_dev/src/update_lints.rs +++ b/src/tools/clippy/clippy_dev/src/update_lints.rs @@ -928,7 +928,7 @@ fn remove_line_splices(s: &str) -> String { .and_then(|s| s.strip_suffix('"')) .unwrap_or_else(|| panic!("expected quoted string, found `{s}`")); let mut res = String::with_capacity(s.len()); - unescape::unescape_literal(s, unescape::Mode::Str, &mut |range, ch| { + unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| { if ch.is_ok() { res.push_str(&s[range]); } diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs index aa25f82ae1d8d..bf1feb9a7eb07 100644 --- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs +++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs @@ -379,14 +379,14 @@ fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str { let mut error_message = ""; match mode { Mode::CStr => { - rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| { + rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } }); } Mode::ByteStr | Mode::Str => { - rustc_lexer::unescape::unescape_literal(text, mode, &mut |_, res| { + rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| { if let Err(e) = res { error_message = error_to_diagnostic_message(e, mode); } diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs index 2f75e9677ec89..7cd1f1550b988 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs @@ -6,7 +6,7 @@ use std::{ }; use rustc_lexer::unescape::{ - unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode, + unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode, }; use crate::{ @@ -193,7 +193,7 @@ pub trait IsString: AstToken { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_literal(text, Self::MODE, &mut |range, unescaped_char| { + unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); cb(text_range + offset, unescaped_char); @@ -226,7 +226,7 @@ impl ast::String { let mut buf = String::new(); let mut prev_end = 0; let mut has_error = false; - unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match ( + unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -270,7 +270,7 @@ impl ast::ByteString { let mut buf: Vec = Vec::new(); let mut prev_end = 0; let mut has_error = false; - unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match ( + unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match ( unescaped_char, buf.capacity() == 0, ) { @@ -311,7 +311,7 @@ impl IsString for ast::CString { let text = &self.text()[text_range_no_quotes - start]; let offset = text_range_no_quotes.start() - start; - unescape_c_string(text, Self::MODE, &mut |range, unescaped_char| { + unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| { let text_range = TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap()); // XXX: This method should only be used for highlighting ranges. The unescaped @@ -340,7 +340,7 @@ impl ast::CString { MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()), MixedUnit::HighByte(b) => buf.push(b), }; - unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match ( + unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match ( unescaped, buf.capacity() == 0, ) { diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs index fadcbaef14330..5c5b26f525f66 100644 --- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs +++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs @@ -5,7 +5,7 @@ mod block; use rowan::Direction; -use rustc_lexer::unescape::{self, unescape_c_string, unescape_literal, Mode}; +use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode}; use crate::{ algo, @@ -140,7 +140,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::String(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 1, '"') { - unescape_literal(without_quotes, Mode::Str, &mut |range, char| { + unescape_unicode(without_quotes, Mode::Str, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -151,7 +151,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::ByteString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| { + unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -162,7 +162,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { ast::LiteralKind::CString(s) => { if !s.is_raw() { if let Some(without_quotes) = unquote(text, 2, '"') { - unescape_c_string(without_quotes, Mode::CStr, &mut |range, char| { + unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -172,7 +172,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Char(_) => { if let Some(without_quotes) = unquote(text, 1, '\'') { - unescape_literal(without_quotes, Mode::Char, &mut |range, char| { + unescape_unicode(without_quotes, Mode::Char, &mut |range, char| { if let Err(err) = char { push_err(1, range.start, err); } @@ -181,7 +181,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec) { } ast::LiteralKind::Byte(_) => { if let Some(without_quotes) = unquote(text, 2, '\'') { - unescape_literal(without_quotes, Mode::Byte, &mut |range, char| { + unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| { if let Err(err) = char { push_err(2, range.start, err); } From 6be2e5623cb7ae63ca1796759150c0cbc845bbcd Mon Sep 17 00:00:00 2001 From: Nicholas Nethercote Date: Wed, 24 Jan 2024 16:00:10 +1100 Subject: [PATCH 7/7] Use `unescape_unicode` for raw C string literals. They can't contain `\x` escapes, which means they can't contain high bytes, which means we can used `unescape_unicode` instead of `unescape_mixed` to unescape them. This avoids unnecessary used of `MixedUnit`. --- compiler/rustc_lexer/src/unescape.rs | 33 ++++++++++++--------------- compiler/rustc_parse/src/lexer/mod.rs | 2 +- 2 files changed, 15 insertions(+), 20 deletions(-) diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs index 4da6d35727cb5..03d178eb266a4 100644 --- a/compiler/rustc_lexer/src/unescape.rs +++ b/compiler/rustc_lexer/src/unescape.rs @@ -97,7 +97,13 @@ where } Str | ByteStr => unescape_non_raw_common(src, mode, callback), RawStr | RawByteStr => check_raw_common(src, mode, callback), - CStr | RawCStr => unreachable!(), + RawCStr => check_raw_common(src, mode, &mut |r, mut result| { + if let Ok('\0') = result { + result = Err(EscapeError::NulInCStr); + } + callback(r, result) + }), + CStr => unreachable!(), } } @@ -141,24 +147,13 @@ where F: FnMut(Range, Result), { match mode { - CStr => { - unescape_non_raw_common(src, mode, &mut |r, mut result| { - if let Ok(MixedUnit::Char('\0')) = result { - result = Err(EscapeError::NulInCStr); - } - callback(r, result) - }); - } - RawCStr => { - check_raw_common(src, mode, &mut |r, mut result| { - if let Ok('\0') = result { - result = Err(EscapeError::NulInCStr); - } - // High bytes aren't possible in raw strings. - callback(r, result.map(MixedUnit::Char)) - }); - } - Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(), + CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| { + if let Ok(MixedUnit::Char('\0')) = result { + result = Err(EscapeError::NulInCStr); + } + callback(r, result) + }), + Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(), } } diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs index a491d1969bd5c..20ec4a300c1f8 100644 --- a/compiler/rustc_parse/src/lexer/mod.rs +++ b/compiler/rustc_parse/src/lexer/mod.rs @@ -472,7 +472,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> { if let Some(n_hashes) = n_hashes { let n = u32::from(n_hashes); let kind = token::CStrRaw(n_hashes); - self.cook_mixed(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## + self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "## } else { self.report_raw_str_error(start, 2); }