From 314dbc7f2250cc7bece306254f6bfafc96bf913f Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Mon, 22 Jan 2024 16:58:39 +1100
Subject: [PATCH 1/7] Avoid useless checking in `from_token_lit`.

The parser already does a check-only unescaping which catches all
errors. So the checking done in `from_token_lit` never hits.

But literals causing warnings can still occur in `from_token_lit`. So
the commit changes `str-escape.rs` to use byte string literals and C
string literals as well, to give better coverage and ensure the new
assertions in `from_token_lit` are correct.
---
 compiler/rustc_ast/src/util/literal.rs | 83 +++++++-------------------
 tests/ui/str/str-escape.rs             |  9 +--
 tests/ui/str/str-escape.stderr         | 14 ++---
 3 files changed, 33 insertions(+), 73 deletions(-)

diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index 7b781ba1e1121..852d49fc5b628 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -48,6 +48,9 @@ impl LitKind {
             return Err(LitError::InvalidSuffix);
         }
 
+        // For byte/char/string literals, chars and escapes have already been
+        // checked in the lexer (in `cook_lexer_literal`). So we can assume all
+        // chars and escapes are valid here.
         Ok(match kind {
             token::Bool => {
                 assert!(symbol.is_bool_lit());
@@ -56,12 +59,12 @@ impl LitKind {
             token::Byte => {
                 return unescape_byte(symbol.as_str())
                     .map(LitKind::Byte)
-                    .map_err(|_| LitError::LexerError);
+                    .map_err(|_| panic!("failed to unescape byte literal"));
             }
             token::Char => {
                 return unescape_char(symbol.as_str())
                     .map(LitKind::Char)
-                    .map_err(|_| LitError::LexerError);
+                    .map_err(|_| panic!("failed to unescape char literal"));
             }
 
             // There are some valid suffixes for integer and float literals,
@@ -77,26 +80,22 @@ impl LitKind {
                 let s = symbol.as_str();
                 // Vanilla strings are so common we optimize for the common case where no chars
                 // requiring special behaviour are present.
-                let symbol = if s.contains(['\\', '\r']) {
+                let symbol = if s.contains('\\') {
                     let mut buf = String::with_capacity(s.len());
-                    let mut error = Ok(());
                     // Force-inlining here is aggressive but the closure is
-                    // called on every char in the string, so it can be
-                    // hot in programs with many long strings.
+                    // called on every char in the string, so it can be hot in
+                    // programs with many long strings containing escapes.
                     unescape_literal(
                         s,
                         Mode::Str,
                         &mut #[inline(always)]
-                        |_, unescaped_char| match unescaped_char {
+                        |_, c| match c {
                             Ok(c) => buf.push(c),
                             Err(err) => {
-                                if err.is_fatal() {
-                                    error = Err(LitError::LexerError);
-                                }
+                                assert!(!err.is_fatal(), "failed to unescape string literal")
                             }
                         },
                     );
-                    error?;
                     Symbol::intern(&buf)
                 } else {
                     symbol
@@ -104,86 +103,46 @@ impl LitKind {
                 LitKind::Str(symbol, ast::StrStyle::Cooked)
             }
             token::StrRaw(n) => {
-                // Raw strings have no escapes, so we only need to check for invalid chars, and we
-                // can reuse the symbol on success.
-                let mut error = Ok(());
-                unescape_literal(symbol.as_str(), Mode::RawStr, &mut |_, unescaped_char| {
-                    match unescaped_char {
-                        Ok(_) => {}
-                        Err(err) => {
-                            if err.is_fatal() {
-                                error = Err(LitError::LexerError);
-                            }
-                        }
-                    }
-                });
-                error?;
+                // Raw strings have no escapes so no work is needed here.
                 LitKind::Str(symbol, ast::StrStyle::Raw(n))
             }
             token::ByteStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                let mut error = Ok(());
                 unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
                     Ok(c) => buf.push(byte_from_char(c)),
                     Err(err) => {
-                        if err.is_fatal() {
-                            error = Err(LitError::LexerError);
-                        }
+                        assert!(!err.is_fatal(), "failed to unescape string literal")
                     }
                 });
-                error?;
                 LitKind::ByteStr(buf.into(), StrStyle::Cooked)
             }
             token::ByteStrRaw(n) => {
-                // Raw strings have no escapes, so we only need to check for invalid chars, and we
-                // can convert the symbol directly to a `Lrc<u8>` on success.
-                let s = symbol.as_str();
-                let mut error = Ok(());
-                unescape_literal(s, Mode::RawByteStr, &mut |_, c| match c {
-                    Ok(_) => {}
-                    Err(err) => {
-                        if err.is_fatal() {
-                            error = Err(LitError::LexerError);
-                        }
-                    }
-                });
-                LitKind::ByteStr(s.to_owned().into_bytes().into(), StrStyle::Raw(n))
+                // Raw strings have no escapes so we can convert the symbol
+                // directly to a `Lrc<u8>`.
+                let buf = symbol.as_str().to_owned().into_bytes();
+                LitKind::ByteStr(buf.into(), StrStyle::Raw(n))
             }
             token::CStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                let mut error = Ok(());
                 unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
                     Ok(CStrUnit::Byte(b)) => buf.push(b),
                     Ok(CStrUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
                     Err(err) => {
-                        if err.is_fatal() {
-                            error = Err(LitError::LexerError);
-                        }
+                        assert!(!err.is_fatal(), "failed to unescape C string literal")
                     }
                 });
-                error?;
                 buf.push(0);
                 LitKind::CStr(buf.into(), StrStyle::Cooked)
             }
             token::CStrRaw(n) => {
-                // Raw strings have no escapes, so we only need to check for invalid chars, and we
-                // can convert the symbol directly to a `Lrc<u8>` on success.
-                let s = symbol.as_str();
-                let mut error = Ok(());
-                unescape_c_string(s, Mode::RawCStr, &mut |_, c| match c {
-                    Ok(_) => {}
-                    Err(err) => {
-                        if err.is_fatal() {
-                            error = Err(LitError::LexerError);
-                        }
-                    }
-                });
-                error?;
-                let mut buf = s.to_owned().into_bytes();
+                // Raw strings have no escapes so we can convert the symbol
+                // directly to a `Lrc<u8>` after appending the terminating NUL
+                // char.
+                let mut buf = symbol.as_str().to_owned().into_bytes();
                 buf.push(0);
                 LitKind::CStr(buf.into(), StrStyle::Raw(n))
             }
diff --git a/tests/ui/str/str-escape.rs b/tests/ui/str/str-escape.rs
index 10a72421f24a7..89a8217106391 100644
--- a/tests/ui/str/str-escape.rs
+++ b/tests/ui/str/str-escape.rs
@@ -1,5 +1,6 @@
 // check-pass
 // ignore-tidy-tab
+// edition: 2021
 
 fn main() {
     let s = "\
@@ -8,11 +9,11 @@ fn main() {
     //~^^^ WARNING multiple lines skipped by escaped newline
     assert_eq!(s, "");
 
-    let s = "foo\
+    let s = c"foo\
              bar
              ";
     //~^^^ WARNING whitespace symbol '\u{a0}' is not skipped
-    assert_eq!(s, "foo           bar\n             ");
+    assert_eq!(s, c"foo           bar\n             ");
 
     let s = "a\
  b";
@@ -22,10 +23,10 @@ fn main() {
 	b";
     assert_eq!(s, "ab");
 
-    let s = "a\
+    let s = b"a\
     b";
     //~^^ WARNING whitespace symbol '\u{c}' is not skipped
     // '\x0c' is ASCII whitespace, but it may not need skipped
     // discussion: https://github.com/rust-lang/rust/pull/108403
-    assert_eq!(s, "a\x0cb");
+    assert_eq!(s, b"a\x0cb");
 }
diff --git a/tests/ui/str/str-escape.stderr b/tests/ui/str/str-escape.stderr
index 43b4f7e36f6ab..00fe5444e1a4e 100644
--- a/tests/ui/str/str-escape.stderr
+++ b/tests/ui/str/str-escape.stderr
@@ -1,5 +1,5 @@
 warning: multiple lines skipped by escaped newline
-  --> $DIR/str-escape.rs:5:14
+  --> $DIR/str-escape.rs:6:14
    |
 LL |       let s = "\
    |  ______________^
@@ -8,20 +8,20 @@ LL | |              ";
    | |_____________^ skipping everything up to and including this point
 
 warning: whitespace symbol '\u{a0}' is not skipped
-  --> $DIR/str-escape.rs:11:17
+  --> $DIR/str-escape.rs:12:18
    |
-LL |       let s = "foo\
-   |  _________________^
+LL |       let s = c"foo\
+   |  __________________^
 LL | |              bar
    | |   ^ whitespace symbol '\u{a0}' is not skipped
    | |___|
    | 
 
 warning: whitespace symbol '\u{c}' is not skipped
-  --> $DIR/str-escape.rs:25:15
+  --> $DIR/str-escape.rs:26:16
    |
-LL |       let s = "a\
-   |  _______________^
+LL |       let s = b"a\
+   |  ________________^
 LL | |     b";
    | |    ^- whitespace symbol '\u{c}' is not skipped
    | |____|

From 4b4bdb575b8498b22729a522885ffadd0e646c0f Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 24 Jan 2024 16:46:57 +1100
Subject: [PATCH 2/7] Fix copy/paste error.

The `CString` handling code is erroneously identical to the `ByteString`
handling code.
---
 src/tools/rust-analyzer/crates/syntax/src/validation.rs | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
index 69dffbf79f191..fadcbaef14330 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
@@ -5,7 +5,7 @@
 mod block;
 
 use rowan::Direction;
-use rustc_lexer::unescape::{self, unescape_literal, Mode};
+use rustc_lexer::unescape::{self, unescape_c_string, unescape_literal, Mode};
 
 use crate::{
     algo,
@@ -162,7 +162,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::CString(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 2, '"') {
-                    unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| {
+                    unescape_c_string(without_quotes, Mode::CStr, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }

From ef1e2228cfd9df4059aa44740b0659fea7c5a52f Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 23 Jan 2024 10:37:27 +1100
Subject: [PATCH 3/7] Use `from` instead of `into` in unescaping code.

The `T` type in these functions took me some time to understand, and I
find the explicit `T` in the use of `from` makes the code easier to
read, as does the `u8` annotation in `scan_escape`.
---
 compiler/rustc_lexer/src/unescape.rs | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 0a632c4d12ad5..a5ab3fcdd34a0 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -222,7 +222,7 @@ fn scan_escape<T: From<u8> + From<char>>(
     mode: Mode,
 ) -> Result<T, EscapeError> {
     // Previous character was '\\', unescape what follows.
-    let res = match chars.next().ok_or(EscapeError::LoneSlash)? {
+    let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? {
         '"' => b'"',
         'n' => b'\n',
         'r' => b'\r',
@@ -249,10 +249,10 @@ fn scan_escape<T: From<u8> + From<char>>(
             value as u8
         }
 
-        'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(Into::into),
+        'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from),
         _ => return Err(EscapeError::InvalidEscape),
     };
-    Ok(res.into())
+    Ok(T::from(res))
 }
 
 fn scan_unicode(
@@ -366,7 +366,7 @@ where
             }
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, chars_should_be_ascii).map(Into::into),
+            _ => ascii_check(c, chars_should_be_ascii).map(T::from),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, res);

From a1c07214f0f7988cbc5a645a499bb8f7dd9cbed7 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 23 Jan 2024 12:27:56 +1100
Subject: [PATCH 4/7] Rework `CStrUnit`.

- Rename it as `MixedUnit`, because it will soon be used in more than
  just C string literals.
- Change the `Byte` variant to `HighByte` and use it only for
  `\x80`..`\xff` cases. This fixes the old inexactness where ASCII chars
  could be encoded with either `Byte` or `Char`.
- Add useful comments.
- Remove `is_ascii`, in favour of `u8::is_ascii`.
---
 compiler/rustc_ast/src/util/literal.rs        |  6 +-
 compiler/rustc_lexer/src/unescape.rs          | 79 +++++++++++--------
 .../crates/syntax/src/ast/token_ext.rs        |  9 +--
 3 files changed, 52 insertions(+), 42 deletions(-)

diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index 852d49fc5b628..c3995c7776f94 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -3,7 +3,7 @@
 use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
 use crate::token::{self, Token};
 use rustc_lexer::unescape::{
-    byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit,
+    byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
     Mode,
 };
 use rustc_span::symbol::{kw, sym, Symbol};
@@ -127,10 +127,10 @@ impl LitKind {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
                 unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
-                    Ok(CStrUnit::Byte(b)) => buf.push(b),
-                    Ok(CStrUnit::Char(c)) => {
+                    Ok(MixedUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
+                    Ok(MixedUnit::HighByte(b)) => buf.push(b),
                     Err(err) => {
                         assert!(!err.is_fatal(), "failed to unescape C string literal")
                     }
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index a5ab3fcdd34a0..3c23af58f376a 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -101,32 +101,45 @@ where
     }
 }
 
-/// A unit within CStr. Must not be a nul character.
-pub enum CStrUnit {
-    Byte(u8),
+/// Used for mixed utf8 string literals, i.e. those that allow both unicode
+/// chars and high bytes.
+pub enum MixedUnit {
+    /// Used for ASCII chars (written directly or via `\x00`..`\x7f` escapes)
+    /// and Unicode chars (written directly or via `\u` escapes).
+    ///
+    /// For example, if '¥' appears in a string it is represented here as
+    /// `MixedUnit::Char('¥')`, and it will be appended to the relevant byte
+    /// string as the two-byte UTF-8 sequence `[0xc2, 0xa5]`
     Char(char),
+
+    /// Used for high bytes (`\x80`..`\xff`).
+    ///
+    /// For example, if `\xa5` appears in a string it is represented here as
+    /// `MixedUnit::HighByte(0xa5)`, and it will be appended to the relevant
+    /// byte string as the single byte `0xa5`.
+    HighByte(u8),
 }
 
-impl From<u8> for CStrUnit {
-    fn from(value: u8) -> Self {
-        CStrUnit::Byte(value)
+impl From<char> for MixedUnit {
+    fn from(c: char) -> Self {
+        MixedUnit::Char(c)
     }
 }
 
-impl From<char> for CStrUnit {
-    fn from(value: char) -> Self {
-        CStrUnit::Char(value)
+impl From<u8> for MixedUnit {
+    fn from(n: u8) -> Self {
+        if n.is_ascii() { MixedUnit::Char(n as char) } else { MixedUnit::HighByte(n) }
     }
 }
 
 pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
 where
-    F: FnMut(Range<usize>, Result<CStrUnit, EscapeError>),
+    F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 {
     match mode {
         CStr => {
             unescape_non_raw_common(src, mode, &mut |r, mut result| {
-                if let Ok(CStrUnit::Byte(0) | CStrUnit::Char('\0')) = result {
+                if let Ok(MixedUnit::Char('\0')) = result {
                     result = Err(EscapeError::NulInCStr);
                 }
                 callback(r, result)
@@ -137,7 +150,8 @@ where
                 if let Ok('\0') = result {
                     result = Err(EscapeError::NulInCStr);
                 }
-                callback(r, result.map(CStrUnit::Char))
+                // High bytes aren't possible in raw strings.
+                callback(r, result.map(MixedUnit::Char))
             });
         }
         Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
@@ -217,20 +231,19 @@ impl Mode {
     }
 }
 
-fn scan_escape<T: From<u8> + From<char>>(
+fn scan_escape<T: From<char> + From<u8>>(
     chars: &mut Chars<'_>,
     mode: Mode,
 ) -> Result<T, EscapeError> {
     // Previous character was '\\', unescape what follows.
-    let res: u8 = match chars.next().ok_or(EscapeError::LoneSlash)? {
-        '"' => b'"',
-        'n' => b'\n',
-        'r' => b'\r',
-        't' => b'\t',
-        '\\' => b'\\',
-        '\'' => b'\'',
-        '0' => b'\0',
-
+    let res: char = match chars.next().ok_or(EscapeError::LoneSlash)? {
+        '"' => '"',
+        'n' => '\n',
+        'r' => '\r',
+        't' => '\t',
+        '\\' => '\\',
+        '\'' => '\'',
+        '0' => '\0',
         'x' => {
             // Parse hexadecimal character code.
 
@@ -240,15 +253,17 @@ fn scan_escape<T: From<u8> + From<char>>(
             let lo = chars.next().ok_or(EscapeError::TooShortHexEscape)?;
             let lo = lo.to_digit(16).ok_or(EscapeError::InvalidCharInHexEscape)?;
 
-            let value = hi * 16 + lo;
-
-            if mode.ascii_escapes_should_be_ascii() && !is_ascii(value) {
-                return Err(EscapeError::OutOfRangeHexEscape);
-            }
+            let value = (hi * 16 + lo) as u8;
 
-            value as u8
+            return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() {
+                Err(EscapeError::OutOfRangeHexEscape)
+            } else {
+                // This may be a high byte, but that will only happen if `T` is
+                // `MixedUnit`, because of the `ascii_escapes_should_be_ascii`
+                // check above.
+                Ok(T::from(value as u8))
+            };
         }
-
         'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from),
         _ => return Err(EscapeError::InvalidEscape),
     };
@@ -336,7 +351,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
 
 /// Takes a contents of a string literal (without quotes) and produces a
 /// sequence of escaped characters or errors.
-fn unescape_non_raw_common<F, T: From<u8> + From<char>>(src: &str, mode: Mode, callback: &mut F)
+fn unescape_non_raw_common<F, T: From<char> + From<u8>>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<T, EscapeError>),
 {
@@ -430,7 +445,3 @@ pub fn byte_from_char(c: char) -> u8 {
     debug_assert!(res <= u8::MAX as u32, "guaranteed because of ByteStr");
     res as u8
 }
-
-fn is_ascii(x: u32) -> bool {
-    x <= 0x7F
-}
diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
index b39006e2ff265..2f75e9677ec89 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
@@ -6,7 +6,7 @@ use std::{
 };
 
 use rustc_lexer::unescape::{
-    unescape_byte, unescape_c_string, unescape_char, unescape_literal, CStrUnit, Mode,
+    unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode,
 };
 
 use crate::{
@@ -336,10 +336,9 @@ impl ast::CString {
         let mut buf = Vec::new();
         let mut prev_end = 0;
         let mut has_error = false;
-        let mut char_buf = [0u8; 4];
-        let mut extend_unit = |buf: &mut Vec<u8>, unit: CStrUnit| match unit {
-            CStrUnit::Byte(b) => buf.push(b),
-            CStrUnit::Char(c) => buf.extend(c.encode_utf8(&mut char_buf).as_bytes()),
+        let extend_unit = |buf: &mut Vec<u8>, unit: MixedUnit| match unit {
+            MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
+            MixedUnit::HighByte(b) => buf.push(b),
         };
         unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match (
             unescaped,

From 5e5aa6d556c273141a37c8cb542a022e3e9fae67 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Tue, 23 Jan 2024 14:25:34 +1100
Subject: [PATCH 5/7] Rename and invert sense of `Mode` predicates.

I find it easier if they describe what's allowed, rather than what's
forbidden. Also, consistent naming makes them easier to understand.
---
 compiler/rustc_lexer/src/unescape.rs | 56 ++++++++++++----------------
 1 file changed, 24 insertions(+), 32 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 3c23af58f376a..e4cf7439b97da 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -195,29 +195,29 @@ impl Mode {
         }
     }
 
-    /// Non-byte literals should have `\xXX` escapes that are within the ASCII range.
-    fn ascii_escapes_should_be_ascii(self) -> bool {
+    /// Are `\x80`..`\xff` allowed?
+    fn allow_high_bytes(self) -> bool {
         match self {
-            Char | Str => true,
-            Byte | ByteStr | CStr => false,
+            Char | Str => false,
+            Byte | ByteStr | CStr => true,
             RawStr | RawByteStr | RawCStr => unreachable!(),
         }
     }
 
-    /// Whether characters within the literal must be within the ASCII range.
+    /// Are unicode (non-ASCII) chars allowed?
     #[inline]
-    fn chars_should_be_ascii(self) -> bool {
+    fn allow_unicode_chars(self) -> bool {
         match self {
-            Byte | ByteStr | RawByteStr => true,
-            Char | Str | RawStr | CStr | RawCStr => false,
+            Byte | ByteStr | RawByteStr => false,
+            Char | Str | RawStr | CStr | RawCStr => true,
         }
     }
 
-    /// Byte literals do not allow unicode escape.
-    fn is_unicode_escape_disallowed(self) -> bool {
+    /// Are unicode escapes (`\u`) allowed?
+    fn allow_unicode_escapes(self) -> bool {
         match self {
-            Byte | ByteStr => true,
-            Char | Str | CStr => false,
+            Byte | ByteStr => false,
+            Char | Str | CStr => true,
             RawByteStr | RawStr | RawCStr => unreachable!(),
         }
     }
@@ -255,25 +255,21 @@ fn scan_escape<T: From<char> + From<u8>>(
 
             let value = (hi * 16 + lo) as u8;
 
-            return if mode.ascii_escapes_should_be_ascii() && !value.is_ascii() {
+            return if !mode.allow_high_bytes() && !value.is_ascii() {
                 Err(EscapeError::OutOfRangeHexEscape)
             } else {
                 // This may be a high byte, but that will only happen if `T` is
-                // `MixedUnit`, because of the `ascii_escapes_should_be_ascii`
-                // check above.
+                // `MixedUnit`, because of the `allow_high_bytes` check above.
                 Ok(T::from(value as u8))
             };
         }
-        'u' => return scan_unicode(chars, mode.is_unicode_escape_disallowed()).map(T::from),
+        'u' => return scan_unicode(chars, mode.allow_unicode_escapes()).map(T::from),
         _ => return Err(EscapeError::InvalidEscape),
     };
     Ok(T::from(res))
 }
 
-fn scan_unicode(
-    chars: &mut Chars<'_>,
-    is_unicode_escape_disallowed: bool,
-) -> Result<char, EscapeError> {
+fn scan_unicode(chars: &mut Chars<'_>, allow_unicode_escapes: bool) -> Result<char, EscapeError> {
     // We've parsed '\u', now we have to parse '{..}'.
 
     if chars.next() != Some('{') {
@@ -301,7 +297,7 @@ fn scan_unicode(
 
                 // Incorrect syntax has higher priority for error reporting
                 // than unallowed value for a literal.
-                if is_unicode_escape_disallowed {
+                if !allow_unicode_escapes {
                     return Err(EscapeError::UnicodeEscapeInByte);
                 }
 
@@ -327,12 +323,8 @@ fn scan_unicode(
 }
 
 #[inline]
-fn ascii_check(c: char, chars_should_be_ascii: bool) -> Result<char, EscapeError> {
-    if chars_should_be_ascii && !c.is_ascii() {
-        Err(EscapeError::NonAsciiCharInByte)
-    } else {
-        Ok(c)
-    }
+fn ascii_check(c: char, allow_unicode_chars: bool) -> Result<char, EscapeError> {
+    if allow_unicode_chars || c.is_ascii() { Ok(c) } else { Err(EscapeError::NonAsciiCharInByte) }
 }
 
 fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, EscapeError> {
@@ -341,7 +333,7 @@ fn unescape_char_or_byte(chars: &mut Chars<'_>, mode: Mode) -> Result<char, Esca
         '\\' => scan_escape(chars, mode),
         '\n' | '\t' | '\'' => Err(EscapeError::EscapeOnlyChar),
         '\r' => Err(EscapeError::BareCarriageReturn),
-        _ => ascii_check(c, mode.chars_should_be_ascii()),
+        _ => ascii_check(c, mode.allow_unicode_chars()),
     }?;
     if chars.next().is_some() {
         return Err(EscapeError::MoreThanOneChar);
@@ -356,7 +348,7 @@ where
     F: FnMut(Range<usize>, Result<T, EscapeError>),
 {
     let mut chars = src.chars();
-    let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
+    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
 
     // The `start` and `end` computation here is complicated because
     // `skip_ascii_whitespace` makes us to skip over chars without counting
@@ -381,7 +373,7 @@ where
             }
             '"' => Err(EscapeError::EscapeOnlyChar),
             '\r' => Err(EscapeError::BareCarriageReturn),
-            _ => ascii_check(c, chars_should_be_ascii).map(T::from),
+            _ => ascii_check(c, allow_unicode_chars).map(T::from),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, res);
@@ -423,7 +415,7 @@ where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
     let mut chars = src.chars();
-    let chars_should_be_ascii = mode.chars_should_be_ascii(); // get this outside the loop
+    let allow_unicode_chars = mode.allow_unicode_chars(); // get this outside the loop
 
     // The `start` and `end` computation here matches the one in
     // `unescape_non_raw_common` for consistency, even though this function
@@ -432,7 +424,7 @@ where
         let start = src.len() - chars.as_str().len() - c.len_utf8();
         let res = match c {
             '\r' => Err(EscapeError::BareCarriageReturnInRawString),
-            _ => ascii_check(c, chars_should_be_ascii),
+            _ => ascii_check(c, allow_unicode_chars),
         };
         let end = src.len() - chars.as_str().len();
         callback(start..end, res);

From 86f371ed59e4172c9891bc9b976362a73689fb12 Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 24 Jan 2024 15:24:58 +1100
Subject: [PATCH 6/7] Rename the unescaping functions.

`unescape_literal` becomes `unescape_unicode`, and `unescape_c_string`
becomes `unescape_mixed`. Because rfc3349 will mean that C string
literals will no longer be the only mixed utf8 literals.
---
 compiler/rustc_ast/src/util/literal.rs        |  9 ++++---
 compiler/rustc_lexer/src/unescape.rs          | 12 ++++++----
 compiler/rustc_lexer/src/unescape/tests.rs    | 10 ++++----
 compiler/rustc_parse/src/lexer/mod.rs         | 24 +++++++++----------
 compiler/rustc_parse_format/src/lib.rs        |  2 +-
 .../clippy/clippy_dev/src/update_lints.rs     |  2 +-
 .../crates/parser/src/lexed_str.rs            |  4 ++--
 .../crates/syntax/src/ast/token_ext.rs        | 12 +++++-----
 .../crates/syntax/src/validation.rs           | 12 +++++-----
 9 files changed, 45 insertions(+), 42 deletions(-)

diff --git a/compiler/rustc_ast/src/util/literal.rs b/compiler/rustc_ast/src/util/literal.rs
index c3995c7776f94..aaeb1bb9bff82 100644
--- a/compiler/rustc_ast/src/util/literal.rs
+++ b/compiler/rustc_ast/src/util/literal.rs
@@ -3,8 +3,7 @@
 use crate::ast::{self, LitKind, MetaItemLit, StrStyle};
 use crate::token::{self, Token};
 use rustc_lexer::unescape::{
-    byte_from_char, unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit,
-    Mode,
+    byte_from_char, unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
 };
 use rustc_span::symbol::{kw, sym, Symbol};
 use rustc_span::Span;
@@ -85,7 +84,7 @@ impl LitKind {
                     // Force-inlining here is aggressive but the closure is
                     // called on every char in the string, so it can be hot in
                     // programs with many long strings containing escapes.
-                    unescape_literal(
+                    unescape_unicode(
                         s,
                         Mode::Str,
                         &mut #[inline(always)]
@@ -109,7 +108,7 @@ impl LitKind {
             token::ByteStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_literal(s, Mode::ByteStr, &mut |_, c| match c {
+                unescape_unicode(s, Mode::ByteStr, &mut |_, c| match c {
                     Ok(c) => buf.push(byte_from_char(c)),
                     Err(err) => {
                         assert!(!err.is_fatal(), "failed to unescape string literal")
@@ -126,7 +125,7 @@ impl LitKind {
             token::CStr => {
                 let s = symbol.as_str();
                 let mut buf = Vec::with_capacity(s.len());
-                unescape_c_string(s, Mode::CStr, &mut |_span, c| match c {
+                unescape_mixed(s, Mode::CStr, &mut |_span, c| match c {
                     Ok(MixedUnit::Char(c)) => {
                         buf.extend_from_slice(c.encode_utf8(&mut [0; 4]).as_bytes())
                     }
diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index e4cf7439b97da..4da6d35727cb5 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -80,12 +80,12 @@ impl EscapeError {
     }
 }
 
-/// Takes a contents of a literal (without quotes) and produces a sequence of
-/// escaped characters or errors.
+/// Takes the contents of a unicode-only (non-mixed-utf8) literal (without
+/// quotes) and produces a sequence of escaped characters or errors.
 ///
 /// Values are returned by invoking `callback`. For `Char` and `Byte` modes,
 /// the callback will be called exactly once.
-pub fn unescape_literal<F>(src: &str, mode: Mode, callback: &mut F)
+pub fn unescape_unicode<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<char, EscapeError>),
 {
@@ -132,7 +132,11 @@ impl From<u8> for MixedUnit {
     }
 }
 
-pub fn unescape_c_string<F>(src: &str, mode: Mode, callback: &mut F)
+/// Takes the contents of a mixed-utf8 literal (without quotes) and produces
+/// a sequence of escaped characters or errors.
+///
+/// Values are returned by invoking `callback`.
+pub fn unescape_mixed<F>(src: &str, mode: Mode, callback: &mut F)
 where
     F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 {
diff --git a/compiler/rustc_lexer/src/unescape/tests.rs b/compiler/rustc_lexer/src/unescape/tests.rs
index 1c25b03fdb22e..5b99495f47581 100644
--- a/compiler/rustc_lexer/src/unescape/tests.rs
+++ b/compiler/rustc_lexer/src/unescape/tests.rs
@@ -100,7 +100,7 @@ fn test_unescape_char_good() {
 fn test_unescape_str_warn() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
+        unescape_unicode(literal, Mode::Str, &mut |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -124,7 +124,7 @@ fn test_unescape_str_warn() {
 fn test_unescape_str_good() {
     fn check(literal_text: &str, expected: &str) {
         let mut buf = Ok(String::with_capacity(literal_text.len()));
-        unescape_literal(literal_text, Mode::Str, &mut |range, c| {
+        unescape_unicode(literal_text, Mode::Str, &mut |range, c| {
             if let Ok(b) = &mut buf {
                 match c {
                     Ok(c) => b.push(c),
@@ -241,7 +241,7 @@ fn test_unescape_byte_good() {
 fn test_unescape_byte_str_good() {
     fn check(literal_text: &str, expected: &[u8]) {
         let mut buf = Ok(Vec::with_capacity(literal_text.len()));
-        unescape_literal(literal_text, Mode::ByteStr, &mut |range, c| {
+        unescape_unicode(literal_text, Mode::ByteStr, &mut |range, c| {
             if let Ok(b) = &mut buf {
                 match c {
                     Ok(c) => b.push(byte_from_char(c)),
@@ -264,7 +264,7 @@ fn test_unescape_byte_str_good() {
 fn test_unescape_raw_str() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
+        unescape_unicode(literal, Mode::RawStr, &mut |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
@@ -276,7 +276,7 @@ fn test_unescape_raw_str() {
 fn test_unescape_raw_byte_str() {
     fn check(literal: &str, expected: &[(Range<usize>, Result<char, EscapeError>)]) {
         let mut unescaped = Vec::with_capacity(literal.len());
-        unescape_literal(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
+        unescape_unicode(literal, Mode::RawByteStr, &mut |range, res| unescaped.push((range, res)));
         assert_eq!(unescaped, expected);
     }
 
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index d7ecf577ed676..a491d1969bd5c 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -400,7 +400,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                         .with_code(error_code!(E0762))
                         .emit()
                 }
-                self.cook_quoted(token::Char, Mode::Char, start, end, 1, 1) // ' '
+                self.cook_unicode(token::Char, Mode::Char, start, end, 1, 1) // ' '
             }
             rustc_lexer::LiteralKind::Byte { terminated } => {
                 if !terminated {
@@ -412,7 +412,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                         .with_code(error_code!(E0763))
                         .emit()
                 }
-                self.cook_quoted(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
+                self.cook_unicode(token::Byte, Mode::Byte, start, end, 2, 1) // b' '
             }
             rustc_lexer::LiteralKind::Str { terminated } => {
                 if !terminated {
@@ -424,7 +424,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                         .with_code(error_code!(E0765))
                         .emit()
                 }
-                self.cook_quoted(token::Str, Mode::Str, start, end, 1, 1) // " "
+                self.cook_unicode(token::Str, Mode::Str, start, end, 1, 1) // " "
             }
             rustc_lexer::LiteralKind::ByteStr { terminated } => {
                 if !terminated {
@@ -436,7 +436,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                         .with_code(error_code!(E0766))
                         .emit()
                 }
-                self.cook_quoted(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
+                self.cook_unicode(token::ByteStr, Mode::ByteStr, start, end, 2, 1) // b" "
             }
             rustc_lexer::LiteralKind::CStr { terminated } => {
                 if !terminated {
@@ -448,13 +448,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                         .with_code(error_code!(E0767))
                         .emit()
                 }
-                self.cook_c_string(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
+                self.cook_mixed(token::CStr, Mode::CStr, start, end, 2, 1) // c" "
             }
             rustc_lexer::LiteralKind::RawStr { n_hashes } => {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::StrRaw(n_hashes);
-                    self.cook_quoted(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
+                    self.cook_unicode(kind, Mode::RawStr, start, end, 2 + n, 1 + n) // r##" "##
                 } else {
                     self.report_raw_str_error(start, 1);
                 }
@@ -463,7 +463,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::ByteStrRaw(n_hashes);
-                    self.cook_quoted(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
+                    self.cook_unicode(kind, Mode::RawByteStr, start, end, 3 + n, 1 + n) // br##" "##
                 } else {
                     self.report_raw_str_error(start, 2);
                 }
@@ -472,7 +472,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::CStrRaw(n_hashes);
-                    self.cook_c_string(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
+                    self.cook_mixed(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
                 } else {
                     self.report_raw_str_error(start, 2);
                 }
@@ -735,7 +735,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
         }
     }
 
-    fn cook_quoted(
+    fn cook_unicode(
         &self,
         kind: token::LitKind,
         mode: Mode,
@@ -745,13 +745,13 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
         postfix_len: u32,
     ) -> (token::LitKind, Symbol) {
         self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
-            unescape::unescape_literal(src, mode, &mut |span, result| {
+            unescape::unescape_unicode(src, mode, &mut |span, result| {
                 callback(span, result.map(drop))
             })
         })
     }
 
-    fn cook_c_string(
+    fn cook_mixed(
         &self,
         kind: token::LitKind,
         mode: Mode,
@@ -761,7 +761,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
         postfix_len: u32,
     ) -> (token::LitKind, Symbol) {
         self.cook_common(kind, mode, start, end, prefix_len, postfix_len, |src, mode, callback| {
-            unescape::unescape_c_string(src, mode, &mut |span, result| {
+            unescape::unescape_mixed(src, mode, &mut |span, result| {
                 callback(span, result.map(drop))
             })
         })
diff --git a/compiler/rustc_parse_format/src/lib.rs b/compiler/rustc_parse_format/src/lib.rs
index 625764876a6b6..d76ee161da6fd 100644
--- a/compiler/rustc_parse_format/src/lib.rs
+++ b/compiler/rustc_parse_format/src/lib.rs
@@ -1056,7 +1056,7 @@ fn find_width_map_from_snippet(
 fn unescape_string(string: &str) -> Option<string::String> {
     let mut buf = string::String::new();
     let mut ok = true;
-    unescape::unescape_literal(string, unescape::Mode::Str, &mut |_, unescaped_char| {
+    unescape::unescape_unicode(string, unescape::Mode::Str, &mut |_, unescaped_char| {
         match unescaped_char {
             Ok(c) => buf.push(c),
             Err(_) => ok = false,
diff --git a/src/tools/clippy/clippy_dev/src/update_lints.rs b/src/tools/clippy/clippy_dev/src/update_lints.rs
index 6b76a44debff7..f598f5d3d50f8 100644
--- a/src/tools/clippy/clippy_dev/src/update_lints.rs
+++ b/src/tools/clippy/clippy_dev/src/update_lints.rs
@@ -928,7 +928,7 @@ fn remove_line_splices(s: &str) -> String {
         .and_then(|s| s.strip_suffix('"'))
         .unwrap_or_else(|| panic!("expected quoted string, found `{s}`"));
     let mut res = String::with_capacity(s.len());
-    unescape::unescape_literal(s, unescape::Mode::Str, &mut |range, ch| {
+    unescape::unescape_unicode(s, unescape::Mode::Str, &mut |range, ch| {
         if ch.is_ok() {
             res.push_str(&s[range]);
         }
diff --git a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
index aa25f82ae1d8d..bf1feb9a7eb07 100644
--- a/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
+++ b/src/tools/rust-analyzer/crates/parser/src/lexed_str.rs
@@ -379,14 +379,14 @@ fn unescape_string_error_message(text: &str, mode: Mode) -> &'static str {
     let mut error_message = "";
     match mode {
         Mode::CStr => {
-            rustc_lexer::unescape::unescape_c_string(text, mode, &mut |_, res| {
+            rustc_lexer::unescape::unescape_mixed(text, mode, &mut |_, res| {
                 if let Err(e) = res {
                     error_message = error_to_diagnostic_message(e, mode);
                 }
             });
         }
         Mode::ByteStr | Mode::Str => {
-            rustc_lexer::unescape::unescape_literal(text, mode, &mut |_, res| {
+            rustc_lexer::unescape::unescape_unicode(text, mode, &mut |_, res| {
                 if let Err(e) = res {
                     error_message = error_to_diagnostic_message(e, mode);
                 }
diff --git a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
index 2f75e9677ec89..7cd1f1550b988 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/ast/token_ext.rs
@@ -6,7 +6,7 @@ use std::{
 };
 
 use rustc_lexer::unescape::{
-    unescape_byte, unescape_c_string, unescape_char, unescape_literal, MixedUnit, Mode,
+    unescape_byte, unescape_char, unescape_mixed, unescape_unicode, MixedUnit, Mode,
 };
 
 use crate::{
@@ -193,7 +193,7 @@ pub trait IsString: AstToken {
         let text = &self.text()[text_range_no_quotes - start];
         let offset = text_range_no_quotes.start() - start;
 
-        unescape_literal(text, Self::MODE, &mut |range, unescaped_char| {
+        unescape_unicode(text, Self::MODE, &mut |range, unescaped_char| {
             let text_range =
                 TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
             cb(text_range + offset, unescaped_char);
@@ -226,7 +226,7 @@ impl ast::String {
         let mut buf = String::new();
         let mut prev_end = 0;
         let mut has_error = false;
-        unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match (
+        unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
             unescaped_char,
             buf.capacity() == 0,
         ) {
@@ -270,7 +270,7 @@ impl ast::ByteString {
         let mut buf: Vec<u8> = Vec::new();
         let mut prev_end = 0;
         let mut has_error = false;
-        unescape_literal(text, Self::MODE, &mut |char_range, unescaped_char| match (
+        unescape_unicode(text, Self::MODE, &mut |char_range, unescaped_char| match (
             unescaped_char,
             buf.capacity() == 0,
         ) {
@@ -311,7 +311,7 @@ impl IsString for ast::CString {
         let text = &self.text()[text_range_no_quotes - start];
         let offset = text_range_no_quotes.start() - start;
 
-        unescape_c_string(text, Self::MODE, &mut |range, unescaped_char| {
+        unescape_mixed(text, Self::MODE, &mut |range, unescaped_char| {
             let text_range =
                 TextRange::new(range.start.try_into().unwrap(), range.end.try_into().unwrap());
             // XXX: This method should only be used for highlighting ranges. The unescaped
@@ -340,7 +340,7 @@ impl ast::CString {
             MixedUnit::Char(c) => buf.extend(c.encode_utf8(&mut [0; 4]).as_bytes()),
             MixedUnit::HighByte(b) => buf.push(b),
         };
-        unescape_c_string(text, Self::MODE, &mut |char_range, unescaped| match (
+        unescape_mixed(text, Self::MODE, &mut |char_range, unescaped| match (
             unescaped,
             buf.capacity() == 0,
         ) {
diff --git a/src/tools/rust-analyzer/crates/syntax/src/validation.rs b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
index fadcbaef14330..5c5b26f525f66 100644
--- a/src/tools/rust-analyzer/crates/syntax/src/validation.rs
+++ b/src/tools/rust-analyzer/crates/syntax/src/validation.rs
@@ -5,7 +5,7 @@
 mod block;
 
 use rowan::Direction;
-use rustc_lexer::unescape::{self, unescape_c_string, unescape_literal, Mode};
+use rustc_lexer::unescape::{self, unescape_mixed, unescape_unicode, Mode};
 
 use crate::{
     algo,
@@ -140,7 +140,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::String(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 1, '"') {
-                    unescape_literal(without_quotes, Mode::Str, &mut |range, char| {
+                    unescape_unicode(without_quotes, Mode::Str, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -151,7 +151,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::ByteString(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 2, '"') {
-                    unescape_literal(without_quotes, Mode::ByteStr, &mut |range, char| {
+                    unescape_unicode(without_quotes, Mode::ByteStr, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -162,7 +162,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         ast::LiteralKind::CString(s) => {
             if !s.is_raw() {
                 if let Some(without_quotes) = unquote(text, 2, '"') {
-                    unescape_c_string(without_quotes, Mode::CStr, &mut |range, char| {
+                    unescape_mixed(without_quotes, Mode::CStr, &mut |range, char| {
                         if let Err(err) = char {
                             push_err(1, range.start, err);
                         }
@@ -172,7 +172,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         }
         ast::LiteralKind::Char(_) => {
             if let Some(without_quotes) = unquote(text, 1, '\'') {
-                unescape_literal(without_quotes, Mode::Char, &mut |range, char| {
+                unescape_unicode(without_quotes, Mode::Char, &mut |range, char| {
                     if let Err(err) = char {
                         push_err(1, range.start, err);
                     }
@@ -181,7 +181,7 @@ fn validate_literal(literal: ast::Literal, acc: &mut Vec<SyntaxError>) {
         }
         ast::LiteralKind::Byte(_) => {
             if let Some(without_quotes) = unquote(text, 2, '\'') {
-                unescape_literal(without_quotes, Mode::Byte, &mut |range, char| {
+                unescape_unicode(without_quotes, Mode::Byte, &mut |range, char| {
                     if let Err(err) = char {
                         push_err(2, range.start, err);
                     }

From 6be2e5623cb7ae63ca1796759150c0cbc845bbcd Mon Sep 17 00:00:00 2001
From: Nicholas Nethercote <n.nethercote@gmail.com>
Date: Wed, 24 Jan 2024 16:00:10 +1100
Subject: [PATCH 7/7] Use `unescape_unicode` for raw C string literals.

They can't contain `\x` escapes, which means they can't contain high
bytes, which means we can used `unescape_unicode` instead of
`unescape_mixed` to unescape them. This avoids unnecessary used of
`MixedUnit`.
---
 compiler/rustc_lexer/src/unescape.rs  | 33 ++++++++++++---------------
 compiler/rustc_parse/src/lexer/mod.rs |  2 +-
 2 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/compiler/rustc_lexer/src/unescape.rs b/compiler/rustc_lexer/src/unescape.rs
index 4da6d35727cb5..03d178eb266a4 100644
--- a/compiler/rustc_lexer/src/unescape.rs
+++ b/compiler/rustc_lexer/src/unescape.rs
@@ -97,7 +97,13 @@ where
         }
         Str | ByteStr => unescape_non_raw_common(src, mode, callback),
         RawStr | RawByteStr => check_raw_common(src, mode, callback),
-        CStr | RawCStr => unreachable!(),
+        RawCStr => check_raw_common(src, mode, &mut |r, mut result| {
+            if let Ok('\0') = result {
+                result = Err(EscapeError::NulInCStr);
+            }
+            callback(r, result)
+        }),
+        CStr => unreachable!(),
     }
 }
 
@@ -141,24 +147,13 @@ where
     F: FnMut(Range<usize>, Result<MixedUnit, EscapeError>),
 {
     match mode {
-        CStr => {
-            unescape_non_raw_common(src, mode, &mut |r, mut result| {
-                if let Ok(MixedUnit::Char('\0')) = result {
-                    result = Err(EscapeError::NulInCStr);
-                }
-                callback(r, result)
-            });
-        }
-        RawCStr => {
-            check_raw_common(src, mode, &mut |r, mut result| {
-                if let Ok('\0') = result {
-                    result = Err(EscapeError::NulInCStr);
-                }
-                // High bytes aren't possible in raw strings.
-                callback(r, result.map(MixedUnit::Char))
-            });
-        }
-        Char | Byte | Str | RawStr | ByteStr | RawByteStr => unreachable!(),
+        CStr => unescape_non_raw_common(src, mode, &mut |r, mut result| {
+            if let Ok(MixedUnit::Char('\0')) = result {
+                result = Err(EscapeError::NulInCStr);
+            }
+            callback(r, result)
+        }),
+        Char | Byte | Str | RawStr | ByteStr | RawByteStr | RawCStr => unreachable!(),
     }
 }
 
diff --git a/compiler/rustc_parse/src/lexer/mod.rs b/compiler/rustc_parse/src/lexer/mod.rs
index a491d1969bd5c..20ec4a300c1f8 100644
--- a/compiler/rustc_parse/src/lexer/mod.rs
+++ b/compiler/rustc_parse/src/lexer/mod.rs
@@ -472,7 +472,7 @@ impl<'sess, 'src> StringReader<'sess, 'src> {
                 if let Some(n_hashes) = n_hashes {
                     let n = u32::from(n_hashes);
                     let kind = token::CStrRaw(n_hashes);
-                    self.cook_mixed(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
+                    self.cook_unicode(kind, Mode::RawCStr, start, end, 3 + n, 1 + n) // cr##" "##
                 } else {
                     self.report_raw_str_error(start, 2);
                 }