From a9ad714b18b4daf494db66d148cc2d32e38ad664 Mon Sep 17 00:00:00 2001 From: jakobeha Date: Thu, 2 May 2024 01:06:11 -0400 Subject: [PATCH] fix `Lexer::clone` leak and UB + tests `Lexer::clone` shouldn't clone the inner `ManuallyDrop`, because doing so clones the inner value, which is moved out in `Lexer::next`. This causes use-after-free if the lexer is cloned after the last-returned token is dropped, especially if the token contains an overridden implementation of `Clone` (such as `Rc`) that tries to read the dropped data. It causes a memory leak if the token contains a heap-allocated value, because cloning makes a new allocation. This allocation is in the `ManuallyDrop` and it's guaranteed to be overridden before the call to `ManuallyDrop::take`, so it's never freed. Another thing: https://github.com/maciejhirsz/logos/issues/263 (make `Lexer` implement `Copy`) probably should be added (referencing here because it looks like the issue has been forgotten). --- src/lexer.rs | 2 +- tests/tests/clone.rs | 62 ++++++++++++++++++++++ tests/tests/string.rs | 118 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 181 insertions(+), 1 deletion(-) create mode 100644 tests/tests/clone.rs create mode 100644 tests/tests/string.rs diff --git a/src/lexer.rs b/src/lexer.rs index f5b12b2d..f8db4ce3 100644 --- a/src/lexer.rs +++ b/src/lexer.rs @@ -194,7 +194,7 @@ where fn clone(&self) -> Self { Lexer { extras: self.extras.clone(), - token: self.token.clone(), + token: ManuallyDrop::new(None), ..*self } } diff --git a/tests/tests/clone.rs b/tests/tests/clone.rs new file mode 100644 index 00000000..01a5fcc6 --- /dev/null +++ b/tests/tests/clone.rs @@ -0,0 +1,62 @@ +use std::cell::Cell; + +use logos::Logos as _; +use logos_derive::Logos; + +#[derive(Logos, Clone, Debug, PartialEq)] +pub enum Token { + #[token("a", |_| Evil::default())] + Evil(Evil), +} + +#[derive(Debug, Default, PartialEq)] +pub struct Evil(Box>); + +impl Clone for Evil { + fn clone(&self) -> Self { + self.0.set(self.0.get() + 1); + Self::default() + } +} + +#[test] +fn test_it_works_without_cloning() { + let mut lexer = Token::lexer("aaa"); + assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default())))); + assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default())))); + assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default())))); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_clone_ub() { + let mut lexer = Token::lexer("a"); + assert_eq!(lexer.next(), Some(Ok(Token::Evil(Evil::default())))); + + // In logos 0.14.0, this causes use-after-free (UB), + // because `Clone` dereferences the value returned by the last call to `lexer.next()`, + // which got deallocated. + // A real-life example where this could happen is with `Rc`. + // Note that it may still pass `cargo test`, it will fail with Miri. + let mut lexer2 = lexer.clone(); + + assert_eq!(lexer2.next(), None); +} + +#[test] +fn test_clone_leak() { + let mut lexer = Token::lexer("a"); + let Some(Ok(Token::Evil(evil))) = lexer.next() else { + panic!("Expected Token::Evil"); + }; + assert_eq!(evil.0.get(), 0); + + // In logos 0.14.0, this causes a memory leak because `evil` is cloned with `lexer`. + // This produces `evil.0.get() == 1`. It will fail even on `cargo test`. + let mut lexer2 = lexer.clone(); + assert_eq!(evil.0.get(), 0); + + assert_eq!(lexer2.next(), None); + let _ = evil.clone(); + assert_eq!(evil.0.get(), 1); +} diff --git a/tests/tests/string.rs b/tests/tests/string.rs new file mode 100644 index 00000000..53d82e87 --- /dev/null +++ b/tests/tests/string.rs @@ -0,0 +1,118 @@ +use logos::{Lexer, Logos as _}; +use logos_derive::Logos; + +#[derive(Logos, Clone, Debug, PartialEq)] +#[logos(skip " ")] +pub enum Token { + #[regex(r#""([^"\\]+|\\.)*""#, lex_single_line_string)] + String(String), +} + +#[test] +fn test_it_works_without_cloning() { + let mut lexer = Token::lexer(r#""Hello, world!" "foo😀bar\nbaz \x3F\u{1234}""#); + assert_eq!( + lexer.next(), + Some(Ok(Token::String("Hello, world!".to_string()))) + ); + assert_eq!( + lexer.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!(lexer.next(), None); +} + +#[test] +fn test_it_works_with_cloning() { + let mut lexer = Token::lexer(r#""Hello, world!" "foo😀bar\nbaz \x3F\u{1234}""#); + let mut lexer2 = lexer.clone(); + assert_eq!( + lexer2.next(), + Some(Ok(Token::String("Hello, world!".to_string()))) + ); + let mut lexer3 = lexer2.clone(); + let mut lexer4 = lexer3.clone(); + assert_eq!( + lexer3.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!( + lexer4.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!(lexer4.next(), None); + let mut lexer5 = lexer.clone(); + assert_eq!( + lexer5.next(), + Some(Ok(Token::String("Hello, world!".to_string()))) + ); + assert_eq!( + lexer5.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!( + lexer.next(), + Some(Ok(Token::String("Hello, world!".to_string()))) + ); + assert_eq!(lexer5.next(), None); + assert_eq!( + lexer2.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!(lexer2.next(), None); + assert_eq!(lexer3.next(), None); + assert_eq!( + lexer.next(), + Some(Ok(Token::String("foo😀bar\nbaz \x3F\u{1234}".to_string()))) + ); + assert_eq!(lexer.next(), None); +} + +// Not important +pub fn lex_single_line_string(lexer: &mut Lexer) -> Result { + let mut string = String::new(); + let mut chars = lexer.slice()[1..lexer.slice().len() - 1].chars(); + while let Some(c) = chars.next() { + match c { + '\\' => { + let c = chars.next().ok_or(())?; + match c { + '\n' => {} + 'n' => string.push('\n'), + 'r' => string.push('\r'), + 't' => string.push('\t'), + '0' => string.push('\0'), + '\'' | '"' | '\\' => string.push(c), + 'x' => { + let mut hex = String::new(); + hex.push(chars.next().ok_or(())?); + hex.push(chars.next().ok_or(())?); + let code = u8::from_str_radix(&hex, 16).map_err(|_| ())?; + if code > 0x7F { + return Err(()); + } + string.push(code as char); + } + 'u' => { + if chars.next() != Some('{') { + return Err(()); + } + let mut hex = String::new(); + for _ in 0..6 { + let c = chars.next().ok_or(())?; + if c == '}' { + break; + } + hex.push(c); + } + let code = u32::from_str_radix(&hex, 16).map_err(|_| ())?; + string.push(char::from_u32(code).ok_or(())?); + } + _ => return Err(()), + } + } + _ => string.push(c), + } + } + Ok(string) +}