From 295337d964c9aa7a0402ff5a0474b02a4b45b70b Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Fri, 15 May 2020 14:16:51 -0400 Subject: [PATCH 01/24] Added whitespace token and made -E keep spacing, TODO consume whitespace --- src/data/lex.rs | 4 ++++ src/lex/cpp.rs | 2 +- src/lex/mod.rs | 20 +++++++++++++++++--- src/main.rs | 4 ++-- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/data/lex.rs b/src/data/lex.rs index 813fc03d..b98498ab 100644 --- a/src/data/lex.rs +++ b/src/data/lex.rs @@ -190,6 +190,8 @@ pub enum Token { Literal(Literal), Id(InternedStr), + Whitespace(String), + // Misc Ellipsis, StructDeref, // -> @@ -353,6 +355,8 @@ impl std::fmt::Display for Token { Id(id) => write!(f, "{}", id), Keyword(k) => write!(f, "{}", k), + Whitespace(s) => write!(f, "{}", s), + Ellipsis => write!(f, "..."), StructDeref => write!(f, "->"), Hash => write!(f, "#"), diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 703a0a06..06cb25aa 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -320,7 +320,7 @@ impl<'a> PreProcessor<'a> { self.lexer().span(start) } #[inline] - fn consume_whitespace(&mut self) { + fn consume_whitespace(&mut self) -> String { self.lexer_mut().consume_whitespace() } #[inline] diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 7e1b9aa8..a8ef9460 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -187,12 +187,15 @@ impl Lexer { /// /// Before: b" // some comment\n /*multi comment*/hello " /// After: b"hello " - fn consume_whitespace(&mut self) { + fn consume_whitespace(&mut self) -> String { // there may be comments following whitespace + let mut whitespace = String::new(); loop { // whitespace while self.peek().map_or(false, |c| c.is_ascii_whitespace()) { - self.next_char(); + if let Some(c) = self.next_char() { + whitespace.push(c.into()); + } } // comments if self.peek() == Some(b'/') { @@ -211,6 +214,7 @@ impl Lexer { break; } } + whitespace } /// Remove all characters between now and the next b'\n' character. /// @@ -664,7 +668,17 @@ impl Iterator for Lexer { return None; } - self.consume_whitespace(); + { + let span_start = self.location.offset; + let data = self.consume_whitespace(); + if !data.is_empty() { + return Some(Ok(Locatable { + data: Token::Whitespace(data), + location: self.span(span_start), + })); + } + } + let c = self.next_char().and_then(|c| { let span_start = self.location.offset - 1; // this giant switch is most of the logic diff --git a/src/main.rs b/src/main.rs index 00b79891..6ade8767 100644 --- a/src/main.rs +++ b/src/main.rs @@ -46,7 +46,7 @@ FLAGS: Note that preprocessing discards whitespace and comments. There is not currently a way to disable this behavior. -V, --version Prints version information - + OPTIONS: --color When to use color. May be \"never\", \"auto\", or \"always\". [default: auto] -o, --output The output file to use. [default: a.out] @@ -134,7 +134,7 @@ fn real_main(buf: Rc, bin_opt: BinOpt, output: &Path) -> Result<(), (Error, let stdout = io::stdout(); let mut stdout_buf = BufWriter::new(stdout.lock()); for token in rcc_try!(tokens, files) { - write!(stdout_buf, "{} ", token.data).expect("failed to write to stdout"); + write!(stdout_buf, "{}", token.data).expect("failed to write to stdout"); } writeln!(stdout_buf).expect("failed to write to stdout"); From 1514108236ec17507ecf7ebb646087f49e41483e Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Fri, 15 May 2020 15:57:45 -0400 Subject: [PATCH 02/24] Filtered whitespace before parser, TODO debug a lot of test cases --- src/lex/cpp.rs | 14 +++++++++++++- src/lex/mod.rs | 10 ++++++++++ src/lib.rs | 2 +- src/parse/mod.rs | 1 + 4 files changed, 25 insertions(+), 2 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 06cb25aa..f9b722e6 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1285,6 +1285,16 @@ impl<'a> PreProcessor<'a> { } } } + + /// Returns next token in stream which is not whitespace + pub fn next_non_whitespace(&mut self) -> Option> { + loop { + match self.next() { + Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, + other => break other, + } + } + } } #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -1407,7 +1417,7 @@ mod tests { macro_rules! assert_err { ($src: expr, $err: pat, $description: expr $(,)?) => { - match cpp($src).next().unwrap().unwrap_err().data { + match cpp($src).next_non_whitespace().unwrap().unwrap_err().data { Error::PreProcessor($err) => {} Error::PreProcessor(other) => panic!("expected {}, got {}", $description, other), _ => panic!("expected cpp err"), @@ -1427,9 +1437,11 @@ mod tests { assert_eq!( cpp(src) .map(|res| res.map(|token| token.data)) + .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) .collect::>(), cpp(cpp_src) .map(|res| res.map(|token| token.data)) + .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) .collect::>(), "{} is not the same as {}", src, diff --git a/src/lex/mod.rs b/src/lex/mod.rs index a8ef9460..e4b8a528 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -648,6 +648,16 @@ impl Lexer { } Ok(Token::Id(InternedStr::get_or_intern(id))) } + + /// Returns next token in stream which is not whitespace + pub fn next_non_whitespace(&mut self) -> Option>> { + loop { + match self.next() { + Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, + other => break other, + } + } + } } impl Iterator for Lexer { diff --git a/src/lib.rs b/src/lib.rs index 884a242b..dd4405da 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -240,7 +240,7 @@ pub fn check_semantics(buf: &str, opt: Opt) -> Program break Some(token), Some(Err(err)) => handle_err!(err), None => break None, diff --git a/src/parse/mod.rs b/src/parse/mod.rs index b74fac2d..4dc4e7ca 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -141,6 +141,7 @@ impl Parser { fn __impl_next_token(&mut self) -> Option> { loop { match self.tokens.next() { + Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, Some(Ok(mut token)) => { self.last_location = token.location; // This is _such_ a hack From 33f9f1242b7da6daee6440f46010ebf35bab9eaa Mon Sep 17 00:00:00 2001 From: Joshua Nelson Date: Fri, 15 May 2020 19:11:03 -0400 Subject: [PATCH 03/24] Fix some bugs in whitespace handling - Don't set `seen_line_token` for whitespace - Ignore whitespace in `#if` expressions (since the parser doesn't know what whitespace is) - Run `cargo fmt` --- src/lex/cpp.rs | 9 ++++++++- src/lex/mod.rs | 8 ++++++-- src/parse/mod.rs | 5 ++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index f9b722e6..95d4f1fd 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -927,6 +927,10 @@ impl<'a> PreProcessor<'a> { Ok(token) } }, + Ok(Locatable { + data: Token::Whitespace(_), + .. + }) => continue, _ => token, }; cpp_tokens.push(token); @@ -1290,7 +1294,10 @@ impl<'a> PreProcessor<'a> { pub fn next_non_whitespace(&mut self) -> Option> { loop { match self.next() { - Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, + Some(Ok(Locatable { + data: Token::Whitespace(_), + location: _, + })) => continue, other => break other, } } diff --git a/src/lex/mod.rs b/src/lex/mod.rs index e4b8a528..5379ee8e 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -653,7 +653,10 @@ impl Lexer { pub fn next_non_whitespace(&mut self) -> Option>> { loop { match self.next() { - Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, + Some(Ok(Locatable { + data: Token::Whitespace(_), + location: _, + })) => continue, other => break other, } } @@ -885,7 +888,8 @@ impl Iterator for Lexer { .with(LexError::UnknownToken(x as char)))); } }; - self.seen_line_token |= data != Token::Hash; + // We've seen a token if this isn't # and this isn't whitespace + self.seen_line_token |= data != Token::Hash && !matches!(data, Token::Whitespace(_)); Some(Ok(Locatable { data, location: self.span(span_start), diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 4dc4e7ca..54b64592 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -141,7 +141,10 @@ impl Parser { fn __impl_next_token(&mut self) -> Option> { loop { match self.tokens.next() { - Some(Ok(Locatable {data: Token::Whitespace(_), location: _})) => continue, + Some(Ok(Locatable { + data: Token::Whitespace(_), + location: _, + })) => continue, Some(Ok(mut token)) => { self.last_location = token.location; // This is _such_ a hack From 30be8d9df3ef62d040a68c541090331ca522f847 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 00:46:22 -0400 Subject: [PATCH 04/24] Add a oneline whitespace consumtion after #ifdef, #ifndef, #undef and update tests to reflect non-whitespace --- src/lex/cpp.rs | 37 +++++++++++++++++++++++++------------ 1 file changed, 25 insertions(+), 12 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 38d6e48e..e2139431 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -480,10 +480,12 @@ impl<'a> PreProcessor<'a> { self.if_directive(condition, start) } IfNDef => { + self.consume_whitespace_oneline(start, CppError::EmptyExpression)?; let name = self.expect_id()?; self.if_directive(!self.definitions.contains_key(&name.data), start) } IfDef => { + self.consume_whitespace_oneline(start, CppError::EmptyExpression)?; let name = self.expect_id()?; self.if_directive(self.definitions.contains_key(&name.data), start) } @@ -530,6 +532,7 @@ impl<'a> PreProcessor<'a> { } Define => self.define(start), Undef => { + self.consume_whitespace_oneline(start, CppError::EmptyExpression)?; let name = self.expect_id()?; self.definitions.remove(&name.data); Ok(()) @@ -886,11 +889,7 @@ impl<'a> PreProcessor<'a> { .collect::, Locatable>>() }; - let line = self.line(); - self.file_processor.consume_whitespace(); - if self.line() != line { - return Err(self.span(start).error(CppError::EmptyDefine)); - } + self.consume_whitespace_oneline(start, CppError::EmptyDefine)?; let id = self.expect_id()?; // NOTE: does _not_ discard whitespace if self.lexer_mut().match_next(b'(') { @@ -1090,6 +1089,20 @@ impl<'a> PreProcessor<'a> { } } } + + /// Consumes whitespace but returns error if it includes a newline + fn consume_whitespace_oneline( + &mut self, + start: u32, + error: CppError, + ) -> Result { + let line = self.line(); + let ret = self.lexer_mut().consume_whitespace(); + if self.line() != line { + return Err(self.span(start).error(error)); + } + Ok(ret) + } } #[derive(Copy, Clone, Debug, PartialEq, Eq)] @@ -1285,12 +1298,12 @@ mod tests { let code = "#ifdef a whatever, doesn't matter #endif"; - assert_eq!(cpp(code).next(), None); + assert_eq!(cpp(code).next_non_whitespace(), None); let code = "#ifdef a\n#endif"; - assert_eq!(cpp(code).next(), None); + assert_eq!(cpp(code).next_non_whitespace(), None); - assert!(cpp("#ifdef").next().unwrap().is_err()); + assert!(cpp("#ifdef").next_non_whitespace().unwrap().is_err()); let nested = "#ifdef a #ifdef b @@ -1299,14 +1312,14 @@ mod tests { #endif char;"; assert_eq!( - cpp(nested).next().unwrap().unwrap().data, + cpp(nested).next_non_whitespace().unwrap().unwrap().data, Token::Keyword(Keyword::Char) ); - assert!(cpp("#endif").next().unwrap().is_err()); + assert!(cpp("#endif").next_non_whitespace().unwrap().is_err()); let same_line = "#ifdef a #endif\nint main() {}"; - assert!(cpp(same_line).next().unwrap().is_err()); + assert!(cpp(same_line).next_non_whitespace().unwrap().is_err()); } #[test] fn ifndef() { @@ -1315,7 +1328,7 @@ mod tests { #define A #endif A"; - assert!(cpp(src).next().is_none()); + assert!(cpp(src).next_non_whitespace().is_none()); } #[test] fn object_macros() { From 60db031d6e8d21f474f10fdc66817ed511e2d308 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 01:04:46 -0400 Subject: [PATCH 05/24] Consume whitespace between function macro args --- src/lex/cpp.rs | 5 +++-- src/lex/files.rs | 13 +++++++++++++ 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index e2139431..47d5fa83 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -821,7 +821,7 @@ impl<'a> PreProcessor<'a> { fn fn_args(&mut self, start: u32) -> Result, Locatable> { let mut arguments = Vec::new(); loop { - match self.file_processor.next() { + match self.file_processor.next_non_whitespace() { None => { return Err(CompileError::new( CppError::EndOfFile("identifier or ')'").into(), @@ -1091,13 +1091,14 @@ impl<'a> PreProcessor<'a> { } /// Consumes whitespace but returns error if it includes a newline + #[inline] fn consume_whitespace_oneline( &mut self, start: u32, error: CppError, ) -> Result { let line = self.line(); - let ret = self.lexer_mut().consume_whitespace(); + let ret = self.file_processor.consume_whitespace(); if self.line() != line { return Err(self.span(start).error(error)); } diff --git a/src/lex/files.rs b/src/lex/files.rs index 03a0133c..c79edd8f 100644 --- a/src/lex/files.rs +++ b/src/lex/files.rs @@ -159,4 +159,17 @@ impl FileProcessor { } tokens } + + /// Returns next token in stream which is not whitespace + pub(super) fn next_non_whitespace(&mut self) -> Option>> { + loop { + match self.next() { + Some(Ok(Locatable { + data: Token::Whitespace(_), + location: _, + })) => continue, + other => break other, + } + } + } } From 30eac7f6f355cc2e2e01dbb62b60f7dc8a2307eb Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 02:22:27 -0400 Subject: [PATCH 06/24] Fixed most of lex::tests::* --- src/lex/cpp.rs | 29 ++++++++++++++++------------- src/lex/tests.rs | 12 +++++++++++- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 47d5fa83..c475a0c2 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1242,16 +1242,20 @@ mod tests { _ => panic!("not a keyword: {:?}", token), } } + fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool { + let xs = xs + .map(|res| res.map(|token| token.data)) + .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) + .collect::>(); + let ys = ys + .map(|res| res.map(|token| token.data)) + .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) + .collect::>(); + xs == ys + } fn assert_same(src: &str, cpp_src: &str) { - assert_eq!( - cpp(src) - .map(|res| res.map(|token| token.data)) - .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) - .collect::>(), - cpp(cpp_src) - .map(|res| res.map(|token| token.data)) - .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) - .collect::>(), + assert!( + is_same_preprocessed(cpp(src), cpp(cpp_src)), "{} is not the same as {}", src, cpp_src, @@ -1508,15 +1512,14 @@ c } #[test] fn test_comment_newline() { - let tokens: Vec<_> = cpp_no_newline( + let tokens = cpp_no_newline( " #if 1 // int main() {} #endif ", - ) - .collect(); - assert_eq!(tokens, cpp("int main() {}").collect::>()); + ); + assert!(is_same_preprocessed(tokens, cpp("int main() {}"))); assert_same( " #if 1 /**//**/ diff --git a/src/lex/tests.rs b/src/lex/tests.rs index b6839480..1feb9199 100644 --- a/src/lex/tests.rs +++ b/src/lex/tests.rs @@ -15,7 +15,17 @@ fn lex(input: &str) -> Option { lexed.pop() } fn lex_all(input: &str) -> Vec { - cpp(input).collect() + cpp(input) + .filter(|res| { + !matches!( + res, + Ok(Locatable { + data: Token::Whitespace(_), + .. + }) + ) + }) + .collect() } fn match_data(lexed: Option, closure: T) -> bool From f0c146ef057085442919c5e5fd40ee8b5cb11df4 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 02:33:10 -0400 Subject: [PATCH 07/24] Handle lex::tests::test_no_newline --- src/lex/tests.rs | 22 ++++++++++++---------- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/src/lex/tests.rs b/src/lex/tests.rs index 1feb9199..14f984ec 100644 --- a/src/lex/tests.rs +++ b/src/lex/tests.rs @@ -16,18 +16,20 @@ fn lex(input: &str) -> Option { } fn lex_all(input: &str) -> Vec { cpp(input) - .filter(|res| { - !matches!( - res, - Ok(Locatable { - data: Token::Whitespace(_), - .. - }) - ) - }) + .filter(is_not_whitespace) .collect() } +pub(crate) fn is_not_whitespace(res: &LexType) -> bool { + !matches!( + res, + Ok(Locatable { + data: Token::Whitespace(_), + .. + }) + ) +} + fn match_data(lexed: Option, closure: T) -> bool where T: FnOnce(Result<&Token, &str>) -> bool, @@ -305,7 +307,7 @@ fn test_strings() { #[test] fn test_no_newline() { assert!(cpp_no_newline("").next().is_none()); - let mut tokens: Vec<_> = cpp_no_newline(" ").collect(); + let mut tokens: Vec<_> = cpp_no_newline(" ").filter(is_not_whitespace).collect(); assert_eq!(tokens.len(), 1); assert!(tokens.remove(0).unwrap_err().is_lex_err()); From 8e0509e45ef518c0c3b2583b92de0cc99aa0e096 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 02:50:07 -0400 Subject: [PATCH 08/24] Changed analyze::test::lol to not have leading newline TODO should parse_all remove leading newlines? --- src/analyze/mod.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs index ec7d7f64..b0d8f159 100644 --- a/src/analyze/mod.rs +++ b/src/analyze/mod.rs @@ -1925,8 +1925,7 @@ pub(crate) mod test { } #[test] fn lol() { - let lol = " -int *jynelson(int(*fp)(int)) { + let lol = "int *jynelson(int(*fp)(int)) { return 0; } int f(int i) { From 01f040b6b0054bd0d8a6094efb770a079fa3b794 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 02:56:04 -0400 Subject: [PATCH 09/24] Added whitespace between hash and directive --- src/lex/cpp.rs | 2 +- src/lex/tests.rs | 4 +--- 2 files changed, 2 insertions(+), 4 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index c475a0c2..6b88ee12 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -427,7 +427,7 @@ impl<'a> PreProcessor<'a> { }; Some(if is_hash && !self.file_processor.seen_line_token() { let line = self.file_processor.line(); - match self.file_processor.next()? { + match self.file_processor.next_non_whitespace()? { Ok(Locatable { data: Token::Id(id), location, diff --git a/src/lex/tests.rs b/src/lex/tests.rs index 14f984ec..5e067032 100644 --- a/src/lex/tests.rs +++ b/src/lex/tests.rs @@ -15,9 +15,7 @@ fn lex(input: &str) -> Option { lexed.pop() } fn lex_all(input: &str) -> Vec { - cpp(input) - .filter(is_not_whitespace) - .collect() + cpp(input).filter(is_not_whitespace).collect() } pub(crate) fn is_not_whitespace(res: &LexType) -> bool { From d51af7e4b50b918d98bdc3f637f23b5249ac7f32 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 03:19:54 -0400 Subject: [PATCH 10/24] Remove trailing newline for -E --- src/main.rs | 1 - 1 file changed, 1 deletion(-) diff --git a/src/main.rs b/src/main.rs index 6ade8767..8bb6815a 100644 --- a/src/main.rs +++ b/src/main.rs @@ -136,7 +136,6 @@ fn real_main(buf: Rc, bin_opt: BinOpt, output: &Path) -> Result<(), (Error, for token in rcc_try!(tokens, files) { write!(stdout_buf, "{}", token.data).expect("failed to write to stdout"); } - writeln!(stdout_buf).expect("failed to write to stdout"); return Ok(()); } else { From f671f65e898b2a537f56d2396dd347efe21a48c0 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 20:01:40 -0400 Subject: [PATCH 11/24] Handle spaces in defines and whitespace in comments --- src/lex/cpp.rs | 9 +++++++++ src/lex/mod.rs | 23 ++++++++++++++++++----- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 6b88ee12..0410da29 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -886,6 +886,15 @@ impl<'a> PreProcessor<'a> { this.tokens_until_newline() .into_iter() .map(|res| res.map(|loc| loc.data)) + .enumerate() + .map(|(i, x)| { + if i == 0 { + vec![x] + } else { + vec![x, Ok(Token::Whitespace(String::from(" ")))] + } + }) + .flatten() .collect::, Locatable>>() }; diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 3b73f0af..bf1554e7 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -204,12 +204,16 @@ impl Lexer { // comments if self.peek() == Some(b'/') { match self.peek_next() { - Some(b'/') => self.consume_line_comment(), + Some(b'/') => { + self.consume_line_comment(); + whitespace.push('\n'); + } Some(b'*') => { self.next_char(); self.next_char(); - if let Err(err) = self.consume_multi_comment() { - self.error_handler.push_back(err); + match self.consume_multi_comment() { + Ok(ws) => whitespace.push_str(&ws), + Err(err) => self.error_handler.push_back(err), } } _ => break, @@ -236,12 +240,21 @@ impl Lexer { /// /// Before: u8s{"hello this is a lot of text */ int main(){}"} /// After: chars{" int main(){}"} - fn consume_multi_comment(&mut self) -> LexResult<()> { + /// + /// Return newlines occupied by the comment or a space if no newlines + fn consume_multi_comment(&mut self) -> LexResult { + let mut whitespace = String::new(); let start = self.location.offset - 2; while let Some(c) = self.next_char() { if c == b'*' && self.peek() == Some(b'/') { self.next_char(); - return Ok(()); + if whitespace.is_empty() { + whitespace.push(' '); // For the case `a/* */b` + } + return Ok(whitespace); + } + if c == b'\n' { + whitespace.push(c.into()); } } Err(Locatable { From 36b7fc3d9a59fda6e029478198cda95d4bc4c40e Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sat, 23 May 2020 21:10:58 -0400 Subject: [PATCH 12/24] Handle lex::tests::test_no_newline (again) --- src/lex/tests.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/lex/tests.rs b/src/lex/tests.rs index 5e067032..2480b7a1 100644 --- a/src/lex/tests.rs +++ b/src/lex/tests.rs @@ -310,7 +310,7 @@ fn test_no_newline() { assert!(tokens.remove(0).unwrap_err().is_lex_err()); // regression test for https://github.com/jyn514/rcc/issues/323 - let tokens: Vec<_> = cpp_no_newline("//").collect(); + let tokens: Vec<_> = cpp_no_newline("//").filter(is_not_whitespace).collect(); assert_eq!(tokens.len(), 1); assert!(tokens[0].as_ref().unwrap_err().is_lex_err()); } From 675989b8e52c043b707640d40f62d90843c16885 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 00:03:10 -0400 Subject: [PATCH 13/24] Fix error messages for macros and #ifdef --- src/data/error.rs | 6 ++++++ src/lex/cpp.rs | 9 ++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/src/data/error.rs b/src/data/error.rs index 9bf7d64e..a5a33867 100644 --- a/src/data/error.rs +++ b/src/data/error.rs @@ -455,6 +455,12 @@ pub enum CppError { #[error("expected expression for #if")] EmptyExpression, + #[error("macro name missing")] + ExpectedMacroId, + + #[error("missing {0} in {1}")] + Expected(&'static str, &'static str), + /// A `#define` occured without an identifier following. #[error("macro name missing")] EmptyDefine, diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 0410da29..5b6aeb84 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -480,12 +480,12 @@ impl<'a> PreProcessor<'a> { self.if_directive(condition, start) } IfNDef => { - self.consume_whitespace_oneline(start, CppError::EmptyExpression)?; + self.consume_whitespace_oneline(start, CppError::ExpectedMacroId)?; let name = self.expect_id()?; self.if_directive(!self.definitions.contains_key(&name.data), start) } IfDef => { - self.consume_whitespace_oneline(start, CppError::EmptyExpression)?; + self.consume_whitespace_oneline(start, CppError::ExpectedMacroId)?; let name = self.expect_id()?; self.if_directive(self.definitions.contains_key(&name.data), start) } @@ -907,7 +907,10 @@ impl<'a> PreProcessor<'a> { // # define identifier lparen identifier-listopt ) replacement-list new-line // # define identifier lparen ... ) replacement-list new-line // # define identifier lparen identifier-list , ... ) replacement-list new-line - self.lexer_mut().consume_whitespace(); + self.consume_whitespace_oneline( + self.file_processor.offset(), + CppError::Expected(")", "macro parameter list"), + )?; let params = if !self.lexer_mut().match_next(b')') { self.fn_args(start)? } else { From 1d5578fc6ab61cc2c99ed6bb9391af170a5779ca Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 00:07:28 -0400 Subject: [PATCH 14/24] Rework whitespace in tokens_until_newline --- src/lex/cpp.rs | 46 ++++++++++++++++++++++------------------------ src/lex/files.rs | 17 +++++++++++++++-- 2 files changed, 37 insertions(+), 26 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 5b6aeb84..77bb8442 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -410,8 +410,15 @@ impl<'a> PreProcessor<'a> { self.file_processor.line() } - fn tokens_until_newline(&mut self) -> Vec>> { - self.file_processor.tokens_until_newline() + fn tokens_until_newline(&mut self, whitespace: bool) -> Vec>> { + self.file_processor.tokens_until_newline(whitespace) + } + + fn is_not_whitespace(res: &CppResult) -> bool { + !matches!(res, Ok(Locatable { + data: Token::Whitespace(_), + .. + })) } /// If at the start of the line and we see `#directive`, return that directive. @@ -540,14 +547,14 @@ impl<'a> PreProcessor<'a> { Pragma => { self.error_handler .warn(WarningDiagnostic::IgnoredPragma, self.span(start)); - drop(self.tokens_until_newline()); + drop(self.tokens_until_newline(false)); Ok(()) } // NOTE: #warning is a non-standard extension, but is implemented // by most major compilers including clang and gcc. Warning => { let tokens: Vec<_> = self - .tokens_until_newline() + .tokens_until_newline(false) .into_iter() .map(|res| res.map(|l| l.data)) .collect::>()?; @@ -557,7 +564,7 @@ impl<'a> PreProcessor<'a> { } Error => { let tokens: Vec<_> = self - .tokens_until_newline() + .tokens_until_newline(false) .into_iter() .map(|res| res.map(|l| l.data)) .collect::>()?; @@ -570,7 +577,7 @@ impl<'a> PreProcessor<'a> { WarningDiagnostic::Generic("#line is not yet implemented".into()), self.span(start), ); - drop(self.tokens_until_newline()); + drop(self.tokens_until_newline(false)); Ok(()) } Include => self.include(start), @@ -580,7 +587,7 @@ impl<'a> PreProcessor<'a> { fn boolean_expr(&mut self) -> Result { let start = self.file_processor.offset(); let lex_tokens: Vec<_> = self - .tokens_until_newline() + .tokens_until_newline(false) .into_iter() .collect::>()?; let location = self.span(start); @@ -703,6 +710,7 @@ impl<'a> PreProcessor<'a> { .into_iter() .map(|t| replace(definitions, t.data, std::iter::empty(), t.location)) .flatten() + .filter(PreProcessor::is_not_whitespace) .map(|mut token| { if let Ok(tok) = &mut token { expr_location = Some(location.maybe_merge(expr_location)); @@ -883,18 +891,9 @@ impl<'a> PreProcessor<'a> { // `#define f (a) - object macro fn define(&mut self, start: u32) -> Result<(), Locatable> { let body = |this: &mut PreProcessor| { - this.tokens_until_newline() + this.tokens_until_newline(true) .into_iter() .map(|res| res.map(|loc| loc.data)) - .enumerate() - .map(|(i, x)| { - if i == 0 { - vec![x] - } else { - vec![x, Ok(Token::Whitespace(String::from(" ")))] - } - }) - .flatten() .collect::, Locatable>>() }; @@ -1254,16 +1253,15 @@ mod tests { _ => panic!("not a keyword: {:?}", token), } } + fn _is_not_whitespace(res: &CompileResult) -> bool { + !matches!(res, Ok(Token::Whitespace(_))) + } fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool { - let xs = xs - .map(|res| res.map(|token| token.data)) - .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) - .collect::>(); - let ys = ys + let to_vec = |xs: PreProcessor| xs .map(|res| res.map(|token| token.data)) - .filter(|res| !matches!(res, Ok(Token::Whitespace(_)))) + .filter(_is_not_whitespace) .collect::>(); - xs == ys + to_vec(xs) == to_vec(ys) } fn assert_same(src: &str, cpp_src: &str) { assert!( diff --git a/src/lex/files.rs b/src/lex/files.rs index c79edd8f..041917a9 100644 --- a/src/lex/files.rs +++ b/src/lex/files.rs @@ -138,13 +138,20 @@ impl FileProcessor { /// Return all tokens from the current position until the end of the current line. /// + /// * `whitespace` - whether or not to include whitespace tokens + /// /// Note that these are _tokens_ and not bytes, so if there are invalid tokens /// on the current line, this will return a lex error. - pub(super) fn tokens_until_newline(&mut self) -> Vec>> { + pub(super) fn tokens_until_newline( + &mut self, + whitespace: bool, + ) -> Vec>> { let mut tokens = Vec::new(); let line = self.line(); loop { - self.consume_whitespace(); + let ws_start = self.offset(); + let ws = self.consume_whitespace(); + let ws_span = self.span(ws_start); if self.line() != line { // lines should end with a newline, but in case they don't, don't crash assert!(!self.lexer().seen_line_token || self.lexer_mut().peek().is_none(), @@ -152,6 +159,12 @@ impl FileProcessor { self.lexer_mut().peek()); break; } + if whitespace && !ws.is_empty() { + tokens.push(Ok(Locatable { + data: Token::Whitespace(ws), // NOTE: in clang, this is one space + location: ws_span, + })); + } match self.next() { Some(token) => tokens.push(token), None => break, From c1aa84a90bdca994aa6ce2f8b899af4e08f54dae Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 00:09:38 -0400 Subject: [PATCH 15/24] De Morgan --- src/lex/mod.rs | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/lex/mod.rs b/src/lex/mod.rs index bf1554e7..4895ad5e 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -905,8 +905,8 @@ impl Iterator for Lexer { .with(LexError::UnknownToken(x as char)))); } }; - // We've seen a token if this isn't # and this isn't whitespace - self.seen_line_token |= data != Token::Hash && !matches!(data, Token::Whitespace(_)); + // We've seen a token if this isn't # or whitespace + self.seen_line_token |= !(data == Token::Hash || matches!(data, Token::Whitespace(_))); Some(Ok(Locatable { data, location: self.span(span_start), From 81d47d9f9ab4d0e7103c77d993a4b015fa2dc4a5 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 00:30:16 -0400 Subject: [PATCH 16/24] cargo fmt --- src/lex/cpp.rs | 20 ++++++++++++-------- 1 file changed, 12 insertions(+), 8 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 77bb8442..bfd67abb 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -415,10 +415,13 @@ impl<'a> PreProcessor<'a> { } fn is_not_whitespace(res: &CppResult) -> bool { - !matches!(res, Ok(Locatable { - data: Token::Whitespace(_), - .. - })) + !matches!( + res, + Ok(Locatable { + data: Token::Whitespace(_), + .. + }) + ) } /// If at the start of the line and we see `#directive`, return that directive. @@ -1257,10 +1260,11 @@ mod tests { !matches!(res, Ok(Token::Whitespace(_))) } fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool { - let to_vec = |xs: PreProcessor| xs - .map(|res| res.map(|token| token.data)) - .filter(_is_not_whitespace) - .collect::>(); + let to_vec = |xs: PreProcessor| { + xs.map(|res| res.map(|token| token.data)) + .filter(_is_not_whitespace) + .collect::>() + }; to_vec(xs) == to_vec(ys) } fn assert_same(src: &str, cpp_src: &str) { From bbbb3ab68ec269c09a00b02becb4e36429bbc2b0 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 00:43:56 -0400 Subject: [PATCH 17/24] Add a few tests for preprocess only with exact matching TODO add tests involving preprocessor stuff --- src/lex/cpp.rs | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index bfd67abb..d83a33b0 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1275,6 +1275,15 @@ mod tests { cpp_src, ); } + fn assert_same_exact(src: &str, cpp_src: &str) { + // NOTE make sure `cpp_src` has a trailing newline + let pprint = cpp(src) + .filter_map(|res| res.ok().map(|token| token.data)) + .map(|res| format!("{}", res)) + .collect::>() + .join(""); + assert_eq!(pprint, format!("{}\n", cpp_src)); // Because `cpp` adds newline, do it here too + } #[test] fn keywords() { for keyword in KEYWORDS.values() { @@ -1580,4 +1589,12 @@ int main(){} "; assert_same(original, "a(1)"); } + #[test] + // https://github.com/jyn514/rcc/issues/356 + fn preprocess_only() { + assert_same_exact("int \t\n\r main() {}", "int \t\n\r main() {}"); + assert_same_exact("int/* */main() {}", "int main() {}"); + assert_same_exact("int/*\n\n\n*/main() {}", "int\n\n\nmain() {}"); + // TODO add tests for `assert_same_exact` with preprocessor macros + } } From 46b4943f0412cc23c392088109603354ec526b69 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 21:04:51 -0400 Subject: [PATCH 18/24] Make Whitespace matches consistently use .. --- src/lex/cpp.rs | 2 +- src/lex/files.rs | 2 +- src/lex/mod.rs | 2 +- src/parse/mod.rs | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index d83a33b0..89d83a93 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1097,7 +1097,7 @@ impl<'a> PreProcessor<'a> { match self.next() { Some(Ok(Locatable { data: Token::Whitespace(_), - location: _, + .. })) => continue, other => break other, } diff --git a/src/lex/files.rs b/src/lex/files.rs index 041917a9..8f2f6dd5 100644 --- a/src/lex/files.rs +++ b/src/lex/files.rs @@ -179,7 +179,7 @@ impl FileProcessor { match self.next() { Some(Ok(Locatable { data: Token::Whitespace(_), - location: _, + .. })) => continue, other => break other, } diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 4895ad5e..3927cbe6 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -672,7 +672,7 @@ impl Lexer { match self.next() { Some(Ok(Locatable { data: Token::Whitespace(_), - location: _, + .. })) => continue, other => break other, } diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 0cd8d53d..7aed6671 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -150,7 +150,7 @@ impl Parser { match self.tokens.next() { Some(Ok(Locatable { data: Token::Whitespace(_), - location: _, + .. })) => continue, Some(Ok(mut token)) => { self.last_location = token.location; From 78ad8fc35cf58d44d6a63fb2e919465dbd1d4f58 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 21:10:39 -0400 Subject: [PATCH 19/24] Fixed issue with whitespace at beginning of analyze::test::lol in `parse_all` --- src/analyze/mod.rs | 3 ++- src/parse/mod.rs | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs index b0d8f159..ec7d7f64 100644 --- a/src/analyze/mod.rs +++ b/src/analyze/mod.rs @@ -1925,7 +1925,8 @@ pub(crate) mod test { } #[test] fn lol() { - let lol = "int *jynelson(int(*fp)(int)) { + let lol = " +int *jynelson(int(*fp)(int)) { return 0; } int f(int i) { diff --git a/src/parse/mod.rs b/src/parse/mod.rs index 7aed6671..615843bf 100644 --- a/src/parse/mod.rs +++ b/src/parse/mod.rs @@ -363,7 +363,7 @@ pub(crate) mod test { pub(crate) fn parser(input: &str) -> Parser { //let mut lexer = Lexer::new((), format!("{}\n", input), false); let mut lexer = cpp(input); - let first: Locatable = lexer.next().unwrap().unwrap(); + let first: Locatable = lexer.next_non_whitespace().unwrap().unwrap(); Parser::new(first, lexer, false) } From 8c4f56873adedad776e27782bc7b27c8793cf8ce Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 21:15:19 -0400 Subject: [PATCH 20/24] Clean up filter_map thanks to @jyn514 --- src/lex/cpp.rs | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 89d83a93..5a19b025 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1278,8 +1278,7 @@ mod tests { fn assert_same_exact(src: &str, cpp_src: &str) { // NOTE make sure `cpp_src` has a trailing newline let pprint = cpp(src) - .filter_map(|res| res.ok().map(|token| token.data)) - .map(|res| format!("{}", res)) + .filter_map(|res| res.ok().map(|token| token.data.to_string())) .collect::>() .join(""); assert_eq!(pprint, format!("{}\n", cpp_src)); // Because `cpp` adds newline, do it here too From 0e055b9d16a21ef2b34df0fdf441534b4c788b2b Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 22:57:11 -0400 Subject: [PATCH 21/24] Get tokens_until_newline to do what it's name suggests and other whitespace improvements for preprocessor macros --- src/lex/cpp.rs | 17 ++++++++++------- src/lex/files.rs | 17 ++++++++--------- src/lex/mod.rs | 38 +++++++++++++++++++++++++++++--------- 3 files changed, 47 insertions(+), 25 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 5a19b025..b2a3b5fc 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -896,6 +896,7 @@ impl<'a> PreProcessor<'a> { let body = |this: &mut PreProcessor| { this.tokens_until_newline(true) .into_iter() + .skip(1) // First is always unwanted whitespace .map(|res| res.map(|loc| loc.data)) .collect::, Locatable>>() }; @@ -934,8 +935,8 @@ impl<'a> PreProcessor<'a> { // `#include "file"` - local include, but falls back to system include if `file` is not found. fn include(&mut self, start: u32) -> Result<(), Locatable> { use crate::data::lex::ComparisonToken; + self.consume_whitespace_oneline(start, CppError::EmptyInclude)?; let lexer = self.lexer_mut(); - lexer.consume_whitespace(); let local = if lexer.match_next(b'"') { true } else if lexer.match_next(b'<') { @@ -1484,20 +1485,20 @@ d #[test] fn pragma() { let src = "#pragma gcc __attribute__((inline))"; - assert!(cpp(src).next().is_none()); + assert!(cpp(src).next_non_whitespace().is_none()); } #[test] fn line() { let src = "#line 1"; let mut cpp = cpp(src); - assert!(cpp.next().is_none()); + assert!(cpp.next_non_whitespace().is_none()); assert!(cpp.warnings().pop_front().is_some()); } #[test] fn warning() { let src = "#warning your pants are on file"; let mut cpp = cpp(src); - assert!(cpp.next().is_none()); + assert!(cpp.next_non_whitespace().is_none()); assert!(cpp.warnings().pop_front().is_some()); } #[test] @@ -1508,8 +1509,8 @@ d fn invalid_directive() { assert_err!("#wrong", CppError::InvalidDirective, "invalid directive",); assert_err!("#1", CppError::UnexpectedToken(_, _), "unexpected token",); - assert_err!("#include", CppError::EndOfFile(_), "end of file"); - assert_err!("#if defined", CppError::EndOfFile(_), "end of file"); + assert_err!("#include", CppError::EmptyInclude, "empty include"); + assert_err!("#if defined", CppError::EndOfFile(_), "unexpected eof"); for s in &[ "#if defined()", "#if defined(+)", @@ -1594,6 +1595,8 @@ int main(){} assert_same_exact("int \t\n\r main() {}", "int \t\n\r main() {}"); assert_same_exact("int/* */main() {}", "int main() {}"); assert_same_exact("int/*\n\n\n*/main() {}", "int\n\n\nmain() {}"); - // TODO add tests for `assert_same_exact` with preprocessor macros + assert_same_exact("#define a(c) c\tc\na(1);a(2)", "\n1\t1;2\t2"); + // assert_same_exact("#define a //\n#if defined a\n x\n#endif", "\n\n x"); + // TODO add mote tests for `assert_same_exact` with preprocessor macros } } diff --git a/src/lex/files.rs b/src/lex/files.rs index 8f2f6dd5..fea21fb1 100644 --- a/src/lex/files.rs +++ b/src/lex/files.rs @@ -123,6 +123,10 @@ impl FileProcessor { self.lexer_mut().consume_whitespace() } #[inline] + pub(super) fn consume_whitespace_preprocessor(&mut self) -> String { + self.lexer_mut().consume_whitespace_preprocessor() + } + #[inline] pub(super) fn seen_line_token(&self) -> bool { self.lexer().seen_line_token } @@ -147,24 +151,19 @@ impl FileProcessor { whitespace: bool, ) -> Vec>> { let mut tokens = Vec::new(); - let line = self.line(); loop { let ws_start = self.offset(); - let ws = self.consume_whitespace(); + let ws = self.consume_whitespace_preprocessor(); let ws_span = self.span(ws_start); - if self.line() != line { - // lines should end with a newline, but in case they don't, don't crash - assert!(!self.lexer().seen_line_token || self.lexer_mut().peek().is_none(), - "expected `tokens_until_newline()` to reset `seen_line_token`, but `lexer.peek()` is {:?}", - self.lexer_mut().peek()); - break; - } if whitespace && !ws.is_empty() { tokens.push(Ok(Locatable { data: Token::Whitespace(ws), // NOTE: in clang, this is one space location: ws_span, })); } + if self.lexer_mut().peek().unwrap_or(b'\n') == b'\n' { + break; + } match self.next() { Some(token) => tokens.push(token), None => break, diff --git a/src/lex/mod.rs b/src/lex/mod.rs index 3927cbe6..93fad215 100644 --- a/src/lex/mod.rs +++ b/src/lex/mod.rs @@ -186,17 +186,35 @@ impl Lexer { file: self.location.file, } } + + #[inline] + fn consume_whitespace(&mut self) -> String { + self.consume_whitespace_full(false, true) + } + #[inline] + fn consume_whitespace_preprocessor(&mut self) -> String { + self.consume_whitespace_full(true, false) + } /// Remove all consecutive whitespace pending in the stream. /// This includes comments. /// + /// If `stop_at_newline` this stops at the end of the line (unless there's a comment) + /// If `comments_newlines` then multiline comments are replaced with their newlines else space + /// /// Before: b" // some comment\n /*multi comment*/hello " /// After: b"hello " - fn consume_whitespace(&mut self) -> String { + fn consume_whitespace_full( + &mut self, + stop_at_newline: bool, + comments_newlines: bool, + ) -> String { // there may be comments following whitespace let mut whitespace = String::new(); loop { // whitespace - while self.peek().map_or(false, |c| c.is_ascii_whitespace()) { + while self.peek().map_or(false, |c| { + c.is_ascii_whitespace() && !(stop_at_newline && c == b'\n') + }) { if let Some(c) = self.next_char() { whitespace.push(c.into()); } @@ -204,15 +222,15 @@ impl Lexer { // comments if self.peek() == Some(b'/') { match self.peek_next() { - Some(b'/') => { - self.consume_line_comment(); - whitespace.push('\n'); - } + Some(b'/') => self.consume_line_comment(), Some(b'*') => { self.next_char(); self.next_char(); match self.consume_multi_comment() { - Ok(ws) => whitespace.push_str(&ws), + Ok(ws) => { + let ws = if comments_newlines { &ws } else { " " }; + whitespace.push_str(ws) + } Err(err) => self.error_handler.push_back(err), } } @@ -230,9 +248,11 @@ impl Lexer { /// After: chars{"hello // blah"} fn consume_line_comment(&mut self) { loop { - match self.next_char() { + match self.peek() { None | Some(b'\n') => return, - _ => {} + _ => { + self.next_char(); + } } } } From 3a9cd4aa03840060ef511f5bfaf8057287b12b2f Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 23:34:14 -0400 Subject: [PATCH 22/24] More preprocess_only tests --- src/lex/cpp.rs | 49 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index b2a3b5fc..802e3d3e 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1596,7 +1596,52 @@ int main(){} assert_same_exact("int/* */main() {}", "int main() {}"); assert_same_exact("int/*\n\n\n*/main() {}", "int\n\n\nmain() {}"); assert_same_exact("#define a(c) c\tc\na(1);a(2)", "\n1\t1;2\t2"); - // assert_same_exact("#define a //\n#if defined a\n x\n#endif", "\n\n x"); - // TODO add mote tests for `assert_same_exact` with preprocessor macros + assert_same_exact("#define a //\n#if defined a\n x\n#endif", "\n\n x\n"); + assert_same_exact("#define x\n#undef x\n x", "\n\n x"); + assert_same_exact("#pragma once\n x", "\n x"); + assert_same_exact("#warning dont panic\n x", "\n x"); + assert_same_exact("#error dont panic\n x", "\n x"); + assert_same_exact("#line 1\n x", "\n x"); + assert_same_exact( + "--- +#define a +--- +#if 1 + x + y + z +#endif +--- +#if 0 + x +#endif +--- +#ifdef a + x +#endif +--- +#ifndef a + x +#endif +---", + "--- + +--- + + x + y + z + +--- + +--- + + x + +--- + +---", + ); + // TODO test for #includes } } From 6d084cd61f2e1b5a6e63d7dc18c68c3df80eb979 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Sun, 24 May 2020 23:48:31 -0400 Subject: [PATCH 23/24] Merge two versions of `is_not_whitespace` --- src/lex/cpp.rs | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index 802e3d3e..c02d72b4 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -1257,13 +1257,10 @@ mod tests { _ => panic!("not a keyword: {:?}", token), } } - fn _is_not_whitespace(res: &CompileResult) -> bool { - !matches!(res, Ok(Token::Whitespace(_))) - } fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool { let to_vec = |xs: PreProcessor| { - xs.map(|res| res.map(|token| token.data)) - .filter(_is_not_whitespace) + xs.filter(PreProcessor::is_not_whitespace) + .map(|res| res.map(|token| token.data)) .collect::>() }; to_vec(xs) == to_vec(ys) From 0d03378729800deb07fe6cb46964921b444dc6f0 Mon Sep 17 00:00:00 2001 From: Hunter Damron Date: Mon, 25 May 2020 00:18:18 -0400 Subject: [PATCH 24/24] Do not assume there is whitespace between define id and body --- src/lex/cpp.rs | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs index c02d72b4..df726862 100644 --- a/src/lex/cpp.rs +++ b/src/lex/cpp.rs @@ -414,8 +414,8 @@ impl<'a> PreProcessor<'a> { self.file_processor.tokens_until_newline(whitespace) } - fn is_not_whitespace(res: &CppResult) -> bool { - !matches!( + fn is_whitespace(res: &CppResult) -> bool { + matches!( res, Ok(Locatable { data: Token::Whitespace(_), @@ -423,6 +423,9 @@ impl<'a> PreProcessor<'a> { }) ) } + fn is_not_whitespace(res: &CppResult) -> bool { + !PreProcessor::is_whitespace(res) + } /// If at the start of the line and we see `#directive`, return that directive. /// Otherwise, if we see a token (or error), return that error. @@ -896,7 +899,7 @@ impl<'a> PreProcessor<'a> { let body = |this: &mut PreProcessor| { this.tokens_until_newline(true) .into_iter() - .skip(1) // First is always unwanted whitespace + .skip_while(PreProcessor::is_whitespace) // TODO warning if nothing skips .map(|res| res.map(|loc| loc.data)) .collect::, Locatable>>() };