Skip to content
This repository has been archived by the owner on Feb 4, 2025. It is now read-only.

Close #356 - Add whitespace token for preprocessor #437

Merged
merged 25 commits into from
May 25, 2020
Merged
Show file tree
Hide file tree
Changes from 12 commits
Commits
Show all changes
25 commits
Select commit Hold shift + click to select a range
295337d
Added whitespace token and made -E keep spacing, TODO consume whitespace
hdamron17 May 15, 2020
1514108
Filtered whitespace before parser, TODO debug a lot of test cases
hdamron17 May 15, 2020
33f9f12
Fix some bugs in whitespace handling
jyn514 May 15, 2020
dd628bf
Merge branch 'master' into preprocessor-whitespace
hdamron17 May 23, 2020
30be8d9
Add a oneline whitespace consumtion after #ifdef, #ifndef, #undef
hdamron17 May 23, 2020
60db031
Consume whitespace between function macro args
hdamron17 May 23, 2020
30eac7f
Fixed most of lex::tests::*
hdamron17 May 23, 2020
f0c146e
Handle lex::tests::test_no_newline
hdamron17 May 23, 2020
8e0509e
Changed analyze::test::lol to not have leading newline
hdamron17 May 23, 2020
01f040b
Added whitespace between hash and directive
hdamron17 May 23, 2020
d51af7e
Remove trailing newline for -E
hdamron17 May 23, 2020
f671f65
Handle spaces in defines and whitespace in comments
hdamron17 May 24, 2020
36b7fc3
Handle lex::tests::test_no_newline (again)
hdamron17 May 24, 2020
675989b
Fix error messages for macros and #ifdef
hdamron17 May 24, 2020
1d5578f
Rework whitespace in tokens_until_newline
hdamron17 May 24, 2020
c1aa84a
De Morgan
hdamron17 May 24, 2020
81d47d9
cargo fmt
hdamron17 May 24, 2020
bbbb3ab
Add a few tests for preprocess only with exact matching
hdamron17 May 24, 2020
46b4943
Make Whitespace matches consistently use ..
hdamron17 May 25, 2020
78ad8fc
Fixed issue with whitespace at beginning of analyze::test::lol in `pa…
hdamron17 May 25, 2020
8c4f568
Clean up filter_map thanks to @jyn514
hdamron17 May 25, 2020
0e055b9
Get tokens_until_newline to do what it's name suggests and other whit…
hdamron17 May 25, 2020
3a9cd4a
More preprocess_only tests
hdamron17 May 25, 2020
6d084cd
Merge two versions of `is_not_whitespace`
hdamron17 May 25, 2020
0d03378
Do not assume there is whitespace between define id and body
hdamron17 May 25, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 1 addition & 2 deletions src/analyze/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1925,8 +1925,7 @@ pub(crate) mod test {
}
#[test]
fn lol() {
let lol = "
int *jynelson(int(*fp)(int)) {
let lol = "int *jynelson(int(*fp)(int)) {
return 0;
}
int f(int i) {
Expand Down
4 changes: 4 additions & 0 deletions src/data/lex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -190,6 +190,8 @@ pub enum Token {
Literal(Literal),
Id(InternedStr),

Whitespace(String),

// Misc
Ellipsis,
StructDeref, // ->
Expand Down Expand Up @@ -353,6 +355,8 @@ impl std::fmt::Display for Token {
Id(id) => write!(f, "{}", id),
Keyword(k) => write!(f, "{}", k),

Whitespace(s) => write!(f, "{}", s),

Ellipsis => write!(f, "..."),
StructDeref => write!(f, "->"),
Hash => write!(f, "#"),
Expand Down
93 changes: 67 additions & 26 deletions src/lex/cpp.rs
Original file line number Diff line number Diff line change
Expand Up @@ -427,7 +427,7 @@ impl<'a> PreProcessor<'a> {
};
Some(if is_hash && !self.file_processor.seen_line_token() {
let line = self.file_processor.line();
match self.file_processor.next()? {
match self.file_processor.next_non_whitespace()? {
Ok(Locatable {
data: Token::Id(id),
location,
Expand Down Expand Up @@ -480,10 +480,12 @@ impl<'a> PreProcessor<'a> {
self.if_directive(condition, start)
}
IfNDef => {
self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
let name = self.expect_id()?;
self.if_directive(!self.definitions.contains_key(&name.data), start)
}
IfDef => {
self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
let name = self.expect_id()?;
self.if_directive(self.definitions.contains_key(&name.data), start)
}
Expand Down Expand Up @@ -530,6 +532,7 @@ impl<'a> PreProcessor<'a> {
}
Define => self.define(start),
Undef => {
self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
let name = self.expect_id()?;
self.definitions.remove(&name.data);
Ok(())
Expand Down Expand Up @@ -818,7 +821,7 @@ impl<'a> PreProcessor<'a> {
fn fn_args(&mut self, start: u32) -> Result<Vec<InternedStr>, Locatable<Error>> {
let mut arguments = Vec::new();
loop {
match self.file_processor.next() {
match self.file_processor.next_non_whitespace() {
None => {
return Err(CompileError::new(
CppError::EndOfFile("identifier or ')'").into(),
Expand Down Expand Up @@ -883,14 +886,19 @@ impl<'a> PreProcessor<'a> {
this.tokens_until_newline()
.into_iter()
.map(|res| res.map(|loc| loc.data))
.enumerate()
.map(|(i, x)| {
if i == 0 {
vec![x]
} else {
vec![x, Ok(Token::Whitespace(String::from(" ")))]
}
})
.flatten()
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why do you add these spaces here in the define? Does tokens_until_newline not return whitespace tokens?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

tokens_until_newline (at the moment) does not include whitespace tokens. I did not want to change it since it is somewhat separate from the preprocessor. I can try adding whitespace tokens to it and see what happens now that it is in a stable state. I did notice that clang only puts one space when replacing preprocessor defines, regardless of the original spacing.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm interesting ... I suppose since the behavior is correct we can try to go back and improve the spacing later.

Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Also, please add a comment to this effect either here or at tokens_until_newline.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I actually went back and did it the proper way by changing tokens_until_newline. Unfortunately I had to rework some stuff for boolean_expr because whitespace show up in the replacement stage. I'll push as soon as I double check the tests. Also, I think the rework will be much less of an eyesore.

.collect::<Result<Vec<_>, Locatable<Error>>>()
};

let line = self.line();
self.file_processor.consume_whitespace();
if self.line() != line {
return Err(self.span(start).error(CppError::EmptyDefine));
}
self.consume_whitespace_oneline(start, CppError::EmptyDefine)?;
let id = self.expect_id()?;
// NOTE: does _not_ discard whitespace
if self.lexer_mut().match_next(b'(') {
Expand Down Expand Up @@ -1077,6 +1085,34 @@ impl<'a> PreProcessor<'a> {
}
}
}

/// Returns next token in stream which is not whitespace
pub fn next_non_whitespace(&mut self) -> Option<CppResult<Token>> {
loop {
match self.next() {
Some(Ok(Locatable {
data: Token::Whitespace(_),
location: _,
})) => continue,
other => break other,
}
}
}

/// Consumes whitespace but returns error if it includes a newline
#[inline]
fn consume_whitespace_oneline(
&mut self,
start: u32,
error: CppError,
) -> Result<String, CompileError> {
let line = self.line();
let ret = self.file_processor.consume_whitespace();
if self.line() != line {
return Err(self.span(start).error(error));
}
Ok(ret)
}
}

#[derive(Copy, Clone, Debug, PartialEq, Eq)]
Expand Down Expand Up @@ -1199,7 +1235,7 @@ mod tests {

macro_rules! assert_err {
($src: expr, $err: pat, $description: expr $(,)?) => {
match cpp($src).next().unwrap().unwrap_err().data {
match cpp($src).next_non_whitespace().unwrap().unwrap_err().data {
Error::PreProcessor($err) => {}
Error::PreProcessor(other) => panic!("expected {}, got {}", $description, other),
_ => panic!("expected cpp err"),
Expand All @@ -1215,14 +1251,20 @@ mod tests {
_ => panic!("not a keyword: {:?}", token),
}
}
fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool {
let xs = xs
.map(|res| res.map(|token| token.data))
.filter(|res| !matches!(res, Ok(Token::Whitespace(_))))
.collect::<Vec<_>>();
let ys = ys
.map(|res| res.map(|token| token.data))
.filter(|res| !matches!(res, Ok(Token::Whitespace(_))))
.collect::<Vec<_>>();
xs == ys
}
fn assert_same(src: &str, cpp_src: &str) {
assert_eq!(
cpp(src)
.map(|res| res.map(|token| token.data))
.collect::<Vec<_>>(),
cpp(cpp_src)
.map(|res| res.map(|token| token.data))
.collect::<Vec<_>>(),
assert!(
is_same_preprocessed(cpp(src), cpp(cpp_src)),
"{} is not the same as {}",
src,
cpp_src,
Expand Down Expand Up @@ -1270,12 +1312,12 @@ mod tests {
let code = "#ifdef a
whatever, doesn't matter
#endif";
assert_eq!(cpp(code).next(), None);
assert_eq!(cpp(code).next_non_whitespace(), None);

let code = "#ifdef a\n#endif";
assert_eq!(cpp(code).next(), None);
assert_eq!(cpp(code).next_non_whitespace(), None);

assert!(cpp("#ifdef").next().unwrap().is_err());
assert!(cpp("#ifdef").next_non_whitespace().unwrap().is_err());

let nested = "#ifdef a
#ifdef b
Expand All @@ -1284,14 +1326,14 @@ mod tests {
#endif
char;";
assert_eq!(
cpp(nested).next().unwrap().unwrap().data,
cpp(nested).next_non_whitespace().unwrap().unwrap().data,
Token::Keyword(Keyword::Char)
);

assert!(cpp("#endif").next().unwrap().is_err());
assert!(cpp("#endif").next_non_whitespace().unwrap().is_err());

let same_line = "#ifdef a #endif\nint main() {}";
assert!(cpp(same_line).next().unwrap().is_err());
assert!(cpp(same_line).next_non_whitespace().unwrap().is_err());
}
#[test]
fn ifndef() {
Expand All @@ -1300,7 +1342,7 @@ mod tests {
#define A
#endif
A";
assert!(cpp(src).next().is_none());
assert!(cpp(src).next_non_whitespace().is_none());
}
#[test]
fn object_macros() {
Expand Down Expand Up @@ -1479,15 +1521,14 @@ c
}
#[test]
fn test_comment_newline() {
let tokens: Vec<_> = cpp_no_newline(
let tokens = cpp_no_newline(
"
#if 1 //
int main() {}
#endif
",
)
.collect();
assert_eq!(tokens, cpp("int main() {}").collect::<Vec<_>>());
);
assert!(is_same_preprocessed(tokens, cpp("int main() {}")));
assert_same(
"
#if 1 /**//**/
Expand Down
15 changes: 14 additions & 1 deletion src/lex/files.rs
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ impl FileProcessor {
self.lexer().span(start)
}
#[inline]
pub(super) fn consume_whitespace(&mut self) {
pub(super) fn consume_whitespace(&mut self) -> String {
self.lexer_mut().consume_whitespace()
}
#[inline]
Expand Down Expand Up @@ -159,4 +159,17 @@ impl FileProcessor {
}
tokens
}

/// Returns next token in stream which is not whitespace
pub(super) fn next_non_whitespace(&mut self) -> Option<CompileResult<Locatable<Token>>> {
loop {
match self.next() {
Some(Ok(Locatable {
data: Token::Whitespace(_),
location: _,
})) => continue,
other => break other,
}
}
}
}
59 changes: 50 additions & 9 deletions src/lex/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -191,22 +191,29 @@ impl Lexer {
///
/// Before: b" // some comment\n /*multi comment*/hello "
/// After: b"hello "
fn consume_whitespace(&mut self) {
fn consume_whitespace(&mut self) -> String {
// there may be comments following whitespace
let mut whitespace = String::new();
loop {
// whitespace
while self.peek().map_or(false, |c| c.is_ascii_whitespace()) {
self.next_char();
if let Some(c) = self.next_char() {
whitespace.push(c.into());
jyn514 marked this conversation as resolved.
Show resolved Hide resolved
}
}
// comments
if self.peek() == Some(b'/') {
match self.peek_next() {
Some(b'/') => self.consume_line_comment(),
Some(b'/') => {
self.consume_line_comment();
whitespace.push('\n');
}
Some(b'*') => {
self.next_char();
self.next_char();
if let Err(err) = self.consume_multi_comment() {
self.error_handler.push_back(err);
match self.consume_multi_comment() {
Ok(ws) => whitespace.push_str(&ws),
Err(err) => self.error_handler.push_back(err),
}
}
_ => break,
Expand All @@ -215,6 +222,7 @@ impl Lexer {
break;
jyn514 marked this conversation as resolved.
Show resolved Hide resolved
}
}
whitespace
}
/// Remove all characters between now and the next b'\n' character.
///
Expand All @@ -232,12 +240,21 @@ impl Lexer {
///
/// Before: u8s{"hello this is a lot of text */ int main(){}"}
/// After: chars{" int main(){}"}
fn consume_multi_comment(&mut self) -> LexResult<()> {
///
/// Return newlines occupied by the comment or a space if no newlines
fn consume_multi_comment(&mut self) -> LexResult<String> {
let mut whitespace = String::new();
let start = self.location.offset - 2;
while let Some(c) = self.next_char() {
if c == b'*' && self.peek() == Some(b'/') {
self.next_char();
return Ok(());
if whitespace.is_empty() {
whitespace.push(' '); // For the case `a/* */b`
}
return Ok(whitespace);
}
if c == b'\n' {
whitespace.push(c.into());
}
}
Err(Locatable {
Expand Down Expand Up @@ -648,6 +665,19 @@ impl Lexer {
}
Ok(Token::Id(InternedStr::get_or_intern(id)))
}

/// Returns next token in stream which is not whitespace
pub fn next_non_whitespace(&mut self) -> Option<LexResult<Locatable<Token>>> {
loop {
match self.next() {
Some(Ok(Locatable {
data: Token::Whitespace(_),
location: _,
})) => continue,
other => break other,
}
}
}
}

impl Iterator for Lexer {
Expand All @@ -668,7 +698,17 @@ impl Iterator for Lexer {
return None;
}

self.consume_whitespace();
{
let span_start = self.location.offset;
let data = self.consume_whitespace();
if !data.is_empty() {
return Some(Ok(Locatable {
data: Token::Whitespace(data),
location: self.span(span_start),
}));
}
}

let c = self.next_char().and_then(|c| {
let span_start = self.location.offset - 1;
// this giant switch is most of the logic
Expand Down Expand Up @@ -865,7 +905,8 @@ impl Iterator for Lexer {
.with(LexError::UnknownToken(x as char))));
}
};
self.seen_line_token |= data != Token::Hash;
// We've seen a token if this isn't # and this isn't whitespace
self.seen_line_token |= data != Token::Hash && !matches!(data, Token::Whitespace(_));
hdamron17 marked this conversation as resolved.
Show resolved Hide resolved
Some(Ok(Locatable {
data,
location: self.span(span_start),
Expand Down
14 changes: 12 additions & 2 deletions src/lex/tests.rs
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,17 @@ fn lex(input: &str) -> Option<LexType> {
lexed.pop()
}
fn lex_all(input: &str) -> Vec<LexType> {
cpp(input).collect()
cpp(input).filter(is_not_whitespace).collect()
}

pub(crate) fn is_not_whitespace(res: &LexType) -> bool {
!matches!(
res,
Ok(Locatable {
data: Token::Whitespace(_),
..
})
)
}

fn match_data<T>(lexed: Option<LexType>, closure: T) -> bool
Expand Down Expand Up @@ -295,7 +305,7 @@ fn test_strings() {
#[test]
fn test_no_newline() {
assert!(cpp_no_newline("").next().is_none());
let mut tokens: Vec<_> = cpp_no_newline(" ").collect();
let mut tokens: Vec<_> = cpp_no_newline(" ").filter(is_not_whitespace).collect();
assert_eq!(tokens.len(), 1);
assert!(tokens.remove(0).unwrap_err().is_lex_err());

Expand Down
Loading