jyn514 · jyn514 · May 25, 2020 · May 15, 2020 · May 15, 2020 · May 15, 2020
diff --git a/src/analyze/mod.rs b/src/analyze/mod.rs
@@ -1925,8 +1925,7 @@ pub(crate) mod test {
     }
     #[test]
     fn lol() {
-        let lol = "
-int *jynelson(int(*fp)(int)) {
+        let lol = "int *jynelson(int(*fp)(int)) {
     return 0;
 }
 int f(int i) {

diff --git a/src/data/lex.rs b/src/data/lex.rs
@@ -190,6 +190,8 @@ pub enum Token {
     Literal(Literal),
     Id(InternedStr),
 
+    Whitespace(String),
+
     // Misc
     Ellipsis,
     StructDeref, // ->
@@ -353,6 +355,8 @@ impl std::fmt::Display for Token {
             Id(id) => write!(f, "{}", id),
             Keyword(k) => write!(f, "{}", k),
 
+            Whitespace(s) => write!(f, "{}", s),
+
             Ellipsis => write!(f, "..."),
             StructDeref => write!(f, "->"),
             Hash => write!(f, "#"),

diff --git a/src/lex/cpp.rs b/src/lex/cpp.rs
@@ -427,7 +427,7 @@ impl<'a> PreProcessor<'a> {
         };
         Some(if is_hash && !self.file_processor.seen_line_token() {
             let line = self.file_processor.line();
-            match self.file_processor.next()? {
+            match self.file_processor.next_non_whitespace()? {
                 Ok(Locatable {
                     data: Token::Id(id),
                     location,
@@ -480,10 +480,12 @@ impl<'a> PreProcessor<'a> {
                 self.if_directive(condition, start)
             }
             IfNDef => {
+                self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
                 let name = self.expect_id()?;
                 self.if_directive(!self.definitions.contains_key(&name.data), start)
             }
             IfDef => {
+                self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
                 let name = self.expect_id()?;
                 self.if_directive(self.definitions.contains_key(&name.data), start)
             }
@@ -530,6 +532,7 @@ impl<'a> PreProcessor<'a> {
             }
             Define => self.define(start),
             Undef => {
+                self.consume_whitespace_oneline(start, CppError::EmptyExpression)?;
                 let name = self.expect_id()?;
                 self.definitions.remove(&name.data);
                 Ok(())
@@ -818,7 +821,7 @@ impl<'a> PreProcessor<'a> {
     fn fn_args(&mut self, start: u32) -> Result<Vec<InternedStr>, Locatable<Error>> {
         let mut arguments = Vec::new();
         loop {
-            match self.file_processor.next() {
+            match self.file_processor.next_non_whitespace() {
                 None => {
                     return Err(CompileError::new(
                         CppError::EndOfFile("identifier or ')'").into(),
@@ -883,14 +886,19 @@ impl<'a> PreProcessor<'a> {
             this.tokens_until_newline()
                 .into_iter()
                 .map(|res| res.map(|loc| loc.data))
+                .enumerate()
+                .map(|(i, x)| {
+                    if i == 0 {
+                        vec![x]
+                    } else {
+                        vec![x, Ok(Token::Whitespace(String::from(" ")))]
+                    }
+                })
+                .flatten()
                 .collect::<Result<Vec<_>, Locatable<Error>>>()
         };
 
-        let line = self.line();
-        self.file_processor.consume_whitespace();
-        if self.line() != line {
-            return Err(self.span(start).error(CppError::EmptyDefine));
-        }
+        self.consume_whitespace_oneline(start, CppError::EmptyDefine)?;
         let id = self.expect_id()?;
         // NOTE: does _not_ discard whitespace
         if self.lexer_mut().match_next(b'(') {
@@ -1077,6 +1085,34 @@ impl<'a> PreProcessor<'a> {
             }
         }
     }
+
+    /// Returns next token in stream which is not whitespace
+    pub fn next_non_whitespace(&mut self) -> Option<CppResult<Token>> {
+        loop {
+            match self.next() {
+                Some(Ok(Locatable {
+                    data: Token::Whitespace(_),
+                    location: _,
+                })) => continue,
+                other => break other,
+            }
+        }
+    }
+
+    /// Consumes whitespace but returns error if it includes a newline
+    #[inline]
+    fn consume_whitespace_oneline(
+        &mut self,
+        start: u32,
+        error: CppError,
+    ) -> Result<String, CompileError> {
+        let line = self.line();
+        let ret = self.file_processor.consume_whitespace();
+        if self.line() != line {
+            return Err(self.span(start).error(error));
+        }
+        Ok(ret)
+    }
 }
 
 #[derive(Copy, Clone, Debug, PartialEq, Eq)]
@@ -1199,7 +1235,7 @@ mod tests {
 
     macro_rules! assert_err {
         ($src: expr, $err: pat, $description: expr $(,)?) => {
-            match cpp($src).next().unwrap().unwrap_err().data {
+            match cpp($src).next_non_whitespace().unwrap().unwrap_err().data {
                 Error::PreProcessor($err) => {}
                 Error::PreProcessor(other) => panic!("expected {}, got {}", $description, other),
                 _ => panic!("expected cpp err"),
@@ -1215,14 +1251,20 @@ mod tests {
             _ => panic!("not a keyword: {:?}", token),
         }
     }
+    fn is_same_preprocessed(xs: PreProcessor, ys: PreProcessor) -> bool {
+        let xs = xs
+            .map(|res| res.map(|token| token.data))
+            .filter(|res| !matches!(res, Ok(Token::Whitespace(_))))
+            .collect::<Vec<_>>();
+        let ys = ys
+            .map(|res| res.map(|token| token.data))
+            .filter(|res| !matches!(res, Ok(Token::Whitespace(_))))
+            .collect::<Vec<_>>();
+        xs == ys
+    }
     fn assert_same(src: &str, cpp_src: &str) {
-        assert_eq!(
-            cpp(src)
-                .map(|res| res.map(|token| token.data))
-                .collect::<Vec<_>>(),
-            cpp(cpp_src)
-                .map(|res| res.map(|token| token.data))
-                .collect::<Vec<_>>(),
+        assert!(
+            is_same_preprocessed(cpp(src), cpp(cpp_src)),
             "{} is not the same as {}",
             src,
             cpp_src,
@@ -1270,12 +1312,12 @@ mod tests {
         let code = "#ifdef a
         whatever, doesn't matter
         #endif";
-        assert_eq!(cpp(code).next(), None);
+        assert_eq!(cpp(code).next_non_whitespace(), None);
 
         let code = "#ifdef a\n#endif";
-        assert_eq!(cpp(code).next(), None);
+        assert_eq!(cpp(code).next_non_whitespace(), None);
 
-        assert!(cpp("#ifdef").next().unwrap().is_err());
+        assert!(cpp("#ifdef").next_non_whitespace().unwrap().is_err());
 
         let nested = "#ifdef a
         #ifdef b
@@ -1284,14 +1326,14 @@ mod tests {
         #endif
         char;";
         assert_eq!(
-            cpp(nested).next().unwrap().unwrap().data,
+            cpp(nested).next_non_whitespace().unwrap().unwrap().data,
             Token::Keyword(Keyword::Char)
         );
 
-        assert!(cpp("#endif").next().unwrap().is_err());
+        assert!(cpp("#endif").next_non_whitespace().unwrap().is_err());
 
         let same_line = "#ifdef a #endif\nint main() {}";
-        assert!(cpp(same_line).next().unwrap().is_err());
+        assert!(cpp(same_line).next_non_whitespace().unwrap().is_err());
     }
     #[test]
     fn ifndef() {
@@ -1300,7 +1342,7 @@ mod tests {
 #define A
 #endif
 A";
-        assert!(cpp(src).next().is_none());
+        assert!(cpp(src).next_non_whitespace().is_none());
     }
     #[test]
     fn object_macros() {
@@ -1479,15 +1521,14 @@ c
     }
     #[test]
     fn test_comment_newline() {
-        let tokens: Vec<_> = cpp_no_newline(
+        let tokens = cpp_no_newline(
             "
 #if 1 //
 int main() {}
 #endif
 ",
-        )
-        .collect();
-        assert_eq!(tokens, cpp("int main() {}").collect::<Vec<_>>());
+        );
+        assert!(is_same_preprocessed(tokens, cpp("int main() {}")));
         assert_same(
             "
 #if 1 /**//**/

diff --git a/src/lex/files.rs b/src/lex/files.rs
@@ -119,7 +119,7 @@ impl FileProcessor {
         self.lexer().span(start)
     }
     #[inline]
-    pub(super) fn consume_whitespace(&mut self) {
+    pub(super) fn consume_whitespace(&mut self) -> String {
         self.lexer_mut().consume_whitespace()
     }
     #[inline]
@@ -159,4 +159,17 @@ impl FileProcessor {
         }
         tokens
     }
+
+    /// Returns next token in stream which is not whitespace
+    pub(super) fn next_non_whitespace(&mut self) -> Option<CompileResult<Locatable<Token>>> {
+        loop {
+            match self.next() {
+                Some(Ok(Locatable {
+                    data: Token::Whitespace(_),
+                    location: _,
+                })) => continue,
+                other => break other,
+            }
+        }
+    }
 }
diff --git a/src/lex/mod.rs b/src/lex/mod.rs
@@ -191,22 +191,29 @@ impl Lexer {
     ///
     /// Before: b"    // some comment\n /*multi comment*/hello   "
     /// After:  b"hello   "
-    fn consume_whitespace(&mut self) {
+    fn consume_whitespace(&mut self) -> String {
         // there may be comments following whitespace
+        let mut whitespace = String::new();
         loop {
             // whitespace
             while self.peek().map_or(false, |c| c.is_ascii_whitespace()) {
-                self.next_char();
+                if let Some(c) = self.next_char() {
+                    whitespace.push(c.into());
+                }
             }
             // comments
             if self.peek() == Some(b'/') {
                 match self.peek_next() {
-                    Some(b'/') => self.consume_line_comment(),
+                    Some(b'/') => {
+                        self.consume_line_comment();
+                        whitespace.push('\n');
+                    }
                     Some(b'*') => {
                         self.next_char();
                         self.next_char();
-                        if let Err(err) = self.consume_multi_comment() {
-                            self.error_handler.push_back(err);
+                        match self.consume_multi_comment() {
+                            Ok(ws) => whitespace.push_str(&ws),
+                            Err(err) => self.error_handler.push_back(err),
                         }
                     }
                     _ => break,
@@ -215,6 +222,7 @@ impl Lexer {
                 break;
             }
         }
+        whitespace
     }
     /// Remove all characters between now and the next b'\n' character.
     ///
@@ -232,12 +240,21 @@ impl Lexer {
     ///
     /// Before: u8s{"hello this is a lot of text */ int main(){}"}
     /// After:  chars{" int main(){}"}
-    fn consume_multi_comment(&mut self) -> LexResult<()> {
+    ///
+    /// Return newlines occupied by the comment or a space if no newlines
+    fn consume_multi_comment(&mut self) -> LexResult<String> {
+        let mut whitespace = String::new();
         let start = self.location.offset - 2;
         while let Some(c) = self.next_char() {
             if c == b'*' && self.peek() == Some(b'/') {
                 self.next_char();
-                return Ok(());
+                if whitespace.is_empty() {
+                    whitespace.push(' '); // For the case `a/* */b`
+                }
+                return Ok(whitespace);
+            }
+            if c == b'\n' {
+                whitespace.push(c.into());
             }
         }
         Err(Locatable {
@@ -648,6 +665,19 @@ impl Lexer {
         }
         Ok(Token::Id(InternedStr::get_or_intern(id)))
     }
+
+    /// Returns next token in stream which is not whitespace
+    pub fn next_non_whitespace(&mut self) -> Option<LexResult<Locatable<Token>>> {
+        loop {
+            match self.next() {
+                Some(Ok(Locatable {
+                    data: Token::Whitespace(_),
+                    location: _,
+                })) => continue,
+                other => break other,
+            }
+        }
+    }
 }
 
 impl Iterator for Lexer {
@@ -668,7 +698,17 @@ impl Iterator for Lexer {
             return None;
         }
 
-        self.consume_whitespace();
+        {
+            let span_start = self.location.offset;
+            let data = self.consume_whitespace();
+            if !data.is_empty() {
+                return Some(Ok(Locatable {
+                    data: Token::Whitespace(data),
+                    location: self.span(span_start),
+                }));
+            }
+        }
+
         let c = self.next_char().and_then(|c| {
             let span_start = self.location.offset - 1;
             // this giant switch is most of the logic
@@ -865,7 +905,8 @@ impl Iterator for Lexer {
                         .with(LexError::UnknownToken(x as char))));
                 }
             };
-            self.seen_line_token |= data != Token::Hash;
+            // We've seen a token if this isn't # and this isn't whitespace
+            self.seen_line_token |= data != Token::Hash && !matches!(data, Token::Whitespace(_));
             Some(Ok(Locatable {
                 data,
                 location: self.span(span_start),

diff --git a/src/lex/tests.rs b/src/lex/tests.rs
@@ -15,7 +15,17 @@ fn lex(input: &str) -> Option<LexType> {
     lexed.pop()
 }
 fn lex_all(input: &str) -> Vec<LexType> {
-    cpp(input).collect()
+    cpp(input).filter(is_not_whitespace).collect()
+}
+
+pub(crate) fn is_not_whitespace(res: &LexType) -> bool {
+    !matches!(
+        res,
+        Ok(Locatable {
+            data: Token::Whitespace(_),
+            ..
+        })
+    )
 }
 
 fn match_data<T>(lexed: Option<LexType>, closure: T) -> bool
@@ -295,7 +305,7 @@ fn test_strings() {
 #[test]
 fn test_no_newline() {
     assert!(cpp_no_newline("").next().is_none());
-    let mut tokens: Vec<_> = cpp_no_newline(" ").collect();
+    let mut tokens: Vec<_> = cpp_no_newline(" ").filter(is_not_whitespace).collect();
     assert_eq!(tokens.len(), 1);
     assert!(tokens.remove(0).unwrap_err().is_lex_err());