Skip to content

Commit

Permalink
Correctly handle nested comments
Browse files Browse the repository at this point in the history
The tokenizer currently throws EOF error for

`select 'foo' /*/**/*/`

`last_ch` causes problems when tokenizing nested comments, we have to
consume the combination of /* or */

The existing `tokenize_nested_multiline_comment` test fails after fixing
this logic:

/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/
^^ Start                                      ^^ End nested comment

Relevant: #726
  • Loading branch information
hansott committed Dec 30, 2024
1 parent fe36020 commit b56b9be
Showing 1 changed file with 64 additions and 8 deletions.
72 changes: 64 additions & 8 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1855,22 +1855,30 @@ impl<'a> Tokenizer<'a> {
) -> Result<Option<Token>, TokenizerError> {
let mut s = String::new();
let mut nested = 1;
let mut last_ch = ' ';

loop {
match chars.next() {
Some(ch) => {
if last_ch == '/' && ch == '*' {
if ch == '/' && matches!(chars.peek(), Some('*')) {
s.push(ch);
s.push(chars.next().unwrap()); // consume the '*'
nested += 1;
} else if last_ch == '*' && ch == '/' {
continue;
}

if ch == '*' && matches!(chars.peek(), Some('/')) {
s.push(ch);
let slash = chars.next();
nested -= 1;
if nested == 0 {
s.pop();
s.pop(); // remove the last '/'
break Ok(Some(Token::Whitespace(Whitespace::MultiLineComment(s))));
}
s.push(slash.unwrap());
continue;
}

s.push(ch);
last_ch = ch;
}
None => {
break self.tokenizer_error(
Expand Down Expand Up @@ -2718,17 +2726,65 @@ mod tests {

#[test]
fn tokenize_nested_multiline_comment() {
let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");

let dialect = GenericDialect {};

let sql = String::from("0/*multi-line\n* \n/* comment \n /*comment*/*/ */ /comment*/1");
let tokens = Tokenizer::new(&dialect, &sql).tokenize().unwrap();
let expected = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment*/*/ */ /comment".to_string(),
"multi-line\n* \n/* comment \n /*comment*/*/ ".into(),
)),
Token::Whitespace(Whitespace::Space),
Token::Div,
Token::Word(Word {
value: "comment".to_string(),
quote_style: None,
keyword: Keyword::COMMENT,
}),
Token::Mul,
Token::Div,
Token::Number("1".to_string(), false),
];
compare(expected, tokens);

let sql2 = String::from("0/*multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/*/1");
let tokens2 = Tokenizer::new(&dialect, &sql2).tokenize().unwrap();
let expected2 = vec![
Token::Number("0".to_string(), false),
Token::Whitespace(Whitespace::MultiLineComment(
"multi-line\n* \n/* comment \n /*comment/**/ */ /comment*/".into(),
)),
Token::Number("1".to_string(), false),
];
compare(expected2, tokens2);

let sql3 = String::from("SELECT 1 /* a /* b */ c */");
let tokens3 = Tokenizer::new(&dialect, &sql3).tokenize().unwrap();
let expected3 = vec![
Token::make_keyword("SELECT"),
Token::Whitespace(Whitespace::Space),
Token::Number("1".to_string(), false),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::MultiLineComment(" a /* b */ c ".to_string())),
];
compare(expected3, tokens3);
}

#[test]
fn tokenize_nested_multiline_comment_empty() {
let sql = "select 'foo' /*/**/*/";

let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();
let expected = vec![
Token::make_keyword("select"),
Token::Whitespace(Whitespace::Space),
Token::SingleQuotedString("foo".to_string()),
Token::Whitespace(Whitespace::Space),
Token::Whitespace(Whitespace::MultiLineComment("/**/".to_string())),
];

compare(expected, tokens);
}

Expand Down

0 comments on commit b56b9be

Please sign in to comment.