Skip to content

Commit

Permalink
Add support backslash escape
Browse files Browse the repository at this point in the history
This adds support for parsing string literals on
dialects that treat backslash character as an escape
character. As an example, the following previously failed
to parse by dialects like BigQuery where the syntax is valid.
```sql
SELECT 'a\'b';
```

Moves the SQL `like` and `similar_to` tests from individual
dialects to common since the tests were identical.
  • Loading branch information
iffyio committed Mar 23, 2024
1 parent 6b03a25 commit 7273ded
Show file tree
Hide file tree
Showing 16 changed files with 214 additions and 893 deletions.
6 changes: 3 additions & 3 deletions src/ast/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -441,21 +441,21 @@ pub enum Expr {
negated: bool,
expr: Box<Expr>,
pattern: Box<Expr>,
escape_char: Option<char>,
escape_char: Option<String>,
},
/// `ILIKE` (case-insensitive `LIKE`)
ILike {
negated: bool,
expr: Box<Expr>,
pattern: Box<Expr>,
escape_char: Option<char>,
escape_char: Option<String>,
},
/// SIMILAR TO regex
SimilarTo {
negated: bool,
expr: Box<Expr>,
pattern: Box<Expr>,
escape_char: Option<char>,
escape_char: Option<String>,
},
/// MySQL: RLIKE regex or REGEXP regex
RLike {
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/bigquery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -29,4 +29,8 @@ impl Dialect for BigQueryDialect {
fn is_identifier_part(&self, ch: char) -> bool {
ch.is_ascii_lowercase() || ch.is_ascii_uppercase() || ch.is_ascii_digit() || ch == '_'
}

fn supports_string_literal_backslash_escape(&self) -> bool {
true
}
}
4 changes: 4 additions & 0 deletions src/dialect/clickhouse.rs
Original file line number Diff line number Diff line change
Expand Up @@ -25,4 +25,8 @@ impl Dialect for ClickHouseDialect {
fn is_identifier_part(&self, ch: char) -> bool {
self.is_identifier_start(ch) || ch.is_ascii_digit()
}

fn supports_string_literal_backslash_escape(&self) -> bool {
true
}
}
17 changes: 17 additions & 0 deletions src/dialect/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,23 @@ pub trait Dialect: Debug + Any {
fn is_identifier_start(&self, ch: char) -> bool;
/// Determine if a character is a valid unquoted identifier character
fn is_identifier_part(&self, ch: char) -> bool;
/// Determine if the dialect supports escaping characters via '\' in string literals.
///
/// Some dialects like BigQuery and Snowflake support this while others like
/// Postgres do not. Such that the following is accepted by the former but
/// rejected by the latter.
/// ```sql
/// SELECT 'ab\'cd';
/// ```
///
/// Conversely, such dialects reject the following statement which
/// otherwise would be valid in the other dialects.
/// ```sql
/// SELECT '\';
/// ```
fn supports_string_literal_backslash_escape(&self) -> bool {
false
}
/// Does the dialect support `FILTER (WHERE expr)` for aggregate queries?
fn supports_filter_during_aggregation(&self) -> bool {
false
Expand Down
4 changes: 4 additions & 0 deletions src/dialect/snowflake.rs
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ impl Dialect for SnowflakeDialect {
|| ch == '_'
}

fn supports_string_literal_backslash_escape(&self) -> bool {
true
}

fn supports_within_after_array_aggregation(&self) -> bool {
true
}
Expand Down
4 changes: 2 additions & 2 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2449,9 +2449,9 @@ impl<'a> Parser<'a> {
}

/// parse the ESCAPE CHAR portion of LIKE, ILIKE, and SIMILAR TO
pub fn parse_escape_char(&mut self) -> Result<Option<char>, ParserError> {
pub fn parse_escape_char(&mut self) -> Result<Option<String>, ParserError> {
if self.parse_keyword(Keyword::ESCAPE) {
Ok(Some(self.parse_literal_char()?))
Ok(Some(self.parse_literal_string()?))
} else {
Ok(None)
}
Expand Down
75 changes: 65 additions & 10 deletions src/tokenizer.rs
Original file line number Diff line number Diff line change
Expand Up @@ -627,11 +627,11 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::SingleQuotedByteStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"')?;
let s = self.tokenize_quoted_string(chars, '\"', true)?;
Ok(Some(Token::DoubleQuotedByteStringLiteral(s)))
}
_ => {
Expand All @@ -646,11 +646,11 @@ impl<'a> Tokenizer<'a> {
chars.next(); // consume
match chars.peek() {
Some('\'') => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::RawStringLiteral(s)))
}
Some('\"') => {
let s = self.tokenize_quoted_string(chars, '\"')?;
let s = self.tokenize_quoted_string(chars, '\"', true)?;
Ok(Some(Token::RawStringLiteral(s)))
}
_ => {
Expand All @@ -666,7 +666,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// N'...' - a <national character string literal>
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::NationalStringLiteral(s)))
}
_ => {
Expand Down Expand Up @@ -700,7 +700,7 @@ impl<'a> Tokenizer<'a> {
match chars.peek() {
Some('\'') => {
// X'...' - a <binary string literal>
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(chars, '\'', true)?;
Ok(Some(Token::HexStringLiteral(s)))
}
_ => {
Expand All @@ -712,15 +712,23 @@ impl<'a> Tokenizer<'a> {
}
// single quoted string
'\'' => {
let s = self.tokenize_quoted_string(chars, '\'')?;
let s = self.tokenize_quoted_string(
chars,
'\'',
self.dialect.supports_string_literal_backslash_escape(),
)?;

Ok(Some(Token::SingleQuotedString(s)))
}
// double quoted string
'\"' if !self.dialect.is_delimited_identifier_start(ch)
&& !self.dialect.is_identifier_start(ch) =>
{
let s = self.tokenize_quoted_string(chars, '"')?;
let s = self.tokenize_quoted_string(
chars,
'"',
self.dialect.supports_string_literal_backslash_escape(),
)?;

Ok(Some(Token::DoubleQuotedString(s)))
}
Expand Down Expand Up @@ -1211,15 +1219,17 @@ impl<'a> Tokenizer<'a> {
&self,
chars: &mut State,
quote_style: char,
allow_escape: bool,
) -> Result<String, TokenizerError> {
let mut s = String::new();
let error_loc = chars.location();

chars.next(); // consume the opening quote

let mut escape_on = false;
while let Some(&ch) = chars.peek() {
match ch {
char if char == quote_style => {
char if !escape_on && char == quote_style => {
chars.next(); // consume
if chars.peek().map(|c| *c == quote_style).unwrap_or(false) {
s.push(ch);
Expand All @@ -1235,6 +1245,11 @@ impl<'a> Tokenizer<'a> {
'\\' => {
// consume
chars.next();

if allow_escape {
escape_on = !escape_on;
}

// slash escaping is specific to MySQL dialect.
if dialect_of!(self is MySqlDialect) {
if let Some(next) = chars.peek() {
Expand All @@ -1258,6 +1273,7 @@ impl<'a> Tokenizer<'a> {
s.push(n);
chars.next(); // consume next
}
escape_on = false;
}
} else {
s.push(ch);
Expand All @@ -1266,6 +1282,7 @@ impl<'a> Tokenizer<'a> {
_ => {
chars.next(); // consume
s.push(ch);
escape_on = false;
}
}
}
Expand Down Expand Up @@ -1506,7 +1523,7 @@ impl<'a: 'b, 'b> Unescape<'a, 'b> {
#[cfg(test)]
mod tests {
use super::*;
use crate::dialect::{ClickHouseDialect, MsSqlDialect};
use crate::dialect::{BigQueryDialect, ClickHouseDialect, MsSqlDialect};

Check warning on line 1526 in src/tokenizer.rs

View workflow job for this annotation

GitHub Actions / test (beta)

the item `BigQueryDialect` is imported redundantly

Check warning on line 1526 in src/tokenizer.rs

View workflow job for this annotation

GitHub Actions / test (nightly)

the item `BigQueryDialect` is imported redundantly

#[test]
fn tokenizer_error_impl() {
Expand Down Expand Up @@ -2306,4 +2323,42 @@ mod tests {
check_unescape(r"Hello\0", None);
check_unescape(r"Hello\xCADRust", None);
}

#[test]
fn tokenize_quoted_string_escape() {
for (sql, expected) in [
(r#"'%a\'%b'"#, r#"%a\'%b"#),
(r#"'a\'\'b\'c\'d'"#, r#"a\'\'b\'c\'d"#),
(r#"'\\'"#, r#"\\"#),
(r#"'\\a\\b\'c'"#, r#"\\a\\b\'c"#),
(r#"'\'abcd'"#, r#"\'abcd"#),
(r#"'''a''b'"#, r#"'a'b"#),
] {
let dialect = BigQueryDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();

let expected = vec![Token::SingleQuotedString(expected.to_string())];

compare(expected, tokens);
}

for sql in [r#"'\'"#, r#"'ab\'"#] {
let dialect = BigQueryDialect {};
let mut tokenizer = Tokenizer::new(&dialect, sql);
assert_eq!(
"Unterminated string literal",
tokenizer.tokenize().unwrap_err().message.as_str(),
);
}

// Non-escape dialect
for (sql, expected) in [(r#"'\'"#, r#"\"#), (r#"'ab\'"#, r#"ab\"#)] {
let dialect = GenericDialect {};
let tokens = Tokenizer::new(&dialect, sql).tokenize().unwrap();

let expected = vec![Token::SingleQuotedString(expected.to_string())];

compare(expected, tokens);
}
}
}
109 changes: 0 additions & 109 deletions tests/sqlparser_bigquery.rs
Original file line number Diff line number Diff line change
Expand Up @@ -1083,115 +1083,6 @@ fn parse_cast_bytes_to_string_format() {
bigquery_and_generic().verified_only_select(sql);
}

#[test]
fn parse_like() {
fn chk(negated: bool) {
let sql = &format!(
"SELECT * FROM customers WHERE name {}LIKE '%a'",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::Like {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: None,
},
select.selection.unwrap()
);

// Test with escape char
let sql = &format!(
"SELECT * FROM customers WHERE name {}LIKE '%a' ESCAPE '\\'",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::Like {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: Some('\\'),
},
select.selection.unwrap()
);

// This statement tests that LIKE and NOT LIKE have the same precedence.
// This was previously mishandled (#81).
let sql = &format!(
"SELECT * FROM customers WHERE name {}LIKE '%a' IS NULL",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::IsNull(Box::new(Expr::Like {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: None,
})),
select.selection.unwrap()
);
}
chk(false);
chk(true);
}

#[test]
fn parse_similar_to() {
fn chk(negated: bool) {
let sql = &format!(
"SELECT * FROM customers WHERE name {}SIMILAR TO '%a'",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::SimilarTo {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: None,
},
select.selection.unwrap()
);

// Test with escape char
let sql = &format!(
"SELECT * FROM customers WHERE name {}SIMILAR TO '%a' ESCAPE '\\'",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::SimilarTo {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: Some('\\'),
},
select.selection.unwrap()
);

// This statement tests that SIMILAR TO and NOT SIMILAR TO have the same precedence.
let sql = &format!(
"SELECT * FROM customers WHERE name {}SIMILAR TO '%a' ESCAPE '\\' IS NULL",
if negated { "NOT " } else { "" }
);
let select = bigquery().verified_only_select(sql);
assert_eq!(
Expr::IsNull(Box::new(Expr::SimilarTo {
expr: Box::new(Expr::Identifier(Ident::new("name"))),
negated,
pattern: Box::new(Expr::Value(Value::SingleQuotedString("%a".to_string()))),
escape_char: Some('\\'),
})),
select.selection.unwrap()
);
}
chk(false);
chk(true);
}

#[test]
fn parse_array_agg_func() {
for sql in [
Expand Down
Loading

0 comments on commit 7273ded

Please sign in to comment.