Skip to content

Commit

Permalink
Support simplifying expressions such as ~ ^(ba_r|foo)$ , where the … (
Browse files Browse the repository at this point in the history
  • Loading branch information
tanruixiang authored Aug 3, 2023
1 parent a6dcd94 commit 9c3a537
Show file tree
Hide file tree
Showing 2 changed files with 45 additions and 4 deletions.
33 changes: 33 additions & 0 deletions datafusion/optimizer/src/simplify_expressions/expr_simplifier.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2499,10 +2499,43 @@ mod tests {
col("c1")
.in_list(vec![lit("foo"), lit("bar"), lit("baz"), lit("qux")], false),
);
assert_change(
regex_match(col("c1"), lit("^(fo_o)$")),
col("c1").eq(lit("fo_o")),
);
assert_change(
regex_match(col("c1"), lit("^(fo_o)$")),
col("c1").eq(lit("fo_o")),
);
assert_change(
regex_match(col("c1"), lit("^(fo_o|ba_r)$")),
col("c1").eq(lit("fo_o")).or(col("c1").eq(lit("ba_r"))),
);
assert_change(
regex_not_match(col("c1"), lit("^(fo_o|ba_r)$")),
col("c1")
.not_eq(lit("fo_o"))
.and(col("c1").not_eq(lit("ba_r"))),
);
assert_change(
regex_match(col("c1"), lit("^(fo_o|ba_r|ba_z)$")),
((col("c1").eq(lit("fo_o"))).or(col("c1").eq(lit("ba_r"))))
.or(col("c1").eq(lit("ba_z"))),
);
assert_change(
regex_match(col("c1"), lit("^(fo_o|ba_r|baz|qu_x)$")),
col("c1").in_list(
vec![lit("fo_o"), lit("ba_r"), lit("baz"), lit("qu_x")],
false,
),
);

// regular expressions that mismatch captured literals
assert_no_change(regex_match(col("c1"), lit("(foo|bar)")));
assert_no_change(regex_match(col("c1"), lit("(foo|bar)*")));
assert_no_change(regex_match(col("c1"), lit("(fo_o|b_ar)")));
assert_no_change(regex_match(col("c1"), lit("(foo|ba_r)*")));
assert_no_change(regex_match(col("c1"), lit("(fo_o|ba_r)*")));
assert_no_change(regex_match(col("c1"), lit("^(foo|bar)*")));
assert_no_change(regex_match(col("c1"), lit("^foo|bar$")));
assert_no_change(regex_match(col("c1"), lit("^(foo)(bar)$")));
Expand Down
16 changes: 12 additions & 4 deletions datafusion/optimizer/src/simplify_expressions/regex.rs
Original file line number Diff line number Diff line change
Expand Up @@ -108,7 +108,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {

for sub in parts {
if let HirKind::Literal(l) = sub.kind() {
s.push_str(str_from_literal(l)?);
s.push_str(like_str_from_literal(l)?);
} else {
return None;
}
Expand All @@ -120,7 +120,7 @@ fn collect_concat_to_like_string(parts: &[Hir]) -> Option<String> {

/// returns a str represented by `Literal` if it contains a valid utf8
/// sequence and is safe for like (has no '%' and '_')
fn str_from_literal(l: &Literal) -> Option<&str> {
fn like_str_from_literal(l: &Literal) -> Option<&str> {
// if not utf8, no good
let s = std::str::from_utf8(&l.0).ok()?;

Expand All @@ -131,6 +131,14 @@ fn str_from_literal(l: &Literal) -> Option<&str> {
}
}

/// returns a str represented by `Literal` if it contains a valid utf8
fn str_from_literal(l: &Literal) -> Option<&str> {
// if not utf8, no good
let s = std::str::from_utf8(&l.0).ok()?;

Some(s)
}

fn is_safe_for_like(c: char) -> bool {
(c != '%') && (c != '_')
}
Expand Down Expand Up @@ -196,7 +204,7 @@ fn anchored_literal_to_expr(v: &[Hir]) -> Option<Expr> {
2 => Some(lit("")),
3 => {
let HirKind::Literal(l) = v[1].kind() else { return None };
str_from_literal(l).map(lit)
like_str_from_literal(l).map(lit)
}
_ => None,
}
Expand Down Expand Up @@ -242,7 +250,7 @@ fn lower_simple(mode: &OperatorMode, left: &Expr, hir: &Hir) -> Option<Expr> {
return Some(mode.expr(Box::new(left.clone()), "%".to_owned()));
}
HirKind::Literal(l) => {
let s = str_from_literal(l)?;
let s = like_str_from_literal(l)?;
return Some(mode.expr(Box::new(left.clone()), format!("%{s}%")));
}
HirKind::Concat(inner) if is_anchored_literal(inner) => {
Expand Down

0 comments on commit 9c3a537

Please sign in to comment.