-
Notifications
You must be signed in to change notification settings - Fork 1.3k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Encode all join conditions in a single expression field #7612
Changes from 2 commits
27e8ab3
301414c
cf651fd
590e7a9
b48cc0f
3f2d442
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -129,6 +129,17 @@ fn scalar_function_type_from_str(name: &str) -> Result<ScalarFunctionType> { | |
} | ||
} | ||
|
||
#[inline(always)] | ||
fn make_mixed_join_condition( | ||
join_filter: Option<Expr>, | ||
predicate: &Expr, | ||
) -> Option<Expr> { | ||
match &join_filter { | ||
Some(filter) => Some(filter.clone().and(predicate.clone())), | ||
None => Some(predicate.clone()), | ||
} | ||
} | ||
|
||
/// Convert Substrait Plan to DataFusion DataFrame | ||
pub async fn from_substrait_plan( | ||
ctx: &mut SessionContext, | ||
|
@@ -331,6 +342,12 @@ pub async fn from_substrait_rel( | |
} | ||
} | ||
Some(RelType::Join(join)) => { | ||
if join.post_join_filter.is_some() { | ||
return not_impl_err!( | ||
"JoinRel with post_join_filter is not yet supported" | ||
); | ||
} | ||
|
||
let left = LogicalPlanBuilder::from( | ||
from_substrait_rel(ctx, join.left.as_ref().unwrap(), extensions).await?, | ||
); | ||
|
@@ -341,65 +358,68 @@ pub async fn from_substrait_rel( | |
// The join condition expression needs full input schema and not the output schema from join since we lose columns from | ||
// certain join types such as semi and anti joins | ||
let in_join_schema = left.schema().join(right.schema())?; | ||
// Parse post join filter if exists | ||
let join_filter = match &join.post_join_filter { | ||
Some(filter) => { | ||
let parsed_filter = | ||
from_substrait_rex(filter, &in_join_schema, extensions).await?; | ||
Some(parsed_filter.as_ref().clone()) | ||
} | ||
None => None, | ||
}; | ||
let mut join_filter: Option<Expr> = None; | ||
// If join expression exists, parse the `on` condition expression, build join and return | ||
// Otherwise, build join with koin filter, without join keys | ||
// Otherwise, build join with only the filter, without join keys | ||
match &join.expression.as_ref() { | ||
Some(expr) => { | ||
let on = | ||
from_substrait_rex(expr, &in_join_schema, extensions).await?; | ||
let predicates = split_conjunction(&on); | ||
// TODO: collect only one null_eq_null | ||
let join_exprs: Vec<(Column, Column, bool)> = predicates | ||
.iter() | ||
.map(|p| match p { | ||
// The predicates can contain both equal and non-equal ops. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not sure if this is a good idea, it seems to make things more complex. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Dandandan Thank you for pointing this out. I agree that this makes things more complicated, and my apologies for the inaccurate explanation of the issue. I added a comment to correct my description of the issue. The high-level idea is that join I'll modify the consumer to throw an error for now if there's a There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I looked into the substrait spec, and it doesn't really talk about what semantics of the There is a subtle distinction between non equality filters applied during the join (in the
This makes sense to me For the consumer, there is already code in DataFusion that breaks up an arbitrary I think we could do the same here in the subtrait consumer which would be much simpler, and would let the normal DataFusion optimization machinery work. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Thanks @alamb for the pointer! I'll look into this and update the consumer. |
||
// Since as of datafusion 31.0.0, the equal and non equal join conditions are in separate fields, | ||
// we extract each part as follows: | ||
// - If an equal op is encountered, add the left column, right column and is_null_equal_nulls to `join_on` vector | ||
// - Otherwise we add the expression to join_filter (use conjunction if filter already exists) | ||
let mut join_on: Vec<(Column, Column, bool)> = vec![]; | ||
for p in predicates { | ||
match p { | ||
Expr::BinaryExpr(BinaryExpr { left, op, right }) => { | ||
match (left.as_ref(), right.as_ref()) { | ||
(Expr::Column(l), Expr::Column(r)) => match op { | ||
Operator::Eq => Ok((l.clone(), r.clone(), false)), | ||
Operator::Eq => { | ||
join_on.push((l.clone(), r.clone(), false)) | ||
} | ||
Operator::IsNotDistinctFrom => { | ||
Ok((l.clone(), r.clone(), true)) | ||
join_on.push((l.clone(), r.clone(), true)) | ||
} | ||
_ => { | ||
// If the operator is not a form of an equal operator, add this expression to filter | ||
join_filter = | ||
make_mixed_join_condition(join_filter, p); | ||
} | ||
_ => plan_err!("invalid join condition op"), | ||
}, | ||
_ => plan_err!("invalid join condition expression"), | ||
_ => { | ||
// If this is not a `col op col` expression, then `and` the expression to filter | ||
join_filter = | ||
make_mixed_join_condition(join_filter, p); | ||
} | ||
} | ||
} | ||
_ => plan_err!( | ||
"Non-binary expression is not supported in join condition" | ||
), | ||
}) | ||
.collect::<Result<Vec<_>>>()?; | ||
_ => { | ||
// If this is not a binary expression, then `and` the expression to filter | ||
join_filter = make_mixed_join_condition(join_filter, p); | ||
} | ||
} | ||
} | ||
|
||
let (left_cols, right_cols, null_eq_nulls): (Vec<_>, Vec<_>, Vec<_>) = | ||
itertools::multiunzip(join_exprs); | ||
itertools::multiunzip(join_on); | ||
left.join_detailed( | ||
right.build()?, | ||
join_type, | ||
(left_cols, right_cols), | ||
join_filter, | ||
null_eq_nulls[0], | ||
if null_eq_nulls.is_empty() { | ||
false // if no equal condition, default null_eq_nulls to false | ||
} else { | ||
null_eq_nulls[0] | ||
}, | ||
)? | ||
.build() | ||
} | ||
None => match &join_filter { | ||
Some(_) => left | ||
.join( | ||
right.build()?, | ||
join_type, | ||
(Vec::<Column>::new(), Vec::<Column>::new()), | ||
join_filter, | ||
)? | ||
.build(), | ||
None => plan_err!("Join without join keys require a valid filter"), | ||
}, | ||
None => plan_err!("JoinRel without join condition is not allowed"), | ||
} | ||
} | ||
Some(RelType::Read(read)) => match &read.as_ref().read_type { | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -277,30 +277,47 @@ pub fn to_substrait_rel( | |
// parse filter if exists | ||
let in_join_schema = join.left.schema().join(join.right.schema())?; | ||
let join_filter = match &join.filter { | ||
Some(filter) => Some(Box::new(to_substrait_rex( | ||
Some(filter) => Some(to_substrait_rex( | ||
filter, | ||
&Arc::new(in_join_schema), | ||
0, | ||
extension_info, | ||
)?)), | ||
)?), | ||
None => None, | ||
}; | ||
|
||
// map the left and right columns to binary expressions in the form `l = r` | ||
// build a single expression for the ON condition, such as `l.a = r.a AND l.b = r.b` | ||
let eq_op = if join.null_equals_null { | ||
Operator::IsNotDistinctFrom | ||
} else { | ||
Operator::Eq | ||
}; | ||
|
||
let join_expr = to_substrait_join_expr( | ||
let join_on = to_substrait_join_expr( | ||
&join.on, | ||
eq_op, | ||
join.left.schema(), | ||
join.right.schema(), | ||
extension_info, | ||
)? | ||
.map(Box::new); | ||
)?; | ||
|
||
// create conjunction between `join_on` and `join_filter` to embed all join conditions, | ||
// whether equal or non-equal in a single expression | ||
let join_expr = match &join_on { | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Perhaps you could use |
||
Some(on_expr) => match &join_filter { | ||
Some(filter) => Some(Box::new(make_binary_op_scalar_func( | ||
on_expr, | ||
filter, | ||
Operator::And, | ||
extension_info, | ||
))), | ||
None => join_on.map(Box::new), // the join expression will only contain `join_on` if filter doesn't exist | ||
}, | ||
None => match &join_filter { | ||
Some(_) => join_filter.map(Box::new), // the join expression will only contain `join_file` if the `on` condition doesn't exist | ||
None => None, | ||
}, | ||
}; | ||
|
||
Ok(Box::new(Rel { | ||
rel_type: Some(RelType::Join(Box::new(JoinRel { | ||
|
@@ -309,7 +326,7 @@ pub fn to_substrait_rel( | |
right: Some(right), | ||
r#type: join_type as i32, | ||
expression: join_expr, | ||
post_join_filter: join_filter, | ||
post_join_filter: None, | ||
advanced_extension: None, | ||
}))), | ||
})) | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
👍