diff --git a/datafusion/core/src/dataframe.rs b/datafusion/core/src/dataframe.rs index d704c7f30443..0c7548fc75c6 100644 --- a/datafusion/core/src/dataframe.rs +++ b/datafusion/core/src/dataframe.rs @@ -582,12 +582,21 @@ impl DataFrame { Ok(DataFrame::new(self.session_state, plan)) } - /// Join this DataFrame with another DataFrame using the specified columns as join keys. + /// Join this `DataFrame` with another `DataFrame` using explicitly specified + /// columns and an optional filter expression. /// - /// Filter expression expected to contain non-equality predicates that can not be pushed - /// down to any of join inputs. - /// In case of outer join, filter applied to only matched rows. + /// See [`join_on`](Self::join_on) for a more concise way to specify the + /// join condition. Since DataFusion will automatically identify and + /// optimize equality predicates there is no performance difference between + /// this function and `join_on` /// + /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see + /// example below), which are then combined with the optional `filter` + /// expression. + /// + /// Note that in case of outer join, the `filter` is applied to only matched rows. + /// + /// # Example /// ``` /// # use datafusion::prelude::*; /// # use datafusion::error::Result; @@ -600,11 +609,14 @@ impl DataFrame { /// col("a").alias("a2"), /// col("b").alias("b2"), /// col("c").alias("c2")])?; + /// // Perform the equivalent of `left INNER JOIN right ON (a = a2 AND b = b2)` + /// // finding all pairs of rows from `left` and `right` where `a = a2` and `b = b2`. /// let join = left.join(right, JoinType::Inner, &["a", "b"], &["a2", "b2"], None)?; /// let batches = join.collect().await?; /// # Ok(()) /// # } /// ``` + /// pub fn join( self, right: DataFrame, @@ -624,10 +636,13 @@ impl DataFrame { Ok(DataFrame::new(self.session_state, plan)) } - /// Join this DataFrame with another DataFrame using the specified expressions. + /// Join this `DataFrame` with another `DataFrame` using the specified + /// expressions. /// - /// Simply a thin wrapper over [`join`](Self::join) where the join keys are not provided, - /// and the provided expressions are AND'ed together to form the filter expression. + /// Note that DataFusion automatically optimizes joins, including + /// identifying and optimizing equality predicates. + /// + /// # Example /// /// ``` /// # use datafusion::prelude::*; @@ -646,6 +661,10 @@ impl DataFrame { /// col("b").alias("b2"), /// col("c").alias("c2"), /// ])?; + /// + /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)` + /// // finding all pairs of rows from `left` and `right` where + /// // where `a != a2` and `b != b2`. /// let join_on = left.join_on( /// right, /// JoinType::Inner, diff --git a/datafusion/expr/src/logical_plan/builder.rs b/datafusion/expr/src/logical_plan/builder.rs index 539f6382adf3..8397dd3aac34 100644 --- a/datafusion/expr/src/logical_plan/builder.rs +++ b/datafusion/expr/src/logical_plan/builder.rs @@ -611,11 +611,19 @@ impl LogicalPlanBuilder { }))) } - /// Apply a join with on constraint. + /// Apply a join to `right` using explicitly specified columns and an + /// optional filter expression. /// - /// Filter expression expected to contain non-equality predicates that can not be pushed - /// down to any of join inputs. - /// In case of outer join, filter applied to only matched rows. + /// See [`join_on`](Self::join_on) for a more concise way to specify the + /// join condition. Since DataFusion will automatically identify and + /// optimize equality predicates there is no performance difference between + /// this function and `join_on` + /// + /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see + /// example below), which are then combined with the optional `filter` + /// expression. + /// + /// Note that in case of outer join, the `filter` is applied to only matched rows. pub fn join( self, right: LogicalPlan, @@ -626,11 +634,12 @@ impl LogicalPlanBuilder { self.join_detailed(right, join_type, join_keys, filter, false) } - /// Apply a join with on constraint. + /// Apply a join with using the specified expressions. + /// + /// Note that DataFusion automatically optimizes joins, including + /// identifying and optimizing equality predicates. /// - /// The `ExtractEquijoinPredicate` optimizer pass has the ability to split join predicates into - /// equijoin predicates and (other) filter predicates. Therefore, if you prefer not to manually split the - /// join predicates, it is recommended to use the `join_on` method instead of the `join` method. + /// # Example /// /// ``` /// # use datafusion_expr::{Expr, col, LogicalPlanBuilder, @@ -650,9 +659,17 @@ impl LogicalPlanBuilder { /// /// let right_plan = LogicalPlanBuilder::scan("right", right_table, None)?.build()?; /// - /// let exprs = vec![col("left.a").eq(col("right.a")), col("left.b").not_eq(col("right.b"))] + /// // Form the expression `(left.a != right.a AND left.b != right.b)` + /// let exprs = vec![ + /// col("left.a").eq(col("right.a")), + /// col("left.b").not_eq(col("right.b")) + /// ] /// .into_iter() /// .reduce(Expr::and); + /// + /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)` + /// // finding all pairs of rows from `left` and `right` where + /// // where `a != a2` and `b != b2`. /// let plan = LogicalPlanBuilder::scan("left", left_table, None)? /// .join_on(right_plan, JoinType::Inner, exprs)? /// .build()?; @@ -687,8 +704,14 @@ impl LogicalPlanBuilder { ) } - /// Apply a join with on constraint and specified null equality - /// If null_equals_null is true then null == null, else null != null + /// Apply a join with on constraint and specified null equality. + /// + /// The behavior is the same as [`join`](Self::join) except that it allows + /// specifying the null equality behavior. + /// + /// If `null_equals_null=true`, rows where both join keys are `null` will be + /// emitted. Otherwise rows where either or both join keys are `null` will be + /// omitted. pub fn join_detailed( self, right: LogicalPlan,