diff --git a/datafusion/core/tests/parquet/arrow_statistics.rs b/datafusion/core/tests/parquet/arrow_statistics.rs index e48c11fcbaab..ddb39fce4076 100644 --- a/datafusion/core/tests/parquet/arrow_statistics.rs +++ b/datafusion/core/tests/parquet/arrow_statistics.rs @@ -15,7 +15,7 @@ // specific language governing permissions and limitations // under the License. -//! This file contains an end to end test of extracting statitics from parquet files. +//! This file contains an end to end test of extracting statistics from parquet files. //! It writes data into a parquet file, reads statistics and verifies they are correct use std::default::Default; @@ -716,8 +716,8 @@ async fn test_timestamp() { // "seconds_timezoned" --> TimestampSecondArray // "names" --> StringArray // - // The file is created by 4 record batches, each has 5 rowws. - // Since the row group isze is set to 5, those 4 batches will go into 4 row groups + // The file is created by 4 record batches, each has 5 rows. + // Since the row group size is set to 5, those 4 batches will go into 4 row groups // This creates a parquet files of 4 columns named "nanos", "nanos_timezoned", "micros", "micros_timezoned", "millis", "millis_timezoned", "seconds", "seconds_timezoned" let reader = TestReader { scenario: Scenario::Timestamps, @@ -2039,7 +2039,7 @@ async fn test_missing_statistics() { expected_min: Arc::new(Int64Array::from(vec![None])), expected_max: Arc::new(Int64Array::from(vec![None])), expected_null_counts: UInt64Array::from(vec![None]), - expected_row_counts: Some(UInt64Array::from(vec![3])), // stil has row count statistics + expected_row_counts: Some(UInt64Array::from(vec![3])), // still has row count statistics column_name: "i64", check: Check::RowGroup, } diff --git a/datafusion/optimizer/src/eliminate_one_union.rs b/datafusion/optimizer/src/eliminate_one_union.rs index 68d0ddba8b20..7763e7d3b796 100644 --- a/datafusion/optimizer/src/eliminate_one_union.rs +++ b/datafusion/optimizer/src/eliminate_one_union.rs @@ -88,10 +88,11 @@ mod tests { } fn assert_optimized_plan_equal(plan: LogicalPlan, expected: &str) -> Result<()> { - assert_optimized_plan_eq_with_rules( + assert_optimized_plan_with_rules( vec![Arc::new(EliminateOneUnion::new())], plan, expected, + true, ) } diff --git a/datafusion/optimizer/src/propagate_empty_relation.rs b/datafusion/optimizer/src/propagate_empty_relation.rs index d08820c58a05..dfcfc313efcc 100644 --- a/datafusion/optimizer/src/propagate_empty_relation.rs +++ b/datafusion/optimizer/src/propagate_empty_relation.rs @@ -20,7 +20,7 @@ use std::sync::Arc; use datafusion_common::tree_node::Transformed; -use datafusion_common::JoinType::Inner; +use datafusion_common::JoinType; use datafusion_common::{internal_err, plan_err, Result}; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::LogicalPlan; @@ -94,13 +94,9 @@ impl OptimizerRule for PropagateEmptyRelation { Ok(Transformed::no(LogicalPlan::CrossJoin(join.clone()))) } - LogicalPlan::Join(ref join) if join.join_type == Inner => { + LogicalPlan::Join(ref join) => { // TODO: For Join, more join type need to be careful: - // For LeftOuter/LeftSemi/LeftAnti Join, only the left side is empty, the Join result is empty. - // For LeftSemi Join, if the right side is empty, the Join result is empty. // For LeftAnti Join, if the right side is empty, the Join result is left side(should exclude null ??). - // For RightOuter/RightSemi/RightAnti Join, only the right side is empty, the Join result is empty. - // For RightSemi Join, if the left side is empty, the Join result is empty. // For RightAnti Join, if the left side is empty, the Join result is right side(should exclude null ??). // For Full Join, only both sides are empty, the Join result is empty. // For LeftOut/Full Join, if the right side is empty, the Join can be eliminated with a Projection with left side @@ -108,15 +104,52 @@ impl OptimizerRule for PropagateEmptyRelation { // For RightOut/Full Join, if the left side is empty, the Join can be eliminated with a Projection with right side // columns + left side columns replaced with null values. let (left_empty, right_empty) = binary_plan_children_is_empty(&plan)?; - if left_empty || right_empty { - return Ok(Transformed::yes(LogicalPlan::EmptyRelation( - EmptyRelation { + + match join.join_type { + JoinType::Inner if left_empty || right_empty => Ok(Transformed::yes( + LogicalPlan::EmptyRelation(EmptyRelation { produce_one_row: false, schema: join.schema.clone(), - }, - ))); + }), + )), + JoinType::Left if left_empty => Ok(Transformed::yes( + LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + }), + )), + JoinType::Right if right_empty => Ok(Transformed::yes( + LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + }), + )), + JoinType::LeftSemi if left_empty || right_empty => Ok( + Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + })), + ), + JoinType::RightSemi if left_empty || right_empty => Ok( + Transformed::yes(LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + })), + ), + JoinType::LeftAnti if left_empty => Ok(Transformed::yes( + LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + }), + )), + JoinType::RightAnti if right_empty => Ok(Transformed::yes( + LogicalPlan::EmptyRelation(EmptyRelation { + produce_one_row: false, + schema: join.schema.clone(), + }), + )), + _ => Ok(Transformed::no(LogicalPlan::Join(join.clone()))), } - Ok(Transformed::no(LogicalPlan::Join(join.clone()))) } LogicalPlan::Aggregate(ref agg) => { if !agg.group_expr.is_empty() { @@ -222,7 +255,7 @@ mod tests { use crate::eliminate_filter::EliminateFilter; use crate::eliminate_nested_union::EliminateNestedUnion; use crate::test::{ - assert_optimized_plan_eq, assert_optimized_plan_eq_with_rules, test_table_scan, + assert_optimized_plan_eq, assert_optimized_plan_with_rules, test_table_scan, test_table_scan_fields, test_table_scan_with_name, }; @@ -232,11 +265,12 @@ mod tests { assert_optimized_plan_eq(Arc::new(PropagateEmptyRelation::new()), plan, expected) } - fn assert_together_optimized_plan_eq( + fn assert_together_optimized_plan( plan: LogicalPlan, expected: &str, + eq: bool, ) -> Result<()> { - assert_optimized_plan_eq_with_rules( + assert_optimized_plan_with_rules( vec![ Arc::new(EliminateFilter::new()), Arc::new(EliminateNestedUnion::new()), @@ -244,6 +278,7 @@ mod tests { ], plan, expected, + eq, ) } @@ -279,7 +314,7 @@ mod tests { .build()?; let expected = "EmptyRelation"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -292,7 +327,7 @@ mod tests { let plan = LogicalPlanBuilder::from(left).union(right)?.build()?; let expected = "TableScan: test"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -317,7 +352,7 @@ mod tests { let expected = "Union\ \n TableScan: test1\ \n TableScan: test4"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -342,7 +377,7 @@ mod tests { .build()?; let expected = "EmptyRelation"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -369,7 +404,7 @@ mod tests { let expected = "Union\ \n TableScan: test2\ \n TableScan: test3"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -382,7 +417,7 @@ mod tests { let plan = LogicalPlanBuilder::from(left).union(right)?.build()?; let expected = "TableScan: test"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } #[test] @@ -397,7 +432,103 @@ mod tests { .build()?; let expected = "EmptyRelation"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) + } + + fn assert_empty_left_empty_right_lp( + left_empty: bool, + right_empty: bool, + join_type: JoinType, + eq: bool, + ) -> Result<()> { + let left_lp = if left_empty { + let left_table_scan = test_table_scan()?; + + LogicalPlanBuilder::from(left_table_scan) + .filter(Expr::Literal(ScalarValue::Boolean(Some(false))))? + .build() + } else { + let scan = test_table_scan_with_name("left").unwrap(); + LogicalPlanBuilder::from(scan).build() + }?; + + let right_lp = if right_empty { + let right_table_scan = test_table_scan_with_name("right")?; + + LogicalPlanBuilder::from(right_table_scan) + .filter(Expr::Literal(ScalarValue::Boolean(Some(false))))? + .build() + } else { + let scan = test_table_scan_with_name("right").unwrap(); + LogicalPlanBuilder::from(scan).build() + }?; + + let plan = LogicalPlanBuilder::from(left_lp) + .join_using( + right_lp, + join_type, + vec![Column::from_name("a".to_string())], + )? + .build()?; + + let expected = "EmptyRelation"; + assert_together_optimized_plan(plan, expected, eq) + } + + #[test] + fn test_join_empty_propagation_rules() -> Result<()> { + // test left join with empty left + assert_empty_left_empty_right_lp(true, false, JoinType::Left, true)?; + + // test right join with empty right + assert_empty_left_empty_right_lp(false, true, JoinType::Right, true)?; + + // test left semi join with empty left + assert_empty_left_empty_right_lp(true, false, JoinType::LeftSemi, true)?; + + // test left semi join with empty right + assert_empty_left_empty_right_lp(false, true, JoinType::LeftSemi, true)?; + + // test right semi join with empty left + assert_empty_left_empty_right_lp(true, false, JoinType::RightSemi, true)?; + + // test right semi join with empty right + assert_empty_left_empty_right_lp(false, true, JoinType::RightSemi, true)?; + + // test left anti join empty left + assert_empty_left_empty_right_lp(true, false, JoinType::LeftAnti, true)?; + + // test right anti join empty right + assert_empty_left_empty_right_lp(false, true, JoinType::RightAnti, true) + } + + #[test] + fn test_join_empty_propagation_rules_noop() -> Result<()> { + // these cases should not result in an empty relation + + // test left join with empty right + assert_empty_left_empty_right_lp(false, true, JoinType::Left, false)?; + + // test right join with empty left + assert_empty_left_empty_right_lp(true, false, JoinType::Right, false)?; + + // test left semi with non-empty left and right + assert_empty_left_empty_right_lp(false, false, JoinType::LeftSemi, false)?; + + // test right semi with non-empty left and right + assert_empty_left_empty_right_lp(false, false, JoinType::RightSemi, false)?; + + // test left anti join with non-empty left and right + assert_empty_left_empty_right_lp(false, false, JoinType::LeftAnti, false)?; + + // test left anti with non-empty left and empty right + assert_empty_left_empty_right_lp(false, true, JoinType::LeftAnti, false)?; + + // test right anti join with non-empty left and right + assert_empty_left_empty_right_lp(false, false, JoinType::RightAnti, false)?; + + // test right anti with empty left and non-empty right + assert_empty_left_empty_right_lp(true, false, JoinType::RightAnti, false) } #[test] @@ -430,6 +561,6 @@ mod tests { let expected = "Projection: a, b, c\ \n TableScan: test"; - assert_together_optimized_plan_eq(plan, expected) + assert_together_optimized_plan(plan, expected, true) } } diff --git a/datafusion/optimizer/src/test/mod.rs b/datafusion/optimizer/src/test/mod.rs index 98d19956df3c..2c7e8644026e 100644 --- a/datafusion/optimizer/src/test/mod.rs +++ b/datafusion/optimizer/src/test/mod.rs @@ -121,6 +121,21 @@ pub fn assert_analyzed_plan_eq( Ok(()) } + +pub fn assert_analyzed_plan_ne( + rule: Arc, + plan: LogicalPlan, + expected: &str, +) -> Result<()> { + let options = ConfigOptions::default(); + let analyzed_plan = + Analyzer::with_rules(vec![rule]).execute_and_check(plan, &options, |_, _| {})?; + let formatted_plan = format!("{analyzed_plan:?}"); + assert_ne!(formatted_plan, expected); + + Ok(()) +} + pub fn assert_analyzed_plan_eq_display_indent( rule: Arc, plan: LogicalPlan, @@ -169,21 +184,33 @@ pub fn assert_optimized_plan_eq( Ok(()) } -pub fn assert_optimized_plan_eq_with_rules( +fn generate_optimized_plan_with_rules( rules: Vec>, plan: LogicalPlan, - expected: &str, -) -> Result<()> { +) -> LogicalPlan { fn observe(_plan: &LogicalPlan, _rule: &dyn OptimizerRule) {} let config = &mut OptimizerContext::new() .with_max_passes(1) .with_skip_failing_rules(false); let optimizer = Optimizer::with_rules(rules); - let optimized_plan = optimizer + optimizer .optimize(plan, config, observe) - .expect("failed to optimize plan"); + .expect("failed to optimize plan") +} + +pub fn assert_optimized_plan_with_rules( + rules: Vec>, + plan: LogicalPlan, + expected: &str, + eq: bool, +) -> Result<()> { + let optimized_plan = generate_optimized_plan_with_rules(rules, plan); let formatted_plan = format!("{optimized_plan:?}"); - assert_eq!(formatted_plan, expected); + if eq { + assert_eq!(formatted_plan, expected); + } else { + assert_ne!(formatted_plan, expected); + } Ok(()) } diff --git a/datafusion/sqllogictest/test_files/joins.slt b/datafusion/sqllogictest/test_files/joins.slt index ea7103255697..0cecd7935a74 100644 --- a/datafusion/sqllogictest/test_files/joins.slt +++ b/datafusion/sqllogictest/test_files/joins.slt @@ -3686,3 +3686,99 @@ set datafusion.explain.logical_plan_only = true; statement ok set datafusion.execution.target_partitions = 2; + +# Inner join with empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 as a WHERE 1=0 +) AS a INNER JOIN (SELECT 1 as a) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Inner join with empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a +) AS a INNER JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Left join with empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 as a WHERE 1=0 +) AS a LEFT JOIN (SELECT 1 as a) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Left join with empty left and empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 as a WHERE 1=0 +) AS a LEFT JOIN (SELECT 1 as a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Right join with empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a +) AS a RIGHT JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Right join with empty right and empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 as a WHERE 1=0 +) AS a RIGHT JOIN (SELECT 1 as a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Left SEMI join with empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a +) AS a LEFT SEMI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Left SEMI join with empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a WHERE 1=0 +) AS a LEFT SEMI JOIN (SELECT 1 AS a) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Right SEMI join with empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a WHERE 1=0 +) AS a RIGHT SEMI JOIN (SELECT 1 AS a) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Right SEMI join with empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a +) AS a RIGHT SEMI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Left ANTI join with empty left table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a WHERE 1=0 +) AS a LEFT ANTI JOIN (SELECT 1 AS a) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation + +# Right ANTI join with empty right table +query TT +EXPLAIN SELECT * FROM ( + SELECT 1 AS a +) AS a RIGHT ANTI JOIN (SELECT 1 AS a WHERE 1=0) AS b ON a.a=b.a; +---- +logical_plan EmptyRelation diff --git a/datafusion/sqllogictest/test_files/subquery.slt b/datafusion/sqllogictest/test_files/subquery.slt index f325d5567603..dbdb7fc76b8b 100644 --- a/datafusion/sqllogictest/test_files/subquery.slt +++ b/datafusion/sqllogictest/test_files/subquery.slt @@ -626,10 +626,7 @@ SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id query TT explain SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0) ---- -logical_plan -01)LeftSemi Join: t1.t1_id = __correlated_sq_1.t2_id -02)--TableScan: t1 projection=[t1_id, t1_name] -03)--EmptyRelation +logical_plan EmptyRelation query IT rowsort SELECT t1_id, t1_name FROM t1 WHERE EXISTS (SELECT * FROM t2 WHERE t2_id = t1_id limit 0)