From d542cbda8f17ba004de18bb107ecf1c8ec3266f6 Mon Sep 17 00:00:00 2001 From: Peter Toth Date: Fri, 12 Jul 2024 12:53:05 +0200 Subject: [PATCH 01/19] Improve `CommonSubexprEliminate` rule with surely and conditionally evaluated stats (#11357) * Improve `CommonSubexprEliminate` rule with surely and conditionally evaluated stats * remove expression tree hashing as no longer needed * address review comments * add negative tests --- datafusion/expr/src/expr.rs | 39 ++- .../optimizer/src/common_subexpr_eliminate.rs | 256 +++++++++++------- .../optimizer/src/optimize_projections/mod.rs | 10 +- datafusion/sqllogictest/test_files/cse.slt | 88 +++++- datafusion/sqllogictest/test_files/select.slt | 20 +- .../sqllogictest/test_files/tpch/q14.slt.part | 33 +-- 6 files changed, 298 insertions(+), 148 deletions(-) diff --git a/datafusion/expr/src/expr.rs b/datafusion/expr/src/expr.rs index ecece6dbfce7..a344e621ddb1 100644 --- a/datafusion/expr/src/expr.rs +++ b/datafusion/expr/src/expr.rs @@ -17,7 +17,7 @@ //! Logical Expressions: [`Expr`] -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::fmt::{self, Display, Formatter, Write}; use std::hash::{Hash, Hasher}; use std::mem; @@ -1380,7 +1380,7 @@ impl Expr { /// // refs contains "a" and "b" /// assert_eq!(refs.len(), 2); /// assert!(refs.contains(&Column::new_unqualified("a"))); - /// assert!(refs.contains(&Column::new_unqualified("b"))); + /// assert!(refs.contains(&Column::new_unqualified("b"))); /// ``` pub fn column_refs(&self) -> HashSet<&Column> { let mut using_columns = HashSet::new(); @@ -1401,6 +1401,41 @@ impl Expr { .expect("traversal is infallable"); } + /// Return all references to columns and their occurrence counts in the expression. + /// + /// # Example + /// ``` + /// # use std::collections::HashMap; + /// # use datafusion_common::Column; + /// # use datafusion_expr::col; + /// // For an expression `a + (b * a)` + /// let expr = col("a") + (col("b") * col("a")); + /// let mut refs = expr.column_refs_counts(); + /// // refs contains "a" and "b" + /// assert_eq!(refs.len(), 2); + /// assert_eq!(*refs.get(&Column::new_unqualified("a")).unwrap(), 2); + /// assert_eq!(*refs.get(&Column::new_unqualified("b")).unwrap(), 1); + /// ``` + pub fn column_refs_counts(&self) -> HashMap<&Column, usize> { + let mut map = HashMap::new(); + self.add_column_ref_counts(&mut map); + map + } + + /// Adds references to all columns and their occurrence counts in the expression to + /// the map. + /// + /// See [`Self::column_refs_counts`] for details + pub fn add_column_ref_counts<'a>(&'a self, map: &mut HashMap<&'a Column, usize>) { + self.apply(|expr| { + if let Expr::Column(col) = expr { + *map.entry(col).or_default() += 1; + } + Ok(TreeNodeRecursion::Continue) + }) + .expect("traversal is infallable"); + } + /// Returns true if there are any column references in this Expr pub fn any_column_refs(&self) -> bool { self.exists(|expr| Ok(matches!(expr, Expr::Column(_)))) diff --git a/datafusion/optimizer/src/common_subexpr_eliminate.rs b/datafusion/optimizer/src/common_subexpr_eliminate.rs index 721987b917d4..e4b36652974d 100644 --- a/datafusion/optimizer/src/common_subexpr_eliminate.rs +++ b/datafusion/optimizer/src/common_subexpr_eliminate.rs @@ -33,12 +33,12 @@ use datafusion_common::tree_node::{ use datafusion_common::{ internal_datafusion_err, qualified_name, Column, DFSchema, DFSchemaRef, Result, }; -use datafusion_expr::expr::Alias; +use datafusion_expr::expr::{Alias, ScalarFunction}; use datafusion_expr::logical_plan::tree_node::unwrap_arc; use datafusion_expr::logical_plan::{ Aggregate, Filter, LogicalPlan, Projection, Sort, Window, }; -use datafusion_expr::{col, Expr, ExprSchemable}; +use datafusion_expr::{col, BinaryExpr, Case, Expr, ExprSchemable, Operator}; use indexmap::IndexMap; const CSE_PREFIX: &str = "__common_expr"; @@ -56,13 +56,9 @@ struct Identifier<'n> { } impl<'n> Identifier<'n> { - fn new(expr: &'n Expr, is_tree: bool, random_state: &RandomState) -> Self { + fn new(expr: &'n Expr, random_state: &RandomState) -> Self { let mut hasher = random_state.build_hasher(); - if is_tree { - expr.hash(&mut hasher); - } else { - expr.hash_node(&mut hasher); - } + expr.hash_node(&mut hasher); let hash = hasher.finish(); Self { hash, expr } } @@ -110,8 +106,9 @@ impl Hash for Identifier<'_> { /// ``` type IdArray<'n> = Vec<(usize, Option>)>; -/// A map that contains the number of occurrences of expressions by their identifiers. -type ExprStats<'n> = HashMap, usize>; +/// A map that contains the number of normal and conditional occurrences of expressions by +/// their identifiers. +type ExprStats<'n> = HashMap, (usize, usize)>; /// A map that contains the common expressions and their alias extracted during the /// second, rewriting traversal. @@ -200,6 +197,7 @@ impl CommonSubexprEliminate { expr_mask, random_state: &self.random_state, found_common: false, + conditional: false, }; expr.visit(&mut visitor)?; @@ -901,15 +899,17 @@ struct ExprIdentifierVisitor<'a, 'n> { random_state: &'a RandomState, // a flag to indicate that common expression found found_common: bool, + // if we are in a conditional branch. A conditional branch means that the expression + // might not be executed depending on the runtime values of other expressions, and + // thus can not be extracted as a common expression. + conditional: bool, } /// Record item that used when traversing an expression tree. enum VisitRecord<'n> { /// Marks the beginning of expression. It contains: /// - The post-order index assigned during the first, visiting traversal. - /// - A boolean flag if the record marks an expression subtree (not just a single - /// node). - EnterMark(usize, bool), + EnterMark(usize), /// Marks an accumulated subexpression tree. It contains: /// - The accumulated identifier of a subexpression. @@ -924,10 +924,6 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { /// Find the first `EnterMark` in the stack, and accumulates every `ExprItem` before /// it. Returns a tuple that contains: /// - The pre-order index of the expression we marked. - /// - A boolean flag if we marked an expression subtree (not just a single node). - /// If true we didn't recurse into the node's children, so we need to calculate the - /// hash of the marked expression tree (not just the node) and we need to validate - /// the expression tree (not just the node). /// - The accumulated identifier of the children of the marked expression. /// - An accumulated boolean flag from the children of the marked expression if all /// children are valid for subexpression elimination (i.e. it is safe to extract the @@ -937,14 +933,14 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { /// information up from children to parents via `visit_stack` during the first, /// visiting traversal and no need to test the expression's validity beforehand with /// an extra traversal). - fn pop_enter_mark(&mut self) -> (usize, bool, Option>, bool) { + fn pop_enter_mark(&mut self) -> (usize, Option>, bool) { let mut expr_id = None; let mut is_valid = true; while let Some(item) = self.visit_stack.pop() { match item { - VisitRecord::EnterMark(down_index, is_tree) => { - return (down_index, is_tree, expr_id, is_valid); + VisitRecord::EnterMark(down_index) => { + return (down_index, expr_id, is_valid); } VisitRecord::ExprItem(sub_expr_id, sub_expr_is_valid) => { expr_id = Some(sub_expr_id.combine(expr_id)); @@ -954,53 +950,112 @@ impl<'n> ExprIdentifierVisitor<'_, 'n> { } unreachable!("Enter mark should paired with node number"); } + + /// Save the current `conditional` status and run `f` with `conditional` set to true. + fn conditionally Result<()>>( + &mut self, + mut f: F, + ) -> Result<()> { + let conditional = self.conditional; + self.conditional = true; + f(self)?; + self.conditional = conditional; + + Ok(()) + } } impl<'n> TreeNodeVisitor<'n> for ExprIdentifierVisitor<'_, 'n> { type Node = Expr; fn f_down(&mut self, expr: &'n Expr) -> Result { - // If an expression can short circuit its children then don't consider its - // children for CSE (https://github.com/apache/arrow-datafusion/issues/8814). - // This means that we don't recurse into its children, but handle the expression - // as a subtree when we calculate its identifier. - // TODO: consider surely executed children of "short circuited"s for CSE - let is_tree = expr.short_circuits(); - let tnr = if is_tree { - TreeNodeRecursion::Jump - } else { - TreeNodeRecursion::Continue - }; - self.id_array.push((0, None)); self.visit_stack - .push(VisitRecord::EnterMark(self.down_index, is_tree)); + .push(VisitRecord::EnterMark(self.down_index)); self.down_index += 1; - Ok(tnr) + // If an expression can short-circuit then some of its children might not be + // executed so count the occurrence of subexpressions as conditional in all + // children. + Ok(match expr { + // If we are already in a conditionally evaluated subtree then continue + // traversal. + _ if self.conditional => TreeNodeRecursion::Continue, + + // In case of `ScalarFunction`s we don't know which children are surely + // executed so start visiting all children conditionally and stop the + // recursion with `TreeNodeRecursion::Jump`. + Expr::ScalarFunction(ScalarFunction { func, args }) + if func.short_circuits() => + { + self.conditionally(|visitor| { + args.iter().try_for_each(|e| e.visit(visitor).map(|_| ())) + })?; + + TreeNodeRecursion::Jump + } + + // In case of `And` and `Or` the first child is surely executed, but we + // account subexpressions as conditional in the second. + Expr::BinaryExpr(BinaryExpr { + left, + op: Operator::And | Operator::Or, + right, + }) => { + left.visit(self)?; + self.conditionally(|visitor| right.visit(visitor).map(|_| ()))?; + + TreeNodeRecursion::Jump + } + + // In case of `Case` the optional base expression and the first when + // expressions are surely executed, but we account subexpressions as + // conditional in the others. + Expr::Case(Case { + expr, + when_then_expr, + else_expr, + }) => { + expr.iter().try_for_each(|e| e.visit(self).map(|_| ()))?; + when_then_expr.iter().take(1).try_for_each(|(when, then)| { + when.visit(self)?; + self.conditionally(|visitor| then.visit(visitor).map(|_| ())) + })?; + self.conditionally(|visitor| { + when_then_expr.iter().skip(1).try_for_each(|(when, then)| { + when.visit(visitor)?; + then.visit(visitor).map(|_| ()) + })?; + else_expr + .iter() + .try_for_each(|e| e.visit(visitor).map(|_| ())) + })?; + + TreeNodeRecursion::Jump + } + + // In case of non-short-circuit expressions continue the traversal. + _ => TreeNodeRecursion::Continue, + }) } fn f_up(&mut self, expr: &'n Expr) -> Result { - let (down_index, is_tree, sub_expr_id, sub_expr_is_valid) = self.pop_enter_mark(); + let (down_index, sub_expr_id, sub_expr_is_valid) = self.pop_enter_mark(); - let (expr_id, is_valid) = if is_tree { - ( - Identifier::new(expr, true, self.random_state), - !expr.is_volatile()?, - ) - } else { - ( - Identifier::new(expr, false, self.random_state).combine(sub_expr_id), - !expr.is_volatile_node() && sub_expr_is_valid, - ) - }; + let expr_id = Identifier::new(expr, self.random_state).combine(sub_expr_id); + let is_valid = !expr.is_volatile_node() && sub_expr_is_valid; self.id_array[down_index].0 = self.up_index; if is_valid && !self.expr_mask.ignores(expr) { self.id_array[down_index].1 = Some(expr_id); - let count = self.expr_stats.entry(expr_id).or_insert(0); - *count += 1; - if *count > 1 { + let (count, conditional_count) = + self.expr_stats.entry(expr_id).or_insert((0, 0)); + if self.conditional { + *conditional_count += 1; + } else { + *count += 1; + } + if *count > 1 || (*count == 1 && *conditional_count > 0) { self.found_common = true; } } @@ -1039,51 +1094,40 @@ impl TreeNodeRewriter for CommonSubexprRewriter<'_, '_> { self.alias_counter += 1; } - // The `CommonSubexprRewriter` relies on `ExprIdentifierVisitor` to generate the - // `id_array`, which records the expr's identifier used to rewrite expr. So if we - // skip an expr in `ExprIdentifierVisitor`, we should skip it here, too. - let is_tree = expr.short_circuits(); - let tnr = if is_tree { - TreeNodeRecursion::Jump - } else { - TreeNodeRecursion::Continue - }; - let (up_index, expr_id) = self.id_array[self.down_index]; self.down_index += 1; - // skip `Expr`s without identifier (empty identifier). - let Some(expr_id) = expr_id else { - return Ok(Transformed::new(expr, false, tnr)); - }; - - let count = self.expr_stats.get(&expr_id).unwrap(); - if *count > 1 { - // step index to skip all sub-node (which has smaller series number). - while self.down_index < self.id_array.len() - && self.id_array[self.down_index].0 < up_index - { - self.down_index += 1; - } + // Handle `Expr`s with identifiers only + if let Some(expr_id) = expr_id { + let (count, conditional_count) = self.expr_stats.get(&expr_id).unwrap(); + if *count > 1 || *count == 1 && *conditional_count > 0 { + // step index to skip all sub-node (which has smaller series number). + while self.down_index < self.id_array.len() + && self.id_array[self.down_index].0 < up_index + { + self.down_index += 1; + } - let expr_name = expr.display_name()?; - let (_, expr_alias) = self.common_exprs.entry(expr_id).or_insert_with(|| { - let expr_alias = self.alias_generator.next(CSE_PREFIX); - (expr, expr_alias) - }); + let expr_name = expr.display_name()?; + let (_, expr_alias) = + self.common_exprs.entry(expr_id).or_insert_with(|| { + let expr_alias = self.alias_generator.next(CSE_PREFIX); + (expr, expr_alias) + }); - // alias the expressions without an `Alias` ancestor node - let rewritten = if self.alias_counter > 0 { - col(expr_alias.clone()) - } else { - self.alias_counter += 1; - col(expr_alias.clone()).alias(expr_name) - }; + // alias the expressions without an `Alias` ancestor node + let rewritten = if self.alias_counter > 0 { + col(expr_alias.clone()) + } else { + self.alias_counter += 1; + col(expr_alias.clone()).alias(expr_name) + }; - Ok(Transformed::new(rewritten, true, TreeNodeRecursion::Jump)) - } else { - Ok(Transformed::new(expr, false, tnr)) + return Ok(Transformed::new(rewritten, true, TreeNodeRecursion::Jump)); + } } + + Ok(Transformed::no(expr)) } fn f_up(&mut self, expr: Expr) -> Result> { @@ -1685,7 +1729,7 @@ mod test { .unwrap(); let rule = CommonSubexprEliminate::new(); let optimized_plan = rule.rewrite(plan, &OptimizerContext::new()).unwrap(); - assert!(!optimized_plan.transformed); + assert!(optimized_plan.transformed); let optimized_plan = optimized_plan.data; let schema = optimized_plan.schema(); @@ -1837,22 +1881,29 @@ mod test { let table_scan = test_table_scan()?; let extracted_short_circuit = col("a").eq(lit(0)).or(col("b").eq(lit(0))); - let not_extracted_short_circuit_leg_1 = (col("a") + col("b")).eq(lit(0)); + let extracted_short_circuit_leg_1 = (col("a") + col("b")).eq(lit(0)); let not_extracted_short_circuit_leg_2 = (col("a") - col("b")).eq(lit(0)); + let extracted_short_circuit_leg_3 = (col("a") * col("b")).eq(lit(0)); let plan = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![ extracted_short_circuit.clone().alias("c1"), extracted_short_circuit.alias("c2"), - not_extracted_short_circuit_leg_1.clone().alias("c3"), - not_extracted_short_circuit_leg_2.clone().alias("c4"), - not_extracted_short_circuit_leg_1 - .or(not_extracted_short_circuit_leg_2) + extracted_short_circuit_leg_1 + .clone() + .or(not_extracted_short_circuit_leg_2.clone()) + .alias("c3"), + extracted_short_circuit_leg_1 + .and(not_extracted_short_circuit_leg_2) + .alias("c4"), + extracted_short_circuit_leg_3 + .clone() + .or(extracted_short_circuit_leg_3.clone()) .alias("c5"), ])? .build()?; - let expected = "Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, test.a + test.b = Int32(0) AS c3, test.a - test.b = Int32(0) AS c4, test.a + test.b = Int32(0) OR test.a - test.b = Int32(0) AS c5\ - \n Projection: test.a = Int32(0) OR test.b = Int32(0) AS __common_expr_1, test.a, test.b, test.c\ + let expected = "Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, __common_expr_2 OR test.a - test.b = Int32(0) AS c3, __common_expr_2 AND test.a - test.b = Int32(0) AS c4, __common_expr_3 OR __common_expr_3 AS c5\ + \n Projection: test.a = Int32(0) OR test.b = Int32(0) AS __common_expr_1, test.a + test.b = Int32(0) AS __common_expr_2, test.a * test.b = Int32(0) AS __common_expr_3, test.a, test.b, test.c\ \n TableScan: test"; assert_optimized_plan_eq(expected, plan, None); @@ -1888,10 +1939,12 @@ mod test { let table_scan = test_table_scan()?; let rand = rand_func().call(vec![]); - let not_extracted_volatile_short_circuit_2 = - rand.clone().eq(lit(0)).or(col("b").eq(lit(0))); + let extracted_short_circuit_leg_1 = col("a").eq(lit(0)); let not_extracted_volatile_short_circuit_1 = - col("a").eq(lit(0)).or(rand.eq(lit(0))); + extracted_short_circuit_leg_1.or(rand.clone().eq(lit(0))); + let not_extracted_short_circuit_leg_2 = col("b").eq(lit(0)); + let not_extracted_volatile_short_circuit_2 = + rand.eq(lit(0)).or(not_extracted_short_circuit_leg_2); let plan = LogicalPlanBuilder::from(table_scan.clone()) .project(vec![ not_extracted_volatile_short_circuit_1.clone().alias("c1"), @@ -1901,10 +1954,11 @@ mod test { ])? .build()?; - let expected = "Projection: test.a = Int32(0) OR random() = Int32(0) AS c1, test.a = Int32(0) OR random() = Int32(0) AS c2, random() = Int32(0) OR test.b = Int32(0) AS c3, random() = Int32(0) OR test.b = Int32(0) AS c4\ - \n TableScan: test"; + let expected = "Projection: __common_expr_1 OR random() = Int32(0) AS c1, __common_expr_1 OR random() = Int32(0) AS c2, random() = Int32(0) OR test.b = Int32(0) AS c3, random() = Int32(0) OR test.b = Int32(0) AS c4\ + \n Projection: test.a = Int32(0) AS __common_expr_1, test.a, test.b, test.c\ + \n TableScan: test"; - assert_non_optimized_plan_eq(expected, plan, None); + assert_optimized_plan_eq(expected, plan, None); Ok(()) } diff --git a/datafusion/optimizer/src/optimize_projections/mod.rs b/datafusion/optimizer/src/optimize_projections/mod.rs index cae2a7b2cad2..58c1ae297b02 100644 --- a/datafusion/optimizer/src/optimize_projections/mod.rs +++ b/datafusion/optimizer/src/optimize_projections/mod.rs @@ -19,7 +19,7 @@ mod required_indices; -use std::collections::HashSet; +use std::collections::{HashMap, HashSet}; use std::sync::Arc; use crate::optimizer::ApplyOrder; @@ -42,7 +42,6 @@ use datafusion_common::tree_node::{ Transformed, TreeNode, TreeNodeIterator, TreeNodeRecursion, }; use datafusion_expr::logical_plan::tree_node::unwrap_arc; -use hashbrown::HashMap; /// Optimizer rule to prune unnecessary columns from intermediate schemas /// inside the [`LogicalPlan`]. This rule: @@ -472,11 +471,8 @@ fn merge_consecutive_projections(proj: Projection) -> Result::new(); - for columns in expr.iter().map(|expr| expr.column_refs()) { - for col in columns.into_iter() { - *column_referral_map.entry(col).or_default() += 1; - } - } + expr.iter() + .for_each(|expr| expr.add_column_ref_counts(&mut column_referral_map)); // If an expression is non-trivial and appears more than once, do not merge // them as consecutive projections will benefit from a compute-once approach. diff --git a/datafusion/sqllogictest/test_files/cse.slt b/datafusion/sqllogictest/test_files/cse.slt index 3579c1c1635c..19b47fa50e41 100644 --- a/datafusion/sqllogictest/test_files/cse.slt +++ b/datafusion/sqllogictest/test_files/cse.slt @@ -93,15 +93,16 @@ FROM t1 ---- logical_plan 01)Projection: __common_expr_1 AS c1, __common_expr_1 AS c2, __common_expr_2 AS c3, __common_expr_2 AS c4, __common_expr_3 AS c5, __common_expr_3 AS c6 -02)--Projection: t1.a = Float64(0) AND t1.b = Float64(0) AS __common_expr_1, t1.a = Float64(0) OR t1.b = Float64(0) AS __common_expr_2, CASE WHEN t1.a = Float64(0) THEN Int64(0) ELSE Int64(1) END AS __common_expr_3 -03)----TableScan: t1 projection=[a, b] +02)--Projection: __common_expr_4 AND t1.b = Float64(0) AS __common_expr_1, __common_expr_4 OR t1.b = Float64(0) AS __common_expr_2, CASE WHEN __common_expr_4 THEN Int64(0) ELSE Int64(1) END AS __common_expr_3 +03)----Projection: t1.a = Float64(0) AS __common_expr_4, t1.b +04)------TableScan: t1 projection=[a, b] physical_plan 01)ProjectionExec: expr=[__common_expr_1@0 as c1, __common_expr_1@0 as c2, __common_expr_2@1 as c3, __common_expr_2@1 as c4, __common_expr_3@2 as c5, __common_expr_3@2 as c6] -02)--ProjectionExec: expr=[a@0 = 0 AND b@1 = 0 as __common_expr_1, a@0 = 0 OR b@1 = 0 as __common_expr_2, CASE WHEN a@0 = 0 THEN 0 ELSE 1 END as __common_expr_3] -03)----MemoryExec: partitions=1, partition_sizes=[0] +02)--ProjectionExec: expr=[__common_expr_4@0 AND b@1 = 0 as __common_expr_1, __common_expr_4@0 OR b@1 = 0 as __common_expr_2, CASE WHEN __common_expr_4@0 THEN 0 ELSE 1 END as __common_expr_3] +03)----ProjectionExec: expr=[a@0 = 0 as __common_expr_4, b@1 as b] +04)------MemoryExec: partitions=1, partition_sizes=[0] # Common children of short-circuit expression -# TODO: consider surely executed children of "short circuited"s for CSE. i.e. `a = 0`, `a = 2`, `a = 4` should be extracted query TT EXPLAIN SELECT a = 0 AND b = 0 AS c1, @@ -121,14 +122,15 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: t1.a = Float64(0) AND t1.b = Float64(0) AS c1, t1.a = Float64(0) AND t1.b = Float64(1) AS c2, t1.b = Float64(2) AND t1.a = Float64(1) AS c3, t1.b = Float64(3) AND t1.a = Float64(1) AS c4, t1.a = Float64(2) OR t1.b = Float64(4) AS c5, t1.a = Float64(2) OR t1.b = Float64(5) AS c6, t1.b = Float64(6) OR t1.a = Float64(3) AS c7, t1.b = Float64(7) OR t1.a = Float64(3) AS c8, CASE WHEN t1.a = Float64(4) THEN Int64(0) ELSE Int64(1) END AS c9, CASE WHEN t1.a = Float64(4) THEN Int64(0) ELSE Int64(2) END AS c10, CASE WHEN t1.b = Float64(8) THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 -02)--TableScan: t1 projection=[a, b] +01)Projection: __common_expr_1 AND t1.b = Float64(0) AS c1, __common_expr_1 AND t1.b = Float64(1) AS c2, t1.b = Float64(2) AND t1.a = Float64(1) AS c3, t1.b = Float64(3) AND t1.a = Float64(1) AS c4, __common_expr_2 OR t1.b = Float64(4) AS c5, __common_expr_2 OR t1.b = Float64(5) AS c6, t1.b = Float64(6) OR t1.a = Float64(3) AS c7, t1.b = Float64(7) OR t1.a = Float64(3) AS c8, CASE WHEN __common_expr_3 THEN Int64(0) ELSE Int64(1) END AS c9, CASE WHEN __common_expr_3 THEN Int64(0) ELSE Int64(2) END AS c10, CASE WHEN t1.b = Float64(8) THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 +02)--Projection: t1.a = Float64(0) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a = Float64(4) AS __common_expr_3, t1.a, t1.b +03)----TableScan: t1 projection=[a, b] physical_plan -01)ProjectionExec: expr=[a@0 = 0 AND b@1 = 0 as c1, a@0 = 0 AND b@1 = 1 as c2, b@1 = 2 AND a@0 = 1 as c3, b@1 = 3 AND a@0 = 1 as c4, a@0 = 2 OR b@1 = 4 as c5, a@0 = 2 OR b@1 = 5 as c6, b@1 = 6 OR a@0 = 3 as c7, b@1 = 7 OR a@0 = 3 as c8, CASE WHEN a@0 = 4 THEN 0 ELSE 1 END as c9, CASE WHEN a@0 = 4 THEN 0 ELSE 2 END as c10, CASE WHEN b@1 = 8 THEN a@0 + 1 ELSE 0 END as c11, CASE WHEN b@1 = 9 THEN a@0 + 1 ELSE 0 END as c12, CASE WHEN b@1 = 10 THEN 0 ELSE a@0 + 2 END as c13, CASE WHEN b@1 = 11 THEN 0 ELSE a@0 + 2 END as c14] -02)--MemoryExec: partitions=1, partition_sizes=[0] +01)ProjectionExec: expr=[__common_expr_1@0 AND b@4 = 0 as c1, __common_expr_1@0 AND b@4 = 1 as c2, b@4 = 2 AND a@3 = 1 as c3, b@4 = 3 AND a@3 = 1 as c4, __common_expr_2@1 OR b@4 = 4 as c5, __common_expr_2@1 OR b@4 = 5 as c6, b@4 = 6 OR a@3 = 3 as c7, b@4 = 7 OR a@3 = 3 as c8, CASE WHEN __common_expr_3@2 THEN 0 ELSE 1 END as c9, CASE WHEN __common_expr_3@2 THEN 0 ELSE 2 END as c10, CASE WHEN b@4 = 8 THEN a@3 + 1 ELSE 0 END as c11, CASE WHEN b@4 = 9 THEN a@3 + 1 ELSE 0 END as c12, CASE WHEN b@4 = 10 THEN 0 ELSE a@3 + 2 END as c13, CASE WHEN b@4 = 11 THEN 0 ELSE a@3 + 2 END as c14] +02)--ProjectionExec: expr=[a@0 = 0 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 = 4 as __common_expr_3, a@0 as a, b@1 as b] +03)----MemoryExec: partitions=1, partition_sizes=[0] # Common children of volatile, short-circuit expression -# TODO: consider surely executed children of "short circuited"s for CSE. i.e. `a = 0`, `a = 2`, `a = 4` should be extracted query TT EXPLAIN SELECT a = 0 AND b = random() AS c1, @@ -148,11 +150,13 @@ EXPLAIN SELECT FROM t1 ---- logical_plan -01)Projection: t1.a = Float64(0) AND t1.b = random() AS c1, t1.a = Float64(0) AND t1.b = Float64(1) + random() AS c2, t1.b = Float64(2) + random() AND t1.a = Float64(1) AS c3, t1.b = Float64(3) + random() AND t1.a = Float64(1) AS c4, t1.a = Float64(2) OR t1.b = Float64(4) + random() AS c5, t1.a = Float64(2) OR t1.b = Float64(5) + random() AS c6, t1.b = Float64(6) + random() OR t1.a = Float64(3) AS c7, t1.b = Float64(7) + random() OR t1.a = Float64(3) AS c8, CASE WHEN t1.a = Float64(4) THEN random() ELSE Float64(1) END AS c9, CASE WHEN t1.a = Float64(4) THEN random() ELSE Float64(2) END AS c10, CASE WHEN t1.b = Float64(8) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 -02)--TableScan: t1 projection=[a, b] +01)Projection: __common_expr_1 AND t1.b = random() AS c1, __common_expr_1 AND t1.b = Float64(1) + random() AS c2, t1.b = Float64(2) + random() AND t1.a = Float64(1) AS c3, t1.b = Float64(3) + random() AND t1.a = Float64(1) AS c4, __common_expr_2 OR t1.b = Float64(4) + random() AS c5, __common_expr_2 OR t1.b = Float64(5) + random() AS c6, t1.b = Float64(6) + random() OR t1.a = Float64(3) AS c7, t1.b = Float64(7) + random() OR t1.a = Float64(3) AS c8, CASE WHEN __common_expr_3 THEN random() ELSE Float64(1) END AS c9, CASE WHEN __common_expr_3 THEN random() ELSE Float64(2) END AS c10, CASE WHEN t1.b = Float64(8) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c11, CASE WHEN t1.b = Float64(9) + random() THEN t1.a + Float64(1) ELSE Float64(0) END AS c12, CASE WHEN t1.b = Float64(10) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c13, CASE WHEN t1.b = Float64(11) + random() THEN Float64(0) ELSE t1.a + Float64(2) END AS c14 +02)--Projection: t1.a = Float64(0) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a = Float64(4) AS __common_expr_3, t1.a, t1.b +03)----TableScan: t1 projection=[a, b] physical_plan -01)ProjectionExec: expr=[a@0 = 0 AND b@1 = random() as c1, a@0 = 0 AND b@1 = 1 + random() as c2, b@1 = 2 + random() AND a@0 = 1 as c3, b@1 = 3 + random() AND a@0 = 1 as c4, a@0 = 2 OR b@1 = 4 + random() as c5, a@0 = 2 OR b@1 = 5 + random() as c6, b@1 = 6 + random() OR a@0 = 3 as c7, b@1 = 7 + random() OR a@0 = 3 as c8, CASE WHEN a@0 = 4 THEN random() ELSE 1 END as c9, CASE WHEN a@0 = 4 THEN random() ELSE 2 END as c10, CASE WHEN b@1 = 8 + random() THEN a@0 + 1 ELSE 0 END as c11, CASE WHEN b@1 = 9 + random() THEN a@0 + 1 ELSE 0 END as c12, CASE WHEN b@1 = 10 + random() THEN 0 ELSE a@0 + 2 END as c13, CASE WHEN b@1 = 11 + random() THEN 0 ELSE a@0 + 2 END as c14] -02)--MemoryExec: partitions=1, partition_sizes=[0] +01)ProjectionExec: expr=[__common_expr_1@0 AND b@4 = random() as c1, __common_expr_1@0 AND b@4 = 1 + random() as c2, b@4 = 2 + random() AND a@3 = 1 as c3, b@4 = 3 + random() AND a@3 = 1 as c4, __common_expr_2@1 OR b@4 = 4 + random() as c5, __common_expr_2@1 OR b@4 = 5 + random() as c6, b@4 = 6 + random() OR a@3 = 3 as c7, b@4 = 7 + random() OR a@3 = 3 as c8, CASE WHEN __common_expr_3@2 THEN random() ELSE 1 END as c9, CASE WHEN __common_expr_3@2 THEN random() ELSE 2 END as c10, CASE WHEN b@4 = 8 + random() THEN a@3 + 1 ELSE 0 END as c11, CASE WHEN b@4 = 9 + random() THEN a@3 + 1 ELSE 0 END as c12, CASE WHEN b@4 = 10 + random() THEN 0 ELSE a@3 + 2 END as c13, CASE WHEN b@4 = 11 + random() THEN 0 ELSE a@3 + 2 END as c14] +02)--ProjectionExec: expr=[a@0 = 0 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 = 4 as __common_expr_3, a@0 as a, b@1 as b] +03)----MemoryExec: partitions=1, partition_sizes=[0] # Common volatile children of short-circuit expression query TT @@ -171,3 +175,59 @@ logical_plan physical_plan 01)ProjectionExec: expr=[a@0 = random() AND b@1 = 0 as c1, a@0 = random() AND b@1 = 1 as c2, a@0 = 2 + random() OR b@1 = 4 as c3, a@0 = 2 + random() OR b@1 = 5 as c4, CASE WHEN a@0 = 4 + random() THEN 0 ELSE 1 END as c5, CASE WHEN a@0 = 4 + random() THEN 0 ELSE 2 END as c6] 02)--MemoryExec: partitions=1, partition_sizes=[0] + +# Surely only once but also conditionally evaluated expressions +query TT +EXPLAIN SELECT + (a = 1 OR random() = 0) AND a = 1 AS c1, + (a = 2 AND random() = 0) OR a = 2 AS c2, + CASE WHEN a + 3 = 0 THEN a + 3 ELSE 0 END AS c3, + CASE WHEN a + 4 = 0 THEN 0 WHEN a + 4 THEN 0 ELSE 0 END AS c4, + CASE WHEN a + 5 = 0 THEN 0 WHEN random() = 0 THEN a + 5 ELSE 0 END AS c5, + CASE WHEN a + 6 = 0 THEN 0 ELSE a + 6 END AS c6 +FROM t1 +---- +logical_plan +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND __common_expr_1 AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Int64(0) WHEN CAST(__common_expr_4 AS Boolean) THEN Int64(0) ELSE Int64(0) END AS c4, CASE WHEN __common_expr_5 = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN __common_expr_5 ELSE Float64(0) END AS c5, CASE WHEN __common_expr_6 = Float64(0) THEN Float64(0) ELSE __common_expr_6 END AS c6 +02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4, t1.a + Float64(5) AS __common_expr_5, t1.a + Float64(6) AS __common_expr_6 +03)----TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND __common_expr_1@0 as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 WHEN CAST(__common_expr_4@3 AS Boolean) THEN 0 ELSE 0 END as c4, CASE WHEN __common_expr_5@4 = 0 THEN 0 WHEN random() = 0 THEN __common_expr_5@4 ELSE 0 END as c5, CASE WHEN __common_expr_6@5 = 0 THEN 0 ELSE __common_expr_6@5 END as c6] +02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4, a@0 + 5 as __common_expr_5, a@0 + 6 as __common_expr_6] +03)----MemoryExec: partitions=1, partition_sizes=[0] + +# Surely only once but also conditionally evaluated subexpressions +query TT +EXPLAIN SELECT + (a = 1 OR random() = 0) AND (a = 1 OR random() = 1) AS c1, + (a = 2 AND random() = 0) OR (a = 2 AND random() = 1) AS c2, + CASE WHEN a + 3 = 0 THEN a + 3 + random() ELSE 0 END AS c3, + CASE WHEN a + 4 = 0 THEN 0 ELSE a + 4 + random() END AS c4 +FROM t1 +---- +logical_plan +01)Projection: (__common_expr_1 OR random() = Float64(0)) AND (__common_expr_1 OR random() = Float64(1)) AS c1, __common_expr_2 AND random() = Float64(0) OR __common_expr_2 AND random() = Float64(1) AS c2, CASE WHEN __common_expr_3 = Float64(0) THEN __common_expr_3 + random() ELSE Float64(0) END AS c3, CASE WHEN __common_expr_4 = Float64(0) THEN Float64(0) ELSE __common_expr_4 + random() END AS c4 +02)--Projection: t1.a = Float64(1) AS __common_expr_1, t1.a = Float64(2) AS __common_expr_2, t1.a + Float64(3) AS __common_expr_3, t1.a + Float64(4) AS __common_expr_4 +03)----TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(__common_expr_1@0 OR random() = 0) AND (__common_expr_1@0 OR random() = 1) as c1, __common_expr_2@1 AND random() = 0 OR __common_expr_2@1 AND random() = 1 as c2, CASE WHEN __common_expr_3@2 = 0 THEN __common_expr_3@2 + random() ELSE 0 END as c3, CASE WHEN __common_expr_4@3 = 0 THEN 0 ELSE __common_expr_4@3 + random() END as c4] +02)--ProjectionExec: expr=[a@0 = 1 as __common_expr_1, a@0 = 2 as __common_expr_2, a@0 + 3 as __common_expr_3, a@0 + 4 as __common_expr_4] +03)----MemoryExec: partitions=1, partition_sizes=[0] + +# Only conditionally evaluated expressions +query TT +EXPLAIN SELECT + (random() = 0 OR a = 1) AND a = 1 AS c1, + (random() = 0 AND a = 2) OR a = 2 AS c2, + CASE WHEN random() = 0 THEN a + 3 ELSE a + 3 END AS c3, + CASE WHEN random() = 0 THEN 0 WHEN a + 4 = 0 THEN a + 4 ELSE 0 END AS c4, + CASE WHEN random() = 0 THEN 0 WHEN a + 5 = 0 THEN 0 ELSE a + 5 END AS c5, + CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a + 6 ELSE a + 6 END AS c6 +FROM t1 +---- +logical_plan +01)Projection: (random() = Float64(0) OR t1.a = Float64(1)) AND t1.a = Float64(1) AS c1, random() = Float64(0) AND t1.a = Float64(2) OR t1.a = Float64(2) AS c2, CASE WHEN random() = Float64(0) THEN t1.a + Float64(3) ELSE t1.a + Float64(3) END AS c3, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(4) = Float64(0) THEN t1.a + Float64(4) ELSE Float64(0) END AS c4, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN t1.a + Float64(5) = Float64(0) THEN Float64(0) ELSE t1.a + Float64(5) END AS c5, CASE WHEN random() = Float64(0) THEN Float64(0) WHEN random() = Float64(0) THEN t1.a + Float64(6) ELSE t1.a + Float64(6) END AS c6 +02)--TableScan: t1 projection=[a] +physical_plan +01)ProjectionExec: expr=[(random() = 0 OR a@0 = 1) AND a@0 = 1 as c1, random() = 0 AND a@0 = 2 OR a@0 = 2 as c2, CASE WHEN random() = 0 THEN a@0 + 3 ELSE a@0 + 3 END as c3, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 4 = 0 THEN a@0 + 4 ELSE 0 END as c4, CASE WHEN random() = 0 THEN 0 WHEN a@0 + 5 = 0 THEN 0 ELSE a@0 + 5 END as c5, CASE WHEN random() = 0 THEN 0 WHEN random() = 0 THEN a@0 + 6 ELSE a@0 + 6 END as c6] +02)--MemoryExec: partitions=1, partition_sizes=[0] diff --git a/datafusion/sqllogictest/test_files/select.slt b/datafusion/sqllogictest/test_files/select.slt index f9baf8db69d5..95f67245a981 100644 --- a/datafusion/sqllogictest/test_files/select.slt +++ b/datafusion/sqllogictest/test_files/select.slt @@ -1504,21 +1504,25 @@ query TT EXPLAIN SELECT y > 0 and 1 / y < 1, x > 0 and y > 0 and 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND t.y > Int32(0) AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x -02)--TableScan: t projection=[x, y] +01)Projection: __common_expr_1 AND Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y > Int64(0) AND Int64(1) / t.y < Int64(1), t.x > Int32(0) AND __common_expr_1 AND Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x +02)--Projection: t.y > Int32(0) AS __common_expr_1, t.x, t.y +03)----TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@0 > 0 AND y@1 > 0 AND 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] -02)--MemoryExec: partitions=1, partition_sizes=[1] +01)ProjectionExec: expr=[__common_expr_1@0 AND 1 / CAST(y@2 AS Int64) < 1 as t.y > Int64(0) AND Int64(1) / t.y < Int64(1), x@1 > 0 AND __common_expr_1@0 AND 1 / CAST(y@2 AS Int64) < 1 / CAST(x@1 AS Int64) as t.x > Int64(0) AND t.y > Int64(0) AND Int64(1) / t.y < Int64(1) / t.x] +02)--ProjectionExec: expr=[y@1 > 0 as __common_expr_1, x@0 as x, y@1 as y] +03)----MemoryExec: partitions=1, partition_sizes=[1] query TT EXPLAIN SELECT y = 0 or 1 / y < 1, x = 0 or y = 0 or 1 / y < 1 / x from t; ---- logical_plan -01)Projection: t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR t.y = Int32(0) OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x -02)--TableScan: t projection=[x, y] +01)Projection: __common_expr_1 OR Int64(1) / CAST(t.y AS Int64) < Int64(1) AS t.y = Int64(0) OR Int64(1) / t.y < Int64(1), t.x = Int32(0) OR __common_expr_1 OR Int64(1) / CAST(t.y AS Int64) < Int64(1) / CAST(t.x AS Int64) AS t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x +02)--Projection: t.y = Int32(0) AS __common_expr_1, t.x, t.y +03)----TableScan: t projection=[x, y] physical_plan -01)ProjectionExec: expr=[y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@0 = 0 OR y@1 = 0 OR 1 / CAST(y@1 AS Int64) < 1 / CAST(x@0 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] -02)--MemoryExec: partitions=1, partition_sizes=[1] +01)ProjectionExec: expr=[__common_expr_1@0 OR 1 / CAST(y@2 AS Int64) < 1 as t.y = Int64(0) OR Int64(1) / t.y < Int64(1), x@1 = 0 OR __common_expr_1@0 OR 1 / CAST(y@2 AS Int64) < 1 / CAST(x@1 AS Int64) as t.x = Int64(0) OR t.y = Int64(0) OR Int64(1) / t.y < Int64(1) / t.x] +02)--ProjectionExec: expr=[y@1 = 0 as __common_expr_1, x@0 as x, y@1 as y] +03)----MemoryExec: partitions=1, partition_sizes=[1] # due to the reason describe in https://github.com/apache/datafusion/issues/8927, # the following queries will fail diff --git a/datafusion/sqllogictest/test_files/tpch/q14.slt.part b/datafusion/sqllogictest/test_files/tpch/q14.slt.part index e56e463a617d..3743c201ff2e 100644 --- a/datafusion/sqllogictest/test_files/tpch/q14.slt.part +++ b/datafusion/sqllogictest/test_files/tpch/q14.slt.part @@ -32,9 +32,9 @@ where and l_shipdate < date '1995-10-01'; ---- logical_plan -01)Projection: Float64(100) * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Float64) AS promo_revenue -02)--Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount)) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] -03)----Projection: lineitem.l_extendedprice, lineitem.l_discount, part.p_type +01)Projection: Float64(100) * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END) AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount) AS Float64) AS promo_revenue +02)--Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN __common_expr_1 ELSE Decimal128(Some(0),38,4) END) AS sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(__common_expr_1) AS sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)]] +03)----Projection: lineitem.l_extendedprice * (Decimal128(Some(1),20,0) - lineitem.l_discount) AS __common_expr_1, part.p_type 04)------Inner Join: lineitem.l_partkey = part.p_partkey 05)--------Projection: lineitem.l_partkey, lineitem.l_extendedprice, lineitem.l_discount 06)----------Filter: lineitem.l_shipdate >= Date32("1995-09-01") AND lineitem.l_shipdate < Date32("1995-10-01") @@ -44,19 +44,20 @@ physical_plan 01)ProjectionExec: expr=[100 * CAST(sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END)@0 AS Float64) / CAST(sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)@1 AS Float64) as promo_revenue] 02)--AggregateExec: mode=Final, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] 03)----CoalescePartitionsExec -04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] -05)--------CoalesceBatchesExec: target_batch_size=8192 -06)----------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4] -07)------------CoalesceBatchesExec: target_batch_size=8192 -08)--------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 -09)----------------ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] -10)------------------CoalesceBatchesExec: target_batch_size=8192 -11)--------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01 -12)----------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], has_header=false -13)------------CoalesceBatchesExec: target_batch_size=8192 -14)--------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 -15)----------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 -16)------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], has_header=false +04)------AggregateExec: mode=Partial, gby=[], aggr=[sum(CASE WHEN part.p_type LIKE Utf8("PROMO%") THEN lineitem.l_extendedprice * Int64(1) - lineitem.l_discount ELSE Int64(0) END), sum(lineitem.l_extendedprice * Int64(1) - lineitem.l_discount)] +05)--------ProjectionExec: expr=[l_extendedprice@0 * (Some(1),20,0 - l_discount@1) as __common_expr_1, p_type@2 as p_type] +06)----------CoalesceBatchesExec: target_batch_size=8192 +07)------------HashJoinExec: mode=Partitioned, join_type=Inner, on=[(l_partkey@0, p_partkey@0)], projection=[l_extendedprice@1, l_discount@2, p_type@4] +08)--------------CoalesceBatchesExec: target_batch_size=8192 +09)----------------RepartitionExec: partitioning=Hash([l_partkey@0], 4), input_partitions=4 +10)------------------ProjectionExec: expr=[l_partkey@0 as l_partkey, l_extendedprice@1 as l_extendedprice, l_discount@2 as l_discount] +11)--------------------CoalesceBatchesExec: target_batch_size=8192 +12)----------------------FilterExec: l_shipdate@3 >= 1995-09-01 AND l_shipdate@3 < 1995-10-01 +13)------------------------CsvExec: file_groups={4 groups: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:0..18561749], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:18561749..37123498], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:37123498..55685247], [WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/lineitem.tbl:55685247..74246996]]}, projection=[l_partkey, l_extendedprice, l_discount, l_shipdate], has_header=false +14)--------------CoalesceBatchesExec: target_batch_size=8192 +15)----------------RepartitionExec: partitioning=Hash([p_partkey@0], 4), input_partitions=4 +16)------------------RepartitionExec: partitioning=RoundRobinBatch(4), input_partitions=1 +17)--------------------CsvExec: file_groups={1 group: [[WORKSPACE_ROOT/datafusion/sqllogictest/test_files/tpch/data/part.tbl]]}, projection=[p_partkey, p_type], has_header=false From 1dfac86a89750193491cf3e04917e37b92c64ffa Mon Sep 17 00:00:00 2001 From: wiedld Date: Fri, 12 Jul 2024 04:04:42 -0700 Subject: [PATCH 02/19] fix(11397): surface proper errors in ParquetSink (#11399) * fix(11397): do not surface errors for closed channels, and instead let the task join errors be surfaced * fix(11397): terminate early on channel send failure --- .../src/datasource/file_format/parquet.rs | 32 +++++++++---------- datafusion/core/tests/memory_limit/mod.rs | 4 +-- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/parquet.rs b/datafusion/core/src/datasource/file_format/parquet.rs index 694c94928537..6271d8af3786 100644 --- a/datafusion/core/src/datasource/file_format/parquet.rs +++ b/datafusion/core/src/datasource/file_format/parquet.rs @@ -893,12 +893,12 @@ async fn send_arrays_to_col_writers( let mut next_channel = 0; for (array, field) in rb.columns().iter().zip(schema.fields()) { for c in compute_leaves(field, array)? { - col_array_channels[next_channel] - .send(c) - .await - .map_err(|_| { - DataFusionError::Internal("Unable to send array to writer!".into()) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if col_array_channels[next_channel].send(c).await.is_err() { + return Ok(()); + } + next_channel += 1; } } @@ -984,11 +984,11 @@ fn spawn_parquet_parallel_serialization_task( &pool, ); - serialize_tx.send(finalize_rg_task).await.map_err(|_| { - DataFusionError::Internal( - "Unable to send closed RG to concat task!".into(), - ) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if serialize_tx.send(finalize_rg_task).await.is_err() { + return Ok(()); + } current_rg_rows = 0; rb = rb.slice(rows_left, rb.num_rows() - rows_left); @@ -1013,11 +1013,11 @@ fn spawn_parquet_parallel_serialization_task( &pool, ); - serialize_tx.send(finalize_rg_task).await.map_err(|_| { - DataFusionError::Internal( - "Unable to send closed RG to concat task!".into(), - ) - })?; + // Do not surface error from closed channel (means something + // else hit an error, and the plan is shutting down). + if serialize_tx.send(finalize_rg_task).await.is_err() { + return Ok(()); + } } Ok(()) diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index f7402357d1c7..7ef24609e238 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -340,8 +340,8 @@ async fn oom_parquet_sink() { path.to_string_lossy() )) .with_expected_errors(vec![ - // TODO: update error handling in ParquetSink - "Unable to send array to writer!", + "Failed to allocate additional", + "for ParquetSink(ArrowColumnWriter)", ]) .with_memory_limit(200_000) .run() From 13ddbaf2f7220c26f443d097697d1380e63f6206 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 10:53:58 -0400 Subject: [PATCH 03/19] Minor: Add note about SQLLancer fuzz testing to docs (#11430) * Minor: Add note about SQLLancer fuzz testing to docs * prettier --- docs/source/contributor-guide/testing.md | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/docs/source/contributor-guide/testing.md b/docs/source/contributor-guide/testing.md index 018cc6233c46..0f4461ab2c2c 100644 --- a/docs/source/contributor-guide/testing.md +++ b/docs/source/contributor-guide/testing.md @@ -39,7 +39,7 @@ DataFusion's SQL implementation is tested using [sqllogictest](https://github.co Like similar systems such as [DuckDB](https://duckdb.org/dev/testing), DataFusion has chosen to trade off a slightly higher barrier to contribution for longer term maintainability. -### Rust Integration Tests +## Rust Integration Tests There are several tests of the public interface of the DataFusion library in the [tests](https://github.com/apache/datafusion/tree/main/datafusion/core/tests) directory. @@ -49,6 +49,18 @@ You can run these tests individually using `cargo` as normal command such as cargo test -p datafusion --test parquet_exec ``` +## SQL "Fuzz" testing + +DataFusion uses the [SQLancer] for "fuzz" testing: it generates random SQL +queries and execute them against DataFusion to find bugs. + +The code is in the [datafusion-sqllancer] repository, and we welcome further +contributions. Kudos to [@2010YOUY01] for the initial implementation. + +[sqlancer]: https://github.com/sqlancer/sqlancer +[datafusion-sqllancer]: https://github.com/datafusion-contrib/datafusion-sqllancer +[@2010youy01]: https://github.com/2010YOUY01 + ## Documentation Examples We use Rust [doctest] to verify examples from the documentation are correct and From c769a70dc1c746460b4c1369d4e42c4a78da9571 Mon Sep 17 00:00:00 2001 From: tmi Date: Fri, 12 Jul 2024 17:52:24 +0200 Subject: [PATCH 04/19] Trivial: use arrow csv writer's timestamp_tz_format (#11407) --- datafusion/common/src/file_options/csv_writer.rs | 3 +++ 1 file changed, 3 insertions(+) diff --git a/datafusion/common/src/file_options/csv_writer.rs b/datafusion/common/src/file_options/csv_writer.rs index 5792cfdba9e0..ae069079a68f 100644 --- a/datafusion/common/src/file_options/csv_writer.rs +++ b/datafusion/common/src/file_options/csv_writer.rs @@ -63,6 +63,9 @@ impl TryFrom<&CsvOptions> for CsvWriterOptions { if let Some(v) = &value.timestamp_format { builder = builder.with_timestamp_format(v.into()) } + if let Some(v) = &value.timestamp_tz_format { + builder = builder.with_timestamp_tz_format(v.into()) + } if let Some(v) = &value.time_format { builder = builder.with_time_format(v.into()) } From a2a6458e420209c7125b08966c5726b5fd104195 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 11:53:03 -0400 Subject: [PATCH 05/19] Minor: improve documentation for sql unparsing (#11395) --- datafusion/sql/src/lib.rs | 6 ++- datafusion/sql/src/unparser/expr.rs | 29 +++++++++---- datafusion/sql/src/unparser/mod.rs | 64 +++++++++++++++++++++++++++-- datafusion/sql/src/unparser/plan.rs | 24 ++++++++--- 4 files changed, 105 insertions(+), 18 deletions(-) diff --git a/datafusion/sql/src/lib.rs b/datafusion/sql/src/lib.rs index eb5fec7a3c8b..f53cab5df848 100644 --- a/datafusion/sql/src/lib.rs +++ b/datafusion/sql/src/lib.rs @@ -17,7 +17,7 @@ // Make cheap clones clear: https://github.com/apache/datafusion/issues/11143 #![deny(clippy::clone_on_ref_ptr)] -//! This module provides: +//! This crate provides: //! //! 1. A SQL parser, [`DFParser`], that translates SQL query text into //! an abstract syntax tree (AST), [`Statement`]. @@ -25,10 +25,14 @@ //! 2. A SQL query planner [`SqlToRel`] that creates [`LogicalPlan`]s //! from [`Statement`]s. //! +//! 3. A SQL [`unparser`] that converts [`Expr`]s and [`LogicalPlan`]s +//! into SQL query text. +//! //! [`DFParser`]: parser::DFParser //! [`Statement`]: parser::Statement //! [`SqlToRel`]: planner::SqlToRel //! [`LogicalPlan`]: datafusion_expr::logical_plan::LogicalPlan +//! [`Expr`]: datafusion_expr::expr::Expr mod cte; mod expr; diff --git a/datafusion/sql/src/unparser/expr.rs b/datafusion/sql/src/unparser/expr.rs index e0d05c400cb0..eb149c819c8b 100644 --- a/datafusion/sql/src/unparser/expr.rs +++ b/datafusion/sql/src/unparser/expr.rs @@ -72,21 +72,34 @@ impl Display for Unparsed { } } -/// Convert a DataFusion [`Expr`] to `sqlparser::ast::Expr` +/// Convert a DataFusion [`Expr`] to [`ast::Expr`] /// -/// This function is the opposite of `SqlToRel::sql_to_expr` and can -/// be used to, among other things, convert [`Expr`]s to strings. -/// Throws an error if [`Expr`] can not be represented by an `sqlparser::ast::Expr` +/// This function is the opposite of [`SqlToRel::sql_to_expr`] and can be used +/// to, among other things, convert [`Expr`]s to SQL strings. Such strings could +/// be used to pass filters or other expressions to another SQL engine. +/// +/// # Errors +/// +/// Throws an error if [`Expr`] can not be represented by an [`ast::Expr`] +/// +/// # See Also +/// +/// * [`Unparser`] for more control over the conversion to SQL +/// * [`plan_to_sql`] for converting a [`LogicalPlan`] to SQL /// /// # Example /// ``` /// use datafusion_expr::{col, lit}; /// use datafusion_sql::unparser::expr_to_sql; -/// let expr = col("a").gt(lit(4)); -/// let sql = expr_to_sql(&expr).unwrap(); -/// -/// assert_eq!(format!("{}", sql), "(a > 4)") +/// let expr = col("a").gt(lit(4)); // form an expression `a > 4` +/// let sql = expr_to_sql(&expr).unwrap(); // convert to ast::Expr +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "(a > 4)") /// ``` +/// +/// [`SqlToRel::sql_to_expr`]: crate::planner::SqlToRel::sql_to_expr +/// [`plan_to_sql`]: crate::unparser::plan_to_sql +/// [`LogicalPlan`]: datafusion_expr::logical_plan::LogicalPlan pub fn expr_to_sql(expr: &Expr) -> Result { let unparser = Unparser::default(); unparser.expr_to_sql(expr) diff --git a/datafusion/sql/src/unparser/mod.rs b/datafusion/sql/src/unparser/mod.rs index e5ffbc8a212a..83ae64ba238b 100644 --- a/datafusion/sql/src/unparser/mod.rs +++ b/datafusion/sql/src/unparser/mod.rs @@ -15,6 +15,8 @@ // specific language governing permissions and limitations // under the License. +//! [`Unparser`] for converting `Expr` to SQL text + mod ast; mod expr; mod plan; @@ -27,6 +29,29 @@ pub use plan::plan_to_sql; use self::dialect::{DefaultDialect, Dialect}; pub mod dialect; +/// Convert a DataFusion [`Expr`] to [`sqlparser::ast::Expr`] +/// +/// See [`expr_to_sql`] for background. `Unparser` allows greater control of +/// the conversion, but with a more complicated API. +/// +/// To get more human-readable output, see [`Self::with_pretty`] +/// +/// # Example +/// ``` +/// use datafusion_expr::{col, lit}; +/// use datafusion_sql::unparser::Unparser; +/// let expr = col("a").gt(lit(4)); // form an expression `a > 4` +/// let unparser = Unparser::default(); +/// let sql = unparser.expr_to_sql(&expr).unwrap();// convert to AST +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "(a > 4)"); +/// // now convert to pretty sql +/// let unparser = unparser.with_pretty(true); +/// let sql = unparser.expr_to_sql(&expr).unwrap(); +/// assert_eq!(sql.to_string(), "a > 4"); // note lack of parenthesis +/// ``` +/// +/// [`Expr`]: datafusion_expr::Expr pub struct Unparser<'a> { dialect: &'a dyn Dialect, pretty: bool, @@ -40,9 +65,42 @@ impl<'a> Unparser<'a> { } } - /// Allow unparser to remove parenthesis according to the precedence rules of DataFusion. - /// This might make it invalid SQL for other SQL query engines with different precedence - /// rules, even if its valid for DataFusion. + /// Create pretty SQL output, better suited for human consumption + /// + /// See example on the struct level documentation + /// + /// # Pretty Output + /// + /// By default, `Unparser` generates SQL text that will parse back to the + /// same parsed [`Expr`], which is useful for creating machine readable + /// expressions to send to other systems. However, the resulting expressions are + /// not always nice to read for humans. + /// + /// For example + /// + /// ```sql + /// ((a + 4) > 5) + /// ``` + /// + /// This method removes parenthesis using to the precedence rules of + /// DataFusion. If the output is reparsed, the resulting [`Expr`] produces + /// same value as the original in DataFusion, but with a potentially + /// different order of operations. + /// + /// Note that this setting may create invalid SQL for other SQL query + /// engines with different precedence rules + /// + /// # Example + /// ``` + /// use datafusion_expr::{col, lit}; + /// use datafusion_sql::unparser::Unparser; + /// let expr = col("a").gt(lit(4)).and(col("b").lt(lit(5))); // form an expression `a > 4 AND b < 5` + /// let unparser = Unparser::default().with_pretty(true); + /// let sql = unparser.expr_to_sql(&expr).unwrap(); + /// assert_eq!(sql.to_string(), "a > 4 AND b < 5"); // note lack of parenthesis + /// ``` + /// + /// [`Expr`]: datafusion_expr::Expr pub fn with_pretty(mut self, pretty: bool) -> Self { self.pretty = pretty; self diff --git a/datafusion/sql/src/unparser/plan.rs b/datafusion/sql/src/unparser/plan.rs index 15137403c582..41a8c968841b 100644 --- a/datafusion/sql/src/unparser/plan.rs +++ b/datafusion/sql/src/unparser/plan.rs @@ -33,10 +33,18 @@ use super::{ Unparser, }; -/// Convert a DataFusion [`LogicalPlan`] to `sqlparser::ast::Statement` +/// Convert a DataFusion [`LogicalPlan`] to [`ast::Statement`] /// -/// This function is the opposite of `SqlToRel::sql_statement_to_plan` and can -/// be used to, among other things, convert `LogicalPlan`s to strings. +/// This function is the opposite of [`SqlToRel::sql_statement_to_plan`] and can +/// be used to, among other things, to convert `LogicalPlan`s to SQL strings. +/// +/// # Errors +/// +/// This function returns an error if the plan cannot be converted to SQL. +/// +/// # See Also +/// +/// * [`expr_to_sql`] for converting [`Expr`], a single expression to SQL /// /// # Example /// ``` @@ -47,16 +55,20 @@ use super::{ /// Field::new("id", DataType::Utf8, false), /// Field::new("value", DataType::Utf8, false), /// ]); +/// // Scan 'table' and select columns 'id' and 'value' /// let plan = table_scan(Some("table"), &schema, None) /// .unwrap() /// .project(vec![col("id"), col("value")]) /// .unwrap() /// .build() /// .unwrap(); -/// let sql = plan_to_sql(&plan).unwrap(); -/// -/// assert_eq!(format!("{}", sql), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"") +/// let sql = plan_to_sql(&plan).unwrap(); // convert to AST +/// // use the Display impl to convert to SQL text +/// assert_eq!(sql.to_string(), "SELECT \"table\".id, \"table\".\"value\" FROM \"table\"") /// ``` +/// +/// [`SqlToRel::sql_statement_to_plan`]: crate::planner::SqlToRel::sql_statement_to_plan +/// [`expr_to_sql`]: crate::unparser::expr_to_sql pub fn plan_to_sql(plan: &LogicalPlan) -> Result { let unparser = Unparser::default(); unparser.plan_to_sql(plan) From dc21a6c25893e7906da588debf18a8e5918b3b32 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 11:53:44 -0400 Subject: [PATCH 06/19] Minor: Consolidate specificataion doc sections (#11427) --- docs/source/contributor-guide/index.md | 16 ---------------- .../contributor-guide/specification/index.rst | 10 ++++++++++ 2 files changed, 10 insertions(+), 16 deletions(-) diff --git a/docs/source/contributor-guide/index.md b/docs/source/contributor-guide/index.md index 891277f64757..ad49b614c334 100644 --- a/docs/source/contributor-guide/index.md +++ b/docs/source/contributor-guide/index.md @@ -134,19 +134,3 @@ The good thing about open code and open development is that any issues in one ch Pull requests will be marked with a `stale` label after 60 days of inactivity and then closed 7 days after that. Commenting on the PR will remove the `stale` label. - -## Specifications - -We formalize some DataFusion semantics and behaviors through specification -documents. These specifications are useful to be used as references to help -resolve ambiguities during development or code reviews. - -You are also welcome to propose changes to existing specifications or create -new specifications as you see fit. - -Here is the list current active specifications: - -- [Output field name semantic](https://datafusion.apache.org/contributor-guide/specification/output-field-name-semantic.html) -- [Invariants](https://datafusion.apache.org/contributor-guide/specification/invariants.html) - -All specifications are stored in the `docs/source/specification` folder. diff --git a/docs/source/contributor-guide/specification/index.rst b/docs/source/contributor-guide/specification/index.rst index bcd5a895c4d2..a34f0b19e4de 100644 --- a/docs/source/contributor-guide/specification/index.rst +++ b/docs/source/contributor-guide/specification/index.rst @@ -18,6 +18,16 @@ Specifications ============== +We formalize some DataFusion semantics and behaviors through specification +documents. These specifications are useful to be used as references to help +resolve ambiguities during development or code reviews. + +You are also welcome to propose changes to existing specifications or create +new specifications as you see fit. All specifications are stored in the +`docs/source/specification` folder. Here is the list current active +specifications: + + .. toctree:: :maxdepth: 1 From b075ac471e6d27dfe40b6586a72070a9ec4751a9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 15:27:16 -0400 Subject: [PATCH 07/19] Minor: consolidate doc roadmap pages (#11426) --- .../contributor-guide/quarterly_roadmap.md | 96 ------------------- docs/source/contributor-guide/roadmap.md | 81 ++++++++++++++++ docs/source/index.rst | 1 - 3 files changed, 81 insertions(+), 97 deletions(-) delete mode 100644 docs/source/contributor-guide/quarterly_roadmap.md diff --git a/docs/source/contributor-guide/quarterly_roadmap.md b/docs/source/contributor-guide/quarterly_roadmap.md deleted file mode 100644 index ee82617225aa..000000000000 --- a/docs/source/contributor-guide/quarterly_roadmap.md +++ /dev/null @@ -1,96 +0,0 @@ - - -# Quarterly Roadmap - -A quarterly roadmap will be published to give the DataFusion community visibility into the priorities of the projects contributors. This roadmap is not binding. - -## 2023 Q4 - -- Improve data output (`COPY`, `INSERT` and DataFrame) output capability [#6569](https://github.com/apache/datafusion/issues/6569) -- Implementation of `ARRAY` types and related functions [#6980](https://github.com/apache/datafusion/issues/6980) -- Write an industrial paper about DataFusion for SIGMOD [#6782](https://github.com/apache/datafusion/issues/6782) - -## 2022 Q2 - -### DataFusion Core - -- IO Improvements - - Reading, registering, and writing more file formats from both DataFrame API and SQL - - Additional options for IO including partitioning and metadata support -- Work Scheduling - - Improve predictability, observability and performance of IO and CPU-bound work - - Develop a more explicit story for managing parallelism during plan execution -- Memory Management - - Add more operators for memory limited execution -- Performance - - Incorporate row-format into operators such as aggregate - - Add row-format benchmarks - - Explore JIT-compiling complex expressions - - Explore LLVM for JIT, with inline Rust functions as the primary goal - - Improve performance of Sort and Merge using Row Format / JIT expressions -- Documentation - - General improvements to DataFusion website - - Publish design documents -- Streaming - - Create `StreamProvider` trait - -### Ballista - -- Make production ready - - Shuffle file cleanup - - Fill functional gaps between DataFusion and Ballista - - Improve task scheduling and data exchange efficiency - - Better error handling - - Task failure - - Executor lost - - Schedule restart - - Improve monitoring and logging - - Auto scaling support -- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. -- Executor deployment grouping based on resource allocation - -### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib)) - -#### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) - -- Add missing functionality to DataFrame and SessionContext -- Improve documentation - -#### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) - -- Create Python bindings to use with datafusion-python - -#### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) - -- Create multiple SQL editors -- Expose more Context and query metadata -- Support new data sources - - BigTable, HDFS, HTTP APIs - -#### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) - -- Python binding to use with datafusion-python -- Timestamp range predicate pushdown -- Multi-threaded partition aware execution -- Production ready Rust SDK - -#### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) - -- Create experimental implementation of `StreamProvider` trait diff --git a/docs/source/contributor-guide/roadmap.md b/docs/source/contributor-guide/roadmap.md index a6d78d9311aa..3d9c1ee371fe 100644 --- a/docs/source/contributor-guide/roadmap.md +++ b/docs/source/contributor-guide/roadmap.md @@ -43,3 +43,84 @@ start a conversation using a github issue or the make review efficient and avoid surprises. [The current list of `EPIC`s can be found here](https://github.com/apache/datafusion/issues?q=is%3Aissue+is%3Aopen+epic). + +# Quarterly Roadmap + +A quarterly roadmap will be published to give the DataFusion community +visibility into the priorities of the projects contributors. This roadmap is not +binding and we would welcome any/all contributions to help keep this list up to +date. + +## 2023 Q4 + +- Improve data output (`COPY`, `INSERT` and DataFrame) output capability [#6569](https://github.com/apache/datafusion/issues/6569) +- Implementation of `ARRAY` types and related functions [#6980](https://github.com/apache/datafusion/issues/6980) +- Write an industrial paper about DataFusion for SIGMOD [#6782](https://github.com/apache/datafusion/issues/6782) + +## 2022 Q2 + +### DataFusion Core + +- IO Improvements + - Reading, registering, and writing more file formats from both DataFrame API and SQL + - Additional options for IO including partitioning and metadata support +- Work Scheduling + - Improve predictability, observability and performance of IO and CPU-bound work + - Develop a more explicit story for managing parallelism during plan execution +- Memory Management + - Add more operators for memory limited execution +- Performance + - Incorporate row-format into operators such as aggregate + - Add row-format benchmarks + - Explore JIT-compiling complex expressions + - Explore LLVM for JIT, with inline Rust functions as the primary goal + - Improve performance of Sort and Merge using Row Format / JIT expressions +- Documentation + - General improvements to DataFusion website + - Publish design documents +- Streaming + - Create `StreamProvider` trait + +### Ballista + +- Make production ready + - Shuffle file cleanup + - Fill functional gaps between DataFusion and Ballista + - Improve task scheduling and data exchange efficiency + - Better error handling + - Task failure + - Executor lost + - Schedule restart + - Improve monitoring and logging + - Auto scaling support +- Support for multi-scheduler deployments. Initially for resiliency and fault tolerance but ultimately to support sharding for scalability and more efficient caching. +- Executor deployment grouping based on resource allocation + +### Extensions ([datafusion-contrib](https://github.com/datafusion-contrib)) + +### [DataFusion-Python](https://github.com/datafusion-contrib/datafusion-python) + +- Add missing functionality to DataFrame and SessionContext +- Improve documentation + +### [DataFusion-S3](https://github.com/datafusion-contrib/datafusion-objectstore-s3) + +- Create Python bindings to use with datafusion-python + +### [DataFusion-Tui](https://github.com/datafusion-contrib/datafusion-tui) + +- Create multiple SQL editors +- Expose more Context and query metadata +- Support new data sources + - BigTable, HDFS, HTTP APIs + +### [DataFusion-BigTable](https://github.com/datafusion-contrib/datafusion-bigtable) + +- Python binding to use with datafusion-python +- Timestamp range predicate pushdown +- Multi-threaded partition aware execution +- Production ready Rust SDK + +### [DataFusion-Streams](https://github.com/datafusion-contrib/datafusion-streams) + +- Create experimental implementation of `StreamProvider` trait diff --git a/docs/source/index.rst b/docs/source/index.rst index 8fbff208f561..ca6905c434f3 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -121,7 +121,6 @@ To get started, see contributor-guide/testing contributor-guide/howtos contributor-guide/roadmap - contributor-guide/quarterly_roadmap contributor-guide/governance contributor-guide/inviting contributor-guide/specification/index From d5367f3ff5ed506e824a04c68120194deb68a908 Mon Sep 17 00:00:00 2001 From: Georgi Krastev Date: Fri, 12 Jul 2024 22:34:35 +0300 Subject: [PATCH 08/19] Avoid calling shutdown after failed write of AsyncWrite (#249) (#250) (#11415) in `serialize_rb_stream_to_object_store` --- .../file_format/write/orchestration.rs | 27 +++++++++++-------- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/datafusion/core/src/datasource/file_format/write/orchestration.rs b/datafusion/core/src/datasource/file_format/write/orchestration.rs index a62b5715aeb3..8bd0dae9f5a4 100644 --- a/datafusion/core/src/datasource/file_format/write/orchestration.rs +++ b/datafusion/core/src/datasource/file_format/write/orchestration.rs @@ -42,15 +42,20 @@ use tokio::task::JoinSet; type WriterType = Box; type SerializerType = Arc; -/// Serializes a single data stream in parallel and writes to an ObjectStore -/// concurrently. Data order is preserved. In the event of an error, -/// the ObjectStore writer is returned to the caller in addition to an error, -/// so that the caller may handle aborting failed writes. +/// Serializes a single data stream in parallel and writes to an ObjectStore concurrently. +/// Data order is preserved. +/// +/// In the event of a non-IO error which does not involve the ObjectStore writer, +/// the writer returned to the caller in addition to the error, +/// so that failed writes may be aborted. +/// +/// In the event of an IO error involving the ObjectStore writer, +/// the writer is dropped to avoid calling further methods on it which might panic. pub(crate) async fn serialize_rb_stream_to_object_store( mut data_rx: Receiver, serializer: Arc, mut writer: WriterType, -) -> std::result::Result<(WriterType, u64), (WriterType, DataFusionError)> { +) -> std::result::Result<(WriterType, u64), (Option, DataFusionError)> { let (tx, mut rx) = mpsc::channel::>>(100); let serialize_task = SpawnedTask::spawn(async move { @@ -82,7 +87,7 @@ pub(crate) async fn serialize_rb_stream_to_object_store( Ok(_) => (), Err(e) => { return Err(( - writer, + None, DataFusionError::Execution(format!( "Error writing to object store: {e}" )), @@ -93,12 +98,12 @@ pub(crate) async fn serialize_rb_stream_to_object_store( } Ok(Err(e)) => { // Return the writer along with the error - return Err((writer, e)); + return Err((Some(writer), e)); } Err(e) => { // Handle task panic or cancellation return Err(( - writer, + Some(writer), DataFusionError::Execution(format!( "Serialization task panicked or was cancelled: {e}" )), @@ -109,10 +114,10 @@ pub(crate) async fn serialize_rb_stream_to_object_store( match serialize_task.join().await { Ok(Ok(_)) => (), - Ok(Err(e)) => return Err((writer, e)), + Ok(Err(e)) => return Err((Some(writer), e)), Err(_) => { return Err(( - writer, + Some(writer), internal_datafusion_err!("Unknown error writing to object store"), )) } @@ -153,7 +158,7 @@ pub(crate) async fn stateless_serialize_and_write_files( row_count += cnt; } Err((writer, e)) => { - finished_writers.push(writer); + finished_writers.extend(writer); any_errors = true; triggering_error = Some(e); } From 02335ebe2dd36081e22ed2d8ab46287c6d950a5c Mon Sep 17 00:00:00 2001 From: kamille Date: Sat, 13 Jul 2024 03:50:22 +0800 Subject: [PATCH 09/19] Short term way to make `AggregateStatistics` still work when min/max is converted to udaf (#11261) * impl the short term solution. * add todos. --- .../aggregate_statistics.rs | 136 +++++++++++------- 1 file changed, 85 insertions(+), 51 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs index 7e9aec9e5e4c..66067d8cb5c4 100644 --- a/datafusion/core/src/physical_optimizer/aggregate_statistics.rs +++ b/datafusion/core/src/physical_optimizer/aggregate_statistics.rs @@ -140,31 +140,29 @@ fn take_optimizable_column_and_table_count( stats: &Statistics, ) -> Option<(ScalarValue, String)> { let col_stats = &stats.column_statistics; - if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { - if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { - if let Precision::Exact(num_rows) = stats.num_rows { - let exprs = agg_expr.expressions(); - if exprs.len() == 1 { - // TODO optimize with exprs other than Column - if let Some(col_expr) = - exprs[0].as_any().downcast_ref::() - { - let current_val = &col_stats[col_expr.index()].null_count; - if let &Precision::Exact(val) = current_val { - return Some(( - ScalarValue::Int64(Some((num_rows - val) as i64)), - agg_expr.name().to_string(), - )); - } - } else if let Some(lit_expr) = - exprs[0].as_any().downcast_ref::() - { - if lit_expr.value() == &COUNT_STAR_EXPANSION { - return Some(( - ScalarValue::Int64(Some(num_rows as i64)), - agg_expr.name().to_string(), - )); - } + if is_non_distinct_count(agg_expr) { + if let Precision::Exact(num_rows) = stats.num_rows { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { + // TODO optimize with exprs other than Column + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() + { + let current_val = &col_stats[col_expr.index()].null_count; + if let &Precision::Exact(val) = current_val { + return Some(( + ScalarValue::Int64(Some((num_rows - val) as i64)), + agg_expr.name().to_string(), + )); + } + } else if let Some(lit_expr) = + exprs[0].as_any().downcast_ref::() + { + if lit_expr.value() == &COUNT_STAR_EXPANSION { + return Some(( + ScalarValue::Int64(Some(num_rows as i64)), + agg_expr.name().to_string(), + )); } } } @@ -182,26 +180,22 @@ fn take_optimizable_min( match *num_rows { 0 => { // MIN/MAX with 0 rows is always null - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { + if is_min(agg_expr) { if let Ok(min_data_type) = - ScalarValue::try_from(casted_expr.field().unwrap().data_type()) + ScalarValue::try_from(agg_expr.field().unwrap().data_type()) { - return Some((min_data_type, casted_expr.name().to_string())); + return Some((min_data_type, agg_expr.name().to_string())); } } } value if value > 0 => { let col_stats = &stats.column_statistics; - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { - if casted_expr.expressions().len() == 1 { + if is_min(agg_expr) { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() { if let Precision::Exact(val) = &col_stats[col_expr.index()].min_value @@ -209,7 +203,7 @@ fn take_optimizable_min( if !val.is_null() { return Some(( val.clone(), - casted_expr.name().to_string(), + agg_expr.name().to_string(), )); } } @@ -232,26 +226,22 @@ fn take_optimizable_max( match *num_rows { 0 => { // MIN/MAX with 0 rows is always null - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { + if is_max(agg_expr) { if let Ok(max_data_type) = - ScalarValue::try_from(casted_expr.field().unwrap().data_type()) + ScalarValue::try_from(agg_expr.field().unwrap().data_type()) { - return Some((max_data_type, casted_expr.name().to_string())); + return Some((max_data_type, agg_expr.name().to_string())); } } } value if value > 0 => { let col_stats = &stats.column_statistics; - if let Some(casted_expr) = - agg_expr.as_any().downcast_ref::() - { - if casted_expr.expressions().len() == 1 { + if is_max(agg_expr) { + let exprs = agg_expr.expressions(); + if exprs.len() == 1 { // TODO optimize with exprs other than Column - if let Some(col_expr) = casted_expr.expressions()[0] - .as_any() - .downcast_ref::() + if let Some(col_expr) = + exprs[0].as_any().downcast_ref::() { if let Precision::Exact(val) = &col_stats[col_expr.index()].max_value @@ -259,7 +249,7 @@ fn take_optimizable_max( if !val.is_null() { return Some(( val.clone(), - casted_expr.name().to_string(), + agg_expr.name().to_string(), )); } } @@ -273,6 +263,50 @@ fn take_optimizable_max( None } +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_non_distinct_count(agg_expr: &dyn AggregateExpr) -> bool { + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "count" && !agg_expr.is_distinct() { + return true; + } + } + + false +} + +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_min(agg_expr: &dyn AggregateExpr) -> bool { + if agg_expr.as_any().is::() { + return true; + } + + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "min" { + return true; + } + } + + false +} + +// TODO: Move this check into AggregateUDFImpl +// https://github.com/apache/datafusion/issues/11153 +fn is_max(agg_expr: &dyn AggregateExpr) -> bool { + if agg_expr.as_any().is::() { + return true; + } + + if let Some(agg_expr) = agg_expr.as_any().downcast_ref::() { + if agg_expr.fun().name() == "max" { + return true; + } + } + + false +} + #[cfg(test)] pub(crate) mod tests { use super::*; From bd25e26747a271752b7f46aa0970022525eff05b Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Fri, 12 Jul 2024 12:51:01 -0700 Subject: [PATCH 10/19] Implement TPCH substrait integration test, support tpch_13, tpch_14, tpch_16 (#11405) optimize code --- .../tests/cases/consumer_integration.rs | 86 +- .../tpch_substrait_plans/query_13.json | 624 +++++++++ .../tpch_substrait_plans/query_14.json | 924 +++++++++++++ .../tpch_substrait_plans/query_16.json | 1175 +++++++++++++++++ 4 files changed, 2808 insertions(+), 1 deletion(-) create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index 10c1319b903b..c8130220ef4a 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -40,7 +40,6 @@ mod tests { } Ok(ctx) } - #[tokio::test] async fn tpch_test_1() -> Result<()> { let ctx = create_context(vec![( @@ -314,4 +313,89 @@ mod tests { \n TableScan: FILENAME_PLACEHOLDER_2 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); Ok(()) } + + // missing query 12 + #[tokio::test] + async fn tpch_test_13() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/orders.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_13.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey) AS C_COUNT, count(Int64(1)) AS CUSTDIST\ + \n Sort: count(Int64(1)) DESC NULLS FIRST, count(FILENAME_PLACEHOLDER_1.o_orderkey) DESC NULLS FIRST\ + \n Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey), count(Int64(1))\ + \n Aggregate: groupBy=[[count(FILENAME_PLACEHOLDER_1.o_orderkey)]], aggr=[[count(Int64(1))]]\ + \n Projection: count(FILENAME_PLACEHOLDER_1.o_orderkey)\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.c_custkey]], aggr=[[count(FILENAME_PLACEHOLDER_1.o_orderkey)]]\ + \n Projection: FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_1.o_orderkey\ + \n Left Join: FILENAME_PLACEHOLDER_0.c_custkey = FILENAME_PLACEHOLDER_1.o_custkey Filter: NOT FILENAME_PLACEHOLDER_1.o_comment LIKE CAST(Utf8(\"%special%requests%\") AS Utf8)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_14() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_14.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: Decimal128(Some(10000),5,2) * sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END) / sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS PROMO_REVENUE\ + \n Aggregate: groupBy=[[]], aggr=[[sum(CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount ELSE Decimal128(Some(0),19,0) END), sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount)]]\ + \n Projection: CASE WHEN FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"PROMO%\") AS Utf8) THEN FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount) ELSE Decimal128(Some(0),19,0) END, FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ + \n Filter: FILENAME_PLACEHOLDER_0.l_partkey = FILENAME_PLACEHOLDER_1.p_partkey AND FILENAME_PLACEHOLDER_0.l_shipdate >= Date32(\"1995-09-01\") AND FILENAME_PLACEHOLDER_0.l_shipdate < CAST(Utf8(\"1995-10-01\") AS Date32)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); + Ok(()) + } + // query 15 is missing + #[tokio::test] + async fn tpch_test_16() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/partsupp.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/supplier.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_16.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_1.p_brand AS P_BRAND, FILENAME_PLACEHOLDER_1.p_type AS P_TYPE, FILENAME_PLACEHOLDER_1.p_size AS P_SIZE, count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey) AS SUPPLIER_CNT\ + \n Sort: count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey) DESC NULLS FIRST, FILENAME_PLACEHOLDER_1.p_brand ASC NULLS LAST, FILENAME_PLACEHOLDER_1.p_type ASC NULLS LAST, FILENAME_PLACEHOLDER_1.p_size ASC NULLS LAST\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_1.p_brand, FILENAME_PLACEHOLDER_1.p_type, FILENAME_PLACEHOLDER_1.p_size]], aggr=[[count(DISTINCT FILENAME_PLACEHOLDER_0.ps_suppkey)]]\ + \n Projection: FILENAME_PLACEHOLDER_1.p_brand, FILENAME_PLACEHOLDER_1.p_type, FILENAME_PLACEHOLDER_1.p_size, FILENAME_PLACEHOLDER_0.ps_suppkey\ + \n Filter: FILENAME_PLACEHOLDER_1.p_partkey = FILENAME_PLACEHOLDER_0.ps_partkey AND FILENAME_PLACEHOLDER_1.p_brand != CAST(Utf8(\"Brand#45\") AS Utf8) AND NOT FILENAME_PLACEHOLDER_1.p_type LIKE CAST(Utf8(\"MEDIUM POLISHED%\") AS Utf8) AND (FILENAME_PLACEHOLDER_1.p_size = Int32(49) OR FILENAME_PLACEHOLDER_1.p_size = Int32(14) OR FILENAME_PLACEHOLDER_1.p_size = Int32(23) OR FILENAME_PLACEHOLDER_1.p_size = Int32(45) OR FILENAME_PLACEHOLDER_1.p_size = Int32(19) OR FILENAME_PLACEHOLDER_1.p_size = Int32(3) OR FILENAME_PLACEHOLDER_1.p_size = Int32(36) OR FILENAME_PLACEHOLDER_1.p_size = Int32(9)) AND NOT CAST(FILENAME_PLACEHOLDER_0.ps_suppkey IN () AS Boolean)\ + \n Subquery:\ + \n Projection: FILENAME_PLACEHOLDER_2.s_suppkey\ + \n Filter: FILENAME_PLACEHOLDER_2.s_comment LIKE CAST(Utf8(\"%Customer%Complaints%\") AS Utf8)\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); + Ok(()) + } } diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json new file mode 100644 index 000000000000..c88e61e78304 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_13.json @@ -0,0 +1,624 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 4, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 2, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "count:opt_any" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "count:opt" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2, + 3 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 17, + 18 + ] + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "%special%requests%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + ] + } + }, + "type": "JOIN_TYPE_LEFT" + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 4, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [] + } + } + ] + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + } + ] + } + }, + "names": [ + "C_COUNT", + "CUSTDIST" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json new file mode 100644 index 000000000000..380b71df8aac --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_14.json @@ -0,0 +1,924 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 5, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gte:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "lt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 5, + "name": "multiply:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 6, + "name": "subtract:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 7, + "name": "sum:opt_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 8, + "name": "divide:opt_decimal_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 25, + 26 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "date": 9374, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1995-10-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "ifThen": { + "ifs": [ + { + "if": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 20 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "PROMO%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + }, + "then": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + } + ], + "else": { + "literal": { + "decimal": { + "value": "AAAAAAAAAAAAAAAAAAAAAA==", + "precision": 19, + "scale": 0 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + } + }, + { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + }, + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 8, + "args": [], + "outputType": { + "decimal": { + "scale": 2, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "decimal": { + "scale": 2, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "literal": { + "decimal": { + "value": "ECcAAAAAAAAAAAAAAAAAAA==", + "precision": 5, + "scale": 2 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "names": [ + "PROMO_REVENUE" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json new file mode 100644 index 000000000000..f988aa7a76a2 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_16.json @@ -0,0 +1,1175 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 4, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "not_equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 3, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 4, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 5, + "name": "or:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "count:opt_any" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 14, + 15, + 16, + 17 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "PS_PARTKEY", + "PS_SUPPKEY", + "PS_AVAILQTY", + "PS_SUPPLYCOST", + "PS_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 199, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "Brand#45", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "MEDIUM POLISHED%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 49, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 14, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 23, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 45, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 19, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 3, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 36, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 9, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "subquery": { + "inPredicate": { + "needles": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ], + "haystack": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 7 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "S_SUPPKEY", + "S_NAME", + "S_ADDRESS", + "S_NATIONKEY", + "S_PHONE", + "S_ACCTBAL", + "S_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "%Customer%Complaints%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + } + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_DISTINCT", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "names": [ + "P_BRAND", + "P_TYPE", + "P_SIZE", + "SUPPLIER_CNT" + ] + } + } + ], + "expectedTypeUrls": [] +} From 4d04a6ebbb0458495d2282df34e8b22001f3971d Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 15:51:44 -0400 Subject: [PATCH 11/19] Minor: fix labeler rules (#11428) --- .github/workflows/dev_pr/labeler.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/dev_pr/labeler.yml b/.github/workflows/dev_pr/labeler.yml index 34a37948785b..308abd1688a6 100644 --- a/.github/workflows/dev_pr/labeler.yml +++ b/.github/workflows/dev_pr/labeler.yml @@ -17,11 +17,11 @@ development-process: - changed-files: - - any-glob-to-any-file: ['dev/**.*', '.github/**.*', 'ci/**.*', '.asf.yaml'] + - any-glob-to-any-file: ['dev/**/*', '.github/**/*', 'ci/**/*', '.asf.yaml'] documentation: - changed-files: - - any-glob-to-any-file: ['docs/**.*', 'README.md', './**/README.md', 'DEVELOPERS.md', 'datafusion/docs/**.*'] + - any-glob-to-any-file: ['docs/**/*', 'README.md', './**/README.md', 'DEVELOPERS.md', 'datafusion/docs/**/*'] sql: - changed-files: From 8f8df07c80aa66bb94d57c9619be93f9c3be92a9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 12 Jul 2024 23:14:17 -0400 Subject: [PATCH 12/19] Minor: change internal error to not supported error for nested field access (#11446) --- datafusion/sql/src/expr/identifier.rs | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/datafusion/sql/src/expr/identifier.rs b/datafusion/sql/src/expr/identifier.rs index d297b2e4df5b..39736b1fbba5 100644 --- a/datafusion/sql/src/expr/identifier.rs +++ b/datafusion/sql/src/expr/identifier.rs @@ -18,8 +18,8 @@ use crate::planner::{ContextProvider, PlannerContext, SqlToRel}; use arrow_schema::Field; use datafusion_common::{ - internal_err, plan_datafusion_err, Column, DFSchema, DataFusionError, Result, - ScalarValue, TableReference, + internal_err, not_impl_err, plan_datafusion_err, Column, DFSchema, DataFusionError, + Result, ScalarValue, TableReference, }; use datafusion_expr::{expr::ScalarFunction, lit, Case, Expr}; use sqlparser::ast::{Expr as SQLExpr, Ident}; @@ -118,7 +118,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // Though ideally once that support is in place, this code should work with it // TODO: remove when can support multiple nested identifiers if ids.len() > 5 { - return internal_err!("Unsupported compound identifier: {ids:?}"); + return not_impl_err!("Compound identifier: {ids:?}"); } let search_result = search_dfschema(&ids, schema); @@ -127,7 +127,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { Some((field, qualifier, nested_names)) if !nested_names.is_empty() => { // TODO: remove when can support multiple nested identifiers if nested_names.len() > 1 { - return internal_err!( + return not_impl_err!( "Nested identifiers not yet supported for column {}", Column::from((qualifier, field)).quoted_flat_name() ); @@ -154,7 +154,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { // return default where use all identifiers to not have a nested field // this len check is because at 5 identifiers will have to have a nested field if ids.len() == 5 { - internal_err!("Unsupported compound identifier: {ids:?}") + not_impl_err!("compound identifier: {ids:?}") } else { // check the outer_query_schema and try to find a match if let Some(outer) = planner_context.outer_query_schema() { @@ -165,7 +165,7 @@ impl<'a, S: ContextProvider> SqlToRel<'a, S> { if !nested_names.is_empty() => { // TODO: remove when can support nested identifiers for OuterReferenceColumn - internal_err!( + not_impl_err!( "Nested identifiers are not yet supported for OuterReferenceColumn {}", Column::from((qualifier, field)).quoted_flat_name() ) From 9e4a4a1599b9def33f27a6f82dd32045038de296 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sat, 13 Jul 2024 05:33:51 -0400 Subject: [PATCH 13/19] Minor: change Datafusion --> DataFusion in docs (#11439) * Minor: change Datafusion --> DataFusion in docs * update expected --- datafusion-examples/README.md | 4 ++-- datafusion-examples/examples/expr_api.rs | 2 +- datafusion/common/src/config.rs | 2 +- datafusion/core/src/dataframe/mod.rs | 2 +- datafusion/expr/src/signature.rs | 2 +- datafusion/optimizer/src/unwrap_cast_in_comparison.rs | 2 +- datafusion/physical-expr/src/intervals/cp_solver.rs | 2 +- datafusion/physical-plan/src/aggregates/mod.rs | 2 +- datafusion/sql/src/parser.rs | 2 +- datafusion/sqllogictest/README.md | 2 +- datafusion/sqllogictest/test_files/information_schema.slt | 2 +- datafusion/sqllogictest/test_files/window.slt | 6 +++--- docs/source/contributor-guide/inviting.md | 2 +- docs/source/user-guide/configs.md | 2 +- 14 files changed, 17 insertions(+), 17 deletions(-) diff --git a/datafusion-examples/README.md b/datafusion-examples/README.md index 2696f74775cf..da01f60b527d 100644 --- a/datafusion-examples/README.md +++ b/datafusion-examples/README.md @@ -71,8 +71,8 @@ cargo run --example dataframe - [`parquet_index.rs`](examples/parquet_index.rs): Create an secondary index over several parquet files and use it to speed up queries - [`parquet_sql_multiple_files.rs`](examples/parquet_sql_multiple_files.rs): Build and run a query plan from a SQL statement against multiple local Parquet files - [`parquet_exec_visitor.rs`](examples/parquet_exec_visitor.rs): Extract statistics by visiting an ExecutionPlan after execution -- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into Datafusion `Expr`. -- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from Datafusion `Expr` and `LogicalPlan` +- [`parse_sql_expr.rs`](examples/parse_sql_expr.rs): Parse SQL text into DataFusion `Expr`. +- [`plan_to_sql.rs`](examples/plan_to_sql.rs): Generate SQL from DataFusion `Expr` and `LogicalPlan` - [`pruning.rs`](examples/pruning.rs): Use pruning to rule out files based on statistics - [`query-aws-s3.rs`](examples/external_dependency/query-aws-s3.rs): Configure `object_store` and run a query against files stored in AWS S3 - [`query-http-csv.rs`](examples/query-http-csv.rs): Configure `object_store` and run a query against files vi HTTP diff --git a/datafusion-examples/examples/expr_api.rs b/datafusion-examples/examples/expr_api.rs index 43729a913e5d..a5cf7011f811 100644 --- a/datafusion-examples/examples/expr_api.rs +++ b/datafusion-examples/examples/expr_api.rs @@ -83,7 +83,7 @@ async fn main() -> Result<()> { Ok(()) } -/// Datafusion's `expr_fn` API makes it easy to create [`Expr`]s for the +/// DataFusion's `expr_fn` API makes it easy to create [`Expr`]s for the /// full range of expression types such as aggregates and window functions. fn expr_fn_demo() -> Result<()> { // Let's say you want to call the "first_value" aggregate function diff --git a/datafusion/common/src/config.rs b/datafusion/common/src/config.rs index 1d2a9589adfc..880f0119ce0d 100644 --- a/datafusion/common/src/config.rs +++ b/datafusion/common/src/config.rs @@ -309,7 +309,7 @@ config_namespace! { /// Currently experimental pub split_file_groups_by_statistics: bool, default = false - /// Should Datafusion keep the columns used for partition_by in the output RecordBatches + /// Should DataFusion keep the columns used for partition_by in the output RecordBatches pub keep_partition_by_columns: bool, default = false } } diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index d0f2852a6e53..05a08a637893 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -1472,7 +1472,7 @@ impl DataFrame { /// /// The method supports case sensitive rename with wrapping column name into one of following symbols ( " or ' or ` ) /// - /// Alternatively setting Datafusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable + /// Alternatively setting DataFusion param `datafusion.sql_parser.enable_ident_normalization` to `false` will enable /// case sensitive rename without need to wrap column name into special symbols /// /// # Example diff --git a/datafusion/expr/src/signature.rs b/datafusion/expr/src/signature.rs index 33f643eb2dc2..fba793dd229d 100644 --- a/datafusion/expr/src/signature.rs +++ b/datafusion/expr/src/signature.rs @@ -93,7 +93,7 @@ pub enum TypeSignature { Variadic(Vec), /// The acceptable signature and coercions rules to coerce arguments to this /// signature are special for this function. If this signature is specified, - /// Datafusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types. + /// DataFusion will call [`ScalarUDFImpl::coerce_types`] to prepare argument types. /// /// [`ScalarUDFImpl::coerce_types`]: crate::udf::ScalarUDFImpl::coerce_types UserDefined, diff --git a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs index 344708252559..9941da9dd65e 100644 --- a/datafusion/optimizer/src/unwrap_cast_in_comparison.rs +++ b/datafusion/optimizer/src/unwrap_cast_in_comparison.rs @@ -1080,7 +1080,7 @@ mod tests { ), }; - // Datafusion ignores timezones for comparisons of ScalarValue + // DataFusion ignores timezones for comparisons of ScalarValue // so double check it here assert_eq!(lit_tz_none, lit_tz_utc); diff --git a/datafusion/physical-expr/src/intervals/cp_solver.rs b/datafusion/physical-expr/src/intervals/cp_solver.rs index fc4950ae4e7c..f05ac3624b8e 100644 --- a/datafusion/physical-expr/src/intervals/cp_solver.rs +++ b/datafusion/physical-expr/src/intervals/cp_solver.rs @@ -176,7 +176,7 @@ impl ExprIntervalGraphNode { &self.interval } - /// This function creates a DAEG node from Datafusion's [`ExprTreeNode`] + /// This function creates a DAEG node from DataFusion's [`ExprTreeNode`] /// object. Literals are created with definite, singleton intervals while /// any other expression starts with an indefinite interval ([-∞, ∞]). pub fn make_node(node: &ExprTreeNode, schema: &Schema) -> Result { diff --git a/datafusion/physical-plan/src/aggregates/mod.rs b/datafusion/physical-plan/src/aggregates/mod.rs index 8caf10acf09b..8bf808af3b5b 100644 --- a/datafusion/physical-plan/src/aggregates/mod.rs +++ b/datafusion/physical-plan/src/aggregates/mod.rs @@ -324,7 +324,7 @@ impl AggregateExec { /// Create a new hash aggregate execution plan with the given schema. /// This constructor isn't part of the public API, it is used internally - /// by Datafusion to enforce schema consistency during when re-creating + /// by DataFusion to enforce schema consistency during when re-creating /// `AggregateExec`s inside optimization rules. Schema field names of an /// `AggregateExec` depends on the names of aggregate expressions. Since /// a rule may re-write aggregate expressions (e.g. reverse them) during diff --git a/datafusion/sql/src/parser.rs b/datafusion/sql/src/parser.rs index 5da7f7176509..8147092c34ab 100644 --- a/datafusion/sql/src/parser.rs +++ b/datafusion/sql/src/parser.rs @@ -253,7 +253,7 @@ fn ensure_not_set(field: &Option, name: &str) -> Result<(), ParserError> { Ok(()) } -/// Datafusion SQL Parser based on [`sqlparser`] +/// DataFusion SQL Parser based on [`sqlparser`] /// /// Parses DataFusion's SQL dialect, often delegating to [`sqlparser`]'s [`Parser`]. /// diff --git a/datafusion/sqllogictest/README.md b/datafusion/sqllogictest/README.md index 930df4796776..c7f04c0d762c 100644 --- a/datafusion/sqllogictest/README.md +++ b/datafusion/sqllogictest/README.md @@ -225,7 +225,7 @@ query ``` -- `test_name`: Uniquely identify the test name (Datafusion only) +- `test_name`: Uniquely identify the test name (DataFusion only) - `type_string`: A short string that specifies the number of result columns and the expected datatype of each result column. There is one character in the for each result column. The characters codes are: - 'B' - **B**oolean, diff --git a/datafusion/sqllogictest/test_files/information_schema.slt b/datafusion/sqllogictest/test_files/information_schema.slt index acd465a0c021..95bea1223a9c 100644 --- a/datafusion/sqllogictest/test_files/information_schema.slt +++ b/datafusion/sqllogictest/test_files/information_schema.slt @@ -257,7 +257,7 @@ datafusion.execution.batch_size 8192 Default batch size while creating new batch datafusion.execution.coalesce_batches true When set to true, record batches will be examined between each operator and small batches will be coalesced into larger batches. This is helpful when there are highly selective filters or joins that could produce tiny output batches. The target batch size is determined by the configuration setting datafusion.execution.collect_statistics false Should DataFusion collect statistics after listing files datafusion.execution.enable_recursive_ctes true Should DataFusion support recursive CTEs -datafusion.execution.keep_partition_by_columns false Should Datafusion keep the columns used for partition_by in the output RecordBatches +datafusion.execution.keep_partition_by_columns false Should DataFusion keep the columns used for partition_by in the output RecordBatches datafusion.execution.listing_table_ignore_subdirectory true Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). datafusion.execution.max_buffered_batches_per_output_file 2 This is the maximum number of RecordBatches buffered for each output file being worked. Higher values can potentially give faster write performance at the cost of higher peak memory consumption datafusion.execution.meta_fetch_concurrency 32 Number of files to read in parallel when inferring schema and statistics diff --git a/datafusion/sqllogictest/test_files/window.slt b/datafusion/sqllogictest/test_files/window.slt index 7f2e766aab91..a865a7ccbd8f 100644 --- a/datafusion/sqllogictest/test_files/window.slt +++ b/datafusion/sqllogictest/test_files/window.slt @@ -2236,7 +2236,7 @@ SELECT SUM(c12) OVER(ORDER BY c1, c2 GROUPS BETWEEN 1 PRECEDING AND 1 FOLLOWING) 7.728066219895 NULL # test_c9_rn_ordering_alias -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, @@ -2275,7 +2275,7 @@ SELECT c9, rn1 FROM (SELECT c9, 145294611 5 # test_c9_rn_ordering_alias_opposite_direction -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan shouldn't have a SortExec after the BoundedWindowAggExec since the table after BoundedWindowAggExec is already ordered by rn1 ASC and c9 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, @@ -2314,7 +2314,7 @@ SELECT c9, rn1 FROM (SELECT c9, 4076864659 5 # test_c9_rn_ordering_alias_opposite_direction2 -# These tests check whether Datafusion is aware of the ordering generated by the ROW_NUMBER() window function. +# These tests check whether DataFusion is aware of the ordering generated by the ROW_NUMBER() window function. # Physical plan _should_ have a SortExec after BoundedWindowAggExec since the table after BoundedWindowAggExec is ordered by rn1 ASC and c9 DESC, which is conflicting with the requirement rn1 DESC. query TT EXPLAIN SELECT c9, rn1 FROM (SELECT c9, diff --git a/docs/source/contributor-guide/inviting.md b/docs/source/contributor-guide/inviting.md index 967f417e6e9a..4066dd9699ee 100644 --- a/docs/source/contributor-guide/inviting.md +++ b/docs/source/contributor-guide/inviting.md @@ -59,7 +59,7 @@ the person. Here is an example: To: private@datafusion.apache.org Subject: [DISCUSS] $PERSONS_NAME for Committer -$PERSONS_NAME has been an active contributor to the Datafusion community for the +$PERSONS_NAME has been an active contributor to the DataFusion community for the last 6 months[1][2], helping others, answering questions, and improving the project's code. diff --git a/docs/source/user-guide/configs.md b/docs/source/user-guide/configs.md index 579088f991ef..5130b0a56d0e 100644 --- a/docs/source/user-guide/configs.md +++ b/docs/source/user-guide/configs.md @@ -86,7 +86,7 @@ Environment variables are read during `SessionConfig` initialisation so they mus | datafusion.execution.listing_table_ignore_subdirectory | true | Should sub directories be ignored when scanning directories for data files. Defaults to true (ignores subdirectories), consistent with Hive. Note that this setting does not affect reading partitioned tables (e.g. `/table/year=2021/month=01/data.parquet`). | | datafusion.execution.enable_recursive_ctes | true | Should DataFusion support recursive CTEs | | datafusion.execution.split_file_groups_by_statistics | false | Attempt to eliminate sorts by packing & sorting files with non-overlapping statistics into the same file groups. Currently experimental | -| datafusion.execution.keep_partition_by_columns | false | Should Datafusion keep the columns used for partition_by in the output RecordBatches | +| datafusion.execution.keep_partition_by_columns | false | Should DataFusion keep the columns used for partition_by in the output RecordBatches | | datafusion.optimizer.enable_distinct_aggregation_soft_limit | true | When set to true, the optimizer will push a limit operation into grouped aggregations which have no aggregate expressions, as a soft limit, emitting groups once the limit is reached, before all rows in the group are read. | | datafusion.optimizer.enable_round_robin_repartition | true | When set to true, the physical plan optimizer will try to add round robin repartitioning to increase parallelism to leverage more CPU cores | | datafusion.optimizer.enable_topk_aggregation | true | When set to true, the optimizer will attempt to perform limit operations during aggregations, if possible | From 08fa444aaa8513a60ede5c57d92f29e6156b91a8 Mon Sep 17 00:00:00 2001 From: Jonah Gao Date: Sat, 13 Jul 2024 17:34:45 +0800 Subject: [PATCH 14/19] fix: make sure JOIN ON expression is boolean type (#11423) * fix: make sure JOIN ON expression is boolean type * Applied to DataFrame * Update datafusion/optimizer/src/analyzer/type_coercion.rs Co-authored-by: Andrew Lamb --------- Co-authored-by: Andrew Lamb --- datafusion/core/src/dataframe/mod.rs | 31 +++++++++++++++++-- .../optimizer/src/analyzer/type_coercion.rs | 17 +++++++++- datafusion/sqllogictest/test_files/join.slt | 12 ++++++- 3 files changed, 55 insertions(+), 5 deletions(-) diff --git a/datafusion/core/src/dataframe/mod.rs b/datafusion/core/src/dataframe/mod.rs index 05a08a637893..c55b7c752765 100644 --- a/datafusion/core/src/dataframe/mod.rs +++ b/datafusion/core/src/dataframe/mod.rs @@ -896,9 +896,8 @@ impl DataFrame { join_type: JoinType, on_exprs: impl IntoIterator, ) -> Result { - let expr = on_exprs.into_iter().reduce(Expr::and); let plan = LogicalPlanBuilder::from(self.plan) - .join_on(right.plan, join_type, expr)? + .join_on(right.plan, join_type, on_exprs)? .build()?; Ok(DataFrame { session_state: self.session_state, @@ -1694,7 +1693,7 @@ mod tests { use crate::test_util::{register_aggregate_csv, test_table, test_table_with_name}; use arrow::array::{self, Int32Array}; - use datafusion_common::{Constraint, Constraints}; + use datafusion_common::{Constraint, Constraints, ScalarValue}; use datafusion_common_runtime::SpawnedTask; use datafusion_expr::{ array_agg, cast, create_udf, expr, lit, BuiltInWindowFunction, @@ -2555,6 +2554,32 @@ mod tests { Ok(()) } + #[tokio::test] + async fn join_on_filter_datatype() -> Result<()> { + let left = test_table_with_name("a").await?.select_columns(&["c1"])?; + let right = test_table_with_name("b").await?.select_columns(&["c1"])?; + + // JOIN ON untyped NULL + let join = left.clone().join_on( + right.clone(), + JoinType::Inner, + Some(Expr::Literal(ScalarValue::Null)), + )?; + let expected_plan = "CrossJoin:\ + \n TableScan: a projection=[c1], full_filters=[Boolean(NULL)]\ + \n TableScan: b projection=[c1]"; + assert_eq!(expected_plan, format!("{:?}", join.into_optimized_plan()?)); + + // JOIN ON expression must be boolean type + let join = left.join_on(right, JoinType::Inner, Some(lit("TRUE")))?; + let expected = join.into_optimized_plan().unwrap_err(); + assert_eq!( + expected.strip_backtrace(), + "type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8" + ); + Ok(()) + } + #[tokio::test] async fn join_ambiguous_filter() -> Result<()> { let left = test_table_with_name("a") diff --git a/datafusion/optimizer/src/analyzer/type_coercion.rs b/datafusion/optimizer/src/analyzer/type_coercion.rs index 3cab474df84e..80a8c864e431 100644 --- a/datafusion/optimizer/src/analyzer/type_coercion.rs +++ b/datafusion/optimizer/src/analyzer/type_coercion.rs @@ -127,7 +127,7 @@ impl<'a> TypeCoercionRewriter<'a> { Self { schema } } - /// Coerce join equality expressions + /// Coerce join equality expressions and join filter /// /// Joins must be treated specially as their equality expressions are stored /// as a parallel list of left and right expressions, rather than a single @@ -151,9 +151,24 @@ impl<'a> TypeCoercionRewriter<'a> { }) .collect::>>()?; + // Join filter must be boolean + join.filter = join + .filter + .map(|expr| self.coerce_join_filter(expr)) + .transpose()?; + Ok(LogicalPlan::Join(join)) } + fn coerce_join_filter(&self, expr: Expr) -> Result { + let expr_type = expr.get_type(self.schema)?; + match expr_type { + DataType::Boolean => Ok(expr), + DataType::Null => expr.cast_to(&DataType::Boolean, self.schema), + other => plan_err!("Join condition must be boolean type, but got {other:?}"), + } + } + fn coerce_binary_op( &self, left: Expr, diff --git a/datafusion/sqllogictest/test_files/join.slt b/datafusion/sqllogictest/test_files/join.slt index 12cb8b3985c7..efebba1779cf 100644 --- a/datafusion/sqllogictest/test_files/join.slt +++ b/datafusion/sqllogictest/test_files/join.slt @@ -988,7 +988,6 @@ statement ok DROP TABLE department -# Test issue: https://github.com/apache/datafusion/issues/11269 statement ok CREATE TABLE t1 (v0 BIGINT) AS VALUES (-503661263); @@ -998,11 +997,22 @@ CREATE TABLE t2 (v0 DOUBLE) AS VALUES (-1.663563947387); statement ok CREATE TABLE t3 (v0 DOUBLE) AS VALUES (0.05112015193508901); +# Test issue: https://github.com/apache/datafusion/issues/11269 query RR SELECT t3.v0, t2.v0 FROM t1,t2,t3 WHERE t3.v0 >= t1.v0; ---- 0.051120151935 -1.663563947387 +# Test issue: https://github.com/apache/datafusion/issues/11414 +query IRR +SELECT * FROM t1 INNER JOIN t2 ON NULL RIGHT JOIN t3 ON TRUE; +---- +NULL NULL 0.051120151935 + +# ON expression must be boolean type +query error DataFusion error: type_coercion\ncaused by\nError during planning: Join condition must be boolean type, but got Utf8 +SELECT * FROM t1 INNER JOIN t2 ON 'TRUE' + statement ok DROP TABLE t1; From f5d88d1790eea85910ae5590a353ae17318f8401 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=BC=A0=E6=9E=97=E4=BC=9F?= Date: Sun, 14 Jul 2024 05:44:32 +0800 Subject: [PATCH 15/19] Support serialization/deserialization for custom physical exprs in proto (#11387) * Add PhysicalExtensionExprNode * regen proto * Add ser/de extension expr logic * Add test and fix clippy lint --- datafusion/proto/proto/datafusion.proto | 7 + datafusion/proto/src/generated/pbjson.rs | 124 +++++++++++++++ datafusion/proto/src/generated/prost.rs | 12 +- .../proto/src/physical_plan/from_proto.rs | 8 + datafusion/proto/src/physical_plan/mod.rs | 16 ++ .../proto/src/physical_plan/to_proto.rs | 19 ++- .../tests/cases/roundtrip_physical_plan.rs | 147 +++++++++++++++++- 7 files changed, 330 insertions(+), 3 deletions(-) diff --git a/datafusion/proto/proto/datafusion.proto b/datafusion/proto/proto/datafusion.proto index 345765b08be3..9ef884531e32 100644 --- a/datafusion/proto/proto/datafusion.proto +++ b/datafusion/proto/proto/datafusion.proto @@ -836,6 +836,8 @@ message PhysicalExprNode { // was PhysicalDateTimeIntervalExprNode date_time_interval_expr = 17; PhysicalLikeExprNode like_expr = 18; + + PhysicalExtensionExprNode extension = 19; } } @@ -942,6 +944,11 @@ message PhysicalNegativeNode { PhysicalExprNode expr = 1; } +message PhysicalExtensionExprNode { + bytes expr = 1; + repeated PhysicalExprNode inputs = 2; +} + message FilterExecNode { PhysicalPlanNode input = 1; PhysicalExprNode expr = 2; diff --git a/datafusion/proto/src/generated/pbjson.rs b/datafusion/proto/src/generated/pbjson.rs index 905f0d984955..fa989480fad9 100644 --- a/datafusion/proto/src/generated/pbjson.rs +++ b/datafusion/proto/src/generated/pbjson.rs @@ -13543,6 +13543,9 @@ impl serde::Serialize for PhysicalExprNode { physical_expr_node::ExprType::LikeExpr(v) => { struct_ser.serialize_field("likeExpr", v)?; } + physical_expr_node::ExprType::Extension(v) => { + struct_ser.serialize_field("extension", v)?; + } } } struct_ser.end() @@ -13582,6 +13585,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { "scalarUdf", "like_expr", "likeExpr", + "extension", ]; #[allow(clippy::enum_variant_names)] @@ -13602,6 +13606,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { WindowExpr, ScalarUdf, LikeExpr, + Extension, } impl<'de> serde::Deserialize<'de> for GeneratedField { fn deserialize(deserializer: D) -> std::result::Result @@ -13639,6 +13644,7 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { "windowExpr" | "window_expr" => Ok(GeneratedField::WindowExpr), "scalarUdf" | "scalar_udf" => Ok(GeneratedField::ScalarUdf), "likeExpr" | "like_expr" => Ok(GeneratedField::LikeExpr), + "extension" => Ok(GeneratedField::Extension), _ => Err(serde::de::Error::unknown_field(value, FIELDS)), } } @@ -13771,6 +13777,13 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { return Err(serde::de::Error::duplicate_field("likeExpr")); } expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::LikeExpr) +; + } + GeneratedField::Extension => { + if expr_type__.is_some() { + return Err(serde::de::Error::duplicate_field("extension")); + } + expr_type__ = map_.next_value::<::std::option::Option<_>>()?.map(physical_expr_node::ExprType::Extension) ; } } @@ -13783,6 +13796,117 @@ impl<'de> serde::Deserialize<'de> for PhysicalExprNode { deserializer.deserialize_struct("datafusion.PhysicalExprNode", FIELDS, GeneratedVisitor) } } +impl serde::Serialize for PhysicalExtensionExprNode { + #[allow(deprecated)] + fn serialize(&self, serializer: S) -> std::result::Result + where + S: serde::Serializer, + { + use serde::ser::SerializeStruct; + let mut len = 0; + if !self.expr.is_empty() { + len += 1; + } + if !self.inputs.is_empty() { + len += 1; + } + let mut struct_ser = serializer.serialize_struct("datafusion.PhysicalExtensionExprNode", len)?; + if !self.expr.is_empty() { + #[allow(clippy::needless_borrow)] + struct_ser.serialize_field("expr", pbjson::private::base64::encode(&self.expr).as_str())?; + } + if !self.inputs.is_empty() { + struct_ser.serialize_field("inputs", &self.inputs)?; + } + struct_ser.end() + } +} +impl<'de> serde::Deserialize<'de> for PhysicalExtensionExprNode { + #[allow(deprecated)] + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + const FIELDS: &[&str] = &[ + "expr", + "inputs", + ]; + + #[allow(clippy::enum_variant_names)] + enum GeneratedField { + Expr, + Inputs, + } + impl<'de> serde::Deserialize<'de> for GeneratedField { + fn deserialize(deserializer: D) -> std::result::Result + where + D: serde::Deserializer<'de>, + { + struct GeneratedVisitor; + + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = GeneratedField; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(formatter, "expected one of: {:?}", &FIELDS) + } + + #[allow(unused_variables)] + fn visit_str(self, value: &str) -> std::result::Result + where + E: serde::de::Error, + { + match value { + "expr" => Ok(GeneratedField::Expr), + "inputs" => Ok(GeneratedField::Inputs), + _ => Err(serde::de::Error::unknown_field(value, FIELDS)), + } + } + } + deserializer.deserialize_identifier(GeneratedVisitor) + } + } + struct GeneratedVisitor; + impl<'de> serde::de::Visitor<'de> for GeneratedVisitor { + type Value = PhysicalExtensionExprNode; + + fn expecting(&self, formatter: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + formatter.write_str("struct datafusion.PhysicalExtensionExprNode") + } + + fn visit_map(self, mut map_: V) -> std::result::Result + where + V: serde::de::MapAccess<'de>, + { + let mut expr__ = None; + let mut inputs__ = None; + while let Some(k) = map_.next_key()? { + match k { + GeneratedField::Expr => { + if expr__.is_some() { + return Err(serde::de::Error::duplicate_field("expr")); + } + expr__ = + Some(map_.next_value::<::pbjson::private::BytesDeserialize<_>>()?.0) + ; + } + GeneratedField::Inputs => { + if inputs__.is_some() { + return Err(serde::de::Error::duplicate_field("inputs")); + } + inputs__ = Some(map_.next_value()?); + } + } + } + Ok(PhysicalExtensionExprNode { + expr: expr__.unwrap_or_default(), + inputs: inputs__.unwrap_or_default(), + }) + } + } + deserializer.deserialize_struct("datafusion.PhysicalExtensionExprNode", FIELDS, GeneratedVisitor) + } +} impl serde::Serialize for PhysicalExtensionNode { #[allow(deprecated)] fn serialize(&self, serializer: S) -> std::result::Result diff --git a/datafusion/proto/src/generated/prost.rs b/datafusion/proto/src/generated/prost.rs index b16d26ee6e1e..8407e545fe65 100644 --- a/datafusion/proto/src/generated/prost.rs +++ b/datafusion/proto/src/generated/prost.rs @@ -1218,7 +1218,7 @@ pub struct PhysicalExtensionNode { pub struct PhysicalExprNode { #[prost( oneof = "physical_expr_node::ExprType", - tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18" + tags = "1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19" )] pub expr_type: ::core::option::Option, } @@ -1266,6 +1266,8 @@ pub mod physical_expr_node { ScalarUdf(super::PhysicalScalarUdfNode), #[prost(message, tag = "18")] LikeExpr(::prost::alloc::boxed::Box), + #[prost(message, tag = "19")] + Extension(super::PhysicalExtensionExprNode), } } #[allow(clippy::derive_partial_eq_without_eq)] @@ -1456,6 +1458,14 @@ pub struct PhysicalNegativeNode { } #[allow(clippy::derive_partial_eq_without_eq)] #[derive(Clone, PartialEq, ::prost::Message)] +pub struct PhysicalExtensionExprNode { + #[prost(bytes = "vec", tag = "1")] + pub expr: ::prost::alloc::vec::Vec, + #[prost(message, repeated, tag = "2")] + pub inputs: ::prost::alloc::vec::Vec, +} +#[allow(clippy::derive_partial_eq_without_eq)] +#[derive(Clone, PartialEq, ::prost::Message)] pub struct FilterExecNode { #[prost(message, optional, boxed, tag = "1")] pub input: ::core::option::Option<::prost::alloc::boxed::Box>, diff --git a/datafusion/proto/src/physical_plan/from_proto.rs b/datafusion/proto/src/physical_plan/from_proto.rs index e94bb3b8efcb..52fbd5cbdcf6 100644 --- a/datafusion/proto/src/physical_plan/from_proto.rs +++ b/datafusion/proto/src/physical_plan/from_proto.rs @@ -394,6 +394,14 @@ pub fn parse_physical_expr( codec, )?, )), + ExprType::Extension(extension) => { + let inputs: Vec> = extension + .inputs + .iter() + .map(|e| parse_physical_expr(e, registry, input_schema, codec)) + .collect::>()?; + (codec.try_decode_expr(extension.expr.as_slice(), &inputs)?) as _ + } }; Ok(pexpr) diff --git a/datafusion/proto/src/physical_plan/mod.rs b/datafusion/proto/src/physical_plan/mod.rs index 56e702704798..e5429945e97e 100644 --- a/datafusion/proto/src/physical_plan/mod.rs +++ b/datafusion/proto/src/physical_plan/mod.rs @@ -2018,6 +2018,22 @@ pub trait PhysicalExtensionCodec: Debug + Send + Sync { fn try_encode_udf(&self, _node: &ScalarUDF, _buf: &mut Vec) -> Result<()> { Ok(()) } + + fn try_decode_expr( + &self, + _buf: &[u8], + _inputs: &[Arc], + ) -> Result> { + not_impl_err!("PhysicalExtensionCodec is not provided") + } + + fn try_encode_expr( + &self, + _node: Arc, + _buf: &mut Vec, + ) -> Result<()> { + not_impl_err!("PhysicalExtensionCodec is not provided") + } } #[derive(Debug)] diff --git a/datafusion/proto/src/physical_plan/to_proto.rs b/datafusion/proto/src/physical_plan/to_proto.rs index 5e982ad2afde..9c95acc1dcf4 100644 --- a/datafusion/proto/src/physical_plan/to_proto.rs +++ b/datafusion/proto/src/physical_plan/to_proto.rs @@ -495,7 +495,24 @@ pub fn serialize_physical_expr( ))), }) } else { - internal_err!("physical_plan::to_proto() unsupported expression {value:?}") + let mut buf: Vec = vec![]; + match codec.try_encode_expr(Arc::clone(&value), &mut buf) { + Ok(_) => { + let inputs: Vec = value + .children() + .into_iter() + .map(|e| serialize_physical_expr(Arc::clone(e), codec)) + .collect::>()?; + Ok(protobuf::PhysicalExprNode { + expr_type: Some(protobuf::physical_expr_node::ExprType::Extension( + protobuf::PhysicalExtensionExprNode { expr: buf, inputs }, + )), + }) + } + Err(e) => internal_err!( + "Unsupported physical expr and extension codec failed with [{e}]. Expr: {value:?}" + ), + } } } diff --git a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs index d8d85ace1a29..2fcc65008fd8 100644 --- a/datafusion/proto/tests/cases/roundtrip_physical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_physical_plan.rs @@ -15,7 +15,10 @@ // specific language governing permissions and limitations // under the License. +use arrow::array::RecordBatch; use std::any::Any; +use std::fmt::Display; +use std::hash::Hasher; use std::ops::Deref; use std::sync::Arc; use std::vec; @@ -38,6 +41,7 @@ use datafusion::datasource::physical_plan::{ }; use datafusion::execution::FunctionRegistry; use datafusion::logical_expr::{create_udf, JoinType, Operator, Volatility}; +use datafusion::physical_expr::aggregate::utils::down_cast_any_ref; use datafusion::physical_expr::expressions::Max; use datafusion::physical_expr::window::SlidingAggregateWindowExpr; use datafusion::physical_expr::{PhysicalSortRequirement, ScalarFunctionExpr}; @@ -75,7 +79,7 @@ use datafusion_common::file_options::csv_writer::CsvWriterOptions; use datafusion_common::file_options::json_writer::JsonWriterOptions; use datafusion_common::parsers::CompressionTypeVariant; use datafusion_common::stats::Precision; -use datafusion_common::{not_impl_err, plan_err, DataFusionError, Result}; +use datafusion_common::{internal_err, not_impl_err, plan_err, DataFusionError, Result}; use datafusion_expr::{ Accumulator, AccumulatorFactoryFunction, AggregateUDF, ColumnarValue, ScalarUDF, ScalarUDFImpl, Signature, SimpleAggregateUDF, WindowFrame, WindowFrameBound, @@ -658,6 +662,147 @@ async fn roundtrip_parquet_exec_with_table_partition_cols() -> Result<()> { roundtrip_test(ParquetExec::builder(scan_config).build_arc()) } +#[test] +fn roundtrip_parquet_exec_with_custom_predicate_expr() -> Result<()> { + let scan_config = FileScanConfig { + object_store_url: ObjectStoreUrl::local_filesystem(), + file_schema: Arc::new(Schema::new(vec![Field::new( + "col", + DataType::Utf8, + false, + )])), + file_groups: vec![vec![PartitionedFile::new( + "/path/to/file.parquet".to_string(), + 1024, + )]], + statistics: Statistics { + num_rows: Precision::Inexact(100), + total_byte_size: Precision::Inexact(1024), + column_statistics: Statistics::unknown_column(&Arc::new(Schema::new(vec![ + Field::new("col", DataType::Utf8, false), + ]))), + }, + projection: None, + limit: None, + table_partition_cols: vec![], + output_ordering: vec![], + }; + + #[derive(Debug, Hash, Clone)] + struct CustomPredicateExpr { + inner: Arc, + } + impl Display for CustomPredicateExpr { + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "CustomPredicateExpr") + } + } + impl PartialEq for CustomPredicateExpr { + fn eq(&self, other: &dyn Any) -> bool { + down_cast_any_ref(other) + .downcast_ref::() + .map(|x| self.inner.eq(&x.inner)) + .unwrap_or(false) + } + } + impl PhysicalExpr for CustomPredicateExpr { + fn as_any(&self) -> &dyn Any { + self + } + + fn data_type(&self, _input_schema: &Schema) -> Result { + unreachable!() + } + + fn nullable(&self, _input_schema: &Schema) -> Result { + unreachable!() + } + + fn evaluate(&self, _batch: &RecordBatch) -> Result { + unreachable!() + } + + fn children(&self) -> Vec<&Arc> { + vec![&self.inner] + } + + fn with_new_children( + self: Arc, + _children: Vec>, + ) -> Result> { + todo!() + } + + fn dyn_hash(&self, _state: &mut dyn Hasher) { + unreachable!() + } + } + + #[derive(Debug)] + struct CustomPhysicalExtensionCodec; + impl PhysicalExtensionCodec for CustomPhysicalExtensionCodec { + fn try_decode( + &self, + _buf: &[u8], + _inputs: &[Arc], + _registry: &dyn FunctionRegistry, + ) -> Result> { + unreachable!() + } + + fn try_encode( + &self, + _node: Arc, + _buf: &mut Vec, + ) -> Result<()> { + unreachable!() + } + + fn try_decode_expr( + &self, + buf: &[u8], + inputs: &[Arc], + ) -> Result> { + if buf == "CustomPredicateExpr".as_bytes() { + Ok(Arc::new(CustomPredicateExpr { + inner: inputs[0].clone(), + })) + } else { + internal_err!("Not supported") + } + } + + fn try_encode_expr( + &self, + node: Arc, + buf: &mut Vec, + ) -> Result<()> { + if node + .as_ref() + .as_any() + .downcast_ref::() + .is_some() + { + buf.extend_from_slice("CustomPredicateExpr".as_bytes()); + Ok(()) + } else { + internal_err!("Not supported") + } + } + } + + let custom_predicate_expr = Arc::new(CustomPredicateExpr { + inner: Arc::new(Column::new("col", 1)), + }); + let exec_plan = ParquetExec::builder(scan_config) + .with_predicate(custom_predicate_expr) + .build_arc(); + + let ctx = SessionContext::new(); + roundtrip_test_and_return(exec_plan, &ctx, &CustomPhysicalExtensionCodec {})?; + Ok(()) +} + #[test] fn roundtrip_scalar_udf() -> Result<()> { let field_a = Field::new("a", DataType::Int64, false); From a43cf79bf0b133379ee6f2a236c025e59a5ef822 Mon Sep 17 00:00:00 2001 From: kf zheng <100595273+Kev1n8@users.noreply.github.com> Date: Sun, 14 Jul 2024 05:45:03 +0800 Subject: [PATCH 16/19] remove termtree dependency (#11416) * remove termtree dependency * impl Display for TopKHeap, replace uses of tree_print in tests * use to_string instead of format! --- datafusion/physical-plan/Cargo.toml | 1 - .../physical-plan/src/aggregates/topk/heap.rs | 86 ++++++++++++------- 2 files changed, 55 insertions(+), 32 deletions(-) diff --git a/datafusion/physical-plan/Cargo.toml b/datafusion/physical-plan/Cargo.toml index f5f756417ebf..00fc81ebde97 100644 --- a/datafusion/physical-plan/Cargo.toml +++ b/datafusion/physical-plan/Cargo.toml @@ -66,7 +66,6 @@ tokio = { workspace = true } [dev-dependencies] rstest = { workspace = true } rstest_reuse = "0.7.0" -termtree = "0.5.0" tokio = { workspace = true, features = [ "rt-multi-thread", "fs", diff --git a/datafusion/physical-plan/src/aggregates/topk/heap.rs b/datafusion/physical-plan/src/aggregates/topk/heap.rs index 51593f5c28ce..81eadbc018b3 100644 --- a/datafusion/physical-plan/src/aggregates/topk/heap.rs +++ b/datafusion/physical-plan/src/aggregates/topk/heap.rs @@ -27,7 +27,7 @@ use datafusion_common::Result; use datafusion_physical_expr::aggregate::utils::adjust_output_array; use half::f16; use std::cmp::Ordering; -use std::fmt::{Debug, Formatter}; +use std::fmt::{Debug, Display, Formatter}; use std::sync::Arc; /// A custom version of `Ord` that only exists to we can implement it for the Values in our heap @@ -323,29 +323,53 @@ impl TopKHeap { } } - #[cfg(test)] - fn _tree_print(&self, idx: usize) -> Option> { - let hi = self.heap.get(idx)?; - match hi { - None => None, - Some(hi) => { - let label = - format!("val={:?} idx={}, bucket={}", hi.val, idx, hi.map_idx); - let left = self._tree_print(idx * 2 + 1); - let right = self._tree_print(idx * 2 + 2); - let children = left.into_iter().chain(right); - let me = termtree::Tree::new(label).with_leaves(children); - Some(me) + fn _tree_print( + &self, + idx: usize, + prefix: String, + is_tail: bool, + output: &mut String, + ) { + if let Some(Some(hi)) = self.heap.get(idx) { + let connector = if idx != 0 { + if is_tail { + "└── " + } else { + "├── " + } + } else { + "" + }; + output.push_str(&format!( + "{}{}val={:?} idx={}, bucket={}\n", + prefix, connector, hi.val, idx, hi.map_idx + )); + let new_prefix = if is_tail { "" } else { "│ " }; + let child_prefix = format!("{}{}", prefix, new_prefix); + + let left_idx = idx * 2 + 1; + let right_idx = idx * 2 + 2; + + let left_exists = left_idx < self.len; + let right_exists = right_idx < self.len; + + if left_exists { + self._tree_print(left_idx, child_prefix.clone(), !right_exists, output); + } + if right_exists { + self._tree_print(right_idx, child_prefix, true, output); } } } +} - #[cfg(test)] - fn tree_print(&self) -> String { - match self._tree_print(0) { - None => "".to_string(), - Some(root) => format!("{}", root), +impl Display for TopKHeap { + fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { + let mut output = String::new(); + if self.heap.first().is_some() { + self._tree_print(0, String::new(), true, &mut output); } + write!(f, "{}", output) } } @@ -361,9 +385,9 @@ impl HeapItem { impl Debug for HeapItem { fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { f.write_str("bucket=")?; - self.map_idx.fmt(f)?; + Debug::fmt(&self.map_idx, f)?; f.write_str(" val=")?; - self.val.fmt(f)?; + Debug::fmt(&self.val, f)?; f.write_str("\n")?; Ok(()) } @@ -462,7 +486,7 @@ mod tests { let mut heap = TopKHeap::new(10, false); heap.append_or_replace(1, 1, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=1 idx=0, bucket=1 "#; @@ -482,7 +506,7 @@ val=1 idx=0, bucket=1 heap.append_or_replace(2, 2, &mut map); assert_eq!(map, vec![(2, 0), (1, 1)]); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -500,7 +524,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); heap.append_or_replace(3, 3, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=3 idx=0, bucket=3 ├── val=1 idx=1, bucket=1 @@ -510,7 +534,7 @@ val=3 idx=0, bucket=3 let mut map = vec![]; heap.append_or_replace(0, 0, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 ├── val=1 idx=1, bucket=1 @@ -531,7 +555,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(2, 2, &mut map); heap.append_or_replace(3, 3, &mut map); heap.append_or_replace(4, 4, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=4 idx=0, bucket=4 ├── val=3 idx=1, bucket=3 @@ -542,7 +566,7 @@ val=4 idx=0, bucket=4 let mut map = vec![]; heap.replace_if_better(1, 0, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=4 idx=0, bucket=4 ├── val=1 idx=1, bucket=1 @@ -563,7 +587,7 @@ val=4 idx=0, bucket=4 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -584,7 +608,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -607,7 +631,7 @@ val=2 idx=0, bucket=2 heap.append_or_replace(1, 1, &mut map); heap.append_or_replace(2, 2, &mut map); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=2 └── val=1 idx=1, bucket=1 @@ -616,7 +640,7 @@ val=2 idx=0, bucket=2 let numbers = vec![(0, 1), (1, 2)]; heap.renumber(numbers.as_slice()); - let actual = heap.tree_print(); + let actual = heap.to_string(); let expected = r#" val=2 idx=0, bucket=1 └── val=1 idx=1, bucket=2 From a7041feff32c2af09854c144a760d945e30fb38a Mon Sep 17 00:00:00 2001 From: Jax Liu Date: Sun, 14 Jul 2024 05:47:47 +0800 Subject: [PATCH 17/19] Minor: Add an example for backtrace pretty print (#11450) * add the example for printing backtrace pretty * add empty end line * fix prettier * sync the usage example * Update docs/source/user-guide/crate-configuration.md Co-authored-by: Oleks V --------- Co-authored-by: Oleks V --- docs/source/user-guide/crate-configuration.md | 44 ++++++++++++++++++- 1 file changed, 43 insertions(+), 1 deletion(-) diff --git a/docs/source/user-guide/crate-configuration.md b/docs/source/user-guide/crate-configuration.md index 0587d06a3919..9d22e3403097 100644 --- a/docs/source/user-guide/crate-configuration.md +++ b/docs/source/user-guide/crate-configuration.md @@ -121,7 +121,7 @@ backtrace: 0: std::backtrace_rs::backtrace::libunwind::trace The backtraces are useful when debugging code. If there is a test in `datafusion/core/src/physical_planner.rs` -``` +```rust #[tokio::test] async fn test_get_backtrace_for_failed_code() -> Result<()> { let ctx = SessionContext::new(); @@ -141,6 +141,48 @@ To obtain a backtrace: ```bash cargo build --features=backtrace RUST_BACKTRACE=1 cargo test --features=backtrace --package datafusion --lib -- physical_planner::tests::test_get_backtrace_for_failed_code --exact --nocapture + +running 1 test +Error: Plan("Invalid function 'row_numer'.\nDid you mean 'ROW_NUMBER'?\n\nbacktrace: 0: std::backtrace_rs::backtrace::libunwind::trace\n at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5\n 1: std::backtrace_rs::backtrace::trace_unsynchronized\n... ``` Note: The backtrace wrapped into systems calls, so some steps on top of the backtrace can be ignored + +To show the backtrace in a pretty-printed format use `eprintln!("{e}");`. + +```rust +#[tokio::test] +async fn test_get_backtrace_for_failed_code() -> Result<()> { + let ctx = SessionContext::new(); + + let sql = "select row_numer() over (partition by a order by a) from (select 1 a);"; + + let _ = match ctx.sql(sql).await { + Ok(result) => result.show().await?, + Err(e) => { + eprintln!("{e}"); + } + }; + + Ok(()) +} +``` + +Then run the test: + +```bash +$ RUST_BACKTRACE=1 cargo test --features=backtrace --package datafusion --lib -- physical_planner::tests::test_get_backtrace_for_failed_code --exact --nocapture + +running 1 test +Error during planning: Invalid function 'row_numer'. +Did you mean 'ROW_NUMBER'? + +backtrace: 0: std::backtrace_rs::backtrace::libunwind::trace + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/libunwind.rs:105:5 + 1: std::backtrace_rs::backtrace::trace_unsynchronized + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/../../backtrace/src/backtrace/mod.rs:66:5 + 2: std::backtrace::Backtrace::create + at /rustc/129f3b9964af4d4a709d1383930ade12dfe7c081/library/std/src/backtrace.rs:331:13 + 3: std::backtrace::Backtrace::capture + ... +``` From 84758062f808f97ba3b7e9d8a9d3839df4c39d98 Mon Sep 17 00:00:00 2001 From: Bruce Ritchie Date: Sun, 14 Jul 2024 15:00:31 -0400 Subject: [PATCH 18/19] Add SessionStateBuilder and extract out the registration of defaults (#11403) * Create a SessionStateBuilder and use it for creating anything but a basic SessionState. * Updated new_from_existing to take a reference to the existing SessionState and clone it. * Minor documentation update. * SessionStateDefaults improvements. * Reworked how SessionStateBuilder works from PR feedback. * Bug fix for missing array_expressions cfg feature. * Review feedback updates + doc fixes for SessionStateDefaults * Cargo fmt update. --- datafusion-cli/src/catalog.rs | 11 +- .../examples/custom_file_format.rs | 9 +- .../core/src/datasource/file_format/csv.rs | 7 +- datafusion/core/src/execution/context/mod.rs | 25 +- .../core/src/execution/session_state.rs | 965 ++++++++++++++---- datafusion/core/src/physical_planner.rs | 7 +- datafusion/core/src/test/object_store.rs | 8 +- datafusion/core/tests/dataframe/mod.rs | 19 +- datafusion/core/tests/memory_limit/mod.rs | 14 +- .../core/tests/parquet/file_statistics.rs | 6 +- datafusion/core/tests/sql/create_drop.rs | 13 +- .../tests/user_defined/user_defined_plan.rs | 11 +- .../tests/cases/roundtrip_logical_plan.rs | 8 +- .../tests/cases/roundtrip_logical_plan.rs | 13 +- 14 files changed, 884 insertions(+), 232 deletions(-) diff --git a/datafusion-cli/src/catalog.rs b/datafusion-cli/src/catalog.rs index c11eb3280c20..b83f65975610 100644 --- a/datafusion-cli/src/catalog.rs +++ b/datafusion-cli/src/catalog.rs @@ -29,6 +29,7 @@ use datafusion::datasource::listing::{ use datafusion::datasource::TableProvider; use datafusion::error::Result; use datafusion::execution::context::SessionState; +use datafusion::execution::session_state::SessionStateBuilder; use async_trait::async_trait; use dirs::home_dir; @@ -162,6 +163,7 @@ impl SchemaProvider for DynamicFileSchemaProvider { .ok_or_else(|| plan_datafusion_err!("locking error"))? .read() .clone(); + let mut builder = SessionStateBuilder::from(state.clone()); let optimized_name = substitute_tilde(name.to_owned()); let table_url = ListingTableUrl::parse(optimized_name.as_str())?; let scheme = table_url.scheme(); @@ -178,13 +180,18 @@ impl SchemaProvider for DynamicFileSchemaProvider { // to any command options so the only choice is to use an empty collection match scheme { "s3" | "oss" | "cos" => { - state = state.add_table_options_extension(AwsOptions::default()); + if let Some(table_options) = builder.table_options() { + table_options.extensions.insert(AwsOptions::default()) + } } "gs" | "gcs" => { - state = state.add_table_options_extension(GcpOptions::default()) + if let Some(table_options) = builder.table_options() { + table_options.extensions.insert(GcpOptions::default()) + } } _ => {} }; + state = builder.build(); let store = get_object_store( &state, table_url.scheme(), diff --git a/datafusion-examples/examples/custom_file_format.rs b/datafusion-examples/examples/custom_file_format.rs index fe936418bce4..bdb702375c94 100644 --- a/datafusion-examples/examples/custom_file_format.rs +++ b/datafusion-examples/examples/custom_file_format.rs @@ -22,6 +22,7 @@ use arrow::{ datatypes::UInt64Type, }; use arrow_schema::{DataType, Field, Schema, SchemaRef}; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::{ datasource::{ file_format::{ @@ -32,9 +33,9 @@ use datafusion::{ MemTable, }, error::Result, - execution::{context::SessionState, runtime_env::RuntimeEnv}, + execution::context::SessionState, physical_plan::ExecutionPlan, - prelude::{SessionConfig, SessionContext}, + prelude::SessionContext, }; use datafusion_common::{GetExt, Statistics}; use datafusion_physical_expr::{PhysicalExpr, PhysicalSortRequirement}; @@ -176,9 +177,7 @@ impl GetExt for TSVFileFactory { #[tokio::main] async fn main() -> Result<()> { // Create a new context with the default configuration - let config = SessionConfig::new(); - let runtime = RuntimeEnv::default(); - let mut state = SessionState::new_with_config_rt(config, Arc::new(runtime)); + let mut state = SessionStateBuilder::new().with_default_features().build(); // Register the custom file format let file_format = Arc::new(TSVFileFactory::new()); diff --git a/datafusion/core/src/datasource/file_format/csv.rs b/datafusion/core/src/datasource/file_format/csv.rs index 92cb11e2b47a..baeaf51fb56d 100644 --- a/datafusion/core/src/datasource/file_format/csv.rs +++ b/datafusion/core/src/datasource/file_format/csv.rs @@ -632,6 +632,7 @@ mod tests { use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; use datafusion_expr::{col, lit}; + use crate::execution::session_state::SessionStateBuilder; use chrono::DateTime; use object_store::local::LocalFileSystem; use object_store::path::Path; @@ -814,7 +815,11 @@ mod tests { let runtime = Arc::new(RuntimeEnv::new(RuntimeConfig::new()).unwrap()); let mut cfg = SessionConfig::new(); cfg.options_mut().catalog.has_header = true; - let session_state = SessionState::new_with_config_rt(cfg, runtime); + let session_state = SessionStateBuilder::new() + .with_config(cfg) + .with_runtime_env(runtime) + .with_default_features() + .build(); let integration = LocalFileSystem::new_with_prefix(arrow_test_data()).unwrap(); let path = Path::from("csv/aggregate_test_100.csv"); let csv = CsvFormat::default().with_has_header(true); diff --git a/datafusion/core/src/execution/context/mod.rs b/datafusion/core/src/execution/context/mod.rs index 4b9e3e843341..640a9b14a65f 100644 --- a/datafusion/core/src/execution/context/mod.rs +++ b/datafusion/core/src/execution/context/mod.rs @@ -73,6 +73,7 @@ use object_store::ObjectStore; use parking_lot::RwLock; use url::Url; +use crate::execution::session_state::SessionStateBuilder; pub use datafusion_execution::config::SessionConfig; pub use datafusion_execution::TaskContext; pub use datafusion_expr::execution_props::ExecutionProps; @@ -294,7 +295,11 @@ impl SessionContext { /// all `SessionContext`'s should be configured with the /// same `RuntimeEnv`. pub fn new_with_config_rt(config: SessionConfig, runtime: Arc) -> Self { - let state = SessionState::new_with_config_rt(config, runtime); + let state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build(); Self::new_with_state(state) } @@ -315,7 +320,7 @@ impl SessionContext { } /// Creates a new `SessionContext` using the provided [`SessionState`] - #[deprecated(since = "32.0.0", note = "Use SessionState::new_with_state")] + #[deprecated(since = "32.0.0", note = "Use SessionContext::new_with_state")] pub fn with_state(state: SessionState) -> Self { Self::new_with_state(state) } @@ -1574,6 +1579,7 @@ mod tests { use datafusion_common_runtime::SpawnedTask; use crate::catalog::schema::SchemaProvider; + use crate::execution::session_state::SessionStateBuilder; use crate::physical_planner::PhysicalPlanner; use async_trait::async_trait; use tempfile::TempDir; @@ -1707,7 +1713,11 @@ mod tests { .set_str("datafusion.catalog.location", url.as_str()) .set_str("datafusion.catalog.format", "CSV") .set_str("datafusion.catalog.has_header", "true"); - let session_state = SessionState::new_with_config_rt(cfg, runtime); + let session_state = SessionStateBuilder::new() + .with_config(cfg) + .with_runtime_env(runtime) + .with_default_features() + .build(); let ctx = SessionContext::new_with_state(session_state); ctx.refresh_catalogs().await?; @@ -1733,9 +1743,12 @@ mod tests { #[tokio::test] async fn custom_query_planner() -> Result<()> { let runtime = Arc::new(RuntimeEnv::default()); - let session_state = - SessionState::new_with_config_rt(SessionConfig::new(), runtime) - .with_query_planner(Arc::new(MyQueryPlanner {})); + let session_state = SessionStateBuilder::new() + .with_config(SessionConfig::new()) + .with_runtime_env(runtime) + .with_default_features() + .with_query_planner(Arc::new(MyQueryPlanner {})) + .build(); let ctx = SessionContext::new_with_state(session_state); let df = ctx.sql("SELECT 1").await?; diff --git a/datafusion/core/src/execution/session_state.rs b/datafusion/core/src/execution/session_state.rs index dbfba9ea9352..75eef4345487 100644 --- a/datafusion/core/src/execution/session_state.rs +++ b/datafusion/core/src/execution/session_state.rs @@ -77,6 +77,8 @@ use datafusion_physical_expr_common::physical_expr::PhysicalExpr; use datafusion_physical_plan::ExecutionPlan; use datafusion_sql::parser::{DFParser, Statement}; use datafusion_sql::planner::{ContextProvider, ParserOptions, PlannerContext, SqlToRel}; +use itertools::Itertools; +use log::{debug, info}; use sqlparser::ast::Expr as SQLExpr; use sqlparser::dialect::dialect_from_str; use std::collections::hash_map::Entry; @@ -89,9 +91,29 @@ use uuid::Uuid; /// Execution context for registering data sources and executing queries. /// See [`SessionContext`] for a higher level API. /// +/// Use the [`SessionStateBuilder`] to build a SessionState object. +/// +/// ``` +/// use datafusion::prelude::*; +/// # use datafusion::{error::Result, assert_batches_eq}; +/// # use datafusion::execution::session_state::SessionStateBuilder; +/// # use datafusion_execution::runtime_env::RuntimeEnv; +/// # use std::sync::Arc; +/// # #[tokio::main] +/// # async fn main() -> Result<()> { +/// let state = SessionStateBuilder::new() +/// .with_config(SessionConfig::new()) +/// .with_runtime_env(Arc::new(RuntimeEnv::default())) +/// .with_default_features() +/// .build(); +/// Ok(()) +/// # } +/// ``` +/// /// Note that there is no `Default` or `new()` for SessionState, /// to avoid accidentally running queries or other operations without passing through -/// the [`SessionConfig`] or [`RuntimeEnv`]. See [`SessionContext`]. +/// the [`SessionConfig`] or [`RuntimeEnv`]. See [`SessionStateBuilder`] and +/// [`SessionContext`]. /// /// [`SessionContext`]: crate::execution::context::SessionContext #[derive(Clone)] @@ -140,7 +162,6 @@ pub struct SessionState { table_factories: HashMap>, /// Runtime environment runtime_env: Arc, - /// [FunctionFactory] to support pluggable user defined function handler. /// /// It will be invoked on `CREATE FUNCTION` statements. @@ -153,6 +174,7 @@ impl Debug for SessionState { f.debug_struct("SessionState") .field("session_id", &self.session_id) .field("analyzer", &"...") + .field("expr_planners", &"...") .field("optimizer", &"...") .field("physical_optimizers", &"...") .field("query_planner", &"...") @@ -175,193 +197,56 @@ impl Debug for SessionState { impl SessionState { /// Returns new [`SessionState`] using the provided /// [`SessionConfig`] and [`RuntimeEnv`]. + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] pub fn new_with_config_rt(config: SessionConfig, runtime: Arc) -> Self { - let catalog_list = - Arc::new(MemoryCatalogProviderList::new()) as Arc; - Self::new_with_config_rt_and_catalog_list(config, runtime, catalog_list) + SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build() } /// Returns new [`SessionState`] using the provided /// [`SessionConfig`] and [`RuntimeEnv`]. - #[deprecated(since = "32.0.0", note = "Use SessionState::new_with_config_rt")] + #[deprecated(since = "32.0.0", note = "Use SessionStateBuilder")] pub fn with_config_rt(config: SessionConfig, runtime: Arc) -> Self { - Self::new_with_config_rt(config, runtime) + SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build() } /// Returns new [`SessionState`] using the provided /// [`SessionConfig`], [`RuntimeEnv`], and [`CatalogProviderList`] + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] pub fn new_with_config_rt_and_catalog_list( config: SessionConfig, runtime: Arc, catalog_list: Arc, ) -> Self { - let session_id = Uuid::new_v4().to_string(); - - // Create table_factories for all default formats - let mut table_factories: HashMap> = - HashMap::new(); - #[cfg(feature = "parquet")] - table_factories.insert("PARQUET".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("CSV".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("JSON".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("NDJSON".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("AVRO".into(), Arc::new(DefaultTableFactory::new())); - table_factories.insert("ARROW".into(), Arc::new(DefaultTableFactory::new())); - - if config.create_default_catalog_and_schema() { - let default_catalog = MemoryCatalogProvider::new(); - - default_catalog - .register_schema( - &config.options().catalog.default_schema, - Arc::new(MemorySchemaProvider::new()), - ) - .expect("memory catalog provider can register schema"); - - Self::register_default_schema( - &config, - &table_factories, - &runtime, - &default_catalog, - ); - - catalog_list.register_catalog( - config.options().catalog.default_catalog.clone(), - Arc::new(default_catalog), - ); - } - - let expr_planners: Vec> = vec![ - Arc::new(functions::core::planner::CoreFunctionPlanner::default()), - // register crate of array expressions (if enabled) - #[cfg(feature = "array_expressions")] - Arc::new(functions_array::planner::ArrayFunctionPlanner), - #[cfg(feature = "array_expressions")] - Arc::new(functions_array::planner::FieldAccessPlanner), - #[cfg(any( - feature = "datetime_expressions", - feature = "unicode_expressions" - ))] - Arc::new(functions::planner::UserDefinedFunctionPlanner), - ]; - - let mut new_self = SessionState { - session_id, - analyzer: Analyzer::new(), - expr_planners, - optimizer: Optimizer::new(), - physical_optimizers: PhysicalOptimizer::new(), - query_planner: Arc::new(DefaultQueryPlanner {}), - catalog_list, - table_functions: HashMap::new(), - scalar_functions: HashMap::new(), - aggregate_functions: HashMap::new(), - window_functions: HashMap::new(), - serializer_registry: Arc::new(EmptySerializerRegistry), - file_formats: HashMap::new(), - table_options: TableOptions::default_from_session_config(config.options()), - config, - execution_props: ExecutionProps::new(), - runtime_env: runtime, - table_factories, - function_factory: None, - }; - - #[cfg(feature = "parquet")] - if let Err(e) = - new_self.register_file_format(Arc::new(ParquetFormatFactory::new()), false) - { - log::info!("Unable to register default ParquetFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(JsonFormatFactory::new()), false) - { - log::info!("Unable to register default JsonFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(CsvFormatFactory::new()), false) - { - log::info!("Unable to register default CsvFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(ArrowFormatFactory::new()), false) - { - log::info!("Unable to register default ArrowFormat: {e}") - }; - - if let Err(e) = - new_self.register_file_format(Arc::new(AvroFormatFactory::new()), false) - { - log::info!("Unable to register default AvroFormat: {e}") - }; - - // register built in functions - functions::register_all(&mut new_self) - .expect("can not register built in functions"); - - // register crate of array expressions (if enabled) - #[cfg(feature = "array_expressions")] - functions_array::register_all(&mut new_self) - .expect("can not register array expressions"); - - functions_aggregate::register_all(&mut new_self) - .expect("can not register aggregate functions"); - - new_self + SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_catalog_list(catalog_list) + .with_default_features() + .build() } + /// Returns new [`SessionState`] using the provided /// [`SessionConfig`] and [`RuntimeEnv`]. - #[deprecated( - since = "32.0.0", - note = "Use SessionState::new_with_config_rt_and_catalog_list" - )] + #[deprecated(since = "32.0.0", note = "Use SessionStateBuilder")] pub fn with_config_rt_and_catalog_list( config: SessionConfig, runtime: Arc, catalog_list: Arc, ) -> Self { - Self::new_with_config_rt_and_catalog_list(config, runtime, catalog_list) - } - fn register_default_schema( - config: &SessionConfig, - table_factories: &HashMap>, - runtime: &Arc, - default_catalog: &MemoryCatalogProvider, - ) { - let url = config.options().catalog.location.as_ref(); - let format = config.options().catalog.format.as_ref(); - let (url, format) = match (url, format) { - (Some(url), Some(format)) => (url, format), - _ => return, - }; - let url = url.to_string(); - let format = format.to_string(); - - let url = Url::parse(url.as_str()).expect("Invalid default catalog location!"); - let authority = match url.host_str() { - Some(host) => format!("{}://{}", url.scheme(), host), - None => format!("{}://", url.scheme()), - }; - let path = &url.as_str()[authority.len()..]; - let path = object_store::path::Path::parse(path).expect("Can't parse path"); - let store = ObjectStoreUrl::parse(authority.as_str()) - .expect("Invalid default catalog url"); - let store = match runtime.object_store(store) { - Ok(store) => store, - _ => return, - }; - let factory = match table_factories.get(format.as_str()) { - Some(factory) => factory, - _ => return, - }; - let schema = - ListingSchemaProvider::new(authority, path, factory.clone(), store, format); - let _ = default_catalog - .register_schema("default", Arc::new(schema)) - .expect("Failed to register default schema"); + SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_catalog_list(catalog_list) + .with_default_features() + .build() } pub(crate) fn resolve_table_ref( @@ -400,12 +285,14 @@ impl SessionState { }) } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Replace the random session id. pub fn with_session_id(mut self, session_id: String) -> Self { self.session_id = session_id; self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// override default query planner with `query_planner` pub fn with_query_planner( mut self, @@ -415,6 +302,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Override the [`AnalyzerRule`]s optimizer plan rules. pub fn with_analyzer_rules( mut self, @@ -424,6 +312,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Replace the entire list of [`OptimizerRule`]s used to optimize plans pub fn with_optimizer_rules( mut self, @@ -433,6 +322,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Replace the entire list of [`PhysicalOptimizerRule`]s used to optimize plans pub fn with_physical_optimizer_rules( mut self, @@ -452,6 +342,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Add `optimizer_rule` to the end of the list of /// [`OptimizerRule`]s used to rewrite queries. pub fn add_optimizer_rule( @@ -472,6 +363,7 @@ impl SessionState { self.optimizer.rules.push(optimizer_rule); } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Add `physical_optimizer_rule` to the end of the list of /// [`PhysicalOptimizerRule`]s used to rewrite queries. pub fn add_physical_optimizer_rule( @@ -482,6 +374,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Adds a new [`ConfigExtension`] to TableOptions pub fn add_table_options_extension( mut self, @@ -491,6 +384,7 @@ impl SessionState { self } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Registers a [`FunctionFactory`] to handle `CREATE FUNCTION` statements pub fn with_function_factory( mut self, @@ -505,6 +399,7 @@ impl SessionState { self.function_factory = Some(function_factory); } + #[deprecated(since = "40.0.0", note = "Use SessionStateBuilder")] /// Replace the extension [`SerializerRegistry`] pub fn with_serializer_registry( mut self, @@ -858,19 +753,20 @@ impl SessionState { &self.table_options } - /// Return mutable table opptions + /// Return mutable table options pub fn table_options_mut(&mut self) -> &mut TableOptions { &mut self.table_options } - /// Registers a [`ConfigExtension`] as a table option extention that can be + /// Registers a [`ConfigExtension`] as a table option extension that can be /// referenced from SQL statements executed against this context. pub fn register_table_options_extension(&mut self, extension: T) { self.table_options.extensions.insert(extension) } - /// Adds or updates a [FileFormatFactory] which can be used with COPY TO or CREATE EXTERNAL TABLE statements for reading - /// and writing files of custom formats. + /// Adds or updates a [FileFormatFactory] which can be used with COPY TO or + /// CREATE EXTERNAL TABLE statements for reading and writing files of custom + /// formats. pub fn register_file_format( &mut self, file_format: Arc, @@ -950,7 +846,7 @@ impl SessionState { ); } - /// Deregsiter a user defined table function + /// Deregister a user defined table function pub fn deregister_udtf( &mut self, name: &str, @@ -974,6 +870,733 @@ impl SessionState { } } +/// A builder to be used for building [`SessionState`]'s. Defaults will +/// be used for all values unless explicitly provided. +/// +/// See example on [`SessionState`] +pub struct SessionStateBuilder { + session_id: Option, + analyzer: Option, + expr_planners: Option>>, + optimizer: Option, + physical_optimizers: Option, + query_planner: Option>, + catalog_list: Option>, + table_functions: Option>>, + scalar_functions: Option>>, + aggregate_functions: Option>>, + window_functions: Option>>, + serializer_registry: Option>, + file_formats: Option>>, + config: Option, + table_options: Option, + execution_props: Option, + table_factories: Option>>, + runtime_env: Option>, + function_factory: Option>, + // fields to support convenience functions + analyzer_rules: Option>>, + optimizer_rules: Option>>, + physical_optimizer_rules: Option>>, +} + +impl SessionStateBuilder { + /// Returns a new [`SessionStateBuilder`] with no options set. + pub fn new() -> Self { + Self { + session_id: None, + analyzer: None, + expr_planners: None, + optimizer: None, + physical_optimizers: None, + query_planner: None, + catalog_list: None, + table_functions: None, + scalar_functions: None, + aggregate_functions: None, + window_functions: None, + serializer_registry: None, + file_formats: None, + table_options: None, + config: None, + execution_props: None, + table_factories: None, + runtime_env: None, + function_factory: None, + // fields to support convenience functions + analyzer_rules: None, + optimizer_rules: None, + physical_optimizer_rules: None, + } + } + + /// Returns a new [SessionStateBuilder] based on an existing [SessionState] + /// The session id for the new builder will be unset; all other fields will + /// be cloned from what is set in the provided session state + pub fn new_from_existing(existing: SessionState) -> Self { + Self { + session_id: None, + analyzer: Some(existing.analyzer), + expr_planners: Some(existing.expr_planners), + optimizer: Some(existing.optimizer), + physical_optimizers: Some(existing.physical_optimizers), + query_planner: Some(existing.query_planner), + catalog_list: Some(existing.catalog_list), + table_functions: Some(existing.table_functions), + scalar_functions: Some(existing.scalar_functions.into_values().collect_vec()), + aggregate_functions: Some( + existing.aggregate_functions.into_values().collect_vec(), + ), + window_functions: Some(existing.window_functions.into_values().collect_vec()), + serializer_registry: Some(existing.serializer_registry), + file_formats: Some(existing.file_formats.into_values().collect_vec()), + config: Some(existing.config), + table_options: Some(existing.table_options), + execution_props: Some(existing.execution_props), + table_factories: Some(existing.table_factories), + runtime_env: Some(existing.runtime_env), + function_factory: existing.function_factory, + + // fields to support convenience functions + analyzer_rules: None, + optimizer_rules: None, + physical_optimizer_rules: None, + } + } + + /// Set defaults for table_factories, file formats, expr_planners and builtin + /// scalar and aggregate functions. + pub fn with_default_features(mut self) -> Self { + self.table_factories = Some(SessionStateDefaults::default_table_factories()); + self.file_formats = Some(SessionStateDefaults::default_file_formats()); + self.expr_planners = Some(SessionStateDefaults::default_expr_planners()); + self.scalar_functions = Some(SessionStateDefaults::default_scalar_functions()); + self.aggregate_functions = + Some(SessionStateDefaults::default_aggregate_functions()); + self + } + + /// Set the session id. + pub fn with_session_id(mut self, session_id: String) -> Self { + self.session_id = Some(session_id); + self + } + + /// Set the [`AnalyzerRule`]s optimizer plan rules. + pub fn with_analyzer_rules( + mut self, + rules: Vec>, + ) -> Self { + self.analyzer = Some(Analyzer::with_rules(rules)); + self + } + + /// Add `analyzer_rule` to the end of the list of + /// [`AnalyzerRule`]s used to rewrite queries. + pub fn with_analyzer_rule( + mut self, + analyzer_rule: Arc, + ) -> Self { + let mut rules = self.analyzer_rules.unwrap_or_default(); + rules.push(analyzer_rule); + self.analyzer_rules = Some(rules); + self + } + + /// Set the [`OptimizerRule`]s used to optimize plans. + pub fn with_optimizer_rules( + mut self, + rules: Vec>, + ) -> Self { + self.optimizer = Some(Optimizer::with_rules(rules)); + self + } + + /// Add `optimizer_rule` to the end of the list of + /// [`OptimizerRule`]s used to rewrite queries. + pub fn with_optimizer_rule( + mut self, + optimizer_rule: Arc, + ) -> Self { + let mut rules = self.optimizer_rules.unwrap_or_default(); + rules.push(optimizer_rule); + self.optimizer_rules = Some(rules); + self + } + + /// Set the [`ExprPlanner`]s used to customize the behavior of the SQL planner. + pub fn with_expr_planners( + mut self, + expr_planners: Vec>, + ) -> Self { + self.expr_planners = Some(expr_planners); + self + } + + /// Set tje [`PhysicalOptimizerRule`]s used to optimize plans. + pub fn with_physical_optimizer_rules( + mut self, + physical_optimizers: Vec>, + ) -> Self { + self.physical_optimizers = + Some(PhysicalOptimizer::with_rules(physical_optimizers)); + self + } + + /// Add `physical_optimizer_rule` to the end of the list of + /// [`PhysicalOptimizerRule`]s used to rewrite queries. + pub fn with_physical_optimizer_rule( + mut self, + physical_optimizer_rule: Arc, + ) -> Self { + let mut rules = self.physical_optimizer_rules.unwrap_or_default(); + rules.push(physical_optimizer_rule); + self.physical_optimizer_rules = Some(rules); + self + } + + /// Set the [`QueryPlanner`] + pub fn with_query_planner( + mut self, + query_planner: Arc, + ) -> Self { + self.query_planner = Some(query_planner); + self + } + + /// Set the [`CatalogProviderList`] + pub fn with_catalog_list( + mut self, + catalog_list: Arc, + ) -> Self { + self.catalog_list = Some(catalog_list); + self + } + + /// Set the map of [`TableFunction`]s + pub fn with_table_functions( + mut self, + table_functions: HashMap>, + ) -> Self { + self.table_functions = Some(table_functions); + self + } + + /// Set the map of [`ScalarUDF`]s + pub fn with_scalar_functions( + mut self, + scalar_functions: Vec>, + ) -> Self { + self.scalar_functions = Some(scalar_functions); + self + } + + /// Set the map of [`AggregateUDF`]s + pub fn with_aggregate_functions( + mut self, + aggregate_functions: Vec>, + ) -> Self { + self.aggregate_functions = Some(aggregate_functions); + self + } + + /// Set the map of [`WindowUDF`]s + pub fn with_window_functions( + mut self, + window_functions: Vec>, + ) -> Self { + self.window_functions = Some(window_functions); + self + } + + /// Set the [`SerializerRegistry`] + pub fn with_serializer_registry( + mut self, + serializer_registry: Arc, + ) -> Self { + self.serializer_registry = Some(serializer_registry); + self + } + + /// Set the map of [`FileFormatFactory`]s + pub fn with_file_formats( + mut self, + file_formats: Vec>, + ) -> Self { + self.file_formats = Some(file_formats); + self + } + + /// Set the [`SessionConfig`] + pub fn with_config(mut self, config: SessionConfig) -> Self { + self.config = Some(config); + self + } + + /// Set the [`TableOptions`] + pub fn with_table_options(mut self, table_options: TableOptions) -> Self { + self.table_options = Some(table_options); + self + } + + /// Set the [`ExecutionProps`] + pub fn with_execution_props(mut self, execution_props: ExecutionProps) -> Self { + self.execution_props = Some(execution_props); + self + } + + /// Set the map of [`TableProviderFactory`]s + pub fn with_table_factories( + mut self, + table_factories: HashMap>, + ) -> Self { + self.table_factories = Some(table_factories); + self + } + + /// Set the [`RuntimeEnv`] + pub fn with_runtime_env(mut self, runtime_env: Arc) -> Self { + self.runtime_env = Some(runtime_env); + self + } + + /// Set a [`FunctionFactory`] to handle `CREATE FUNCTION` statements + pub fn with_function_factory( + mut self, + function_factory: Option>, + ) -> Self { + self.function_factory = function_factory; + self + } + + /// Builds a [`SessionState`] with the current configuration. + /// + /// Note that there is an explicit option for enabling catalog and schema defaults + /// in [SessionConfig::create_default_catalog_and_schema] which if enabled + /// will be built here. + pub fn build(self) -> SessionState { + let Self { + session_id, + analyzer, + expr_planners, + optimizer, + physical_optimizers, + query_planner, + catalog_list, + table_functions, + scalar_functions, + aggregate_functions, + window_functions, + serializer_registry, + file_formats, + table_options, + config, + execution_props, + table_factories, + runtime_env, + function_factory, + analyzer_rules, + optimizer_rules, + physical_optimizer_rules, + } = self; + + let config = config.unwrap_or_default(); + let runtime_env = runtime_env.unwrap_or(Arc::new(RuntimeEnv::default())); + + let mut state = SessionState { + session_id: session_id.unwrap_or(Uuid::new_v4().to_string()), + analyzer: analyzer.unwrap_or_default(), + expr_planners: expr_planners.unwrap_or_default(), + optimizer: optimizer.unwrap_or_default(), + physical_optimizers: physical_optimizers.unwrap_or_default(), + query_planner: query_planner.unwrap_or(Arc::new(DefaultQueryPlanner {})), + catalog_list: catalog_list + .unwrap_or(Arc::new(MemoryCatalogProviderList::new()) + as Arc), + table_functions: table_functions.unwrap_or_default(), + scalar_functions: HashMap::new(), + aggregate_functions: HashMap::new(), + window_functions: HashMap::new(), + serializer_registry: serializer_registry + .unwrap_or(Arc::new(EmptySerializerRegistry)), + file_formats: HashMap::new(), + table_options: table_options + .unwrap_or(TableOptions::default_from_session_config(config.options())), + config, + execution_props: execution_props.unwrap_or_default(), + table_factories: table_factories.unwrap_or_default(), + runtime_env, + function_factory, + }; + + if let Some(file_formats) = file_formats { + for file_format in file_formats { + if let Err(e) = state.register_file_format(file_format, false) { + info!("Unable to register file format: {e}") + }; + } + } + + if let Some(scalar_functions) = scalar_functions { + scalar_functions.into_iter().for_each(|udf| { + let existing_udf = state.register_udf(udf); + if let Ok(Some(existing_udf)) = existing_udf { + debug!("Overwrote an existing UDF: {}", existing_udf.name()); + } + }); + } + + if let Some(aggregate_functions) = aggregate_functions { + aggregate_functions.into_iter().for_each(|udaf| { + let existing_udf = state.register_udaf(udaf); + if let Ok(Some(existing_udf)) = existing_udf { + debug!("Overwrote an existing UDF: {}", existing_udf.name()); + } + }); + } + + if let Some(window_functions) = window_functions { + window_functions.into_iter().for_each(|udwf| { + let existing_udf = state.register_udwf(udwf); + if let Ok(Some(existing_udf)) = existing_udf { + debug!("Overwrote an existing UDF: {}", existing_udf.name()); + } + }); + } + + if state.config.create_default_catalog_and_schema() { + let default_catalog = SessionStateDefaults::default_catalog( + &state.config, + &state.table_factories, + &state.runtime_env, + ); + + state.catalog_list.register_catalog( + state.config.options().catalog.default_catalog.clone(), + Arc::new(default_catalog), + ); + } + + if let Some(analyzer_rules) = analyzer_rules { + for analyzer_rule in analyzer_rules { + state.analyzer.rules.push(analyzer_rule); + } + } + + if let Some(optimizer_rules) = optimizer_rules { + for optimizer_rule in optimizer_rules { + state.optimizer.rules.push(optimizer_rule); + } + } + + if let Some(physical_optimizer_rules) = physical_optimizer_rules { + for physical_optimizer_rule in physical_optimizer_rules { + state + .physical_optimizers + .rules + .push(physical_optimizer_rule); + } + } + + state + } + + /// Returns the current session_id value + pub fn session_id(&self) -> &Option { + &self.session_id + } + + /// Returns the current analyzer value + pub fn analyzer(&mut self) -> &mut Option { + &mut self.analyzer + } + + /// Returns the current expr_planners value + pub fn expr_planners(&mut self) -> &mut Option>> { + &mut self.expr_planners + } + + /// Returns the current optimizer value + pub fn optimizer(&mut self) -> &mut Option { + &mut self.optimizer + } + + /// Returns the current physical_optimizers value + pub fn physical_optimizers(&mut self) -> &mut Option { + &mut self.physical_optimizers + } + + /// Returns the current query_planner value + pub fn query_planner(&mut self) -> &mut Option> { + &mut self.query_planner + } + + /// Returns the current catalog_list value + pub fn catalog_list(&mut self) -> &mut Option> { + &mut self.catalog_list + } + + /// Returns the current table_functions value + pub fn table_functions( + &mut self, + ) -> &mut Option>> { + &mut self.table_functions + } + + /// Returns the current scalar_functions value + pub fn scalar_functions(&mut self) -> &mut Option>> { + &mut self.scalar_functions + } + + /// Returns the current aggregate_functions value + pub fn aggregate_functions(&mut self) -> &mut Option>> { + &mut self.aggregate_functions + } + + /// Returns the current window_functions value + pub fn window_functions(&mut self) -> &mut Option>> { + &mut self.window_functions + } + + /// Returns the current serializer_registry value + pub fn serializer_registry(&mut self) -> &mut Option> { + &mut self.serializer_registry + } + + /// Returns the current file_formats value + pub fn file_formats(&mut self) -> &mut Option>> { + &mut self.file_formats + } + + /// Returns the current session_config value + pub fn config(&mut self) -> &mut Option { + &mut self.config + } + + /// Returns the current table_options value + pub fn table_options(&mut self) -> &mut Option { + &mut self.table_options + } + + /// Returns the current execution_props value + pub fn execution_props(&mut self) -> &mut Option { + &mut self.execution_props + } + + /// Returns the current table_factories value + pub fn table_factories( + &mut self, + ) -> &mut Option>> { + &mut self.table_factories + } + + /// Returns the current runtime_env value + pub fn runtime_env(&mut self) -> &mut Option> { + &mut self.runtime_env + } + + /// Returns the current function_factory value + pub fn function_factory(&mut self) -> &mut Option> { + &mut self.function_factory + } + + /// Returns the current analyzer_rules value + pub fn analyzer_rules( + &mut self, + ) -> &mut Option>> { + &mut self.analyzer_rules + } + + /// Returns the current optimizer_rules value + pub fn optimizer_rules( + &mut self, + ) -> &mut Option>> { + &mut self.optimizer_rules + } + + /// Returns the current physical_optimizer_rules value + pub fn physical_optimizer_rules( + &mut self, + ) -> &mut Option>> { + &mut self.physical_optimizer_rules + } +} + +impl Default for SessionStateBuilder { + fn default() -> Self { + Self::new() + } +} + +impl From for SessionStateBuilder { + fn from(state: SessionState) -> Self { + SessionStateBuilder::new_from_existing(state) + } +} + +/// Defaults that are used as part of creating a SessionState such as table providers, +/// file formats, registering of builtin functions, etc. +pub struct SessionStateDefaults {} + +impl SessionStateDefaults { + /// returns a map of the default [`TableProviderFactory`]s + pub fn default_table_factories() -> HashMap> { + let mut table_factories: HashMap> = + HashMap::new(); + #[cfg(feature = "parquet")] + table_factories.insert("PARQUET".into(), Arc::new(DefaultTableFactory::new())); + table_factories.insert("CSV".into(), Arc::new(DefaultTableFactory::new())); + table_factories.insert("JSON".into(), Arc::new(DefaultTableFactory::new())); + table_factories.insert("NDJSON".into(), Arc::new(DefaultTableFactory::new())); + table_factories.insert("AVRO".into(), Arc::new(DefaultTableFactory::new())); + table_factories.insert("ARROW".into(), Arc::new(DefaultTableFactory::new())); + + table_factories + } + + /// returns the default MemoryCatalogProvider + pub fn default_catalog( + config: &SessionConfig, + table_factories: &HashMap>, + runtime: &Arc, + ) -> MemoryCatalogProvider { + let default_catalog = MemoryCatalogProvider::new(); + + default_catalog + .register_schema( + &config.options().catalog.default_schema, + Arc::new(MemorySchemaProvider::new()), + ) + .expect("memory catalog provider can register schema"); + + Self::register_default_schema(config, table_factories, runtime, &default_catalog); + + default_catalog + } + + /// returns the list of default [`ExprPlanner`]s + pub fn default_expr_planners() -> Vec> { + let expr_planners: Vec> = vec![ + Arc::new(functions::core::planner::CoreFunctionPlanner::default()), + // register crate of array expressions (if enabled) + #[cfg(feature = "array_expressions")] + Arc::new(functions_array::planner::ArrayFunctionPlanner), + #[cfg(feature = "array_expressions")] + Arc::new(functions_array::planner::FieldAccessPlanner), + #[cfg(any( + feature = "datetime_expressions", + feature = "unicode_expressions" + ))] + Arc::new(functions::planner::UserDefinedFunctionPlanner), + ]; + + expr_planners + } + + /// returns the list of default [`ScalarUDF']'s + pub fn default_scalar_functions() -> Vec> { + let mut functions: Vec> = functions::all_default_functions(); + #[cfg(feature = "array_expressions")] + functions.append(&mut functions_array::all_default_array_functions()); + + functions + } + + /// returns the list of default [`AggregateUDF']'s + pub fn default_aggregate_functions() -> Vec> { + functions_aggregate::all_default_aggregate_functions() + } + + /// returns the list of default [`FileFormatFactory']'s + pub fn default_file_formats() -> Vec> { + let file_formats: Vec> = vec![ + #[cfg(feature = "parquet")] + Arc::new(ParquetFormatFactory::new()), + Arc::new(JsonFormatFactory::new()), + Arc::new(CsvFormatFactory::new()), + Arc::new(ArrowFormatFactory::new()), + Arc::new(AvroFormatFactory::new()), + ]; + + file_formats + } + + /// registers all builtin functions - scalar, array and aggregate + pub fn register_builtin_functions(state: &mut SessionState) { + Self::register_scalar_functions(state); + Self::register_array_functions(state); + Self::register_aggregate_functions(state); + } + + /// registers all the builtin scalar functions + pub fn register_scalar_functions(state: &mut SessionState) { + functions::register_all(state).expect("can not register built in functions"); + } + + /// registers all the builtin array functions + pub fn register_array_functions(state: &mut SessionState) { + // register crate of array expressions (if enabled) + #[cfg(feature = "array_expressions")] + functions_array::register_all(state).expect("can not register array expressions"); + } + + /// registers all the builtin aggregate functions + pub fn register_aggregate_functions(state: &mut SessionState) { + functions_aggregate::register_all(state) + .expect("can not register aggregate functions"); + } + + /// registers the default schema + pub fn register_default_schema( + config: &SessionConfig, + table_factories: &HashMap>, + runtime: &Arc, + default_catalog: &MemoryCatalogProvider, + ) { + let url = config.options().catalog.location.as_ref(); + let format = config.options().catalog.format.as_ref(); + let (url, format) = match (url, format) { + (Some(url), Some(format)) => (url, format), + _ => return, + }; + let url = url.to_string(); + let format = format.to_string(); + + let url = Url::parse(url.as_str()).expect("Invalid default catalog location!"); + let authority = match url.host_str() { + Some(host) => format!("{}://{}", url.scheme(), host), + None => format!("{}://", url.scheme()), + }; + let path = &url.as_str()[authority.len()..]; + let path = object_store::path::Path::parse(path).expect("Can't parse path"); + let store = ObjectStoreUrl::parse(authority.as_str()) + .expect("Invalid default catalog url"); + let store = match runtime.object_store(store) { + Ok(store) => store, + _ => return, + }; + let factory = match table_factories.get(format.as_str()) { + Some(factory) => factory, + _ => return, + }; + let schema = + ListingSchemaProvider::new(authority, path, factory.clone(), store, format); + let _ = default_catalog + .register_schema("default", Arc::new(schema)) + .expect("Failed to register default schema"); + } + + /// registers the default [`FileFormatFactory`]s + pub fn register_default_file_formats(state: &mut SessionState) { + let formats = SessionStateDefaults::default_file_formats(); + for format in formats { + if let Err(e) = state.register_file_format(format, false) { + log::info!("Unable to register default file format: {e}") + }; + } + } +} + struct SessionContextProvider<'a> { state: &'a SessionState, tables: HashMap>, diff --git a/datafusion/core/src/physical_planner.rs b/datafusion/core/src/physical_planner.rs index d2bc334ec324..efc83d8f6b5c 100644 --- a/datafusion/core/src/physical_planner.rs +++ b/datafusion/core/src/physical_planner.rs @@ -2269,6 +2269,7 @@ mod tests { use crate::prelude::{SessionConfig, SessionContext}; use crate::test_util::{scan_empty, scan_empty_with_partitions}; + use crate::execution::session_state::SessionStateBuilder; use arrow::array::{ArrayRef, DictionaryArray, Int32Array}; use arrow::datatypes::{DataType, Field, Int32Type}; use datafusion_common::{assert_contains, DFSchemaRef, TableReference}; @@ -2282,7 +2283,11 @@ mod tests { let runtime = Arc::new(RuntimeEnv::default()); let config = SessionConfig::new().with_target_partitions(4); let config = config.set_bool("datafusion.optimizer.skip_failed_rules", false); - SessionState::new_with_config_rt(config, runtime) + SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build() } async fn plan(logical_plan: &LogicalPlan) -> Result> { diff --git a/datafusion/core/src/test/object_store.rs b/datafusion/core/src/test/object_store.rs index bea6f7b9ceb7..6c0a2fc7bec4 100644 --- a/datafusion/core/src/test/object_store.rs +++ b/datafusion/core/src/test/object_store.rs @@ -16,9 +16,8 @@ // under the License. //! Object store implementation used for testing use crate::execution::context::SessionState; +use crate::execution::session_state::SessionStateBuilder; use crate::prelude::SessionContext; -use datafusion_execution::config::SessionConfig; -use datafusion_execution::runtime_env::RuntimeEnv; use futures::FutureExt; use object_store::{memory::InMemory, path::Path, ObjectMeta, ObjectStore}; use std::sync::Arc; @@ -44,10 +43,7 @@ pub fn make_test_store_and_state(files: &[(&str, u64)]) -> (Arc, Sessi ( Arc::new(memory), - SessionState::new_with_config_rt( - SessionConfig::default(), - Arc::new(RuntimeEnv::default()), - ), + SessionStateBuilder::new().with_default_features().build(), ) } diff --git a/datafusion/core/tests/dataframe/mod.rs b/datafusion/core/tests/dataframe/mod.rs index f1d57c44293b..1b2a6770cf01 100644 --- a/datafusion/core/tests/dataframe/mod.rs +++ b/datafusion/core/tests/dataframe/mod.rs @@ -42,7 +42,8 @@ use url::Url; use datafusion::dataframe::{DataFrame, DataFrameWriteOptions}; use datafusion::datasource::MemTable; use datafusion::error::Result; -use datafusion::execution::context::{SessionContext, SessionState}; +use datafusion::execution::context::SessionContext; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::prelude::JoinType; use datafusion::prelude::{CsvReadOptions, ParquetReadOptions}; use datafusion::test_util::{parquet_test_data, populate_csv_partitions}; @@ -1544,7 +1545,11 @@ async fn unnest_non_nullable_list() -> Result<()> { async fn test_read_batches() -> Result<()> { let config = SessionConfig::new(); let runtime = Arc::new(RuntimeEnv::default()); - let state = SessionState::new_with_config_rt(config, runtime); + let state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build(); let ctx = SessionContext::new_with_state(state); let schema = Arc::new(Schema::new(vec![ @@ -1594,7 +1599,11 @@ async fn test_read_batches() -> Result<()> { async fn test_read_batches_empty() -> Result<()> { let config = SessionConfig::new(); let runtime = Arc::new(RuntimeEnv::default()); - let state = SessionState::new_with_config_rt(config, runtime); + let state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() + .build(); let ctx = SessionContext::new_with_state(state); let batches = vec![]; @@ -1608,9 +1617,7 @@ async fn test_read_batches_empty() -> Result<()> { #[tokio::test] async fn consecutive_projection_same_schema() -> Result<()> { - let config = SessionConfig::new(); - let runtime = Arc::new(RuntimeEnv::default()); - let state = SessionState::new_with_config_rt(config, runtime); + let state = SessionStateBuilder::new().with_default_features().build(); let ctx = SessionContext::new_with_state(state); let schema = Arc::new(Schema::new(vec![Field::new("id", DataType::Int32, false)])); diff --git a/datafusion/core/tests/memory_limit/mod.rs b/datafusion/core/tests/memory_limit/mod.rs index 7ef24609e238..1d151f9fd368 100644 --- a/datafusion/core/tests/memory_limit/mod.rs +++ b/datafusion/core/tests/memory_limit/mod.rs @@ -38,6 +38,7 @@ use datafusion::datasource::{MemTable, TableProvider}; use datafusion::execution::context::SessionState; use datafusion::execution::disk_manager::DiskManagerConfig; use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::physical_optimizer::join_selection::JoinSelection; use datafusion::physical_plan::stream::RecordBatchStreamAdapter; use datafusion::physical_plan::{ExecutionPlan, SendableRecordBatchStream}; @@ -459,13 +460,16 @@ impl TestCase { let runtime = RuntimeEnv::new(rt_config).unwrap(); // Configure execution - let state = SessionState::new_with_config_rt(config, Arc::new(runtime)); - let state = match scenario.rules() { - Some(rules) => state.with_physical_optimizer_rules(rules), - None => state, + let builder = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(Arc::new(runtime)) + .with_default_features(); + let builder = match scenario.rules() { + Some(rules) => builder.with_physical_optimizer_rules(rules), + None => builder, }; - let ctx = SessionContext::new_with_state(state); + let ctx = SessionContext::new_with_state(builder.build()); ctx.register_table("t", table).expect("registering table"); let query = query.expect("Test error: query not specified"); diff --git a/datafusion/core/tests/parquet/file_statistics.rs b/datafusion/core/tests/parquet/file_statistics.rs index 9f94a59a3e59..bf25b36f48e8 100644 --- a/datafusion/core/tests/parquet/file_statistics.rs +++ b/datafusion/core/tests/parquet/file_statistics.rs @@ -35,6 +35,7 @@ use datafusion_execution::cache::cache_unit::{ use datafusion_execution::config::SessionConfig; use datafusion_execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::session_state::SessionStateBuilder; use tempfile::tempdir; #[tokio::test] @@ -167,10 +168,7 @@ async fn get_listing_table( ) -> ListingTable { let schema = opt .infer_schema( - &SessionState::new_with_config_rt( - SessionConfig::default(), - Arc::new(RuntimeEnv::default()), - ), + &SessionStateBuilder::new().with_default_features().build(), table_path, ) .await diff --git a/datafusion/core/tests/sql/create_drop.rs b/datafusion/core/tests/sql/create_drop.rs index 2174009b8557..83712053b954 100644 --- a/datafusion/core/tests/sql/create_drop.rs +++ b/datafusion/core/tests/sql/create_drop.rs @@ -15,18 +15,14 @@ // specific language governing permissions and limitations // under the License. -use datafusion::execution::context::SessionState; -use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::test_util::TestTableFactory; use super::*; #[tokio::test] async fn create_custom_table() -> Result<()> { - let cfg = RuntimeConfig::new(); - let env = RuntimeEnv::new(cfg).unwrap(); - let ses = SessionConfig::new(); - let mut state = SessionState::new_with_config_rt(ses, Arc::new(env)); + let mut state = SessionStateBuilder::new().with_default_features().build(); state .table_factories_mut() .insert("DELTATABLE".to_string(), Arc::new(TestTableFactory {})); @@ -45,10 +41,7 @@ async fn create_custom_table() -> Result<()> { #[tokio::test] async fn create_external_table_with_ddl() -> Result<()> { - let cfg = RuntimeConfig::new(); - let env = RuntimeEnv::new(cfg).unwrap(); - let ses = SessionConfig::new(); - let mut state = SessionState::new_with_config_rt(ses, Arc::new(env)); + let mut state = SessionStateBuilder::new().with_default_features().build(); state .table_factories_mut() .insert("MOCKTABLE".to_string(), Arc::new(TestTableFactory {})); diff --git a/datafusion/core/tests/user_defined/user_defined_plan.rs b/datafusion/core/tests/user_defined/user_defined_plan.rs index 38ed142cf922..a44f522ba95a 100644 --- a/datafusion/core/tests/user_defined/user_defined_plan.rs +++ b/datafusion/core/tests/user_defined/user_defined_plan.rs @@ -92,6 +92,7 @@ use datafusion::{ }; use async_trait::async_trait; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion_common::config::ConfigOptions; use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; use datafusion_common::ScalarValue; @@ -290,10 +291,14 @@ async fn topk_plan() -> Result<()> { fn make_topk_context() -> SessionContext { let config = SessionConfig::new().with_target_partitions(48); let runtime = Arc::new(RuntimeEnv::default()); - let mut state = SessionState::new_with_config_rt(config, runtime) + let state = SessionStateBuilder::new() + .with_config(config) + .with_runtime_env(runtime) + .with_default_features() .with_query_planner(Arc::new(TopKQueryPlanner {})) - .add_optimizer_rule(Arc::new(TopKOptimizerRule {})); - state.add_analyzer_rule(Arc::new(MyAnalyzerRule {})); + .with_optimizer_rule(Arc::new(TopKOptimizerRule {})) + .with_analyzer_rule(Arc::new(MyAnalyzerRule {})) + .build(); SessionContext::new_with_state(state) } diff --git a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs index f764a050a6cd..d0209d811b7c 100644 --- a/datafusion/proto/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/proto/tests/cases/roundtrip_logical_plan.rs @@ -39,8 +39,7 @@ use prost::Message; use datafusion::datasource::provider::TableProviderFactory; use datafusion::datasource::TableProvider; -use datafusion::execution::context::SessionState; -use datafusion::execution::runtime_env::{RuntimeConfig, RuntimeEnv}; +use datafusion::execution::session_state::SessionStateBuilder; use datafusion::execution::FunctionRegistry; use datafusion::functions_aggregate::count::count_udaf; use datafusion::functions_aggregate::expr_fn::{ @@ -202,10 +201,7 @@ async fn roundtrip_custom_tables() -> Result<()> { let mut table_factories: HashMap> = HashMap::new(); table_factories.insert("TESTTABLE".to_string(), Arc::new(TestTableFactory {})); - let cfg = RuntimeConfig::new(); - let env = RuntimeEnv::new(cfg).unwrap(); - let ses = SessionConfig::new(); - let mut state = SessionState::new_with_config_rt(ses, Arc::new(env)); + let mut state = SessionStateBuilder::new().with_default_features().build(); // replace factories *state.table_factories_mut() = table_factories; let ctx = SessionContext::new_with_state(state); diff --git a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs index 2893b1a31a26..5b2d0fbacaef 100644 --- a/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs +++ b/datafusion/substrait/tests/cases/roundtrip_logical_plan.rs @@ -28,7 +28,6 @@ use std::sync::Arc; use datafusion::arrow::datatypes::{DataType, Field, IntervalUnit, Schema, TimeUnit}; use datafusion::common::{not_impl_err, plan_err, DFSchema, DFSchemaRef}; use datafusion::error::Result; -use datafusion::execution::context::SessionState; use datafusion::execution::registry::SerializerRegistry; use datafusion::execution::runtime_env::RuntimeEnv; use datafusion::logical_expr::{ @@ -37,6 +36,7 @@ use datafusion::logical_expr::{ use datafusion::optimizer::simplify_expressions::expr_simplifier::THRESHOLD_INLINE_INLIST; use datafusion::prelude::*; +use datafusion::execution::session_state::SessionStateBuilder; use substrait::proto::extensions::simple_extension_declaration::MappingType; use substrait::proto::rel::RelType; use substrait::proto::{plan_rel, Plan, Rel}; @@ -1121,11 +1121,12 @@ async fn function_extension_info(sql: &str) -> Result<(Vec, Vec)> { } async fn create_context() -> Result { - let mut state = SessionState::new_with_config_rt( - SessionConfig::default(), - Arc::new(RuntimeEnv::default()), - ) - .with_serializer_registry(Arc::new(MockSerializerRegistry)); + let mut state = SessionStateBuilder::new() + .with_config(SessionConfig::default()) + .with_runtime_env(Arc::new(RuntimeEnv::default())) + .with_default_features() + .with_serializer_registry(Arc::new(MockSerializerRegistry)) + .build(); // register udaf for test, e.g. `sum()` datafusion_functions_aggregate::register_all(&mut state) From bfd815622f1fe2c84d6fab32596b83ffbe52a84a Mon Sep 17 00:00:00 2001 From: Lordworms <48054792+Lordworms@users.noreply.github.com> Date: Sun, 14 Jul 2024 12:06:14 -0700 Subject: [PATCH 19/19] integrate consumer tests, implement tpch query 18 to 22 (#11462) --- .../tests/cases/consumer_integration.rs | 191 ++ .../tpch_substrait_plans/query_18.json | 1128 ++++++++ .../tpch_substrait_plans/query_19.json | 2386 +++++++++++++++++ .../tpch_substrait_plans/query_20.json | 1273 +++++++++ .../tpch_substrait_plans/query_21.json | 1493 +++++++++++ .../tpch_substrait_plans/query_22.json | 2034 ++++++++++++++ 6 files changed, 8505 insertions(+) create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_18.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_19.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_20.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_21.json create mode 100644 datafusion/substrait/tests/testdata/tpch_substrait_plans/query_22.json diff --git a/datafusion/substrait/tests/cases/consumer_integration.rs b/datafusion/substrait/tests/cases/consumer_integration.rs index c8130220ef4a..8fbcd721166e 100644 --- a/datafusion/substrait/tests/cases/consumer_integration.rs +++ b/datafusion/substrait/tests/cases/consumer_integration.rs @@ -398,4 +398,195 @@ mod tests { \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); Ok(()) } + /// this test has some problem in json file internally, gonna fix it + #[ignore] + #[tokio::test] + async fn tpch_test_17() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/lineitem.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_17.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let _plan = from_substrait_plan(&ctx, &proto).await?; + Ok(()) + } + + #[tokio::test] + async fn tpch_test_18() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/orders.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/lineitem.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_18.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.c_name AS C_NAME, FILENAME_PLACEHOLDER_0.c_custkey AS C_CUSTKEY, FILENAME_PLACEHOLDER_1.o_orderkey AS O_ORDERKEY, FILENAME_PLACEHOLDER_1.o_orderdate AS O_ORDERDATE, FILENAME_PLACEHOLDER_1.o_totalprice AS O_TOTALPRICE, sum(FILENAME_PLACEHOLDER_2.l_quantity) AS EXPR$5\ + \n Limit: skip=0, fetch=100\ + \n Sort: FILENAME_PLACEHOLDER_1.o_totalprice DESC NULLS FIRST, FILENAME_PLACEHOLDER_1.o_orderdate ASC NULLS LAST\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.c_name, FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_1.o_orderkey, FILENAME_PLACEHOLDER_1.o_orderdate, FILENAME_PLACEHOLDER_1.o_totalprice]], aggr=[[sum(FILENAME_PLACEHOLDER_2.l_quantity)]]\ + \n Projection: FILENAME_PLACEHOLDER_0.c_name, FILENAME_PLACEHOLDER_0.c_custkey, FILENAME_PLACEHOLDER_1.o_orderkey, FILENAME_PLACEHOLDER_1.o_orderdate, FILENAME_PLACEHOLDER_1.o_totalprice, FILENAME_PLACEHOLDER_2.l_quantity\ + \n Filter: CAST(FILENAME_PLACEHOLDER_1.o_orderkey IN () AS Boolean) AND FILENAME_PLACEHOLDER_0.c_custkey = FILENAME_PLACEHOLDER_1.o_custkey AND FILENAME_PLACEHOLDER_1.o_orderkey = FILENAME_PLACEHOLDER_2.l_orderkey\ + \n Subquery:\ + \n Projection: FILENAME_PLACEHOLDER_3.l_orderkey\ + \n Filter: sum(FILENAME_PLACEHOLDER_3.l_quantity) > CAST(Int32(300) AS Decimal128(19, 0))\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_3.l_orderkey]], aggr=[[sum(FILENAME_PLACEHOLDER_3.l_quantity)]]\ + \n Projection: FILENAME_PLACEHOLDER_3.l_orderkey, FILENAME_PLACEHOLDER_3.l_quantity\ + \n TableScan: FILENAME_PLACEHOLDER_3 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]"); + Ok(()) + } + #[tokio::test] + async fn tpch_test_19() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/part.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_19.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Aggregate: groupBy=[[]], aggr=[[sum(FILENAME_PLACEHOLDER_0.l_extendedprice * Int32(1) - FILENAME_PLACEHOLDER_0.l_discount) AS REVENUE]]\n Projection: FILENAME_PLACEHOLDER_0.l_extendedprice * (CAST(Int32(1) AS Decimal128(19, 0)) - FILENAME_PLACEHOLDER_0.l_discount)\ + \n Filter: FILENAME_PLACEHOLDER_1.p_partkey = FILENAME_PLACEHOLDER_0.l_partkey AND FILENAME_PLACEHOLDER_1.p_brand = CAST(Utf8(\"Brand#12\") AS Utf8) AND (FILENAME_PLACEHOLDER_1.p_container = Utf8(\"SM CASE\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"SM BOX\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"SM PACK\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"SM PKG\")) AND FILENAME_PLACEHOLDER_0.l_quantity >= CAST(Int32(1) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_0.l_quantity <= CAST(Int32(1) + Int32(10) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_1.p_size >= Int32(1) AND FILENAME_PLACEHOLDER_1.p_size <= Int32(5) AND (FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR\") OR FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR REG\")) AND FILENAME_PLACEHOLDER_0.l_shipinstruct = CAST(Utf8(\"DELIVER IN PERSON\") AS Utf8) OR FILENAME_PLACEHOLDER_1.p_partkey = FILENAME_PLACEHOLDER_0.l_partkey AND FILENAME_PLACEHOLDER_1.p_brand = CAST(Utf8(\"Brand#23\") AS Utf8) AND (FILENAME_PLACEHOLDER_1.p_container = Utf8(\"MED BAG\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"MED BOX\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"MED PKG\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"MED PACK\")) AND FILENAME_PLACEHOLDER_0.l_quantity >= CAST(Int32(10) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_0.l_quantity <= CAST(Int32(10) + Int32(10) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_1.p_size >= Int32(1) AND FILENAME_PLACEHOLDER_1.p_size <= Int32(10) AND (FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR\") OR FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR REG\")) AND FILENAME_PLACEHOLDER_0.l_shipinstruct = CAST(Utf8(\"DELIVER IN PERSON\") AS Utf8) OR FILENAME_PLACEHOLDER_1.p_partkey = FILENAME_PLACEHOLDER_0.l_partkey AND FILENAME_PLACEHOLDER_1.p_brand = CAST(Utf8(\"Brand#34\") AS Utf8) AND (FILENAME_PLACEHOLDER_1.p_container = Utf8(\"LG CASE\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"LG BOX\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"LG PACK\") OR FILENAME_PLACEHOLDER_1.p_container = Utf8(\"LG PKG\")) AND FILENAME_PLACEHOLDER_0.l_quantity >= CAST(Int32(20) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_0.l_quantity <= CAST(Int32(20) + Int32(10) AS Decimal128(19, 0)) AND FILENAME_PLACEHOLDER_1.p_size >= Int32(1) AND FILENAME_PLACEHOLDER_1.p_size <= Int32(15) AND (FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR\") OR FILENAME_PLACEHOLDER_0.l_shipmode = Utf8(\"AIR REG\")) AND FILENAME_PLACEHOLDER_0.l_shipinstruct = CAST(Utf8(\"DELIVER IN PERSON\") AS Utf8)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_20() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/supplier.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/nation.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/partsupp.csv"), + ("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/part.csv"), + ("FILENAME_PLACEHOLDER_4", "tests/testdata/tpch/lineitem.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_20.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.s_name AS S_NAME, FILENAME_PLACEHOLDER_0.s_address AS S_ADDRESS\ + \n Sort: FILENAME_PLACEHOLDER_0.s_name ASC NULLS LAST\ + \n Projection: FILENAME_PLACEHOLDER_0.s_name, FILENAME_PLACEHOLDER_0.s_address\ + \n Filter: CAST(FILENAME_PLACEHOLDER_0.s_suppkey IN () AS Boolean) AND FILENAME_PLACEHOLDER_0.s_nationkey = FILENAME_PLACEHOLDER_1.n_nationkey AND FILENAME_PLACEHOLDER_1.n_name = CAST(Utf8(\"CANADA\") AS Utf8)\ + \n Subquery:\ + \n Projection: FILENAME_PLACEHOLDER_2.ps_suppkey\ + \n Filter: CAST(FILENAME_PLACEHOLDER_2.ps_partkey IN () AS Boolean) AND CAST(FILENAME_PLACEHOLDER_2.ps_availqty AS Decimal128(19, 1)) > ()\ + \n Subquery:\ + \n Projection: FILENAME_PLACEHOLDER_3.p_partkey\ + \n Filter: FILENAME_PLACEHOLDER_3.p_name LIKE CAST(Utf8(\"forest%\") AS Utf8)\ + \n TableScan: FILENAME_PLACEHOLDER_3 projection=[p_partkey, p_name, p_mfgr, p_brand, p_type, p_size, p_container, p_retailprice, p_comment]\ + \n Subquery:\ + \n Projection: Decimal128(Some(5),2,1) * sum(FILENAME_PLACEHOLDER_4.l_quantity)\ + \n Aggregate: groupBy=[[]], aggr=[[sum(FILENAME_PLACEHOLDER_4.l_quantity)]]\ + \n Projection: FILENAME_PLACEHOLDER_4.l_quantity\ + \n Filter: FILENAME_PLACEHOLDER_4.l_partkey = FILENAME_PLACEHOLDER_4.l_orderkey AND FILENAME_PLACEHOLDER_4.l_suppkey = FILENAME_PLACEHOLDER_4.l_partkey AND FILENAME_PLACEHOLDER_4.l_shipdate >= CAST(Utf8(\"1994-01-01\") AS Date32) AND FILENAME_PLACEHOLDER_4.l_shipdate < CAST(Utf8(\"1995-01-01\") AS Date32)\ + \n TableScan: FILENAME_PLACEHOLDER_4 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[ps_partkey, ps_suppkey, ps_availqty, ps_supplycost, ps_comment]\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_21() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/supplier.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/orders.csv"), + ("FILENAME_PLACEHOLDER_3", "tests/testdata/tpch/nation.csv"), + ("FILENAME_PLACEHOLDER_4", "tests/testdata/tpch/lineitem.csv"), + ("FILENAME_PLACEHOLDER_5", "tests/testdata/tpch/lineitem.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_21.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: FILENAME_PLACEHOLDER_0.s_name AS S_NAME, count(Int64(1)) AS NUMWAIT\ + \n Limit: skip=0, fetch=100\ + \n Sort: count(Int64(1)) DESC NULLS FIRST, FILENAME_PLACEHOLDER_0.s_name ASC NULLS LAST\ + \n Aggregate: groupBy=[[FILENAME_PLACEHOLDER_0.s_name]], aggr=[[count(Int64(1))]]\ + \n Projection: FILENAME_PLACEHOLDER_0.s_name\ + \n Filter: FILENAME_PLACEHOLDER_0.s_suppkey = FILENAME_PLACEHOLDER_1.l_suppkey AND FILENAME_PLACEHOLDER_2.o_orderkey = FILENAME_PLACEHOLDER_1.l_orderkey AND FILENAME_PLACEHOLDER_2.o_orderstatus = Utf8(\"F\") AND FILENAME_PLACEHOLDER_1.l_receiptdate > FILENAME_PLACEHOLDER_1.l_commitdate AND EXISTS () AND NOT EXISTS () AND FILENAME_PLACEHOLDER_0.s_nationkey = FILENAME_PLACEHOLDER_3.n_nationkey AND FILENAME_PLACEHOLDER_3.n_name = CAST(Utf8(\"SAUDI ARABIA\") AS Utf8)\ + \n Subquery:\ + \n Filter: FILENAME_PLACEHOLDER_4.l_orderkey = FILENAME_PLACEHOLDER_4.l_tax AND FILENAME_PLACEHOLDER_4.l_suppkey != FILENAME_PLACEHOLDER_4.l_linestatus\ + \n TableScan: FILENAME_PLACEHOLDER_4 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n Subquery:\ + \n Filter: FILENAME_PLACEHOLDER_5.l_orderkey = FILENAME_PLACEHOLDER_5.l_tax AND FILENAME_PLACEHOLDER_5.l_suppkey != FILENAME_PLACEHOLDER_5.l_linestatus AND FILENAME_PLACEHOLDER_5.l_receiptdate > FILENAME_PLACEHOLDER_5.l_commitdate\ + \n TableScan: FILENAME_PLACEHOLDER_5 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n Inner Join: Filter: Boolean(true)\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[s_suppkey, s_name, s_address, s_nationkey, s_phone, s_acctbal, s_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[l_orderkey, l_partkey, l_suppkey, l_linenumber, l_quantity, l_extendedprice, l_discount, l_tax, l_returnflag, l_linestatus, l_shipdate, l_commitdate, l_receiptdate, l_shipinstruct, l_shipmode, l_comment]\n TableScan: FILENAME_PLACEHOLDER_2 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_3 projection=[n_nationkey, n_name, n_regionkey, n_comment]"); + Ok(()) + } + + #[tokio::test] + async fn tpch_test_22() -> Result<()> { + let ctx = create_context(vec![ + ("FILENAME_PLACEHOLDER_0", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_1", "tests/testdata/tpch/customer.csv"), + ("FILENAME_PLACEHOLDER_2", "tests/testdata/tpch/orders.csv"), + ]) + .await?; + let path = "tests/testdata/tpch_substrait_plans/query_22.json"; + let proto = serde_json::from_reader::<_, Plan>(BufReader::new( + File::open(path).expect("file not found"), + )) + .expect("failed to parse json"); + + let plan = from_substrait_plan(&ctx, &proto).await?; + let plan_str = format!("{:?}", plan); + assert_eq!(plan_str, "Projection: substr(FILENAME_PLACEHOLDER_0.c_phone,Int32(1),Int32(2)) AS CNTRYCODE, count(Int64(1)) AS NUMCUST, sum(FILENAME_PLACEHOLDER_0.c_acctbal) AS TOTACCTBAL\n Sort: substr(FILENAME_PLACEHOLDER_0.c_phone,Int32(1),Int32(2)) ASC NULLS LAST\ + \n Aggregate: groupBy=[[substr(FILENAME_PLACEHOLDER_0.c_phone,Int32(1),Int32(2))]], aggr=[[count(Int64(1)), sum(FILENAME_PLACEHOLDER_0.c_acctbal)]]\ + \n Projection: substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)), FILENAME_PLACEHOLDER_0.c_acctbal\ + \n Filter: (substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"13\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"31\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"23\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"29\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"30\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"18\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_0.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"17\") AS Utf8)) AND FILENAME_PLACEHOLDER_0.c_acctbal > () AND NOT EXISTS ()\ + \n Subquery:\ + \n Aggregate: groupBy=[[]], aggr=[[avg(FILENAME_PLACEHOLDER_1.c_acctbal)]]\ + \n Projection: FILENAME_PLACEHOLDER_1.c_acctbal\ + \n Filter: FILENAME_PLACEHOLDER_1.c_acctbal > Decimal128(Some(0),3,2) AND (substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"13\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"31\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"23\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"29\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"30\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"18\") AS Utf8) OR substr(FILENAME_PLACEHOLDER_1.c_phone, Int32(1), Int32(2)) = CAST(Utf8(\"17\") AS Utf8))\ + \n TableScan: FILENAME_PLACEHOLDER_1 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]\n Subquery:\ + \n Filter: FILENAME_PLACEHOLDER_2.o_custkey = FILENAME_PLACEHOLDER_2.o_orderkey\ + \n TableScan: FILENAME_PLACEHOLDER_2 projection=[o_orderkey, o_custkey, o_orderstatus, o_totalprice, o_orderdate, o_orderpriority, o_clerk, o_shippriority, o_comment]\ + \n TableScan: FILENAME_PLACEHOLDER_0 projection=[c_custkey, c_name, c_address, c_nationkey, c_phone, c_acctbal, c_mktsegment, c_comment]"); + Ok(()) + } } diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_18.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_18.json new file mode 100644 index 000000000000..a4f0b25db956 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_18.json @@ -0,0 +1,1128 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "sum:opt_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gt:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "equal:any1_any1" + } + } + ], + "relations": [ + { + "root": { + "input": { + "fetch": { + "common": { + "direct": { + } + }, + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 33, + 34, + 35, + 36, + 37, + 38 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "subquery": { + "inPredicate": { + "needles": [ + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + ], + "haystack": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 2 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 16, + 17 + ] + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_3", + "parquet": {} + } + ] + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 1, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "condition": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 300, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + } + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 17 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 11 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 1, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "offset": "0", + "count": "100" + } + }, + "names": [ + "C_NAME", + "C_CUSTKEY", + "O_ORDERKEY", + "O_ORDERDATE", + "O_TOTALPRICE", + "EXPR$5" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_19.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_19.json new file mode 100644 index 000000000000..356111a480f3 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_19.json @@ -0,0 +1,2386 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 3, + "uri": "/functions_arithmetic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "or:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 3, + "name": "gte:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 4, + "name": "lte:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 5, + "name": "add:opt_i32_i32" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 6, + "name": "multiply:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 7, + "name": "subtract:opt_decimal_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 8, + "name": "sum:opt_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 25 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 19 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "Brand#12", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "SM CASE", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "SM BOX", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "SM PACK", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "SM PKG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 5, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR REG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 13 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "DELIVER IN PERSON", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 19 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "Brand#23", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "MED BAG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "MED BOX", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "MED PKG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "MED PACK", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR REG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 13 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "DELIVER IN PERSON", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 16 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 19 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "Brand#34", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "LG CASE", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "LG BOX", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "LG PACK", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 22 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "LG PKG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 20, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "literal": { + "i32": 20, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 10, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 21 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 15, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 14 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "AIR REG", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 13 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "DELIVER IN PERSON", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 7, + "args": [], + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 6 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 8, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "names": [ + "REVENUE" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_20.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_20.json new file mode 100644 index 000000000000..54a71fa553f8 --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_20.json @@ -0,0 +1,1273 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 5, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "like:vchar_vchar" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gt:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 4, + "name": "gte:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "lt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 6, + "name": "sum:opt_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 7, + "name": "multiply:opt_decimal_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": {} + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 11, + 12 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "join": { + "common": { + "direct": {} + }, + "left": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "S_SUPPKEY", + "S_NAME", + "S_ADDRESS", + "S_NATIONKEY", + "S_PHONE", + "S_ACCTBAL", + "S_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "N_NATIONKEY", + "N_NAME", + "N_REGIONKEY", + "N_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 152, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "subquery": { + "inPredicate": { + "needles": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + ], + "haystack": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 5 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "PS_PARTKEY", + "PS_SUPPKEY", + "PS_AVAILQTY", + "PS_SUPPLYCOST", + "PS_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 199, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "subquery": { + "inPredicate": { + "needles": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + ], + "haystack": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 9 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "P_PARTKEY", + "P_NAME", + "P_MFGR", + "P_BRAND", + "P_TYPE", + "P_SIZE", + "P_CONTAINER", + "P_RETAILPRICE", + "P_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 23, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_3", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 55, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "forest%", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + ] + } + } + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "cast": { + "type": { + "decimal": { + "scale": 1, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": {} + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "subquery": { + "scalar": { + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 1 + ] + } + }, + "input": { + "aggregate": { + "common": { + "direct": {} + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 16 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": {} + }, + "input": { + "read": { + "common": { + "direct": {} + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_4", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1994-01-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 5, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 10 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "1995-01-01", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": {} + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 6, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + } + ] + } + } + ] + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 7, + "args": [], + "outputType": { + "decimal": { + "scale": 1, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "literal": { + "decimal": { + "value": "BQAAAAAAAAAAAAAAAAAAAA==", + "precision": 2, + "scale": 1 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + } + } + ] + } + } + ] + } + } + } + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + } + ] + } + } + } + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": {} + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 8 + } + }, + "rootReference": {} + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "CANADA", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": {} + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": {} + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": {} + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "names": [ + "S_NAME", + "S_ADDRESS" + ] + } + } + ], + "expectedTypeUrls": [] +} \ No newline at end of file diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_21.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_21.json new file mode 100644 index 000000000000..d35c1517228b --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_21.json @@ -0,0 +1,1493 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 4, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_datetime.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 1, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 2, + "name": "gt:date_date" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 3, + "name": "not_equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 4, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "count:opt" + } + } + ], + "relations": [ + { + "root": { + "input": { + "fetch": { + "common": { + "direct": { + } + }, + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 36 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "join": { + "common": { + "direct": { + } + }, + "left": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "S_SUPPKEY", + "S_NAME", + "S_ADDRESS", + "S_NATIONKEY", + "S_PHONE", + "S_ACCTBAL", + "S_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 101, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "right": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "N_NATIONKEY", + "N_NAME", + "N_REGIONKEY", + "N_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 152, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_3", + "parquet": {} + } + ] + } + } + }, + "expression": { + "literal": { + "boolean": true, + "nullable": false, + "typeVariationReference": 0 + } + }, + "type": "JOIN_TYPE_INNER" + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 23 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 25 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "fixedChar": "F", + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 19 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 18 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "subquery": { + "setPredicate": { + "predicateOp": "PREDICATE_OP_EXISTS", + "tuples": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_4", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + } + ] + } + } + } + } + } + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "subquery": { + "setPredicate": { + "predicateOp": "PREDICATE_OP_EXISTS", + "tuples": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "L_ORDERKEY", + "L_PARTKEY", + "L_SUPPKEY", + "L_LINENUMBER", + "L_QUANTITY", + "L_EXTENDEDPRICE", + "L_DISCOUNT", + "L_TAX", + "L_RETURNFLAG", + "L_LINESTATUS", + "L_SHIPDATE", + "L_COMMITDATE", + "L_RECEIPTDATE", + "L_SHIPINSTRUCT", + "L_SHIPMODE", + "L_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 44, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_5", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 7 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 2 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 9 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 12 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 11 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + } + ] + } + } + } + } + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 3 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 32 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 33 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "cast": { + "type": { + "fixedChar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "input": { + "literal": { + "fixedChar": "SAUDI ARABIA", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [] + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_DESC_NULLS_FIRST" + }, + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "offset": "0", + "count": "100" + } + }, + "names": [ + "S_NAME", + "NUMWAIT" + ] + } + } + ], + "expectedTypeUrls": [] +} diff --git a/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_22.json b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_22.json new file mode 100644 index 000000000000..9eb37da8e18e --- /dev/null +++ b/datafusion/substrait/tests/testdata/tpch_substrait_plans/query_22.json @@ -0,0 +1,2034 @@ +{ + "extensionUris": [ + { + "extensionUriAnchor": 5, + "uri": "/functions_aggregate_generic.yaml" + }, + { + "extensionUriAnchor": 1, + "uri": "/functions_boolean.yaml" + }, + { + "extensionUriAnchor": 3, + "uri": "/functions_string.yaml" + }, + { + "extensionUriAnchor": 4, + "uri": "/functions_arithmetic_decimal.yaml" + }, + { + "extensionUriAnchor": 2, + "uri": "/functions_comparison.yaml" + } + ], + "extensions": [ + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 0, + "name": "and:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 1, + "name": "or:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 2, + "name": "equal:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 3, + "functionAnchor": 3, + "name": "substring:fchar_i32_i32" + } + }, + { + "extensionFunction": { + "extensionUriReference": 2, + "functionAnchor": 4, + "name": "gt:any1_any1" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 5, + "name": "avg:opt_decimal" + } + }, + { + "extensionFunction": { + "extensionUriReference": 1, + "functionAnchor": 6, + "name": "not:bool" + } + }, + { + "extensionFunction": { + "extensionUriReference": 5, + "functionAnchor": 7, + "name": "count:opt" + } + }, + { + "extensionFunction": { + "extensionUriReference": 4, + "functionAnchor": 8, + "name": "sum:opt_decimal" + } + } + ], + "relations": [ + { + "root": { + "input": { + "sort": { + "common": { + "direct": { + } + }, + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 8, + 9 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_0", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "13", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "31", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "23", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "29", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "30", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "18", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "17", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "subquery": { + "scalar": { + "input": { + "aggregate": { + "common": { + "direct": { + } + }, + "input": { + "project": { + "common": { + "emit": { + "outputMapping": [ + 8 + ] + } + }, + "input": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "C_CUSTKEY", + "C_NAME", + "C_ADDRESS", + "C_NATIONKEY", + "C_PHONE", + "C_ACCTBAL", + "C_MKTSEGMENT", + "C_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "varchar": { + "length": 25, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 40, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 10, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 117, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_1", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 0, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 4, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "decimal": { + "value": "AAAAAAAAAAAAAAAAAAAAAA==", + "precision": 3, + "scale": 2 + }, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 1, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "13", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "31", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "23", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "29", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "30", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "18", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + } + }, + { + "value": { + "cast": { + "type": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "input": { + "literal": { + "fixedChar": "17", + "nullable": false, + "typeVariationReference": 0 + } + }, + "failureBehavior": "FAILURE_BEHAVIOR_UNSPECIFIED" + } + } + } + ] + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [] + } + ], + "measures": [ + { + "measure": { + "functionReference": 5, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + } + } + } + } + } + ] + } + } + }, + { + "value": { + "scalarFunction": { + "functionReference": 6, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "subquery": { + "setPredicate": { + "predicateOp": "PREDICATE_OP_EXISTS", + "tuples": { + "filter": { + "common": { + "direct": { + } + }, + "input": { + "read": { + "common": { + "direct": { + } + }, + "baseSchema": { + "names": [ + "O_ORDERKEY", + "O_CUSTKEY", + "O_ORDERSTATUS", + "O_TOTALPRICE", + "O_ORDERDATE", + "O_ORDERPRIORITY", + "O_CLERK", + "O_SHIPPRIORITY", + "O_COMMENT" + ], + "struct": { + "types": [ + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + { + "fixedChar": { + "length": 1, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "date": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "fixedChar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "i32": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + { + "varchar": { + "length": 79, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + } + ], + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "local_files": { + "items": [ + { + "uri_file": "file://FILENAME_PLACEHOLDER_2", + "parquet": {} + } + ] + } + } + }, + "condition": { + "scalarFunction": { + "functionReference": 2, + "args": [], + "outputType": { + "bool": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "outerReference": { + "stepsOut": 1 + } + } + } + } + ] + } + } + } + } + } + } + } + } + ] + } + } + } + ] + } + } + } + }, + "expressions": [ + { + "scalarFunction": { + "functionReference": 3, + "args": [], + "outputType": { + "varchar": { + "length": 15, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 4 + } + }, + "rootReference": { + } + } + } + }, + { + "value": { + "literal": { + "i32": 1, + "nullable": false, + "typeVariationReference": 0 + } + } + }, + { + "value": { + "literal": { + "i32": 2, + "nullable": false, + "typeVariationReference": 0 + } + } + } + ] + } + }, + { + "selection": { + "directReference": { + "structField": { + "field": 5 + } + }, + "rootReference": { + } + } + } + ] + } + }, + "groupings": [ + { + "groupingExpressions": [ + { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + } + ] + } + ], + "measures": [ + { + "measure": { + "functionReference": 7, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "i64": { + "typeVariationReference": 0, + "nullability": "NULLABILITY_REQUIRED" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [] + } + }, + { + "measure": { + "functionReference": 8, + "args": [], + "sorts": [], + "phase": "AGGREGATION_PHASE_INITIAL_TO_RESULT", + "outputType": { + "decimal": { + "scale": 0, + "precision": 19, + "typeVariationReference": 0, + "nullability": "NULLABILITY_NULLABLE" + } + }, + "invocation": "AGGREGATION_INVOCATION_ALL", + "arguments": [ + { + "value": { + "selection": { + "directReference": { + "structField": { + "field": 1 + } + }, + "rootReference": { + } + } + } + } + ] + } + } + ] + } + }, + "sorts": [ + { + "expr": { + "selection": { + "directReference": { + "structField": { + "field": 0 + } + }, + "rootReference": { + } + } + }, + "direction": "SORT_DIRECTION_ASC_NULLS_LAST" + } + ] + } + }, + "names": [ + "CNTRYCODE", + "NUMCUST", + "TOTACCTBAL" + ] + } + } + ], + "expectedTypeUrls": [] +}