/Users/andrewlamb/Software/datafusion/datafusion/expr/src/logical_plan/builder.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! This module provides a builder for creating LogicalPlans |
19 | | |
20 | | use std::any::Any; |
21 | | use std::cmp::Ordering; |
22 | | use std::collections::{HashMap, HashSet}; |
23 | | use std::sync::Arc; |
24 | | |
25 | | use crate::dml::CopyTo; |
26 | | use crate::expr::{Alias, Sort as SortExpr}; |
27 | | use crate::expr_rewriter::{ |
28 | | coerce_plan_expr_for_schema, normalize_col, |
29 | | normalize_col_with_schemas_and_ambiguity_check, normalize_cols, normalize_sorts, |
30 | | rewrite_sort_cols_by_aggs, |
31 | | }; |
32 | | use crate::logical_plan::{ |
33 | | Aggregate, Analyze, CrossJoin, Distinct, DistinctOn, EmptyRelation, Explain, Filter, |
34 | | Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Prepare, |
35 | | Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values, |
36 | | Window, |
37 | | }; |
38 | | use crate::utils::{ |
39 | | can_hash, columnize_expr, compare_sort_expr, expr_to_columns, |
40 | | find_valid_equijoin_key_pair, group_window_expr_by_sort_keys, |
41 | | }; |
42 | | use crate::{ |
43 | | and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery, |
44 | | TableProviderFilterPushDown, TableSource, WriteOp, |
45 | | }; |
46 | | |
47 | | use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef}; |
48 | | use datafusion_common::display::ToStringifiedPlan; |
49 | | use datafusion_common::file_options::file_type::FileType; |
50 | | use datafusion_common::{ |
51 | | get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err, |
52 | | plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue, |
53 | | TableReference, ToDFSchema, UnnestOptions, |
54 | | }; |
55 | | use datafusion_expr_common::type_coercion::binary::type_union_resolution; |
56 | | |
57 | | use super::dml::InsertOp; |
58 | | use super::plan::{ColumnUnnestList, ColumnUnnestType}; |
59 | | |
60 | | /// Default table name for unnamed table |
61 | | pub const UNNAMED_TABLE: &str = "?table?"; |
62 | | |
63 | | /// Builder for logical plans |
64 | | /// |
65 | | /// # Example building a simple plan |
66 | | /// ``` |
67 | | /// # use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan}; |
68 | | /// # use datafusion_common::Result; |
69 | | /// # use arrow::datatypes::{Schema, DataType, Field}; |
70 | | /// # |
71 | | /// # fn main() -> Result<()> { |
72 | | /// # |
73 | | /// # fn employee_schema() -> Schema { |
74 | | /// # Schema::new(vec![ |
75 | | /// # Field::new("id", DataType::Int32, false), |
76 | | /// # Field::new("first_name", DataType::Utf8, false), |
77 | | /// # Field::new("last_name", DataType::Utf8, false), |
78 | | /// # Field::new("state", DataType::Utf8, false), |
79 | | /// # Field::new("salary", DataType::Int32, false), |
80 | | /// # ]) |
81 | | /// # } |
82 | | /// # |
83 | | /// // Create a plan similar to |
84 | | /// // SELECT last_name |
85 | | /// // FROM employees |
86 | | /// // WHERE salary < 1000 |
87 | | /// let plan = table_scan(Some("employee"), &employee_schema(), None)? |
88 | | /// // Keep only rows where salary < 1000 |
89 | | /// .filter(col("salary").lt(lit(1000)))? |
90 | | /// // only show "last_name" in the final results |
91 | | /// .project(vec![col("last_name")])? |
92 | | /// .build()?; |
93 | | /// |
94 | | /// // Convert from plan back to builder |
95 | | /// let builder = LogicalPlanBuilder::from(plan); |
96 | | /// |
97 | | /// # Ok(()) |
98 | | /// # } |
99 | | /// ``` |
100 | | #[derive(Debug, Clone)] |
101 | | pub struct LogicalPlanBuilder { |
102 | | plan: Arc<LogicalPlan>, |
103 | | } |
104 | | |
105 | | impl LogicalPlanBuilder { |
106 | | /// Create a builder from an existing plan |
107 | 0 | pub fn new(plan: LogicalPlan) -> Self { |
108 | 0 | Self { |
109 | 0 | plan: Arc::new(plan), |
110 | 0 | } |
111 | 0 | } |
112 | | |
113 | | /// Create a builder from an existing plan |
114 | 0 | pub fn new_from_arc(plan: Arc<LogicalPlan>) -> Self { |
115 | 0 | Self { plan } |
116 | 0 | } |
117 | | |
118 | | /// Return the output schema of the plan build so far |
119 | 0 | pub fn schema(&self) -> &DFSchemaRef { |
120 | 0 | self.plan.schema() |
121 | 0 | } |
122 | | |
123 | | /// Return the LogicalPlan of the plan build so far |
124 | 0 | pub fn plan(&self) -> &LogicalPlan { |
125 | 0 | &self.plan |
126 | 0 | } |
127 | | |
128 | | /// Create an empty relation. |
129 | | /// |
130 | | /// `produce_one_row` set to true means this empty node needs to produce a placeholder row. |
131 | 0 | pub fn empty(produce_one_row: bool) -> Self { |
132 | 0 | Self::new(LogicalPlan::EmptyRelation(EmptyRelation { |
133 | 0 | produce_one_row, |
134 | 0 | schema: DFSchemaRef::new(DFSchema::empty()), |
135 | 0 | })) |
136 | 0 | } |
137 | | |
138 | | /// Convert a regular plan into a recursive query. |
139 | | /// `is_distinct` indicates whether the recursive term should be de-duplicated (`UNION`) after each iteration or not (`UNION ALL`). |
140 | 0 | pub fn to_recursive_query( |
141 | 0 | self, |
142 | 0 | name: String, |
143 | 0 | recursive_term: LogicalPlan, |
144 | 0 | is_distinct: bool, |
145 | 0 | ) -> Result<Self> { |
146 | 0 | // TODO: we need to do a bunch of validation here. Maybe more. |
147 | 0 | if is_distinct { |
148 | 0 | return not_impl_err!( |
149 | 0 | "Recursive queries with a distinct 'UNION' (in which the previous iteration's results will be de-duplicated) is not supported" |
150 | 0 | ); |
151 | 0 | } |
152 | 0 | // Ensure that the static term and the recursive term have the same number of fields |
153 | 0 | let static_fields_len = self.plan.schema().fields().len(); |
154 | 0 | let recurive_fields_len = recursive_term.schema().fields().len(); |
155 | 0 | if static_fields_len != recurive_fields_len { |
156 | 0 | return plan_err!( |
157 | 0 | "Non-recursive term and recursive term must have the same number of columns ({} != {})", |
158 | 0 | static_fields_len, recurive_fields_len |
159 | 0 | ); |
160 | 0 | } |
161 | | // Ensure that the recursive term has the same field types as the static term |
162 | 0 | let coerced_recursive_term = |
163 | 0 | coerce_plan_expr_for_schema(recursive_term, self.plan.schema())?; |
164 | 0 | Ok(Self::from(LogicalPlan::RecursiveQuery(RecursiveQuery { |
165 | 0 | name, |
166 | 0 | static_term: self.plan, |
167 | 0 | recursive_term: Arc::new(coerced_recursive_term), |
168 | 0 | is_distinct, |
169 | 0 | }))) |
170 | 0 | } |
171 | | |
172 | | /// Create a values list based relation, and the schema is inferred from data, consuming |
173 | | /// `value`. See the [Postgres VALUES](https://www.postgresql.org/docs/current/queries-values.html) |
174 | | /// documentation for more details. |
175 | | /// |
176 | | /// By default, it assigns the names column1, column2, etc. to the columns of a VALUES table. |
177 | | /// The column names are not specified by the SQL standard and different database systems do it differently, |
178 | | /// so it's usually better to override the default names with a table alias list. |
179 | | /// |
180 | | /// If the values include params/binders such as $1, $2, $3, etc, then the `param_data_types` should be provided. |
181 | 0 | pub fn values(mut values: Vec<Vec<Expr>>) -> Result<Self> { |
182 | 0 | if values.is_empty() { |
183 | 0 | return plan_err!("Values list cannot be empty"); |
184 | 0 | } |
185 | 0 | let n_cols = values[0].len(); |
186 | 0 | if n_cols == 0 { |
187 | 0 | return plan_err!("Values list cannot be zero length"); |
188 | 0 | } |
189 | 0 | for (i, row) in values.iter().enumerate() { |
190 | 0 | if row.len() != n_cols { |
191 | 0 | return plan_err!( |
192 | 0 | "Inconsistent data length across values list: got {} values in row {} but expected {}", |
193 | 0 | row.len(), |
194 | 0 | i, |
195 | 0 | n_cols |
196 | 0 | ); |
197 | 0 | } |
198 | | } |
199 | | |
200 | 0 | let empty_schema = DFSchema::empty(); |
201 | 0 | let mut field_types: Vec<DataType> = Vec::with_capacity(n_cols); |
202 | 0 | for j in 0..n_cols { |
203 | 0 | let mut common_type: Option<DataType> = None; |
204 | 0 | for (i, row) in values.iter().enumerate() { |
205 | 0 | let value = &row[j]; |
206 | 0 | let data_type = value.get_type(&empty_schema)?; |
207 | 0 | if data_type == DataType::Null { |
208 | 0 | continue; |
209 | 0 | } |
210 | 0 | if let Some(prev_type) = common_type { |
211 | | // get common type of each column values. |
212 | 0 | let data_types = vec![prev_type.clone(), data_type.clone()]; |
213 | 0 | let Some(new_type) = type_union_resolution(&data_types) else { |
214 | 0 | return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}"); |
215 | | }; |
216 | 0 | common_type = Some(new_type); |
217 | 0 | } else { |
218 | 0 | common_type = Some(data_type); |
219 | 0 | } |
220 | | } |
221 | | // assuming common_type was not set, and no error, therefore the type should be NULL |
222 | | // since the code loop skips NULL |
223 | 0 | field_types.push(common_type.unwrap_or(DataType::Null)); |
224 | | } |
225 | | // wrap cast if data type is not same as common type. |
226 | 0 | for row in &mut values { |
227 | 0 | for (j, field_type) in field_types.iter().enumerate() { |
228 | 0 | if let Expr::Literal(ScalarValue::Null) = row[j] { |
229 | 0 | row[j] = Expr::Literal(ScalarValue::try_from(field_type)?); |
230 | | } else { |
231 | 0 | row[j] = |
232 | 0 | std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?; |
233 | | } |
234 | | } |
235 | | } |
236 | 0 | let fields = field_types |
237 | 0 | .iter() |
238 | 0 | .enumerate() |
239 | 0 | .map(|(j, data_type)| { |
240 | 0 | // naming is following convention https://www.postgresql.org/docs/current/queries-values.html |
241 | 0 | let name = &format!("column{}", j + 1); |
242 | 0 | Field::new(name, data_type.clone(), true) |
243 | 0 | }) |
244 | 0 | .collect::<Vec<_>>(); |
245 | 0 | let dfschema = DFSchema::from_unqualified_fields(fields.into(), HashMap::new())?; |
246 | 0 | let schema = DFSchemaRef::new(dfschema); |
247 | 0 | Ok(Self::new(LogicalPlan::Values(Values { schema, values }))) |
248 | 0 | } |
249 | | |
250 | | /// Convert a table provider into a builder with a TableScan |
251 | | /// |
252 | | /// Note that if you pass a string as `table_name`, it is treated |
253 | | /// as a SQL identifier, as described on [`TableReference`] and |
254 | | /// thus is normalized |
255 | | /// |
256 | | /// # Example: |
257 | | /// ``` |
258 | | /// # use datafusion_expr::{lit, col, LogicalPlanBuilder, |
259 | | /// # logical_plan::builder::LogicalTableSource, logical_plan::table_scan |
260 | | /// # }; |
261 | | /// # use std::sync::Arc; |
262 | | /// # use arrow::datatypes::{Schema, DataType, Field}; |
263 | | /// # use datafusion_common::TableReference; |
264 | | /// # |
265 | | /// # let employee_schema = Arc::new(Schema::new(vec![ |
266 | | /// # Field::new("id", DataType::Int32, false), |
267 | | /// # ])) as _; |
268 | | /// # let table_source = Arc::new(LogicalTableSource::new(employee_schema)); |
269 | | /// // Scan table_source with the name "mytable" (after normalization) |
270 | | /// # let table = table_source.clone(); |
271 | | /// let scan = LogicalPlanBuilder::scan("MyTable", table, None); |
272 | | /// |
273 | | /// // Scan table_source with the name "MyTable" by enclosing in quotes |
274 | | /// # let table = table_source.clone(); |
275 | | /// let scan = LogicalPlanBuilder::scan(r#""MyTable""#, table, None); |
276 | | /// |
277 | | /// // Scan table_source with the name "MyTable" by forming the table reference |
278 | | /// # let table = table_source.clone(); |
279 | | /// let table_reference = TableReference::bare("MyTable"); |
280 | | /// let scan = LogicalPlanBuilder::scan(table_reference, table, None); |
281 | | /// ``` |
282 | 0 | pub fn scan( |
283 | 0 | table_name: impl Into<TableReference>, |
284 | 0 | table_source: Arc<dyn TableSource>, |
285 | 0 | projection: Option<Vec<usize>>, |
286 | 0 | ) -> Result<Self> { |
287 | 0 | Self::scan_with_filters(table_name, table_source, projection, vec![]) |
288 | 0 | } |
289 | | |
290 | | /// Create a [CopyTo] for copying the contents of this builder to the specified file(s) |
291 | 0 | pub fn copy_to( |
292 | 0 | input: LogicalPlan, |
293 | 0 | output_url: String, |
294 | 0 | file_type: Arc<dyn FileType>, |
295 | 0 | options: HashMap<String, String>, |
296 | 0 | partition_by: Vec<String>, |
297 | 0 | ) -> Result<Self> { |
298 | 0 | Ok(Self::new(LogicalPlan::Copy(CopyTo { |
299 | 0 | input: Arc::new(input), |
300 | 0 | output_url, |
301 | 0 | partition_by, |
302 | 0 | file_type, |
303 | 0 | options, |
304 | 0 | }))) |
305 | 0 | } |
306 | | |
307 | | /// Create a [DmlStatement] for inserting the contents of this builder into the named table |
308 | 0 | pub fn insert_into( |
309 | 0 | input: LogicalPlan, |
310 | 0 | table_name: impl Into<TableReference>, |
311 | 0 | table_schema: &Schema, |
312 | 0 | insert_op: InsertOp, |
313 | 0 | ) -> Result<Self> { |
314 | 0 | let table_schema = table_schema.clone().to_dfschema_ref()?; |
315 | | |
316 | 0 | Ok(Self::new(LogicalPlan::Dml(DmlStatement::new( |
317 | 0 | table_name.into(), |
318 | 0 | table_schema, |
319 | 0 | WriteOp::Insert(insert_op), |
320 | 0 | Arc::new(input), |
321 | 0 | )))) |
322 | 0 | } |
323 | | |
324 | | /// Convert a table provider into a builder with a TableScan |
325 | 0 | pub fn scan_with_filters( |
326 | 0 | table_name: impl Into<TableReference>, |
327 | 0 | table_source: Arc<dyn TableSource>, |
328 | 0 | projection: Option<Vec<usize>>, |
329 | 0 | filters: Vec<Expr>, |
330 | 0 | ) -> Result<Self> { |
331 | 0 | TableScan::try_new(table_name, table_source, projection, filters, None) |
332 | 0 | .map(LogicalPlan::TableScan) |
333 | 0 | .map(Self::new) |
334 | 0 | } |
335 | | |
336 | | /// Convert a table provider into a builder with a TableScan with filter and fetch |
337 | 0 | pub fn scan_with_filters_fetch( |
338 | 0 | table_name: impl Into<TableReference>, |
339 | 0 | table_source: Arc<dyn TableSource>, |
340 | 0 | projection: Option<Vec<usize>>, |
341 | 0 | filters: Vec<Expr>, |
342 | 0 | fetch: Option<usize>, |
343 | 0 | ) -> Result<Self> { |
344 | 0 | TableScan::try_new(table_name, table_source, projection, filters, fetch) |
345 | 0 | .map(LogicalPlan::TableScan) |
346 | 0 | .map(Self::new) |
347 | 0 | } |
348 | | |
349 | | /// Wrap a plan in a window |
350 | 0 | pub fn window_plan( |
351 | 0 | input: LogicalPlan, |
352 | 0 | window_exprs: Vec<Expr>, |
353 | 0 | ) -> Result<LogicalPlan> { |
354 | 0 | let mut plan = input; |
355 | 0 | let mut groups = group_window_expr_by_sort_keys(window_exprs)?; |
356 | | // To align with the behavior of PostgreSQL, we want the sort_keys sorted as same rule as PostgreSQL that first |
357 | | // we compare the sort key themselves and if one window's sort keys are a prefix of another |
358 | | // put the window with more sort keys first. so more deeply sorted plans gets nested further down as children. |
359 | | // The sort_by() implementation here is a stable sort. |
360 | | // Note that by this rule if there's an empty over, it'll be at the top level |
361 | 0 | groups.sort_by(|(key_a, _), (key_b, _)| { |
362 | 0 | for ((first, _), (second, _)) in key_a.iter().zip(key_b.iter()) { |
363 | 0 | let key_ordering = compare_sort_expr(first, second, plan.schema()); |
364 | 0 | match key_ordering { |
365 | | Ordering::Less => { |
366 | 0 | return Ordering::Less; |
367 | | } |
368 | | Ordering::Greater => { |
369 | 0 | return Ordering::Greater; |
370 | | } |
371 | 0 | Ordering::Equal => {} |
372 | | } |
373 | | } |
374 | 0 | key_b.len().cmp(&key_a.len()) |
375 | 0 | }); |
376 | 0 | for (_, exprs) in groups { |
377 | 0 | let window_exprs = exprs.into_iter().collect::<Vec<_>>(); |
378 | | // Partition and sorting is done at physical level, see the EnforceDistribution |
379 | | // and EnforceSorting rules. |
380 | 0 | plan = LogicalPlanBuilder::from(plan) |
381 | 0 | .window(window_exprs)? |
382 | 0 | .build()?; |
383 | | } |
384 | 0 | Ok(plan) |
385 | 0 | } |
386 | | /// Apply a projection without alias. |
387 | 0 | pub fn project( |
388 | 0 | self, |
389 | 0 | expr: impl IntoIterator<Item = impl Into<Expr>>, |
390 | 0 | ) -> Result<Self> { |
391 | 0 | project(Arc::unwrap_or_clone(self.plan), expr).map(Self::new) |
392 | 0 | } |
393 | | |
394 | | /// Select the given column indices |
395 | 0 | pub fn select(self, indices: impl IntoIterator<Item = usize>) -> Result<Self> { |
396 | 0 | let exprs: Vec<_> = indices |
397 | 0 | .into_iter() |
398 | 0 | .map(|x| Expr::Column(Column::from(self.plan.schema().qualified_field(x)))) |
399 | 0 | .collect(); |
400 | 0 | self.project(exprs) |
401 | 0 | } |
402 | | |
403 | | /// Apply a filter |
404 | 0 | pub fn filter(self, expr: impl Into<Expr>) -> Result<Self> { |
405 | 0 | let expr = normalize_col(expr.into(), &self.plan)?; |
406 | 0 | Filter::try_new(expr, self.plan) |
407 | 0 | .map(LogicalPlan::Filter) |
408 | 0 | .map(Self::new) |
409 | 0 | } |
410 | | |
411 | | /// Apply a filter which is used for a having clause |
412 | 0 | pub fn having(self, expr: impl Into<Expr>) -> Result<Self> { |
413 | 0 | let expr = normalize_col(expr.into(), &self.plan)?; |
414 | 0 | Filter::try_new_with_having(expr, self.plan) |
415 | 0 | .map(LogicalPlan::Filter) |
416 | 0 | .map(Self::from) |
417 | 0 | } |
418 | | |
419 | | /// Make a builder for a prepare logical plan from the builder's plan |
420 | 0 | pub fn prepare(self, name: String, data_types: Vec<DataType>) -> Result<Self> { |
421 | 0 | Ok(Self::new(LogicalPlan::Prepare(Prepare { |
422 | 0 | name, |
423 | 0 | data_types, |
424 | 0 | input: self.plan, |
425 | 0 | }))) |
426 | 0 | } |
427 | | |
428 | | /// Limit the number of rows returned |
429 | | /// |
430 | | /// `skip` - Number of rows to skip before fetch any row. |
431 | | /// |
432 | | /// `fetch` - Maximum number of rows to fetch, after skipping `skip` rows, |
433 | | /// if specified. |
434 | 0 | pub fn limit(self, skip: usize, fetch: Option<usize>) -> Result<Self> { |
435 | 0 | Ok(Self::new(LogicalPlan::Limit(Limit { |
436 | 0 | skip, |
437 | 0 | fetch, |
438 | 0 | input: self.plan, |
439 | 0 | }))) |
440 | 0 | } |
441 | | |
442 | | /// Apply an alias |
443 | 0 | pub fn alias(self, alias: impl Into<TableReference>) -> Result<Self> { |
444 | 0 | subquery_alias(Arc::unwrap_or_clone(self.plan), alias).map(Self::new) |
445 | 0 | } |
446 | | |
447 | | /// Add missing sort columns to all downstream projection |
448 | | /// |
449 | | /// Thus, if you have a LogicalPlan that selects A and B and have |
450 | | /// not requested a sort by C, this code will add C recursively to |
451 | | /// all input projections. |
452 | | /// |
453 | | /// Adding a new column is not correct if there is a `Distinct` |
454 | | /// node, which produces only distinct values of its |
455 | | /// inputs. Adding a new column to its input will result in |
456 | | /// potentially different results than with the original column. |
457 | | /// |
458 | | /// For example, if the input is like: |
459 | | /// |
460 | | /// Distinct(A, B) |
461 | | /// |
462 | | /// If the input looks like |
463 | | /// |
464 | | /// a | b | c |
465 | | /// --+---+--- |
466 | | /// 1 | 2 | 3 |
467 | | /// 1 | 2 | 4 |
468 | | /// |
469 | | /// Distinct (A, B) --> (1,2) |
470 | | /// |
471 | | /// But Distinct (A, B, C) --> (1, 2, 3), (1, 2, 4) |
472 | | /// (which will appear as a (1, 2), (1, 2) if a and b are projected |
473 | | /// |
474 | | /// See <https://github.com/apache/datafusion/issues/5065> for more details |
475 | 0 | fn add_missing_columns( |
476 | 0 | curr_plan: LogicalPlan, |
477 | 0 | missing_cols: &[Column], |
478 | 0 | is_distinct: bool, |
479 | 0 | ) -> Result<LogicalPlan> { |
480 | 0 | match curr_plan { |
481 | | LogicalPlan::Projection(Projection { |
482 | 0 | input, |
483 | 0 | mut expr, |
484 | | schema: _, |
485 | 0 | }) if missing_cols.iter().all(|c| input.schema().has_column(c)) => { |
486 | 0 | let mut missing_exprs = missing_cols |
487 | 0 | .iter() |
488 | 0 | .map(|c| normalize_col(Expr::Column(c.clone()), &input)) |
489 | 0 | .collect::<Result<Vec<_>>>()?; |
490 | | |
491 | | // Do not let duplicate columns to be added, some of the |
492 | | // missing_cols may be already present but without the new |
493 | | // projected alias. |
494 | 0 | missing_exprs.retain(|e| !expr.contains(e)); |
495 | 0 | if is_distinct { |
496 | 0 | Self::ambiguous_distinct_check(&missing_exprs, missing_cols, &expr)?; |
497 | 0 | } |
498 | 0 | expr.extend(missing_exprs); |
499 | 0 | project(Arc::unwrap_or_clone(input), expr) |
500 | | } |
501 | | _ => { |
502 | 0 | let is_distinct = |
503 | 0 | is_distinct || matches!(curr_plan, LogicalPlan::Distinct(_)); |
504 | 0 | let new_inputs = curr_plan |
505 | 0 | .inputs() |
506 | 0 | .into_iter() |
507 | 0 | .map(|input_plan| { |
508 | 0 | Self::add_missing_columns( |
509 | 0 | (*input_plan).clone(), |
510 | 0 | missing_cols, |
511 | 0 | is_distinct, |
512 | 0 | ) |
513 | 0 | }) |
514 | 0 | .collect::<Result<Vec<_>>>()?; |
515 | 0 | curr_plan.with_new_exprs(curr_plan.expressions(), new_inputs) |
516 | | } |
517 | | } |
518 | 0 | } |
519 | | |
520 | 0 | fn ambiguous_distinct_check( |
521 | 0 | missing_exprs: &[Expr], |
522 | 0 | missing_cols: &[Column], |
523 | 0 | projection_exprs: &[Expr], |
524 | 0 | ) -> Result<()> { |
525 | 0 | if missing_exprs.is_empty() { |
526 | 0 | return Ok(()); |
527 | 0 | } |
528 | 0 |
|
529 | 0 | // if the missing columns are all only aliases for things in |
530 | 0 | // the existing select list, it is ok |
531 | 0 | // |
532 | 0 | // This handles the special case for |
533 | 0 | // SELECT col as <alias> ORDER BY <alias> |
534 | 0 | // |
535 | 0 | // As described in https://github.com/apache/datafusion/issues/5293 |
536 | 0 | let all_aliases = missing_exprs.iter().all(|e| { |
537 | 0 | projection_exprs.iter().any(|proj_expr| { |
538 | 0 | if let Expr::Alias(Alias { expr, .. }) = proj_expr { |
539 | 0 | e == expr.as_ref() |
540 | | } else { |
541 | 0 | false |
542 | | } |
543 | 0 | }) |
544 | 0 | }); |
545 | 0 | if all_aliases { |
546 | 0 | return Ok(()); |
547 | 0 | } |
548 | 0 |
|
549 | 0 | let missing_col_names = missing_cols |
550 | 0 | .iter() |
551 | 0 | .map(|col| col.flat_name()) |
552 | 0 | .collect::<String>(); |
553 | 0 |
|
554 | 0 | plan_err!("For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list") |
555 | 0 | } |
556 | | |
557 | | /// Apply a sort by provided expressions with default direction |
558 | 0 | pub fn sort_by( |
559 | 0 | self, |
560 | 0 | expr: impl IntoIterator<Item = impl Into<Expr>> + Clone, |
561 | 0 | ) -> Result<Self> { |
562 | 0 | self.sort( |
563 | 0 | expr.into_iter() |
564 | 0 | .map(|e| e.into().sort(true, false)) |
565 | 0 | .collect::<Vec<SortExpr>>(), |
566 | 0 | ) |
567 | 0 | } |
568 | | |
569 | 0 | pub fn sort( |
570 | 0 | self, |
571 | 0 | sorts: impl IntoIterator<Item = impl Into<SortExpr>> + Clone, |
572 | 0 | ) -> Result<Self> { |
573 | 0 | self.sort_with_limit(sorts, None) |
574 | 0 | } |
575 | | |
576 | | /// Apply a sort |
577 | 0 | pub fn sort_with_limit( |
578 | 0 | self, |
579 | 0 | sorts: impl IntoIterator<Item = impl Into<SortExpr>> + Clone, |
580 | 0 | fetch: Option<usize>, |
581 | 0 | ) -> Result<Self> { |
582 | 0 | let sorts = rewrite_sort_cols_by_aggs(sorts, &self.plan)?; |
583 | | |
584 | 0 | let schema = self.plan.schema(); |
585 | 0 |
|
586 | 0 | // Collect sort columns that are missing in the input plan's schema |
587 | 0 | let mut missing_cols: Vec<Column> = vec![]; |
588 | 0 | sorts.iter().try_for_each::<_, Result<()>>(|sort| { |
589 | 0 | let columns = sort.expr.column_refs(); |
590 | 0 |
|
591 | 0 | columns.into_iter().for_each(|c| { |
592 | 0 | if !schema.has_column(c) { |
593 | 0 | missing_cols.push(c.clone()); |
594 | 0 | } |
595 | 0 | }); |
596 | 0 |
|
597 | 0 | Ok(()) |
598 | 0 | })?; |
599 | | |
600 | 0 | if missing_cols.is_empty() { |
601 | | return Ok(Self::new(LogicalPlan::Sort(Sort { |
602 | 0 | expr: normalize_sorts(sorts, &self.plan)?, |
603 | 0 | input: self.plan, |
604 | 0 | fetch, |
605 | | }))); |
606 | 0 | } |
607 | 0 |
|
608 | 0 | // remove pushed down sort columns |
609 | 0 | let new_expr = schema.columns().into_iter().map(Expr::Column).collect(); |
610 | 0 |
|
611 | 0 | let is_distinct = false; |
612 | 0 | let plan = Self::add_missing_columns( |
613 | 0 | Arc::unwrap_or_clone(self.plan), |
614 | 0 | &missing_cols, |
615 | 0 | is_distinct, |
616 | 0 | )?; |
617 | 0 | let sort_plan = LogicalPlan::Sort(Sort { |
618 | 0 | expr: normalize_sorts(sorts, &plan)?, |
619 | 0 | input: Arc::new(plan), |
620 | 0 | fetch, |
621 | 0 | }); |
622 | 0 |
|
623 | 0 | Projection::try_new(new_expr, Arc::new(sort_plan)) |
624 | 0 | .map(LogicalPlan::Projection) |
625 | 0 | .map(Self::new) |
626 | 0 | } |
627 | | |
628 | | /// Apply a union, preserving duplicate rows |
629 | 0 | pub fn union(self, plan: LogicalPlan) -> Result<Self> { |
630 | 0 | union(Arc::unwrap_or_clone(self.plan), plan).map(Self::new) |
631 | 0 | } |
632 | | |
633 | | /// Apply a union, removing duplicate rows |
634 | 0 | pub fn union_distinct(self, plan: LogicalPlan) -> Result<Self> { |
635 | 0 | let left_plan: LogicalPlan = Arc::unwrap_or_clone(self.plan); |
636 | 0 | let right_plan: LogicalPlan = plan; |
637 | 0 |
|
638 | 0 | Ok(Self::new(LogicalPlan::Distinct(Distinct::All(Arc::new( |
639 | 0 | union(left_plan, right_plan)?, |
640 | | ))))) |
641 | 0 | } |
642 | | |
643 | | /// Apply deduplication: Only distinct (different) values are returned) |
644 | 0 | pub fn distinct(self) -> Result<Self> { |
645 | 0 | Ok(Self::new(LogicalPlan::Distinct(Distinct::All(self.plan)))) |
646 | 0 | } |
647 | | |
648 | | /// Project first values of the specified expression list according to the provided |
649 | | /// sorting expressions grouped by the `DISTINCT ON` clause expressions. |
650 | 0 | pub fn distinct_on( |
651 | 0 | self, |
652 | 0 | on_expr: Vec<Expr>, |
653 | 0 | select_expr: Vec<Expr>, |
654 | 0 | sort_expr: Option<Vec<SortExpr>>, |
655 | 0 | ) -> Result<Self> { |
656 | 0 | Ok(Self::new(LogicalPlan::Distinct(Distinct::On( |
657 | 0 | DistinctOn::try_new(on_expr, select_expr, sort_expr, self.plan)?, |
658 | | )))) |
659 | 0 | } |
660 | | |
661 | | /// Apply a join to `right` using explicitly specified columns and an |
662 | | /// optional filter expression. |
663 | | /// |
664 | | /// See [`join_on`](Self::join_on) for a more concise way to specify the |
665 | | /// join condition. Since DataFusion will automatically identify and |
666 | | /// optimize equality predicates there is no performance difference between |
667 | | /// this function and `join_on` |
668 | | /// |
669 | | /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see |
670 | | /// example below), which are then combined with the optional `filter` |
671 | | /// expression. |
672 | | /// |
673 | | /// Note that in case of outer join, the `filter` is applied to only matched rows. |
674 | 0 | pub fn join( |
675 | 0 | self, |
676 | 0 | right: LogicalPlan, |
677 | 0 | join_type: JoinType, |
678 | 0 | join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>), |
679 | 0 | filter: Option<Expr>, |
680 | 0 | ) -> Result<Self> { |
681 | 0 | self.join_detailed(right, join_type, join_keys, filter, false) |
682 | 0 | } |
683 | | |
684 | | /// Apply a join with using the specified expressions. |
685 | | /// |
686 | | /// Note that DataFusion automatically optimizes joins, including |
687 | | /// identifying and optimizing equality predicates. |
688 | | /// |
689 | | /// # Example |
690 | | /// |
691 | | /// ``` |
692 | | /// # use datafusion_expr::{Expr, col, LogicalPlanBuilder, |
693 | | /// # logical_plan::builder::LogicalTableSource, logical_plan::JoinType,}; |
694 | | /// # use std::sync::Arc; |
695 | | /// # use arrow::datatypes::{Schema, DataType, Field}; |
696 | | /// # use datafusion_common::Result; |
697 | | /// # fn main() -> Result<()> { |
698 | | /// let example_schema = Arc::new(Schema::new(vec![ |
699 | | /// Field::new("a", DataType::Int32, false), |
700 | | /// Field::new("b", DataType::Int32, false), |
701 | | /// Field::new("c", DataType::Int32, false), |
702 | | /// ])); |
703 | | /// let table_source = Arc::new(LogicalTableSource::new(example_schema)); |
704 | | /// let left_table = table_source.clone(); |
705 | | /// let right_table = table_source.clone(); |
706 | | /// |
707 | | /// let right_plan = LogicalPlanBuilder::scan("right", right_table, None)?.build()?; |
708 | | /// |
709 | | /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)` |
710 | | /// let exprs = vec![ |
711 | | /// col("left.a").eq(col("right.a")), |
712 | | /// col("left.b").not_eq(col("right.b")) |
713 | | /// ]; |
714 | | /// |
715 | | /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)` |
716 | | /// // finding all pairs of rows from `left` and `right` where |
717 | | /// // where `a = a2` and `b != b2`. |
718 | | /// let plan = LogicalPlanBuilder::scan("left", left_table, None)? |
719 | | /// .join_on(right_plan, JoinType::Inner, exprs)? |
720 | | /// .build()?; |
721 | | /// # Ok(()) |
722 | | /// # } |
723 | | /// ``` |
724 | 0 | pub fn join_on( |
725 | 0 | self, |
726 | 0 | right: LogicalPlan, |
727 | 0 | join_type: JoinType, |
728 | 0 | on_exprs: impl IntoIterator<Item = Expr>, |
729 | 0 | ) -> Result<Self> { |
730 | 0 | let filter = on_exprs.into_iter().reduce(Expr::and); |
731 | 0 |
|
732 | 0 | self.join_detailed( |
733 | 0 | right, |
734 | 0 | join_type, |
735 | 0 | (Vec::<Column>::new(), Vec::<Column>::new()), |
736 | 0 | filter, |
737 | 0 | false, |
738 | 0 | ) |
739 | 0 | } |
740 | | |
741 | 0 | pub(crate) fn normalize( |
742 | 0 | plan: &LogicalPlan, |
743 | 0 | column: impl Into<Column>, |
744 | 0 | ) -> Result<Column> { |
745 | 0 | let schema = plan.schema(); |
746 | 0 | let fallback_schemas = plan.fallback_normalize_schemas(); |
747 | 0 | let using_columns = plan.using_columns()?; |
748 | 0 | column.into().normalize_with_schemas_and_ambiguity_check( |
749 | 0 | &[&[schema], &fallback_schemas], |
750 | 0 | &using_columns, |
751 | 0 | ) |
752 | 0 | } |
753 | | |
754 | | /// Apply a join with on constraint and specified null equality. |
755 | | /// |
756 | | /// The behavior is the same as [`join`](Self::join) except that it allows |
757 | | /// specifying the null equality behavior. |
758 | | /// |
759 | | /// If `null_equals_null=true`, rows where both join keys are `null` will be |
760 | | /// emitted. Otherwise rows where either or both join keys are `null` will be |
761 | | /// omitted. |
762 | 0 | pub fn join_detailed( |
763 | 0 | self, |
764 | 0 | right: LogicalPlan, |
765 | 0 | join_type: JoinType, |
766 | 0 | join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>), |
767 | 0 | filter: Option<Expr>, |
768 | 0 | null_equals_null: bool, |
769 | 0 | ) -> Result<Self> { |
770 | 0 | if join_keys.0.len() != join_keys.1.len() { |
771 | 0 | return plan_err!("left_keys and right_keys were not the same length"); |
772 | 0 | } |
773 | | |
774 | 0 | let filter = if let Some(expr) = filter { |
775 | 0 | let filter = normalize_col_with_schemas_and_ambiguity_check( |
776 | 0 | expr, |
777 | 0 | &[&[self.schema(), right.schema()]], |
778 | 0 | &[], |
779 | 0 | )?; |
780 | 0 | Some(filter) |
781 | | } else { |
782 | 0 | None |
783 | | }; |
784 | | |
785 | 0 | let (left_keys, right_keys): (Vec<Result<Column>>, Vec<Result<Column>>) = |
786 | 0 | join_keys |
787 | 0 | .0 |
788 | 0 | .into_iter() |
789 | 0 | .zip(join_keys.1) |
790 | 0 | .map(|(l, r)| { |
791 | 0 | let l = l.into(); |
792 | 0 | let r = r.into(); |
793 | 0 |
|
794 | 0 | match (&l.relation, &r.relation) { |
795 | 0 | (Some(lr), Some(rr)) => { |
796 | 0 | let l_is_left = |
797 | 0 | self.plan.schema().field_with_qualified_name(lr, &l.name); |
798 | 0 | let l_is_right = |
799 | 0 | right.schema().field_with_qualified_name(lr, &l.name); |
800 | 0 | let r_is_left = |
801 | 0 | self.plan.schema().field_with_qualified_name(rr, &r.name); |
802 | 0 | let r_is_right = |
803 | 0 | right.schema().field_with_qualified_name(rr, &r.name); |
804 | 0 |
|
805 | 0 | match (l_is_left, l_is_right, r_is_left, r_is_right) { |
806 | 0 | (_, Ok(_), Ok(_), _) => (Ok(r), Ok(l)), |
807 | 0 | (Ok(_), _, _, Ok(_)) => (Ok(l), Ok(r)), |
808 | 0 | _ => ( |
809 | 0 | Self::normalize(&self.plan, l), |
810 | 0 | Self::normalize(&right, r), |
811 | 0 | ), |
812 | | } |
813 | | } |
814 | 0 | (Some(lr), None) => { |
815 | 0 | let l_is_left = |
816 | 0 | self.plan.schema().field_with_qualified_name(lr, &l.name); |
817 | 0 | let l_is_right = |
818 | 0 | right.schema().field_with_qualified_name(lr, &l.name); |
819 | 0 |
|
820 | 0 | match (l_is_left, l_is_right) { |
821 | 0 | (Ok(_), _) => (Ok(l), Self::normalize(&right, r)), |
822 | 0 | (_, Ok(_)) => (Self::normalize(&self.plan, r), Ok(l)), |
823 | 0 | _ => ( |
824 | 0 | Self::normalize(&self.plan, l), |
825 | 0 | Self::normalize(&right, r), |
826 | 0 | ), |
827 | | } |
828 | | } |
829 | 0 | (None, Some(rr)) => { |
830 | 0 | let r_is_left = |
831 | 0 | self.plan.schema().field_with_qualified_name(rr, &r.name); |
832 | 0 | let r_is_right = |
833 | 0 | right.schema().field_with_qualified_name(rr, &r.name); |
834 | 0 |
|
835 | 0 | match (r_is_left, r_is_right) { |
836 | 0 | (Ok(_), _) => (Ok(r), Self::normalize(&right, l)), |
837 | 0 | (_, Ok(_)) => (Self::normalize(&self.plan, l), Ok(r)), |
838 | 0 | _ => ( |
839 | 0 | Self::normalize(&self.plan, l), |
840 | 0 | Self::normalize(&right, r), |
841 | 0 | ), |
842 | | } |
843 | | } |
844 | | (None, None) => { |
845 | 0 | let mut swap = false; |
846 | 0 | let left_key = Self::normalize(&self.plan, l.clone()) |
847 | 0 | .or_else(|_| { |
848 | 0 | swap = true; |
849 | 0 | Self::normalize(&right, l) |
850 | 0 | }); |
851 | 0 | if swap { |
852 | 0 | (Self::normalize(&self.plan, r), left_key) |
853 | | } else { |
854 | 0 | (left_key, Self::normalize(&right, r)) |
855 | | } |
856 | | } |
857 | | } |
858 | 0 | }) |
859 | 0 | .unzip(); |
860 | | |
861 | 0 | let left_keys = left_keys.into_iter().collect::<Result<Vec<Column>>>()?; |
862 | 0 | let right_keys = right_keys.into_iter().collect::<Result<Vec<Column>>>()?; |
863 | | |
864 | 0 | let on = left_keys |
865 | 0 | .into_iter() |
866 | 0 | .zip(right_keys) |
867 | 0 | .map(|(l, r)| (Expr::Column(l), Expr::Column(r))) |
868 | 0 | .collect(); |
869 | 0 | let join_schema = |
870 | 0 | build_join_schema(self.plan.schema(), right.schema(), &join_type)?; |
871 | | |
872 | 0 | Ok(Self::new(LogicalPlan::Join(Join { |
873 | 0 | left: self.plan, |
874 | 0 | right: Arc::new(right), |
875 | 0 | on, |
876 | 0 | filter, |
877 | 0 | join_type, |
878 | 0 | join_constraint: JoinConstraint::On, |
879 | 0 | schema: DFSchemaRef::new(join_schema), |
880 | 0 | null_equals_null, |
881 | 0 | }))) |
882 | 0 | } |
883 | | |
884 | | /// Apply a join with using constraint, which duplicates all join columns in output schema. |
885 | 0 | pub fn join_using( |
886 | 0 | self, |
887 | 0 | right: LogicalPlan, |
888 | 0 | join_type: JoinType, |
889 | 0 | using_keys: Vec<impl Into<Column> + Clone>, |
890 | 0 | ) -> Result<Self> { |
891 | 0 | let left_keys: Vec<Column> = using_keys |
892 | 0 | .clone() |
893 | 0 | .into_iter() |
894 | 0 | .map(|c| Self::normalize(&self.plan, c)) |
895 | 0 | .collect::<Result<_>>()?; |
896 | 0 | let right_keys: Vec<Column> = using_keys |
897 | 0 | .into_iter() |
898 | 0 | .map(|c| Self::normalize(&right, c)) |
899 | 0 | .collect::<Result<_>>()?; |
900 | | |
901 | 0 | let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys).collect(); |
902 | 0 | let join_schema = |
903 | 0 | build_join_schema(self.plan.schema(), right.schema(), &join_type)?; |
904 | 0 | let mut join_on: Vec<(Expr, Expr)> = vec![]; |
905 | 0 | let mut filters: Option<Expr> = None; |
906 | 0 | for (l, r) in &on { |
907 | 0 | if self.plan.schema().has_column(l) |
908 | 0 | && right.schema().has_column(r) |
909 | 0 | && can_hash(self.plan.schema().field_from_column(l)?.data_type()) |
910 | 0 | { |
911 | 0 | join_on.push((Expr::Column(l.clone()), Expr::Column(r.clone()))); |
912 | 0 | } else if self.plan.schema().has_column(l) |
913 | 0 | && right.schema().has_column(r) |
914 | 0 | && can_hash(self.plan.schema().field_from_column(r)?.data_type()) |
915 | 0 | { |
916 | 0 | join_on.push((Expr::Column(r.clone()), Expr::Column(l.clone()))); |
917 | 0 | } else { |
918 | 0 | let expr = binary_expr( |
919 | 0 | Expr::Column(l.clone()), |
920 | 0 | Operator::Eq, |
921 | 0 | Expr::Column(r.clone()), |
922 | 0 | ); |
923 | 0 | match filters { |
924 | 0 | None => filters = Some(expr), |
925 | 0 | Some(filter_expr) => filters = Some(and(expr, filter_expr)), |
926 | | } |
927 | | } |
928 | | } |
929 | | |
930 | 0 | if join_on.is_empty() { |
931 | 0 | let join = Self::from(self.plan).cross_join(right)?; |
932 | 0 | join.filter(filters.ok_or_else(|| { |
933 | 0 | DataFusionError::Internal("filters should not be None here".to_string()) |
934 | 0 | })?) |
935 | | } else { |
936 | 0 | Ok(Self::new(LogicalPlan::Join(Join { |
937 | 0 | left: self.plan, |
938 | 0 | right: Arc::new(right), |
939 | 0 | on: join_on, |
940 | 0 | filter: filters, |
941 | 0 | join_type, |
942 | 0 | join_constraint: JoinConstraint::Using, |
943 | 0 | schema: DFSchemaRef::new(join_schema), |
944 | 0 | null_equals_null: false, |
945 | 0 | }))) |
946 | | } |
947 | 0 | } |
948 | | |
949 | | /// Apply a cross join |
950 | 0 | pub fn cross_join(self, right: LogicalPlan) -> Result<Self> { |
951 | 0 | let join_schema = |
952 | 0 | build_join_schema(self.plan.schema(), right.schema(), &JoinType::Inner)?; |
953 | 0 | Ok(Self::new(LogicalPlan::CrossJoin(CrossJoin { |
954 | 0 | left: self.plan, |
955 | 0 | right: Arc::new(right), |
956 | 0 | schema: DFSchemaRef::new(join_schema), |
957 | 0 | }))) |
958 | 0 | } |
959 | | |
960 | | /// Repartition |
961 | 0 | pub fn repartition(self, partitioning_scheme: Partitioning) -> Result<Self> { |
962 | 0 | Ok(Self::new(LogicalPlan::Repartition(Repartition { |
963 | 0 | input: self.plan, |
964 | 0 | partitioning_scheme, |
965 | 0 | }))) |
966 | 0 | } |
967 | | |
968 | | /// Apply a window functions to extend the schema |
969 | 0 | pub fn window( |
970 | 0 | self, |
971 | 0 | window_expr: impl IntoIterator<Item = impl Into<Expr>>, |
972 | 0 | ) -> Result<Self> { |
973 | 0 | let window_expr = normalize_cols(window_expr, &self.plan)?; |
974 | 0 | validate_unique_names("Windows", &window_expr)?; |
975 | 0 | Ok(Self::new(LogicalPlan::Window(Window::try_new( |
976 | 0 | window_expr, |
977 | 0 | self.plan, |
978 | 0 | )?))) |
979 | 0 | } |
980 | | |
981 | | /// Apply an aggregate: grouping on the `group_expr` expressions |
982 | | /// and calculating `aggr_expr` aggregates for each distinct |
983 | | /// value of the `group_expr`; |
984 | 0 | pub fn aggregate( |
985 | 0 | self, |
986 | 0 | group_expr: impl IntoIterator<Item = impl Into<Expr>>, |
987 | 0 | aggr_expr: impl IntoIterator<Item = impl Into<Expr>>, |
988 | 0 | ) -> Result<Self> { |
989 | 0 | let group_expr = normalize_cols(group_expr, &self.plan)?; |
990 | 0 | let aggr_expr = normalize_cols(aggr_expr, &self.plan)?; |
991 | | |
992 | 0 | let group_expr = |
993 | 0 | add_group_by_exprs_from_dependencies(group_expr, self.plan.schema())?; |
994 | 0 | Aggregate::try_new(self.plan, group_expr, aggr_expr) |
995 | 0 | .map(LogicalPlan::Aggregate) |
996 | 0 | .map(Self::new) |
997 | 0 | } |
998 | | |
999 | | /// Create an expression to represent the explanation of the plan |
1000 | | /// |
1001 | | /// if `analyze` is true, runs the actual plan and produces |
1002 | | /// information about metrics during run. |
1003 | | /// |
1004 | | /// if `verbose` is true, prints out additional details. |
1005 | 0 | pub fn explain(self, verbose: bool, analyze: bool) -> Result<Self> { |
1006 | 0 | let schema = LogicalPlan::explain_schema(); |
1007 | 0 | let schema = schema.to_dfschema_ref()?; |
1008 | | |
1009 | 0 | if analyze { |
1010 | 0 | Ok(Self::new(LogicalPlan::Analyze(Analyze { |
1011 | 0 | verbose, |
1012 | 0 | input: self.plan, |
1013 | 0 | schema, |
1014 | 0 | }))) |
1015 | | } else { |
1016 | 0 | let stringified_plans = |
1017 | 0 | vec![self.plan.to_stringified(PlanType::InitialLogicalPlan)]; |
1018 | 0 |
|
1019 | 0 | Ok(Self::new(LogicalPlan::Explain(Explain { |
1020 | 0 | verbose, |
1021 | 0 | plan: self.plan, |
1022 | 0 | stringified_plans, |
1023 | 0 | schema, |
1024 | 0 | logical_optimization_succeeded: false, |
1025 | 0 | }))) |
1026 | | } |
1027 | 0 | } |
1028 | | |
1029 | | /// Process intersect set operator |
1030 | 0 | pub fn intersect( |
1031 | 0 | left_plan: LogicalPlan, |
1032 | 0 | right_plan: LogicalPlan, |
1033 | 0 | is_all: bool, |
1034 | 0 | ) -> Result<LogicalPlan> { |
1035 | 0 | LogicalPlanBuilder::intersect_or_except( |
1036 | 0 | left_plan, |
1037 | 0 | right_plan, |
1038 | 0 | JoinType::LeftSemi, |
1039 | 0 | is_all, |
1040 | 0 | ) |
1041 | 0 | } |
1042 | | |
1043 | | /// Process except set operator |
1044 | 0 | pub fn except( |
1045 | 0 | left_plan: LogicalPlan, |
1046 | 0 | right_plan: LogicalPlan, |
1047 | 0 | is_all: bool, |
1048 | 0 | ) -> Result<LogicalPlan> { |
1049 | 0 | LogicalPlanBuilder::intersect_or_except( |
1050 | 0 | left_plan, |
1051 | 0 | right_plan, |
1052 | 0 | JoinType::LeftAnti, |
1053 | 0 | is_all, |
1054 | 0 | ) |
1055 | 0 | } |
1056 | | |
1057 | | /// Process intersect or except |
1058 | 0 | fn intersect_or_except( |
1059 | 0 | left_plan: LogicalPlan, |
1060 | 0 | right_plan: LogicalPlan, |
1061 | 0 | join_type: JoinType, |
1062 | 0 | is_all: bool, |
1063 | 0 | ) -> Result<LogicalPlan> { |
1064 | 0 | let left_len = left_plan.schema().fields().len(); |
1065 | 0 | let right_len = right_plan.schema().fields().len(); |
1066 | 0 |
|
1067 | 0 | if left_len != right_len { |
1068 | 0 | return plan_err!( |
1069 | 0 | "INTERSECT/EXCEPT query must have the same number of columns. Left is {left_len} and right is {right_len}." |
1070 | 0 | ); |
1071 | 0 | } |
1072 | 0 |
|
1073 | 0 | let join_keys = left_plan |
1074 | 0 | .schema() |
1075 | 0 | .fields() |
1076 | 0 | .iter() |
1077 | 0 | .zip(right_plan.schema().fields().iter()) |
1078 | 0 | .map(|(left_field, right_field)| { |
1079 | 0 | ( |
1080 | 0 | (Column::from_name(left_field.name())), |
1081 | 0 | (Column::from_name(right_field.name())), |
1082 | 0 | ) |
1083 | 0 | }) |
1084 | 0 | .unzip(); |
1085 | 0 | if is_all { |
1086 | 0 | LogicalPlanBuilder::from(left_plan) |
1087 | 0 | .join_detailed(right_plan, join_type, join_keys, None, true)? |
1088 | 0 | .build() |
1089 | | } else { |
1090 | 0 | LogicalPlanBuilder::from(left_plan) |
1091 | 0 | .distinct()? |
1092 | 0 | .join_detailed(right_plan, join_type, join_keys, None, true)? |
1093 | 0 | .build() |
1094 | | } |
1095 | 0 | } |
1096 | | |
1097 | | /// Build the plan |
1098 | 0 | pub fn build(self) -> Result<LogicalPlan> { |
1099 | 0 | Ok(Arc::unwrap_or_clone(self.plan)) |
1100 | 0 | } |
1101 | | |
1102 | | /// Apply a join with the expression on constraint. |
1103 | | /// |
1104 | | /// equi_exprs are "equijoin" predicates expressions on the existing and right inputs, respectively. |
1105 | | /// |
1106 | | /// filter: any other filter expression to apply during the join. equi_exprs predicates are likely |
1107 | | /// to be evaluated more quickly than the filter expressions |
1108 | 0 | pub fn join_with_expr_keys( |
1109 | 0 | self, |
1110 | 0 | right: LogicalPlan, |
1111 | 0 | join_type: JoinType, |
1112 | 0 | equi_exprs: (Vec<impl Into<Expr>>, Vec<impl Into<Expr>>), |
1113 | 0 | filter: Option<Expr>, |
1114 | 0 | ) -> Result<Self> { |
1115 | 0 | if equi_exprs.0.len() != equi_exprs.1.len() { |
1116 | 0 | return plan_err!("left_keys and right_keys were not the same length"); |
1117 | 0 | } |
1118 | | |
1119 | 0 | let join_key_pairs = equi_exprs |
1120 | 0 | .0 |
1121 | 0 | .into_iter() |
1122 | 0 | .zip(equi_exprs.1.into_iter()) |
1123 | 0 | .map(|(l, r)| { |
1124 | 0 | let left_key = l.into(); |
1125 | 0 | let right_key = r.into(); |
1126 | 0 |
|
1127 | 0 | let mut left_using_columns = HashSet::new(); |
1128 | 0 | expr_to_columns(&left_key, &mut left_using_columns)?; |
1129 | 0 | let normalized_left_key = normalize_col_with_schemas_and_ambiguity_check( |
1130 | 0 | left_key, |
1131 | 0 | &[&[self.plan.schema(), right.schema()]], |
1132 | 0 | &[left_using_columns], |
1133 | 0 | )?; |
1134 | | |
1135 | 0 | let mut right_using_columns = HashSet::new(); |
1136 | 0 | expr_to_columns(&right_key, &mut right_using_columns)?; |
1137 | 0 | let normalized_right_key = normalize_col_with_schemas_and_ambiguity_check( |
1138 | 0 | right_key, |
1139 | 0 | &[&[self.plan.schema(), right.schema()]], |
1140 | 0 | &[right_using_columns], |
1141 | 0 | )?; |
1142 | | |
1143 | | // find valid equijoin |
1144 | 0 | find_valid_equijoin_key_pair( |
1145 | 0 | &normalized_left_key, |
1146 | 0 | &normalized_right_key, |
1147 | 0 | self.plan.schema(), |
1148 | 0 | right.schema(), |
1149 | 0 | )?.ok_or_else(|| |
1150 | 0 | plan_datafusion_err!( |
1151 | 0 | "can't create join plan, join key should belong to one input, error key: ({normalized_left_key},{normalized_right_key})" |
1152 | 0 | )) |
1153 | 0 | }) |
1154 | 0 | .collect::<Result<Vec<_>>>()?; |
1155 | | |
1156 | 0 | let join_schema = |
1157 | 0 | build_join_schema(self.plan.schema(), right.schema(), &join_type)?; |
1158 | | |
1159 | 0 | Ok(Self::new(LogicalPlan::Join(Join { |
1160 | 0 | left: self.plan, |
1161 | 0 | right: Arc::new(right), |
1162 | 0 | on: join_key_pairs, |
1163 | 0 | filter, |
1164 | 0 | join_type, |
1165 | 0 | join_constraint: JoinConstraint::On, |
1166 | 0 | schema: DFSchemaRef::new(join_schema), |
1167 | 0 | null_equals_null: false, |
1168 | 0 | }))) |
1169 | 0 | } |
1170 | | |
1171 | | /// Unnest the given column. |
1172 | 0 | pub fn unnest_column(self, column: impl Into<Column>) -> Result<Self> { |
1173 | 0 | unnest(Arc::unwrap_or_clone(self.plan), vec![column.into()]).map(Self::new) |
1174 | 0 | } |
1175 | | |
1176 | | /// Unnest the given column given [`UnnestOptions`] |
1177 | 0 | pub fn unnest_column_with_options( |
1178 | 0 | self, |
1179 | 0 | column: impl Into<Column>, |
1180 | 0 | options: UnnestOptions, |
1181 | 0 | ) -> Result<Self> { |
1182 | 0 | unnest_with_options( |
1183 | 0 | Arc::unwrap_or_clone(self.plan), |
1184 | 0 | vec![(column.into(), ColumnUnnestType::Inferred)], |
1185 | 0 | options, |
1186 | 0 | ) |
1187 | 0 | .map(Self::new) |
1188 | 0 | } |
1189 | | |
1190 | | /// Unnest the given columns with the given [`UnnestOptions`] |
1191 | 0 | pub fn unnest_columns_with_options( |
1192 | 0 | self, |
1193 | 0 | columns: Vec<Column>, |
1194 | 0 | options: UnnestOptions, |
1195 | 0 | ) -> Result<Self> { |
1196 | 0 | unnest_with_options( |
1197 | 0 | Arc::unwrap_or_clone(self.plan), |
1198 | 0 | columns |
1199 | 0 | .into_iter() |
1200 | 0 | .map(|c| (c, ColumnUnnestType::Inferred)) |
1201 | 0 | .collect(), |
1202 | 0 | options, |
1203 | 0 | ) |
1204 | 0 | .map(Self::new) |
1205 | 0 | } |
1206 | | |
1207 | | /// Unnest the given columns with the given [`UnnestOptions`] |
1208 | | /// if one column is a list type, it can be recursively and simultaneously |
1209 | | /// unnested into the desired recursion levels |
1210 | | /// e.g select unnest(list_col,depth=1), unnest(list_col,depth=2) |
1211 | 0 | pub fn unnest_columns_recursive_with_options( |
1212 | 0 | self, |
1213 | 0 | columns: Vec<(Column, ColumnUnnestType)>, |
1214 | 0 | options: UnnestOptions, |
1215 | 0 | ) -> Result<Self> { |
1216 | 0 | unnest_with_options(Arc::unwrap_or_clone(self.plan), columns, options) |
1217 | 0 | .map(Self::new) |
1218 | 0 | } |
1219 | | } |
1220 | | |
1221 | | impl From<LogicalPlan> for LogicalPlanBuilder { |
1222 | 0 | fn from(plan: LogicalPlan) -> Self { |
1223 | 0 | LogicalPlanBuilder::new(plan) |
1224 | 0 | } |
1225 | | } |
1226 | | |
1227 | | impl From<Arc<LogicalPlan>> for LogicalPlanBuilder { |
1228 | 0 | fn from(plan: Arc<LogicalPlan>) -> Self { |
1229 | 0 | LogicalPlanBuilder::new_from_arc(plan) |
1230 | 0 | } |
1231 | | } |
1232 | | |
1233 | 0 | pub fn change_redundant_column(fields: &Fields) -> Vec<Field> { |
1234 | 0 | let mut name_map = HashMap::new(); |
1235 | 0 | fields |
1236 | 0 | .into_iter() |
1237 | 0 | .map(|field| { |
1238 | 0 | let counter = name_map.entry(field.name().to_string()).or_insert(0); |
1239 | 0 | *counter += 1; |
1240 | 0 | if *counter > 1 { |
1241 | 0 | let new_name = format!("{}:{}", field.name(), *counter - 1); |
1242 | 0 | Field::new(new_name, field.data_type().clone(), field.is_nullable()) |
1243 | | } else { |
1244 | 0 | field.as_ref().clone() |
1245 | | } |
1246 | 0 | }) |
1247 | 0 | .collect() |
1248 | 0 | } |
1249 | | /// Creates a schema for a join operation. |
1250 | | /// The fields from the left side are first |
1251 | 0 | pub fn build_join_schema( |
1252 | 0 | left: &DFSchema, |
1253 | 0 | right: &DFSchema, |
1254 | 0 | join_type: &JoinType, |
1255 | 0 | ) -> Result<DFSchema> { |
1256 | 0 | fn nullify_fields<'a>( |
1257 | 0 | fields: impl Iterator<Item = (Option<&'a TableReference>, &'a Arc<Field>)>, |
1258 | 0 | ) -> Vec<(Option<TableReference>, Arc<Field>)> { |
1259 | 0 | fields |
1260 | 0 | .map(|(q, f)| { |
1261 | 0 | // TODO: find a good way to do that |
1262 | 0 | let field = f.as_ref().clone().with_nullable(true); |
1263 | 0 | (q.cloned(), Arc::new(field)) |
1264 | 0 | }) |
1265 | 0 | .collect() |
1266 | 0 | } |
1267 | | |
1268 | 0 | let right_fields = right.iter(); |
1269 | 0 | let left_fields = left.iter(); |
1270 | | |
1271 | 0 | let qualified_fields: Vec<(Option<TableReference>, Arc<Field>)> = match join_type { |
1272 | | JoinType::Inner => { |
1273 | | // left then right |
1274 | 0 | let left_fields = left_fields |
1275 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1276 | 0 | .collect::<Vec<_>>(); |
1277 | 0 | let right_fields = right_fields |
1278 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1279 | 0 | .collect::<Vec<_>>(); |
1280 | 0 | left_fields.into_iter().chain(right_fields).collect() |
1281 | | } |
1282 | | JoinType::Left => { |
1283 | | // left then right, right set to nullable in case of not matched scenario |
1284 | 0 | let left_fields = left_fields |
1285 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1286 | 0 | .collect::<Vec<_>>(); |
1287 | 0 | left_fields |
1288 | 0 | .into_iter() |
1289 | 0 | .chain(nullify_fields(right_fields)) |
1290 | 0 | .collect() |
1291 | | } |
1292 | | JoinType::Right => { |
1293 | | // left then right, left set to nullable in case of not matched scenario |
1294 | 0 | let right_fields = right_fields |
1295 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1296 | 0 | .collect::<Vec<_>>(); |
1297 | 0 | nullify_fields(left_fields) |
1298 | 0 | .into_iter() |
1299 | 0 | .chain(right_fields) |
1300 | 0 | .collect() |
1301 | | } |
1302 | | JoinType::Full => { |
1303 | | // left then right, all set to nullable in case of not matched scenario |
1304 | 0 | nullify_fields(left_fields) |
1305 | 0 | .into_iter() |
1306 | 0 | .chain(nullify_fields(right_fields)) |
1307 | 0 | .collect() |
1308 | | } |
1309 | | JoinType::LeftSemi | JoinType::LeftAnti => { |
1310 | | // Only use the left side for the schema |
1311 | 0 | left_fields |
1312 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1313 | 0 | .collect() |
1314 | | } |
1315 | | JoinType::RightSemi | JoinType::RightAnti => { |
1316 | | // Only use the right side for the schema |
1317 | 0 | right_fields |
1318 | 0 | .map(|(q, f)| (q.cloned(), Arc::clone(f))) |
1319 | 0 | .collect() |
1320 | | } |
1321 | | }; |
1322 | 0 | let func_dependencies = left.functional_dependencies().join( |
1323 | 0 | right.functional_dependencies(), |
1324 | 0 | join_type, |
1325 | 0 | left.fields().len(), |
1326 | 0 | ); |
1327 | 0 | let mut metadata = left.metadata().clone(); |
1328 | 0 | metadata.extend(right.metadata().clone()); |
1329 | 0 | let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?; |
1330 | 0 | dfschema.with_functional_dependencies(func_dependencies) |
1331 | 0 | } |
1332 | | |
1333 | | /// Add additional "synthetic" group by expressions based on functional |
1334 | | /// dependencies. |
1335 | | /// |
1336 | | /// For example, if we are grouping on `[c1]`, and we know from |
1337 | | /// functional dependencies that column `c1` determines `c2`, this function |
1338 | | /// adds `c2` to the group by list. |
1339 | | /// |
1340 | | /// This allows MySQL style selects like |
1341 | | /// `SELECT col FROM t WHERE pk = 5` if col is unique |
1342 | 0 | pub fn add_group_by_exprs_from_dependencies( |
1343 | 0 | mut group_expr: Vec<Expr>, |
1344 | 0 | schema: &DFSchemaRef, |
1345 | 0 | ) -> Result<Vec<Expr>> { |
1346 | 0 | // Names of the fields produced by the GROUP BY exprs for example, `GROUP BY |
1347 | 0 | // c1 + 1` produces an output field named `"c1 + 1"` |
1348 | 0 | let mut group_by_field_names = group_expr |
1349 | 0 | .iter() |
1350 | 0 | .map(|e| e.schema_name().to_string()) |
1351 | 0 | .collect::<Vec<_>>(); |
1352 | | |
1353 | 0 | if let Some(target_indices) = |
1354 | 0 | get_target_functional_dependencies(schema, &group_by_field_names) |
1355 | | { |
1356 | 0 | for idx in target_indices { |
1357 | 0 | let expr = Expr::Column(Column::from(schema.qualified_field(idx))); |
1358 | 0 | let expr_name = expr.schema_name().to_string(); |
1359 | 0 | if !group_by_field_names.contains(&expr_name) { |
1360 | 0 | group_by_field_names.push(expr_name); |
1361 | 0 | group_expr.push(expr); |
1362 | 0 | } |
1363 | | } |
1364 | 0 | } |
1365 | 0 | Ok(group_expr) |
1366 | 0 | } |
1367 | | /// Errors if one or more expressions have equal names. |
1368 | 0 | pub fn validate_unique_names<'a>( |
1369 | 0 | node_name: &str, |
1370 | 0 | expressions: impl IntoIterator<Item = &'a Expr>, |
1371 | 0 | ) -> Result<()> { |
1372 | 0 | let mut unique_names = HashMap::new(); |
1373 | 0 |
|
1374 | 0 | expressions.into_iter().enumerate().try_for_each(|(position, expr)| { |
1375 | 0 | let name = expr.schema_name().to_string(); |
1376 | 0 | match unique_names.get(&name) { |
1377 | | None => { |
1378 | 0 | unique_names.insert(name, (position, expr)); |
1379 | 0 | Ok(()) |
1380 | | }, |
1381 | 0 | Some((existing_position, existing_expr)) => { |
1382 | 0 | plan_err!("{node_name} require unique expression names \ |
1383 | 0 | but the expression \"{existing_expr}\" at position {existing_position} and \"{expr}\" \ |
1384 | 0 | at position {position} have the same name. Consider aliasing (\"AS\") one of them." |
1385 | 0 | ) |
1386 | | } |
1387 | | } |
1388 | 0 | }) |
1389 | 0 | } |
1390 | | |
1391 | | /// Union two [`LogicalPlan`]s. |
1392 | | /// |
1393 | | /// Constructs the UNION plan, but does not perform type-coercion. Therefore the |
1394 | | /// subtree expressions will not be properly typed until the optimizer pass. |
1395 | | /// |
1396 | | /// If a properly typed UNION plan is needed, refer to [`TypeCoercionRewriter::coerce_union`] |
1397 | | /// or alternatively, merge the union input schema using [`coerce_union_schema`] and |
1398 | | /// apply the expression rewrite with [`coerce_plan_expr_for_schema`]. |
1399 | | /// |
1400 | | /// [`TypeCoercionRewriter::coerce_union`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/struct.TypeCoercionRewriter.html#method.coerce_union |
1401 | | /// [`coerce_union_schema`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/fn.coerce_union_schema.html |
1402 | 0 | pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result<LogicalPlan> { |
1403 | 0 | // Temporarily use the schema from the left input and later rely on the analyzer to |
1404 | 0 | // coerce the two schemas into a common one. |
1405 | 0 | let schema = Arc::clone(left_plan.schema()); |
1406 | 0 | Ok(LogicalPlan::Union(Union { |
1407 | 0 | inputs: vec![Arc::new(left_plan), Arc::new(right_plan)], |
1408 | 0 | schema, |
1409 | 0 | })) |
1410 | 0 | } |
1411 | | |
1412 | | /// Create Projection |
1413 | | /// # Errors |
1414 | | /// This function errors under any of the following conditions: |
1415 | | /// * Two or more expressions have the same name |
1416 | | /// * An invalid expression is used (e.g. a `sort` expression) |
1417 | 0 | pub fn project( |
1418 | 0 | plan: LogicalPlan, |
1419 | 0 | expr: impl IntoIterator<Item = impl Into<Expr>>, |
1420 | 0 | ) -> Result<LogicalPlan> { |
1421 | 0 | let mut projected_expr = vec![]; |
1422 | 0 | for e in expr { |
1423 | 0 | let e = e.into(); |
1424 | 0 | match e { |
1425 | 0 | Expr::Wildcard { .. } => projected_expr.push(e), |
1426 | 0 | _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?), |
1427 | | } |
1428 | | } |
1429 | 0 | validate_unique_names("Projections", projected_expr.iter())?; |
1430 | | |
1431 | 0 | Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection) |
1432 | 0 | } |
1433 | | |
1434 | | /// Create a SubqueryAlias to wrap a LogicalPlan. |
1435 | 0 | pub fn subquery_alias( |
1436 | 0 | plan: LogicalPlan, |
1437 | 0 | alias: impl Into<TableReference>, |
1438 | 0 | ) -> Result<LogicalPlan> { |
1439 | 0 | SubqueryAlias::try_new(Arc::new(plan), alias).map(LogicalPlan::SubqueryAlias) |
1440 | 0 | } |
1441 | | |
1442 | | /// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema. |
1443 | | /// This is mostly used for testing and documentation. |
1444 | 0 | pub fn table_scan( |
1445 | 0 | name: Option<impl Into<TableReference>>, |
1446 | 0 | table_schema: &Schema, |
1447 | 0 | projection: Option<Vec<usize>>, |
1448 | 0 | ) -> Result<LogicalPlanBuilder> { |
1449 | 0 | table_scan_with_filters(name, table_schema, projection, vec![]) |
1450 | 0 | } |
1451 | | |
1452 | | /// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema, |
1453 | | /// and inlined filters. |
1454 | | /// This is mostly used for testing and documentation. |
1455 | 0 | pub fn table_scan_with_filters( |
1456 | 0 | name: Option<impl Into<TableReference>>, |
1457 | 0 | table_schema: &Schema, |
1458 | 0 | projection: Option<Vec<usize>>, |
1459 | 0 | filters: Vec<Expr>, |
1460 | 0 | ) -> Result<LogicalPlanBuilder> { |
1461 | 0 | let table_source = table_source(table_schema); |
1462 | 0 | let name = name |
1463 | 0 | .map(|n| n.into()) |
1464 | 0 | .unwrap_or_else(|| TableReference::bare(UNNAMED_TABLE)); |
1465 | 0 | LogicalPlanBuilder::scan_with_filters(name, table_source, projection, filters) |
1466 | 0 | } |
1467 | | |
1468 | | /// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema, |
1469 | | /// filters, and inlined fetch. |
1470 | | /// This is mostly used for testing and documentation. |
1471 | 0 | pub fn table_scan_with_filter_and_fetch( |
1472 | 0 | name: Option<impl Into<TableReference>>, |
1473 | 0 | table_schema: &Schema, |
1474 | 0 | projection: Option<Vec<usize>>, |
1475 | 0 | filters: Vec<Expr>, |
1476 | 0 | fetch: Option<usize>, |
1477 | 0 | ) -> Result<LogicalPlanBuilder> { |
1478 | 0 | let table_source = table_source(table_schema); |
1479 | 0 | let name = name |
1480 | 0 | .map(|n| n.into()) |
1481 | 0 | .unwrap_or_else(|| TableReference::bare(UNNAMED_TABLE)); |
1482 | 0 | LogicalPlanBuilder::scan_with_filters_fetch( |
1483 | 0 | name, |
1484 | 0 | table_source, |
1485 | 0 | projection, |
1486 | 0 | filters, |
1487 | 0 | fetch, |
1488 | 0 | ) |
1489 | 0 | } |
1490 | | |
1491 | 0 | fn table_source(table_schema: &Schema) -> Arc<dyn TableSource> { |
1492 | 0 | let table_schema = Arc::new(table_schema.clone()); |
1493 | 0 | Arc::new(LogicalTableSource { table_schema }) |
1494 | 0 | } |
1495 | | |
1496 | | /// Wrap projection for a plan, if the join keys contains normal expression. |
1497 | 0 | pub fn wrap_projection_for_join_if_necessary( |
1498 | 0 | join_keys: &[Expr], |
1499 | 0 | input: LogicalPlan, |
1500 | 0 | ) -> Result<(LogicalPlan, Vec<Column>, bool)> { |
1501 | 0 | let input_schema = input.schema(); |
1502 | 0 | let alias_join_keys: Vec<Expr> = join_keys |
1503 | 0 | .iter() |
1504 | 0 | .map(|key| { |
1505 | | // The display_name() of cast expression will ignore the cast info, and show the inner expression name. |
1506 | | // If we do not add alais, it will throw same field name error in the schema when adding projection. |
1507 | | // For example: |
1508 | | // input scan : [a, b, c], |
1509 | | // join keys: [cast(a as int)] |
1510 | | // |
1511 | | // then a and cast(a as int) will use the same field name - `a` in projection schema. |
1512 | | // https://github.com/apache/datafusion/issues/4478 |
1513 | 0 | if matches!(key, Expr::Cast(_)) || matches!(key, Expr::TryCast(_)) { |
1514 | 0 | let alias = format!("{key}"); |
1515 | 0 | key.clone().alias(alias) |
1516 | | } else { |
1517 | 0 | key.clone() |
1518 | | } |
1519 | 0 | }) |
1520 | 0 | .collect::<Vec<_>>(); |
1521 | 0 |
|
1522 | 0 | let need_project = join_keys.iter().any(|key| !matches!(key, Expr::Column(_))); |
1523 | 0 | let plan = if need_project { |
1524 | | // Include all columns from the input and extend them with the join keys |
1525 | 0 | let mut projection = input_schema |
1526 | 0 | .columns() |
1527 | 0 | .into_iter() |
1528 | 0 | .map(Expr::Column) |
1529 | 0 | .collect::<Vec<_>>(); |
1530 | 0 | let join_key_items = alias_join_keys |
1531 | 0 | .iter() |
1532 | 0 | .flat_map(|expr| expr.try_as_col().is_none().then_some(expr)) |
1533 | 0 | .cloned() |
1534 | 0 | .collect::<HashSet<Expr>>(); |
1535 | 0 | projection.extend(join_key_items); |
1536 | 0 |
|
1537 | 0 | LogicalPlanBuilder::from(input) |
1538 | 0 | .project(projection)? |
1539 | 0 | .build()? |
1540 | | } else { |
1541 | 0 | input |
1542 | | }; |
1543 | | |
1544 | 0 | let join_on = alias_join_keys |
1545 | 0 | .into_iter() |
1546 | 0 | .map(|key| { |
1547 | 0 | if let Some(col) = key.try_as_col() { |
1548 | 0 | Ok(col.clone()) |
1549 | | } else { |
1550 | 0 | let name = key.schema_name().to_string(); |
1551 | 0 | Ok(Column::from_name(name)) |
1552 | | } |
1553 | 0 | }) |
1554 | 0 | .collect::<Result<Vec<_>>>()?; |
1555 | | |
1556 | 0 | Ok((plan, join_on, need_project)) |
1557 | 0 | } |
1558 | | |
1559 | | /// Basic TableSource implementation intended for use in tests and documentation. It is expected |
1560 | | /// that users will provide their own TableSource implementations or use DataFusion's |
1561 | | /// DefaultTableSource. |
1562 | | pub struct LogicalTableSource { |
1563 | | table_schema: SchemaRef, |
1564 | | } |
1565 | | |
1566 | | impl LogicalTableSource { |
1567 | | /// Create a new LogicalTableSource |
1568 | 0 | pub fn new(table_schema: SchemaRef) -> Self { |
1569 | 0 | Self { table_schema } |
1570 | 0 | } |
1571 | | } |
1572 | | |
1573 | | impl TableSource for LogicalTableSource { |
1574 | 0 | fn as_any(&self) -> &dyn Any { |
1575 | 0 | self |
1576 | 0 | } |
1577 | | |
1578 | 0 | fn schema(&self) -> SchemaRef { |
1579 | 0 | Arc::clone(&self.table_schema) |
1580 | 0 | } |
1581 | | |
1582 | 0 | fn supports_filters_pushdown( |
1583 | 0 | &self, |
1584 | 0 | filters: &[&Expr], |
1585 | 0 | ) -> Result<Vec<crate::TableProviderFilterPushDown>> { |
1586 | 0 | Ok(vec![TableProviderFilterPushDown::Exact; filters.len()]) |
1587 | 0 | } |
1588 | | } |
1589 | | |
1590 | | /// Create a [`LogicalPlan::Unnest`] plan |
1591 | 0 | pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> { |
1592 | 0 | let unnestings = columns |
1593 | 0 | .into_iter() |
1594 | 0 | .map(|c| (c, ColumnUnnestType::Inferred)) |
1595 | 0 | .collect(); |
1596 | 0 | unnest_with_options(input, unnestings, UnnestOptions::default()) |
1597 | 0 | } |
1598 | | |
1599 | 0 | pub fn get_unnested_list_datatype_recursive( |
1600 | 0 | data_type: &DataType, |
1601 | 0 | depth: usize, |
1602 | 0 | ) -> Result<DataType> { |
1603 | 0 | match data_type { |
1604 | 0 | DataType::List(field) |
1605 | 0 | | DataType::FixedSizeList(field, _) |
1606 | 0 | | DataType::LargeList(field) => { |
1607 | 0 | if depth == 1 { |
1608 | 0 | return Ok(field.data_type().clone()); |
1609 | 0 | } |
1610 | 0 | return get_unnested_list_datatype_recursive(field.data_type(), depth - 1); |
1611 | | } |
1612 | 0 | _ => {} |
1613 | 0 | }; |
1614 | 0 |
|
1615 | 0 | internal_err!("trying to unnest on invalid data type {:?}", data_type) |
1616 | 0 | } |
1617 | | |
1618 | | /// Infer the unnest type based on the data type: |
1619 | | /// - list type: infer to unnest(list(col, depth=1)) |
1620 | | /// - struct type: infer to unnest(struct) |
1621 | 0 | fn infer_unnest_type( |
1622 | 0 | col_name: &String, |
1623 | 0 | data_type: &DataType, |
1624 | 0 | ) -> Result<ColumnUnnestType> { |
1625 | 0 | match data_type { |
1626 | | DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => { |
1627 | 0 | Ok(ColumnUnnestType::List(vec![ColumnUnnestList { |
1628 | 0 | output_column: Column::from_name(col_name), |
1629 | 0 | depth: 1, |
1630 | 0 | }])) |
1631 | | } |
1632 | 0 | DataType::Struct(_) => Ok(ColumnUnnestType::Struct), |
1633 | | _ => { |
1634 | 0 | internal_err!("trying to unnest on invalid data type {:?}", data_type) |
1635 | | } |
1636 | | } |
1637 | 0 | } |
1638 | | |
1639 | 0 | pub fn get_struct_unnested_columns( |
1640 | 0 | col_name: &String, |
1641 | 0 | inner_fields: &Fields, |
1642 | 0 | ) -> Vec<Column> { |
1643 | 0 | inner_fields |
1644 | 0 | .iter() |
1645 | 0 | .map(|f| Column::from_name(format!("{}.{}", col_name, f.name()))) |
1646 | 0 | .collect() |
1647 | 0 | } |
1648 | | |
1649 | | // Based on data type, either struct or a variant of list |
1650 | | // return a set of columns as the result of unnesting |
1651 | | // the input columns. |
1652 | | // For example, given a column with name "a", |
1653 | | // - List(Element) returns ["a"] with data type Element |
1654 | | // - Struct(field1, field2) returns ["a.field1","a.field2"] |
1655 | | // For list data type, an argument depth is used to specify |
1656 | | // the recursion level |
1657 | 0 | pub fn get_unnested_columns( |
1658 | 0 | col_name: &String, |
1659 | 0 | data_type: &DataType, |
1660 | 0 | depth: usize, |
1661 | 0 | ) -> Result<Vec<(Column, Arc<Field>)>> { |
1662 | 0 | let mut qualified_columns = Vec::with_capacity(1); |
1663 | 0 |
|
1664 | 0 | match data_type { |
1665 | | DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => { |
1666 | 0 | let data_type = get_unnested_list_datatype_recursive(data_type, depth)?; |
1667 | 0 | let new_field = Arc::new(Field::new( |
1668 | 0 | col_name, data_type, |
1669 | 0 | // Unnesting may produce NULLs even if the list is not null. |
1670 | 0 | // For example: unnset([1], []) -> 1, null |
1671 | 0 | true, |
1672 | 0 | )); |
1673 | 0 | let column = Column::from_name(col_name); |
1674 | 0 | // let column = Column::from((None, &new_field)); |
1675 | 0 | qualified_columns.push((column, new_field)); |
1676 | | } |
1677 | 0 | DataType::Struct(fields) => { |
1678 | 0 | qualified_columns.extend(fields.iter().map(|f| { |
1679 | 0 | let new_name = format!("{}.{}", col_name, f.name()); |
1680 | 0 | let column = Column::from_name(&new_name); |
1681 | 0 | let new_field = f.as_ref().clone().with_name(new_name); |
1682 | 0 | // let column = Column::from((None, &f)); |
1683 | 0 | (column, Arc::new(new_field)) |
1684 | 0 | })) |
1685 | | } |
1686 | | _ => { |
1687 | 0 | return internal_err!( |
1688 | 0 | "trying to unnest on invalid data type {:?}", |
1689 | 0 | data_type |
1690 | 0 | ); |
1691 | | } |
1692 | | }; |
1693 | 0 | Ok(qualified_columns) |
1694 | 0 | } |
1695 | | |
1696 | | /// Create a [`LogicalPlan::Unnest`] plan with options |
1697 | | /// This function receive a list of columns to be unnested |
1698 | | /// because multiple unnest can be performed on the same column (e.g unnest with different depth) |
1699 | | /// The new schema will contains post-unnest fields replacing the original field |
1700 | | /// |
1701 | | /// For example: |
1702 | | /// Input schema as |
1703 | | /// ```text |
1704 | | /// +---------------------+-------------------+ |
1705 | | /// | col1 | col2 | |
1706 | | /// +---------------------+-------------------+ |
1707 | | /// | Struct(INT64,INT32) | List(List(Int64)) | |
1708 | | /// +---------------------+-------------------+ |
1709 | | /// ``` |
1710 | | /// |
1711 | | /// |
1712 | | /// |
1713 | | /// Then unnesting columns with: |
1714 | | /// - (col1,Struct) |
1715 | | /// - (col2,List(\[depth=1,depth=2\])) |
1716 | | /// |
1717 | | /// will generate a new schema as |
1718 | | /// ```text |
1719 | | /// +---------+---------+---------------------+---------------------+ |
1720 | | /// | col1.c0 | col1.c1 | unnest_col2_depth_1 | unnest_col2_depth_2 | |
1721 | | /// +---------+---------+---------------------+---------------------+ |
1722 | | /// | Int64 | Int32 | List(Int64) | Int64 | |
1723 | | /// +---------+---------+---------------------+---------------------+ |
1724 | | /// ``` |
1725 | 0 | pub fn unnest_with_options( |
1726 | 0 | input: LogicalPlan, |
1727 | 0 | columns_to_unnest: Vec<(Column, ColumnUnnestType)>, |
1728 | 0 | options: UnnestOptions, |
1729 | 0 | ) -> Result<LogicalPlan> { |
1730 | 0 | let mut list_columns: Vec<(usize, ColumnUnnestList)> = vec![]; |
1731 | 0 | let mut struct_columns = vec![]; |
1732 | 0 | let indices_to_unnest = columns_to_unnest |
1733 | 0 | .iter() |
1734 | 0 | .map(|col_unnesting| { |
1735 | 0 | Ok(( |
1736 | 0 | input.schema().index_of_column(&col_unnesting.0)?, |
1737 | 0 | col_unnesting, |
1738 | | )) |
1739 | 0 | }) |
1740 | 0 | .collect::<Result<HashMap<usize, &(Column, ColumnUnnestType)>>>()?; |
1741 | | |
1742 | 0 | let input_schema = input.schema(); |
1743 | 0 |
|
1744 | 0 | let mut dependency_indices = vec![]; |
1745 | | // Transform input schema into new schema |
1746 | | // Given this comprehensive example |
1747 | | // |
1748 | | // input schema: |
1749 | | // 1.col1_unnest_placeholder: list[list[int]], |
1750 | | // 2.col1: list[list[int]] |
1751 | | // 3.col2: list[int] |
1752 | | // with unnest on unnest(col1,depth=2), unnest(col1,depth=1) and unnest(col2,depth=1) |
1753 | | // output schema: |
1754 | | // 1.unnest_col1_depth_2: int |
1755 | | // 2.unnest_col1_depth_1: list[int] |
1756 | | // 3.col1: list[list[int]] |
1757 | | // 4.unnest_col2_depth_1: int |
1758 | | // Meaning the placeholder column will be replaced by its unnested variation(s), note |
1759 | | // the plural. |
1760 | 0 | let fields = input_schema |
1761 | 0 | .iter() |
1762 | 0 | .enumerate() |
1763 | 0 | .map(|(index, (original_qualifier, original_field))| { |
1764 | 0 | match indices_to_unnest.get(&index) { |
1765 | 0 | Some((column_to_unnest, unnest_type)) => { |
1766 | 0 | let mut inferred_unnest_type = unnest_type.clone(); |
1767 | 0 | if let ColumnUnnestType::Inferred = unnest_type { |
1768 | 0 | inferred_unnest_type = infer_unnest_type( |
1769 | 0 | &column_to_unnest.name, |
1770 | 0 | original_field.data_type(), |
1771 | 0 | )?; |
1772 | 0 | } |
1773 | 0 | let transformed_columns: Vec<(Column, Arc<Field>)> = |
1774 | 0 | match inferred_unnest_type { |
1775 | | ColumnUnnestType::Struct => { |
1776 | 0 | struct_columns.push(index); |
1777 | 0 | get_unnested_columns( |
1778 | 0 | &column_to_unnest.name, |
1779 | 0 | original_field.data_type(), |
1780 | 0 | 1, |
1781 | 0 | )? |
1782 | | } |
1783 | 0 | ColumnUnnestType::List(unnest_lists) => { |
1784 | 0 | list_columns.extend( |
1785 | 0 | unnest_lists |
1786 | 0 | .iter() |
1787 | 0 | .map(|ul| (index, ul.to_owned().clone())), |
1788 | 0 | ); |
1789 | 0 | unnest_lists |
1790 | 0 | .iter() |
1791 | 0 | .map( |
1792 | 0 | |ColumnUnnestList { |
1793 | | output_column, |
1794 | | depth, |
1795 | 0 | }| { |
1796 | 0 | get_unnested_columns( |
1797 | 0 | &output_column.name, |
1798 | 0 | original_field.data_type(), |
1799 | 0 | *depth, |
1800 | 0 | ) |
1801 | 0 | }, |
1802 | 0 | ) |
1803 | 0 | .collect::<Result<Vec<Vec<(Column, Arc<Field>)>>>>()? |
1804 | 0 | .into_iter() |
1805 | 0 | .flatten() |
1806 | 0 | .collect::<Vec<_>>() |
1807 | | } |
1808 | 0 | _ => return internal_err!("Invalid unnest type"), |
1809 | | }; |
1810 | | // new columns dependent on the same original index |
1811 | 0 | dependency_indices |
1812 | 0 | .extend(std::iter::repeat(index).take(transformed_columns.len())); |
1813 | 0 | Ok(transformed_columns |
1814 | 0 | .iter() |
1815 | 0 | .map(|(col, data_type)| { |
1816 | 0 | (col.relation.to_owned(), data_type.to_owned()) |
1817 | 0 | }) |
1818 | 0 | .collect()) |
1819 | | } |
1820 | | None => { |
1821 | 0 | dependency_indices.push(index); |
1822 | 0 | Ok(vec![( |
1823 | 0 | original_qualifier.cloned(), |
1824 | 0 | Arc::clone(original_field), |
1825 | 0 | )]) |
1826 | | } |
1827 | | } |
1828 | 0 | }) |
1829 | 0 | .collect::<Result<Vec<_>>>()? |
1830 | 0 | .into_iter() |
1831 | 0 | .flatten() |
1832 | 0 | .collect::<Vec<_>>(); |
1833 | 0 |
|
1834 | 0 | let metadata = input_schema.metadata().clone(); |
1835 | 0 | let df_schema = DFSchema::new_with_metadata(fields, metadata)?; |
1836 | | // We can use the existing functional dependencies: |
1837 | 0 | let deps = input_schema.functional_dependencies().clone(); |
1838 | 0 | let schema = Arc::new(df_schema.with_functional_dependencies(deps)?); |
1839 | | |
1840 | 0 | Ok(LogicalPlan::Unnest(Unnest { |
1841 | 0 | input: Arc::new(input), |
1842 | 0 | exec_columns: columns_to_unnest, |
1843 | 0 | list_type_columns: list_columns, |
1844 | 0 | struct_type_columns: struct_columns, |
1845 | 0 | dependency_indices, |
1846 | 0 | schema, |
1847 | 0 | options, |
1848 | 0 | })) |
1849 | 0 | } |
1850 | | |
1851 | | #[cfg(test)] |
1852 | | mod tests { |
1853 | | |
1854 | | use super::*; |
1855 | | use crate::logical_plan::StringifiedPlan; |
1856 | | use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery}; |
1857 | | |
1858 | | use datafusion_common::SchemaError; |
1859 | | |
1860 | | #[test] |
1861 | | fn plan_builder_simple() -> Result<()> { |
1862 | | let plan = |
1863 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![0, 3]))? |
1864 | | .filter(col("state").eq(lit("CO")))? |
1865 | | .project(vec![col("id")])? |
1866 | | .build()?; |
1867 | | |
1868 | | let expected = "Projection: employee_csv.id\ |
1869 | | \n Filter: employee_csv.state = Utf8(\"CO\")\ |
1870 | | \n TableScan: employee_csv projection=[id, state]"; |
1871 | | |
1872 | | assert_eq!(expected, format!("{plan}")); |
1873 | | |
1874 | | Ok(()) |
1875 | | } |
1876 | | |
1877 | | #[test] |
1878 | | fn plan_builder_schema() { |
1879 | | let schema = employee_schema(); |
1880 | | let projection = None; |
1881 | | let plan = |
1882 | | LogicalPlanBuilder::scan("employee_csv", table_source(&schema), projection) |
1883 | | .unwrap(); |
1884 | | let expected = DFSchema::try_from_qualified_schema( |
1885 | | TableReference::bare("employee_csv"), |
1886 | | &schema, |
1887 | | ) |
1888 | | .unwrap(); |
1889 | | assert_eq!(&expected, plan.schema().as_ref()); |
1890 | | |
1891 | | // Note scan of "EMPLOYEE_CSV" is treated as a SQL identifier |
1892 | | // (and thus normalized to "employee"csv") as well |
1893 | | let projection = None; |
1894 | | let plan = |
1895 | | LogicalPlanBuilder::scan("EMPLOYEE_CSV", table_source(&schema), projection) |
1896 | | .unwrap(); |
1897 | | assert_eq!(&expected, plan.schema().as_ref()); |
1898 | | } |
1899 | | |
1900 | | #[test] |
1901 | | fn plan_builder_empty_name() { |
1902 | | let schema = employee_schema(); |
1903 | | let projection = None; |
1904 | | let err = |
1905 | | LogicalPlanBuilder::scan("", table_source(&schema), projection).unwrap_err(); |
1906 | | assert_eq!( |
1907 | | err.strip_backtrace(), |
1908 | | "Error during planning: table_name cannot be empty" |
1909 | | ); |
1910 | | } |
1911 | | |
1912 | | #[test] |
1913 | | fn plan_builder_sort() -> Result<()> { |
1914 | | let plan = |
1915 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))? |
1916 | | .sort(vec![ |
1917 | | expr::Sort::new(col("state"), true, true), |
1918 | | expr::Sort::new(col("salary"), false, false), |
1919 | | ])? |
1920 | | .build()?; |
1921 | | |
1922 | | let expected = "Sort: employee_csv.state ASC NULLS FIRST, employee_csv.salary DESC NULLS LAST\ |
1923 | | \n TableScan: employee_csv projection=[state, salary]"; |
1924 | | |
1925 | | assert_eq!(expected, format!("{plan}")); |
1926 | | |
1927 | | Ok(()) |
1928 | | } |
1929 | | |
1930 | | #[test] |
1931 | | fn plan_builder_union() -> Result<()> { |
1932 | | let plan = |
1933 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?; |
1934 | | |
1935 | | let plan = plan |
1936 | | .clone() |
1937 | | .union(plan.clone().build()?)? |
1938 | | .union(plan.clone().build()?)? |
1939 | | .union(plan.build()?)? |
1940 | | .build()?; |
1941 | | |
1942 | | let expected = "Union\ |
1943 | | \n Union\ |
1944 | | \n Union\ |
1945 | | \n TableScan: employee_csv projection=[state, salary]\ |
1946 | | \n TableScan: employee_csv projection=[state, salary]\ |
1947 | | \n TableScan: employee_csv projection=[state, salary]\ |
1948 | | \n TableScan: employee_csv projection=[state, salary]"; |
1949 | | |
1950 | | assert_eq!(expected, format!("{plan}")); |
1951 | | |
1952 | | Ok(()) |
1953 | | } |
1954 | | |
1955 | | #[test] |
1956 | | fn plan_builder_union_distinct() -> Result<()> { |
1957 | | let plan = |
1958 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?; |
1959 | | |
1960 | | let plan = plan |
1961 | | .clone() |
1962 | | .union_distinct(plan.clone().build()?)? |
1963 | | .union_distinct(plan.clone().build()?)? |
1964 | | .union_distinct(plan.build()?)? |
1965 | | .build()?; |
1966 | | |
1967 | | let expected = "\ |
1968 | | Distinct:\ |
1969 | | \n Union\ |
1970 | | \n Distinct:\ |
1971 | | \n Union\ |
1972 | | \n Distinct:\ |
1973 | | \n Union\ |
1974 | | \n TableScan: employee_csv projection=[state, salary]\ |
1975 | | \n TableScan: employee_csv projection=[state, salary]\ |
1976 | | \n TableScan: employee_csv projection=[state, salary]\ |
1977 | | \n TableScan: employee_csv projection=[state, salary]"; |
1978 | | |
1979 | | assert_eq!(expected, format!("{plan}")); |
1980 | | |
1981 | | Ok(()) |
1982 | | } |
1983 | | |
1984 | | #[test] |
1985 | | fn plan_builder_simple_distinct() -> Result<()> { |
1986 | | let plan = |
1987 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![0, 3]))? |
1988 | | .filter(col("state").eq(lit("CO")))? |
1989 | | .project(vec![col("id")])? |
1990 | | .distinct()? |
1991 | | .build()?; |
1992 | | |
1993 | | let expected = "\ |
1994 | | Distinct:\ |
1995 | | \n Projection: employee_csv.id\ |
1996 | | \n Filter: employee_csv.state = Utf8(\"CO\")\ |
1997 | | \n TableScan: employee_csv projection=[id, state]"; |
1998 | | |
1999 | | assert_eq!(expected, format!("{plan}")); |
2000 | | |
2001 | | Ok(()) |
2002 | | } |
2003 | | |
2004 | | #[test] |
2005 | | fn exists_subquery() -> Result<()> { |
2006 | | let foo = test_table_scan_with_name("foo")?; |
2007 | | let bar = test_table_scan_with_name("bar")?; |
2008 | | |
2009 | | let subquery = LogicalPlanBuilder::from(foo) |
2010 | | .project(vec![col("a")])? |
2011 | | .filter(col("a").eq(col("bar.a")))? |
2012 | | .build()?; |
2013 | | |
2014 | | let outer_query = LogicalPlanBuilder::from(bar) |
2015 | | .project(vec![col("a")])? |
2016 | | .filter(exists(Arc::new(subquery)))? |
2017 | | .build()?; |
2018 | | |
2019 | | let expected = "Filter: EXISTS (<subquery>)\ |
2020 | | \n Subquery:\ |
2021 | | \n Filter: foo.a = bar.a\ |
2022 | | \n Projection: foo.a\ |
2023 | | \n TableScan: foo\ |
2024 | | \n Projection: bar.a\ |
2025 | | \n TableScan: bar"; |
2026 | | assert_eq!(expected, format!("{outer_query}")); |
2027 | | |
2028 | | Ok(()) |
2029 | | } |
2030 | | |
2031 | | #[test] |
2032 | | fn filter_in_subquery() -> Result<()> { |
2033 | | let foo = test_table_scan_with_name("foo")?; |
2034 | | let bar = test_table_scan_with_name("bar")?; |
2035 | | |
2036 | | let subquery = LogicalPlanBuilder::from(foo) |
2037 | | .project(vec![col("a")])? |
2038 | | .filter(col("a").eq(col("bar.a")))? |
2039 | | .build()?; |
2040 | | |
2041 | | // SELECT a FROM bar WHERE a IN (SELECT a FROM foo WHERE a = bar.a) |
2042 | | let outer_query = LogicalPlanBuilder::from(bar) |
2043 | | .project(vec![col("a")])? |
2044 | | .filter(in_subquery(col("a"), Arc::new(subquery)))? |
2045 | | .build()?; |
2046 | | |
2047 | | let expected = "Filter: bar.a IN (<subquery>)\ |
2048 | | \n Subquery:\ |
2049 | | \n Filter: foo.a = bar.a\ |
2050 | | \n Projection: foo.a\ |
2051 | | \n TableScan: foo\ |
2052 | | \n Projection: bar.a\ |
2053 | | \n TableScan: bar"; |
2054 | | assert_eq!(expected, format!("{outer_query}")); |
2055 | | |
2056 | | Ok(()) |
2057 | | } |
2058 | | |
2059 | | #[test] |
2060 | | fn select_scalar_subquery() -> Result<()> { |
2061 | | let foo = test_table_scan_with_name("foo")?; |
2062 | | let bar = test_table_scan_with_name("bar")?; |
2063 | | |
2064 | | let subquery = LogicalPlanBuilder::from(foo) |
2065 | | .project(vec![col("b")])? |
2066 | | .filter(col("a").eq(col("bar.a")))? |
2067 | | .build()?; |
2068 | | |
2069 | | // SELECT (SELECT a FROM foo WHERE a = bar.a) FROM bar |
2070 | | let outer_query = LogicalPlanBuilder::from(bar) |
2071 | | .project(vec![scalar_subquery(Arc::new(subquery))])? |
2072 | | .build()?; |
2073 | | |
2074 | | let expected = "Projection: (<subquery>)\ |
2075 | | \n Subquery:\ |
2076 | | \n Filter: foo.a = bar.a\ |
2077 | | \n Projection: foo.b\ |
2078 | | \n TableScan: foo\ |
2079 | | \n TableScan: bar"; |
2080 | | assert_eq!(expected, format!("{outer_query}")); |
2081 | | |
2082 | | Ok(()) |
2083 | | } |
2084 | | |
2085 | | #[test] |
2086 | | fn projection_non_unique_names() -> Result<()> { |
2087 | | let plan = table_scan( |
2088 | | Some("employee_csv"), |
2089 | | &employee_schema(), |
2090 | | // project id and first_name by column index |
2091 | | Some(vec![0, 1]), |
2092 | | )? |
2093 | | // two columns with the same name => error |
2094 | | .project(vec![col("id"), col("first_name").alias("id")]); |
2095 | | |
2096 | | match plan { |
2097 | | Err(DataFusionError::SchemaError( |
2098 | | SchemaError::AmbiguousReference { |
2099 | | field: |
2100 | | Column { |
2101 | | relation: Some(TableReference::Bare { table }), |
2102 | | name, |
2103 | | }, |
2104 | | }, |
2105 | | _, |
2106 | | )) => { |
2107 | | assert_eq!(*"employee_csv", *table); |
2108 | | assert_eq!("id", &name); |
2109 | | Ok(()) |
2110 | | } |
2111 | | _ => plan_err!("Plan should have returned an DataFusionError::SchemaError"), |
2112 | | } |
2113 | | } |
2114 | | |
2115 | | fn employee_schema() -> Schema { |
2116 | | Schema::new(vec![ |
2117 | | Field::new("id", DataType::Int32, false), |
2118 | | Field::new("first_name", DataType::Utf8, false), |
2119 | | Field::new("last_name", DataType::Utf8, false), |
2120 | | Field::new("state", DataType::Utf8, false), |
2121 | | Field::new("salary", DataType::Int32, false), |
2122 | | ]) |
2123 | | } |
2124 | | |
2125 | | #[test] |
2126 | | fn stringified_plan() { |
2127 | | let stringified_plan = |
2128 | | StringifiedPlan::new(PlanType::InitialLogicalPlan, "...the plan..."); |
2129 | | assert!(stringified_plan.should_display(true)); |
2130 | | assert!(!stringified_plan.should_display(false)); // not in non verbose mode |
2131 | | |
2132 | | let stringified_plan = |
2133 | | StringifiedPlan::new(PlanType::FinalLogicalPlan, "...the plan..."); |
2134 | | assert!(stringified_plan.should_display(true)); |
2135 | | assert!(stringified_plan.should_display(false)); // display in non verbose mode too |
2136 | | |
2137 | | let stringified_plan = |
2138 | | StringifiedPlan::new(PlanType::InitialPhysicalPlan, "...the plan..."); |
2139 | | assert!(stringified_plan.should_display(true)); |
2140 | | assert!(!stringified_plan.should_display(false)); // not in non verbose mode |
2141 | | |
2142 | | let stringified_plan = |
2143 | | StringifiedPlan::new(PlanType::FinalPhysicalPlan, "...the plan..."); |
2144 | | assert!(stringified_plan.should_display(true)); |
2145 | | assert!(stringified_plan.should_display(false)); // display in non verbose mode |
2146 | | |
2147 | | let stringified_plan = StringifiedPlan::new( |
2148 | | PlanType::OptimizedLogicalPlan { |
2149 | | optimizer_name: "random opt pass".into(), |
2150 | | }, |
2151 | | "...the plan...", |
2152 | | ); |
2153 | | assert!(stringified_plan.should_display(true)); |
2154 | | assert!(!stringified_plan.should_display(false)); |
2155 | | } |
2156 | | |
2157 | | fn test_table_scan_with_name(name: &str) -> Result<LogicalPlan> { |
2158 | | let schema = Schema::new(vec![ |
2159 | | Field::new("a", DataType::UInt32, false), |
2160 | | Field::new("b", DataType::UInt32, false), |
2161 | | Field::new("c", DataType::UInt32, false), |
2162 | | ]); |
2163 | | table_scan(Some(name), &schema, None)?.build() |
2164 | | } |
2165 | | |
2166 | | #[test] |
2167 | | fn plan_builder_intersect_different_num_columns_error() -> Result<()> { |
2168 | | let plan1 = |
2169 | | table_scan(TableReference::none(), &employee_schema(), Some(vec![3]))?; |
2170 | | let plan2 = |
2171 | | table_scan(TableReference::none(), &employee_schema(), Some(vec![3, 4]))?; |
2172 | | |
2173 | | let expected = "Error during planning: INTERSECT/EXCEPT query must have the same number of columns. \ |
2174 | | Left is 1 and right is 2."; |
2175 | | let err_msg1 = |
2176 | | LogicalPlanBuilder::intersect(plan1.build()?, plan2.build()?, true) |
2177 | | .unwrap_err(); |
2178 | | |
2179 | | assert_eq!(err_msg1.strip_backtrace(), expected); |
2180 | | |
2181 | | Ok(()) |
2182 | | } |
2183 | | |
2184 | | #[test] |
2185 | | fn plan_builder_unnest() -> Result<()> { |
2186 | | // Cannot unnest on a scalar column |
2187 | | let err = nested_table_scan("test_table")? |
2188 | | .unnest_column("scalar") |
2189 | | .unwrap_err(); |
2190 | | assert!(err |
2191 | | .to_string() |
2192 | | .starts_with("Internal error: trying to unnest on invalid data type UInt32")); |
2193 | | |
2194 | | // Unnesting the strings list. |
2195 | | let plan = nested_table_scan("test_table")? |
2196 | | .unnest_column("strings")? |
2197 | | .build()?; |
2198 | | |
2199 | | let expected = "\ |
2200 | | Unnest: lists[test_table.strings|depth=1] structs[]\ |
2201 | | \n TableScan: test_table"; |
2202 | | assert_eq!(expected, format!("{plan}")); |
2203 | | |
2204 | | // Check unnested field is a scalar |
2205 | | let field = plan.schema().field_with_name(None, "strings").unwrap(); |
2206 | | assert_eq!(&DataType::Utf8, field.data_type()); |
2207 | | |
2208 | | // Unnesting the singular struct column result into 2 new columns for each subfield |
2209 | | let plan = nested_table_scan("test_table")? |
2210 | | .unnest_column("struct_singular")? |
2211 | | .build()?; |
2212 | | |
2213 | | let expected = "\ |
2214 | | Unnest: lists[] structs[test_table.struct_singular]\ |
2215 | | \n TableScan: test_table"; |
2216 | | assert_eq!(expected, format!("{plan}")); |
2217 | | |
2218 | | for field_name in &["a", "b"] { |
2219 | | // Check unnested struct field is a scalar |
2220 | | let field = plan |
2221 | | .schema() |
2222 | | .field_with_name(None, &format!("struct_singular.{}", field_name)) |
2223 | | .unwrap(); |
2224 | | assert_eq!(&DataType::UInt32, field.data_type()); |
2225 | | } |
2226 | | |
2227 | | // Unnesting multiple fields in separate plans |
2228 | | let plan = nested_table_scan("test_table")? |
2229 | | .unnest_column("strings")? |
2230 | | .unnest_column("structs")? |
2231 | | .unnest_column("struct_singular")? |
2232 | | .build()?; |
2233 | | |
2234 | | let expected = "\ |
2235 | | Unnest: lists[] structs[test_table.struct_singular]\ |
2236 | | \n Unnest: lists[test_table.structs|depth=1] structs[]\ |
2237 | | \n Unnest: lists[test_table.strings|depth=1] structs[]\ |
2238 | | \n TableScan: test_table"; |
2239 | | assert_eq!(expected, format!("{plan}")); |
2240 | | |
2241 | | // Check unnested struct list field should be a struct. |
2242 | | let field = plan.schema().field_with_name(None, "structs").unwrap(); |
2243 | | assert!(matches!(field.data_type(), DataType::Struct(_))); |
2244 | | |
2245 | | // Unnesting multiple fields at the same time, using infer syntax |
2246 | | let cols = vec!["strings", "structs", "struct_singular"] |
2247 | | .into_iter() |
2248 | | .map(|c| c.into()) |
2249 | | .collect(); |
2250 | | |
2251 | | let plan = nested_table_scan("test_table")? |
2252 | | .unnest_columns_with_options(cols, UnnestOptions::default())? |
2253 | | .build()?; |
2254 | | |
2255 | | let expected = "\ |
2256 | | Unnest: lists[test_table.strings|depth=1, test_table.structs|depth=1] structs[test_table.struct_singular]\ |
2257 | | \n TableScan: test_table"; |
2258 | | assert_eq!(expected, format!("{plan}")); |
2259 | | |
2260 | | // Unnesting missing column should fail. |
2261 | | let plan = nested_table_scan("test_table")?.unnest_column("missing"); |
2262 | | assert!(plan.is_err()); |
2263 | | |
2264 | | // Simultaneously unnesting a list (with different depth) and a struct column |
2265 | | let plan = nested_table_scan("test_table")? |
2266 | | .unnest_columns_recursive_with_options( |
2267 | | vec![ |
2268 | | ( |
2269 | | "stringss".into(), |
2270 | | ColumnUnnestType::List(vec![ |
2271 | | ColumnUnnestList { |
2272 | | output_column: Column::from_name("stringss_depth_1"), |
2273 | | depth: 1, |
2274 | | }, |
2275 | | ColumnUnnestList { |
2276 | | output_column: Column::from_name("stringss_depth_2"), |
2277 | | depth: 2, |
2278 | | }, |
2279 | | ]), |
2280 | | ), |
2281 | | ("struct_singular".into(), ColumnUnnestType::Inferred), |
2282 | | ], |
2283 | | UnnestOptions::default(), |
2284 | | )? |
2285 | | .build()?; |
2286 | | |
2287 | | let expected = "\ |
2288 | | Unnest: lists[test_table.stringss|depth=1, test_table.stringss|depth=2] structs[test_table.struct_singular]\ |
2289 | | \n TableScan: test_table"; |
2290 | | assert_eq!(expected, format!("{plan}")); |
2291 | | |
2292 | | // Check output columns has correct type |
2293 | | let field = plan |
2294 | | .schema() |
2295 | | .field_with_name(None, "stringss_depth_1") |
2296 | | .unwrap(); |
2297 | | assert_eq!( |
2298 | | &DataType::new_list(DataType::Utf8, false), |
2299 | | field.data_type() |
2300 | | ); |
2301 | | let field = plan |
2302 | | .schema() |
2303 | | .field_with_name(None, "stringss_depth_2") |
2304 | | .unwrap(); |
2305 | | assert_eq!(&DataType::Utf8, field.data_type()); |
2306 | | // unnesting struct is still correct |
2307 | | for field_name in &["a", "b"] { |
2308 | | let field = plan |
2309 | | .schema() |
2310 | | .field_with_name(None, &format!("struct_singular.{}", field_name)) |
2311 | | .unwrap(); |
2312 | | assert_eq!(&DataType::UInt32, field.data_type()); |
2313 | | } |
2314 | | |
2315 | | Ok(()) |
2316 | | } |
2317 | | |
2318 | | fn nested_table_scan(table_name: &str) -> Result<LogicalPlanBuilder> { |
2319 | | // Create a schema with a scalar field, a list of strings, a list of structs |
2320 | | // and a singular struct |
2321 | | let struct_field_in_list = Field::new_struct( |
2322 | | "item", |
2323 | | vec![ |
2324 | | Field::new("a", DataType::UInt32, false), |
2325 | | Field::new("b", DataType::UInt32, false), |
2326 | | ], |
2327 | | false, |
2328 | | ); |
2329 | | let string_field = Field::new("item", DataType::Utf8, false); |
2330 | | let strings_field = Field::new_list("item", string_field.clone(), false); |
2331 | | let schema = Schema::new(vec![ |
2332 | | Field::new("scalar", DataType::UInt32, false), |
2333 | | Field::new_list("strings", string_field, false), |
2334 | | Field::new_list("structs", struct_field_in_list, false), |
2335 | | Field::new( |
2336 | | "struct_singular", |
2337 | | DataType::Struct(Fields::from(vec![ |
2338 | | Field::new("a", DataType::UInt32, false), |
2339 | | Field::new("b", DataType::UInt32, false), |
2340 | | ])), |
2341 | | false, |
2342 | | ), |
2343 | | Field::new_list("stringss", strings_field, false), |
2344 | | ]); |
2345 | | |
2346 | | table_scan(Some(table_name), &schema, None) |
2347 | | } |
2348 | | |
2349 | | #[test] |
2350 | | fn test_union_after_join() -> Result<()> { |
2351 | | let values = vec![vec![lit(1)]]; |
2352 | | |
2353 | | let left = LogicalPlanBuilder::values(values.clone())? |
2354 | | .alias("left")? |
2355 | | .build()?; |
2356 | | let right = LogicalPlanBuilder::values(values)? |
2357 | | .alias("right")? |
2358 | | .build()?; |
2359 | | |
2360 | | let join = LogicalPlanBuilder::from(left).cross_join(right)?.build()?; |
2361 | | |
2362 | | let _ = LogicalPlanBuilder::from(join.clone()) |
2363 | | .union(join)? |
2364 | | .build()?; |
2365 | | |
2366 | | Ok(()) |
2367 | | } |
2368 | | |
2369 | | #[test] |
2370 | | fn test_change_redundant_column() -> Result<()> { |
2371 | | let t1_field_1 = Field::new("a", DataType::Int32, false); |
2372 | | let t2_field_1 = Field::new("a", DataType::Int32, false); |
2373 | | let t2_field_3 = Field::new("a", DataType::Int32, false); |
2374 | | let t1_field_2 = Field::new("b", DataType::Int32, false); |
2375 | | let t2_field_2 = Field::new("b", DataType::Int32, false); |
2376 | | |
2377 | | let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3]; |
2378 | | let remove_redundant = change_redundant_column(&Fields::from(field_vec)); |
2379 | | |
2380 | | assert_eq!( |
2381 | | remove_redundant, |
2382 | | vec![ |
2383 | | Field::new("a", DataType::Int32, false), |
2384 | | Field::new("a:1", DataType::Int32, false), |
2385 | | Field::new("b", DataType::Int32, false), |
2386 | | Field::new("b:1", DataType::Int32, false), |
2387 | | Field::new("a:2", DataType::Int32, false), |
2388 | | ] |
2389 | | ); |
2390 | | Ok(()) |
2391 | | } |
2392 | | |
2393 | | #[test] |
2394 | | fn plan_builder_from_logical_plan() -> Result<()> { |
2395 | | let plan = |
2396 | | table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))? |
2397 | | .sort(vec![ |
2398 | | expr::Sort::new(col("state"), true, true), |
2399 | | expr::Sort::new(col("salary"), false, false), |
2400 | | ])? |
2401 | | .build()?; |
2402 | | |
2403 | | let plan_expected = format!("{plan}"); |
2404 | | let plan_builder: LogicalPlanBuilder = Arc::new(plan).into(); |
2405 | | assert_eq!(plan_expected, format!("{}", plan_builder.plan)); |
2406 | | |
2407 | | Ok(()) |
2408 | | } |
2409 | | } |