Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/expr/src/logical_plan/builder.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! This module provides a builder for creating LogicalPlans
19
20
use std::any::Any;
21
use std::cmp::Ordering;
22
use std::collections::{HashMap, HashSet};
23
use std::sync::Arc;
24
25
use crate::dml::CopyTo;
26
use crate::expr::{Alias, Sort as SortExpr};
27
use crate::expr_rewriter::{
28
    coerce_plan_expr_for_schema, normalize_col,
29
    normalize_col_with_schemas_and_ambiguity_check, normalize_cols, normalize_sorts,
30
    rewrite_sort_cols_by_aggs,
31
};
32
use crate::logical_plan::{
33
    Aggregate, Analyze, CrossJoin, Distinct, DistinctOn, EmptyRelation, Explain, Filter,
34
    Join, JoinConstraint, JoinType, Limit, LogicalPlan, Partitioning, PlanType, Prepare,
35
    Projection, Repartition, Sort, SubqueryAlias, TableScan, Union, Unnest, Values,
36
    Window,
37
};
38
use crate::utils::{
39
    can_hash, columnize_expr, compare_sort_expr, expr_to_columns,
40
    find_valid_equijoin_key_pair, group_window_expr_by_sort_keys,
41
};
42
use crate::{
43
    and, binary_expr, DmlStatement, Expr, ExprSchemable, Operator, RecursiveQuery,
44
    TableProviderFilterPushDown, TableSource, WriteOp,
45
};
46
47
use arrow::datatypes::{DataType, Field, Fields, Schema, SchemaRef};
48
use datafusion_common::display::ToStringifiedPlan;
49
use datafusion_common::file_options::file_type::FileType;
50
use datafusion_common::{
51
    get_target_functional_dependencies, internal_err, not_impl_err, plan_datafusion_err,
52
    plan_err, Column, DFSchema, DFSchemaRef, DataFusionError, Result, ScalarValue,
53
    TableReference, ToDFSchema, UnnestOptions,
54
};
55
use datafusion_expr_common::type_coercion::binary::type_union_resolution;
56
57
use super::dml::InsertOp;
58
use super::plan::{ColumnUnnestList, ColumnUnnestType};
59
60
/// Default table name for unnamed table
61
pub const UNNAMED_TABLE: &str = "?table?";
62
63
/// Builder for logical plans
64
///
65
/// # Example building a simple plan
66
/// ```
67
/// # use datafusion_expr::{lit, col, LogicalPlanBuilder, logical_plan::table_scan};
68
/// # use datafusion_common::Result;
69
/// # use arrow::datatypes::{Schema, DataType, Field};
70
/// #
71
/// # fn main() -> Result<()> {
72
/// #
73
/// # fn employee_schema() -> Schema {
74
/// #    Schema::new(vec![
75
/// #           Field::new("id", DataType::Int32, false),
76
/// #           Field::new("first_name", DataType::Utf8, false),
77
/// #           Field::new("last_name", DataType::Utf8, false),
78
/// #           Field::new("state", DataType::Utf8, false),
79
/// #           Field::new("salary", DataType::Int32, false),
80
/// #       ])
81
/// #   }
82
/// #
83
/// // Create a plan similar to
84
/// // SELECT last_name
85
/// // FROM employees
86
/// // WHERE salary < 1000
87
/// let plan = table_scan(Some("employee"), &employee_schema(), None)?
88
///  // Keep only rows where salary < 1000
89
///  .filter(col("salary").lt(lit(1000)))?
90
///  // only show "last_name" in the final results
91
///  .project(vec![col("last_name")])?
92
///  .build()?;
93
///
94
/// // Convert from plan back to builder
95
/// let builder = LogicalPlanBuilder::from(plan);
96
///
97
/// # Ok(())
98
/// # }
99
/// ```
100
#[derive(Debug, Clone)]
101
pub struct LogicalPlanBuilder {
102
    plan: Arc<LogicalPlan>,
103
}
104
105
impl LogicalPlanBuilder {
106
    /// Create a builder from an existing plan
107
0
    pub fn new(plan: LogicalPlan) -> Self {
108
0
        Self {
109
0
            plan: Arc::new(plan),
110
0
        }
111
0
    }
112
113
    /// Create a builder from an existing plan
114
0
    pub fn new_from_arc(plan: Arc<LogicalPlan>) -> Self {
115
0
        Self { plan }
116
0
    }
117
118
    /// Return the output schema of the plan build so far
119
0
    pub fn schema(&self) -> &DFSchemaRef {
120
0
        self.plan.schema()
121
0
    }
122
123
    /// Return the LogicalPlan of the plan build so far
124
0
    pub fn plan(&self) -> &LogicalPlan {
125
0
        &self.plan
126
0
    }
127
128
    /// Create an empty relation.
129
    ///
130
    /// `produce_one_row` set to true means this empty node needs to produce a placeholder row.
131
0
    pub fn empty(produce_one_row: bool) -> Self {
132
0
        Self::new(LogicalPlan::EmptyRelation(EmptyRelation {
133
0
            produce_one_row,
134
0
            schema: DFSchemaRef::new(DFSchema::empty()),
135
0
        }))
136
0
    }
137
138
    /// Convert a regular plan into a recursive query.
139
    /// `is_distinct` indicates whether the recursive term should be de-duplicated (`UNION`) after each iteration or not (`UNION ALL`).
140
0
    pub fn to_recursive_query(
141
0
        self,
142
0
        name: String,
143
0
        recursive_term: LogicalPlan,
144
0
        is_distinct: bool,
145
0
    ) -> Result<Self> {
146
0
        // TODO: we need to do a bunch of validation here. Maybe more.
147
0
        if is_distinct {
148
0
            return not_impl_err!(
149
0
                "Recursive queries with a distinct 'UNION' (in which the previous iteration's results will be de-duplicated) is not supported"
150
0
            );
151
0
        }
152
0
        // Ensure that the static term and the recursive term have the same number of fields
153
0
        let static_fields_len = self.plan.schema().fields().len();
154
0
        let recurive_fields_len = recursive_term.schema().fields().len();
155
0
        if static_fields_len != recurive_fields_len {
156
0
            return plan_err!(
157
0
                "Non-recursive term and recursive term must have the same number of columns ({} != {})",
158
0
                static_fields_len, recurive_fields_len
159
0
            );
160
0
        }
161
        // Ensure that the recursive term has the same field types as the static term
162
0
        let coerced_recursive_term =
163
0
            coerce_plan_expr_for_schema(recursive_term, self.plan.schema())?;
164
0
        Ok(Self::from(LogicalPlan::RecursiveQuery(RecursiveQuery {
165
0
            name,
166
0
            static_term: self.plan,
167
0
            recursive_term: Arc::new(coerced_recursive_term),
168
0
            is_distinct,
169
0
        })))
170
0
    }
171
172
    /// Create a values list based relation, and the schema is inferred from data, consuming
173
    /// `value`. See the [Postgres VALUES](https://www.postgresql.org/docs/current/queries-values.html)
174
    /// documentation for more details.
175
    ///
176
    /// By default, it assigns the names column1, column2, etc. to the columns of a VALUES table.
177
    /// The column names are not specified by the SQL standard and different database systems do it differently,
178
    /// so it's usually better to override the default names with a table alias list.
179
    ///
180
    /// If the values include params/binders such as $1, $2, $3, etc, then the `param_data_types` should be provided.
181
0
    pub fn values(mut values: Vec<Vec<Expr>>) -> Result<Self> {
182
0
        if values.is_empty() {
183
0
            return plan_err!("Values list cannot be empty");
184
0
        }
185
0
        let n_cols = values[0].len();
186
0
        if n_cols == 0 {
187
0
            return plan_err!("Values list cannot be zero length");
188
0
        }
189
0
        for (i, row) in values.iter().enumerate() {
190
0
            if row.len() != n_cols {
191
0
                return plan_err!(
192
0
                    "Inconsistent data length across values list: got {} values in row {} but expected {}",
193
0
                    row.len(),
194
0
                    i,
195
0
                    n_cols
196
0
                );
197
0
            }
198
        }
199
200
0
        let empty_schema = DFSchema::empty();
201
0
        let mut field_types: Vec<DataType> = Vec::with_capacity(n_cols);
202
0
        for j in 0..n_cols {
203
0
            let mut common_type: Option<DataType> = None;
204
0
            for (i, row) in values.iter().enumerate() {
205
0
                let value = &row[j];
206
0
                let data_type = value.get_type(&empty_schema)?;
207
0
                if data_type == DataType::Null {
208
0
                    continue;
209
0
                }
210
0
                if let Some(prev_type) = common_type {
211
                    // get common type of each column values.
212
0
                    let data_types = vec![prev_type.clone(), data_type.clone()];
213
0
                    let Some(new_type) = type_union_resolution(&data_types) else {
214
0
                        return plan_err!("Inconsistent data type across values list at row {i} column {j}. Was {prev_type} but found {data_type}");
215
                    };
216
0
                    common_type = Some(new_type);
217
0
                } else {
218
0
                    common_type = Some(data_type);
219
0
                }
220
            }
221
            // assuming common_type was not set, and no error, therefore the type should be NULL
222
            // since the code loop skips NULL
223
0
            field_types.push(common_type.unwrap_or(DataType::Null));
224
        }
225
        // wrap cast if data type is not same as common type.
226
0
        for row in &mut values {
227
0
            for (j, field_type) in field_types.iter().enumerate() {
228
0
                if let Expr::Literal(ScalarValue::Null) = row[j] {
229
0
                    row[j] = Expr::Literal(ScalarValue::try_from(field_type)?);
230
                } else {
231
0
                    row[j] =
232
0
                        std::mem::take(&mut row[j]).cast_to(field_type, &empty_schema)?;
233
                }
234
            }
235
        }
236
0
        let fields = field_types
237
0
            .iter()
238
0
            .enumerate()
239
0
            .map(|(j, data_type)| {
240
0
                // naming is following convention https://www.postgresql.org/docs/current/queries-values.html
241
0
                let name = &format!("column{}", j + 1);
242
0
                Field::new(name, data_type.clone(), true)
243
0
            })
244
0
            .collect::<Vec<_>>();
245
0
        let dfschema = DFSchema::from_unqualified_fields(fields.into(), HashMap::new())?;
246
0
        let schema = DFSchemaRef::new(dfschema);
247
0
        Ok(Self::new(LogicalPlan::Values(Values { schema, values })))
248
0
    }
249
250
    /// Convert a table provider into a builder with a TableScan
251
    ///
252
    /// Note that if you pass a string as `table_name`, it is treated
253
    /// as a SQL identifier, as described on [`TableReference`] and
254
    /// thus is normalized
255
    ///
256
    /// # Example:
257
    /// ```
258
    /// # use datafusion_expr::{lit, col, LogicalPlanBuilder,
259
    /// #  logical_plan::builder::LogicalTableSource, logical_plan::table_scan
260
    /// # };
261
    /// # use std::sync::Arc;
262
    /// # use arrow::datatypes::{Schema, DataType, Field};
263
    /// # use datafusion_common::TableReference;
264
    /// #
265
    /// # let employee_schema = Arc::new(Schema::new(vec![
266
    /// #           Field::new("id", DataType::Int32, false),
267
    /// # ])) as _;
268
    /// # let table_source = Arc::new(LogicalTableSource::new(employee_schema));
269
    /// // Scan table_source with the name "mytable" (after normalization)
270
    /// # let table = table_source.clone();
271
    /// let scan = LogicalPlanBuilder::scan("MyTable", table, None);
272
    ///
273
    /// // Scan table_source with the name "MyTable" by enclosing in quotes
274
    /// # let table = table_source.clone();
275
    /// let scan = LogicalPlanBuilder::scan(r#""MyTable""#, table, None);
276
    ///
277
    /// // Scan table_source with the name "MyTable" by forming the table reference
278
    /// # let table = table_source.clone();
279
    /// let table_reference = TableReference::bare("MyTable");
280
    /// let scan = LogicalPlanBuilder::scan(table_reference, table, None);
281
    /// ```
282
0
    pub fn scan(
283
0
        table_name: impl Into<TableReference>,
284
0
        table_source: Arc<dyn TableSource>,
285
0
        projection: Option<Vec<usize>>,
286
0
    ) -> Result<Self> {
287
0
        Self::scan_with_filters(table_name, table_source, projection, vec![])
288
0
    }
289
290
    /// Create a [CopyTo] for copying the contents of this builder to the specified file(s)
291
0
    pub fn copy_to(
292
0
        input: LogicalPlan,
293
0
        output_url: String,
294
0
        file_type: Arc<dyn FileType>,
295
0
        options: HashMap<String, String>,
296
0
        partition_by: Vec<String>,
297
0
    ) -> Result<Self> {
298
0
        Ok(Self::new(LogicalPlan::Copy(CopyTo {
299
0
            input: Arc::new(input),
300
0
            output_url,
301
0
            partition_by,
302
0
            file_type,
303
0
            options,
304
0
        })))
305
0
    }
306
307
    /// Create a [DmlStatement] for inserting the contents of this builder into the named table
308
0
    pub fn insert_into(
309
0
        input: LogicalPlan,
310
0
        table_name: impl Into<TableReference>,
311
0
        table_schema: &Schema,
312
0
        insert_op: InsertOp,
313
0
    ) -> Result<Self> {
314
0
        let table_schema = table_schema.clone().to_dfschema_ref()?;
315
316
0
        Ok(Self::new(LogicalPlan::Dml(DmlStatement::new(
317
0
            table_name.into(),
318
0
            table_schema,
319
0
            WriteOp::Insert(insert_op),
320
0
            Arc::new(input),
321
0
        ))))
322
0
    }
323
324
    /// Convert a table provider into a builder with a TableScan
325
0
    pub fn scan_with_filters(
326
0
        table_name: impl Into<TableReference>,
327
0
        table_source: Arc<dyn TableSource>,
328
0
        projection: Option<Vec<usize>>,
329
0
        filters: Vec<Expr>,
330
0
    ) -> Result<Self> {
331
0
        TableScan::try_new(table_name, table_source, projection, filters, None)
332
0
            .map(LogicalPlan::TableScan)
333
0
            .map(Self::new)
334
0
    }
335
336
    /// Convert a table provider into a builder with a TableScan with filter and fetch
337
0
    pub fn scan_with_filters_fetch(
338
0
        table_name: impl Into<TableReference>,
339
0
        table_source: Arc<dyn TableSource>,
340
0
        projection: Option<Vec<usize>>,
341
0
        filters: Vec<Expr>,
342
0
        fetch: Option<usize>,
343
0
    ) -> Result<Self> {
344
0
        TableScan::try_new(table_name, table_source, projection, filters, fetch)
345
0
            .map(LogicalPlan::TableScan)
346
0
            .map(Self::new)
347
0
    }
348
349
    /// Wrap a plan in a window
350
0
    pub fn window_plan(
351
0
        input: LogicalPlan,
352
0
        window_exprs: Vec<Expr>,
353
0
    ) -> Result<LogicalPlan> {
354
0
        let mut plan = input;
355
0
        let mut groups = group_window_expr_by_sort_keys(window_exprs)?;
356
        // To align with the behavior of PostgreSQL, we want the sort_keys sorted as same rule as PostgreSQL that first
357
        // we compare the sort key themselves and if one window's sort keys are a prefix of another
358
        // put the window with more sort keys first. so more deeply sorted plans gets nested further down as children.
359
        // The sort_by() implementation here is a stable sort.
360
        // Note that by this rule if there's an empty over, it'll be at the top level
361
0
        groups.sort_by(|(key_a, _), (key_b, _)| {
362
0
            for ((first, _), (second, _)) in key_a.iter().zip(key_b.iter()) {
363
0
                let key_ordering = compare_sort_expr(first, second, plan.schema());
364
0
                match key_ordering {
365
                    Ordering::Less => {
366
0
                        return Ordering::Less;
367
                    }
368
                    Ordering::Greater => {
369
0
                        return Ordering::Greater;
370
                    }
371
0
                    Ordering::Equal => {}
372
                }
373
            }
374
0
            key_b.len().cmp(&key_a.len())
375
0
        });
376
0
        for (_, exprs) in groups {
377
0
            let window_exprs = exprs.into_iter().collect::<Vec<_>>();
378
            // Partition and sorting is done at physical level, see the EnforceDistribution
379
            // and EnforceSorting rules.
380
0
            plan = LogicalPlanBuilder::from(plan)
381
0
                .window(window_exprs)?
382
0
                .build()?;
383
        }
384
0
        Ok(plan)
385
0
    }
386
    /// Apply a projection without alias.
387
0
    pub fn project(
388
0
        self,
389
0
        expr: impl IntoIterator<Item = impl Into<Expr>>,
390
0
    ) -> Result<Self> {
391
0
        project(Arc::unwrap_or_clone(self.plan), expr).map(Self::new)
392
0
    }
393
394
    /// Select the given column indices
395
0
    pub fn select(self, indices: impl IntoIterator<Item = usize>) -> Result<Self> {
396
0
        let exprs: Vec<_> = indices
397
0
            .into_iter()
398
0
            .map(|x| Expr::Column(Column::from(self.plan.schema().qualified_field(x))))
399
0
            .collect();
400
0
        self.project(exprs)
401
0
    }
402
403
    /// Apply a filter
404
0
    pub fn filter(self, expr: impl Into<Expr>) -> Result<Self> {
405
0
        let expr = normalize_col(expr.into(), &self.plan)?;
406
0
        Filter::try_new(expr, self.plan)
407
0
            .map(LogicalPlan::Filter)
408
0
            .map(Self::new)
409
0
    }
410
411
    /// Apply a filter which is used for a having clause
412
0
    pub fn having(self, expr: impl Into<Expr>) -> Result<Self> {
413
0
        let expr = normalize_col(expr.into(), &self.plan)?;
414
0
        Filter::try_new_with_having(expr, self.plan)
415
0
            .map(LogicalPlan::Filter)
416
0
            .map(Self::from)
417
0
    }
418
419
    /// Make a builder for a prepare logical plan from the builder's plan
420
0
    pub fn prepare(self, name: String, data_types: Vec<DataType>) -> Result<Self> {
421
0
        Ok(Self::new(LogicalPlan::Prepare(Prepare {
422
0
            name,
423
0
            data_types,
424
0
            input: self.plan,
425
0
        })))
426
0
    }
427
428
    /// Limit the number of rows returned
429
    ///
430
    /// `skip` - Number of rows to skip before fetch any row.
431
    ///
432
    /// `fetch` - Maximum number of rows to fetch, after skipping `skip` rows,
433
    ///          if specified.
434
0
    pub fn limit(self, skip: usize, fetch: Option<usize>) -> Result<Self> {
435
0
        Ok(Self::new(LogicalPlan::Limit(Limit {
436
0
            skip,
437
0
            fetch,
438
0
            input: self.plan,
439
0
        })))
440
0
    }
441
442
    /// Apply an alias
443
0
    pub fn alias(self, alias: impl Into<TableReference>) -> Result<Self> {
444
0
        subquery_alias(Arc::unwrap_or_clone(self.plan), alias).map(Self::new)
445
0
    }
446
447
    /// Add missing sort columns to all downstream projection
448
    ///
449
    /// Thus, if you have a LogicalPlan that selects A and B and have
450
    /// not requested a sort by C, this code will add C recursively to
451
    /// all input projections.
452
    ///
453
    /// Adding a new column is not correct if there is a `Distinct`
454
    /// node, which produces only distinct values of its
455
    /// inputs. Adding a new column to its input will result in
456
    /// potentially different results than with the original column.
457
    ///
458
    /// For example, if the input is like:
459
    ///
460
    /// Distinct(A, B)
461
    ///
462
    /// If the input looks like
463
    ///
464
    /// a | b | c
465
    /// --+---+---
466
    /// 1 | 2 | 3
467
    /// 1 | 2 | 4
468
    ///
469
    /// Distinct (A, B) --> (1,2)
470
    ///
471
    /// But Distinct (A, B, C) --> (1, 2, 3), (1, 2, 4)
472
    ///  (which will appear as a (1, 2), (1, 2) if a and b are projected
473
    ///
474
    /// See <https://github.com/apache/datafusion/issues/5065> for more details
475
0
    fn add_missing_columns(
476
0
        curr_plan: LogicalPlan,
477
0
        missing_cols: &[Column],
478
0
        is_distinct: bool,
479
0
    ) -> Result<LogicalPlan> {
480
0
        match curr_plan {
481
            LogicalPlan::Projection(Projection {
482
0
                input,
483
0
                mut expr,
484
                schema: _,
485
0
            }) if missing_cols.iter().all(|c| input.schema().has_column(c)) => {
486
0
                let mut missing_exprs = missing_cols
487
0
                    .iter()
488
0
                    .map(|c| normalize_col(Expr::Column(c.clone()), &input))
489
0
                    .collect::<Result<Vec<_>>>()?;
490
491
                // Do not let duplicate columns to be added, some of the
492
                // missing_cols may be already present but without the new
493
                // projected alias.
494
0
                missing_exprs.retain(|e| !expr.contains(e));
495
0
                if is_distinct {
496
0
                    Self::ambiguous_distinct_check(&missing_exprs, missing_cols, &expr)?;
497
0
                }
498
0
                expr.extend(missing_exprs);
499
0
                project(Arc::unwrap_or_clone(input), expr)
500
            }
501
            _ => {
502
0
                let is_distinct =
503
0
                    is_distinct || matches!(curr_plan, LogicalPlan::Distinct(_));
504
0
                let new_inputs = curr_plan
505
0
                    .inputs()
506
0
                    .into_iter()
507
0
                    .map(|input_plan| {
508
0
                        Self::add_missing_columns(
509
0
                            (*input_plan).clone(),
510
0
                            missing_cols,
511
0
                            is_distinct,
512
0
                        )
513
0
                    })
514
0
                    .collect::<Result<Vec<_>>>()?;
515
0
                curr_plan.with_new_exprs(curr_plan.expressions(), new_inputs)
516
            }
517
        }
518
0
    }
519
520
0
    fn ambiguous_distinct_check(
521
0
        missing_exprs: &[Expr],
522
0
        missing_cols: &[Column],
523
0
        projection_exprs: &[Expr],
524
0
    ) -> Result<()> {
525
0
        if missing_exprs.is_empty() {
526
0
            return Ok(());
527
0
        }
528
0
529
0
        // if the missing columns are all only aliases for things in
530
0
        // the existing select list, it is ok
531
0
        //
532
0
        // This handles the special case for
533
0
        // SELECT col as <alias> ORDER BY <alias>
534
0
        //
535
0
        // As described in https://github.com/apache/datafusion/issues/5293
536
0
        let all_aliases = missing_exprs.iter().all(|e| {
537
0
            projection_exprs.iter().any(|proj_expr| {
538
0
                if let Expr::Alias(Alias { expr, .. }) = proj_expr {
539
0
                    e == expr.as_ref()
540
                } else {
541
0
                    false
542
                }
543
0
            })
544
0
        });
545
0
        if all_aliases {
546
0
            return Ok(());
547
0
        }
548
0
549
0
        let missing_col_names = missing_cols
550
0
            .iter()
551
0
            .map(|col| col.flat_name())
552
0
            .collect::<String>();
553
0
554
0
        plan_err!("For SELECT DISTINCT, ORDER BY expressions {missing_col_names} must appear in select list")
555
0
    }
556
557
    /// Apply a sort by provided expressions with default direction
558
0
    pub fn sort_by(
559
0
        self,
560
0
        expr: impl IntoIterator<Item = impl Into<Expr>> + Clone,
561
0
    ) -> Result<Self> {
562
0
        self.sort(
563
0
            expr.into_iter()
564
0
                .map(|e| e.into().sort(true, false))
565
0
                .collect::<Vec<SortExpr>>(),
566
0
        )
567
0
    }
568
569
0
    pub fn sort(
570
0
        self,
571
0
        sorts: impl IntoIterator<Item = impl Into<SortExpr>> + Clone,
572
0
    ) -> Result<Self> {
573
0
        self.sort_with_limit(sorts, None)
574
0
    }
575
576
    /// Apply a sort
577
0
    pub fn sort_with_limit(
578
0
        self,
579
0
        sorts: impl IntoIterator<Item = impl Into<SortExpr>> + Clone,
580
0
        fetch: Option<usize>,
581
0
    ) -> Result<Self> {
582
0
        let sorts = rewrite_sort_cols_by_aggs(sorts, &self.plan)?;
583
584
0
        let schema = self.plan.schema();
585
0
586
0
        // Collect sort columns that are missing in the input plan's schema
587
0
        let mut missing_cols: Vec<Column> = vec![];
588
0
        sorts.iter().try_for_each::<_, Result<()>>(|sort| {
589
0
            let columns = sort.expr.column_refs();
590
0
591
0
            columns.into_iter().for_each(|c| {
592
0
                if !schema.has_column(c) {
593
0
                    missing_cols.push(c.clone());
594
0
                }
595
0
            });
596
0
597
0
            Ok(())
598
0
        })?;
599
600
0
        if missing_cols.is_empty() {
601
            return Ok(Self::new(LogicalPlan::Sort(Sort {
602
0
                expr: normalize_sorts(sorts, &self.plan)?,
603
0
                input: self.plan,
604
0
                fetch,
605
            })));
606
0
        }
607
0
608
0
        // remove pushed down sort columns
609
0
        let new_expr = schema.columns().into_iter().map(Expr::Column).collect();
610
0
611
0
        let is_distinct = false;
612
0
        let plan = Self::add_missing_columns(
613
0
            Arc::unwrap_or_clone(self.plan),
614
0
            &missing_cols,
615
0
            is_distinct,
616
0
        )?;
617
0
        let sort_plan = LogicalPlan::Sort(Sort {
618
0
            expr: normalize_sorts(sorts, &plan)?,
619
0
            input: Arc::new(plan),
620
0
            fetch,
621
0
        });
622
0
623
0
        Projection::try_new(new_expr, Arc::new(sort_plan))
624
0
            .map(LogicalPlan::Projection)
625
0
            .map(Self::new)
626
0
    }
627
628
    /// Apply a union, preserving duplicate rows
629
0
    pub fn union(self, plan: LogicalPlan) -> Result<Self> {
630
0
        union(Arc::unwrap_or_clone(self.plan), plan).map(Self::new)
631
0
    }
632
633
    /// Apply a union, removing duplicate rows
634
0
    pub fn union_distinct(self, plan: LogicalPlan) -> Result<Self> {
635
0
        let left_plan: LogicalPlan = Arc::unwrap_or_clone(self.plan);
636
0
        let right_plan: LogicalPlan = plan;
637
0
638
0
        Ok(Self::new(LogicalPlan::Distinct(Distinct::All(Arc::new(
639
0
            union(left_plan, right_plan)?,
640
        )))))
641
0
    }
642
643
    /// Apply deduplication: Only distinct (different) values are returned)
644
0
    pub fn distinct(self) -> Result<Self> {
645
0
        Ok(Self::new(LogicalPlan::Distinct(Distinct::All(self.plan))))
646
0
    }
647
648
    /// Project first values of the specified expression list according to the provided
649
    /// sorting expressions grouped by the `DISTINCT ON` clause expressions.
650
0
    pub fn distinct_on(
651
0
        self,
652
0
        on_expr: Vec<Expr>,
653
0
        select_expr: Vec<Expr>,
654
0
        sort_expr: Option<Vec<SortExpr>>,
655
0
    ) -> Result<Self> {
656
0
        Ok(Self::new(LogicalPlan::Distinct(Distinct::On(
657
0
            DistinctOn::try_new(on_expr, select_expr, sort_expr, self.plan)?,
658
        ))))
659
0
    }
660
661
    /// Apply a join to `right` using explicitly specified columns and an
662
    /// optional filter expression.
663
    ///
664
    /// See [`join_on`](Self::join_on) for a more concise way to specify the
665
    /// join condition. Since DataFusion will automatically identify and
666
    /// optimize equality predicates there is no performance difference between
667
    /// this function and `join_on`
668
    ///
669
    /// `left_cols` and `right_cols` are used to form "equijoin" predicates (see
670
    /// example below), which are then combined with the optional `filter`
671
    /// expression.
672
    ///
673
    /// Note that in case of outer join, the `filter` is applied to only matched rows.
674
0
    pub fn join(
675
0
        self,
676
0
        right: LogicalPlan,
677
0
        join_type: JoinType,
678
0
        join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
679
0
        filter: Option<Expr>,
680
0
    ) -> Result<Self> {
681
0
        self.join_detailed(right, join_type, join_keys, filter, false)
682
0
    }
683
684
    /// Apply a join with using the specified expressions.
685
    ///
686
    /// Note that DataFusion automatically optimizes joins, including
687
    /// identifying and optimizing equality predicates.
688
    ///
689
    /// # Example
690
    ///
691
    /// ```
692
    /// # use datafusion_expr::{Expr, col, LogicalPlanBuilder,
693
    /// #  logical_plan::builder::LogicalTableSource, logical_plan::JoinType,};
694
    /// # use std::sync::Arc;
695
    /// # use arrow::datatypes::{Schema, DataType, Field};
696
    /// # use datafusion_common::Result;
697
    /// # fn main() -> Result<()> {
698
    /// let example_schema = Arc::new(Schema::new(vec![
699
    ///     Field::new("a", DataType::Int32, false),
700
    ///     Field::new("b", DataType::Int32, false),
701
    ///     Field::new("c", DataType::Int32, false),
702
    /// ]));
703
    /// let table_source = Arc::new(LogicalTableSource::new(example_schema));
704
    /// let left_table = table_source.clone();
705
    /// let right_table = table_source.clone();
706
    ///
707
    /// let right_plan = LogicalPlanBuilder::scan("right", right_table, None)?.build()?;
708
    ///
709
    /// // Form the expression `(left.a != right.a)` AND `(left.b != right.b)`
710
    /// let exprs = vec![
711
    ///     col("left.a").eq(col("right.a")),
712
    ///     col("left.b").not_eq(col("right.b"))
713
    ///  ];
714
    ///
715
    /// // Perform the equivalent of `left INNER JOIN right ON (a != a2 AND b != b2)`
716
    /// // finding all pairs of rows from `left` and `right` where
717
    /// // where `a = a2` and `b != b2`.
718
    /// let plan = LogicalPlanBuilder::scan("left", left_table, None)?
719
    ///     .join_on(right_plan, JoinType::Inner, exprs)?
720
    ///     .build()?;
721
    /// # Ok(())
722
    /// # }
723
    /// ```
724
0
    pub fn join_on(
725
0
        self,
726
0
        right: LogicalPlan,
727
0
        join_type: JoinType,
728
0
        on_exprs: impl IntoIterator<Item = Expr>,
729
0
    ) -> Result<Self> {
730
0
        let filter = on_exprs.into_iter().reduce(Expr::and);
731
0
732
0
        self.join_detailed(
733
0
            right,
734
0
            join_type,
735
0
            (Vec::<Column>::new(), Vec::<Column>::new()),
736
0
            filter,
737
0
            false,
738
0
        )
739
0
    }
740
741
0
    pub(crate) fn normalize(
742
0
        plan: &LogicalPlan,
743
0
        column: impl Into<Column>,
744
0
    ) -> Result<Column> {
745
0
        let schema = plan.schema();
746
0
        let fallback_schemas = plan.fallback_normalize_schemas();
747
0
        let using_columns = plan.using_columns()?;
748
0
        column.into().normalize_with_schemas_and_ambiguity_check(
749
0
            &[&[schema], &fallback_schemas],
750
0
            &using_columns,
751
0
        )
752
0
    }
753
754
    /// Apply a join with on constraint and specified null equality.
755
    ///
756
    /// The behavior is the same as [`join`](Self::join) except that it allows
757
    /// specifying the null equality behavior.
758
    ///
759
    /// If `null_equals_null=true`, rows where both join keys are `null` will be
760
    /// emitted. Otherwise rows where either or both join keys are `null` will be
761
    /// omitted.
762
0
    pub fn join_detailed(
763
0
        self,
764
0
        right: LogicalPlan,
765
0
        join_type: JoinType,
766
0
        join_keys: (Vec<impl Into<Column>>, Vec<impl Into<Column>>),
767
0
        filter: Option<Expr>,
768
0
        null_equals_null: bool,
769
0
    ) -> Result<Self> {
770
0
        if join_keys.0.len() != join_keys.1.len() {
771
0
            return plan_err!("left_keys and right_keys were not the same length");
772
0
        }
773
774
0
        let filter = if let Some(expr) = filter {
775
0
            let filter = normalize_col_with_schemas_and_ambiguity_check(
776
0
                expr,
777
0
                &[&[self.schema(), right.schema()]],
778
0
                &[],
779
0
            )?;
780
0
            Some(filter)
781
        } else {
782
0
            None
783
        };
784
785
0
        let (left_keys, right_keys): (Vec<Result<Column>>, Vec<Result<Column>>) =
786
0
            join_keys
787
0
                .0
788
0
                .into_iter()
789
0
                .zip(join_keys.1)
790
0
                .map(|(l, r)| {
791
0
                    let l = l.into();
792
0
                    let r = r.into();
793
0
794
0
                    match (&l.relation, &r.relation) {
795
0
                        (Some(lr), Some(rr)) => {
796
0
                            let l_is_left =
797
0
                                self.plan.schema().field_with_qualified_name(lr, &l.name);
798
0
                            let l_is_right =
799
0
                                right.schema().field_with_qualified_name(lr, &l.name);
800
0
                            let r_is_left =
801
0
                                self.plan.schema().field_with_qualified_name(rr, &r.name);
802
0
                            let r_is_right =
803
0
                                right.schema().field_with_qualified_name(rr, &r.name);
804
0
805
0
                            match (l_is_left, l_is_right, r_is_left, r_is_right) {
806
0
                                (_, Ok(_), Ok(_), _) => (Ok(r), Ok(l)),
807
0
                                (Ok(_), _, _, Ok(_)) => (Ok(l), Ok(r)),
808
0
                                _ => (
809
0
                                    Self::normalize(&self.plan, l),
810
0
                                    Self::normalize(&right, r),
811
0
                                ),
812
                            }
813
                        }
814
0
                        (Some(lr), None) => {
815
0
                            let l_is_left =
816
0
                                self.plan.schema().field_with_qualified_name(lr, &l.name);
817
0
                            let l_is_right =
818
0
                                right.schema().field_with_qualified_name(lr, &l.name);
819
0
820
0
                            match (l_is_left, l_is_right) {
821
0
                                (Ok(_), _) => (Ok(l), Self::normalize(&right, r)),
822
0
                                (_, Ok(_)) => (Self::normalize(&self.plan, r), Ok(l)),
823
0
                                _ => (
824
0
                                    Self::normalize(&self.plan, l),
825
0
                                    Self::normalize(&right, r),
826
0
                                ),
827
                            }
828
                        }
829
0
                        (None, Some(rr)) => {
830
0
                            let r_is_left =
831
0
                                self.plan.schema().field_with_qualified_name(rr, &r.name);
832
0
                            let r_is_right =
833
0
                                right.schema().field_with_qualified_name(rr, &r.name);
834
0
835
0
                            match (r_is_left, r_is_right) {
836
0
                                (Ok(_), _) => (Ok(r), Self::normalize(&right, l)),
837
0
                                (_, Ok(_)) => (Self::normalize(&self.plan, l), Ok(r)),
838
0
                                _ => (
839
0
                                    Self::normalize(&self.plan, l),
840
0
                                    Self::normalize(&right, r),
841
0
                                ),
842
                            }
843
                        }
844
                        (None, None) => {
845
0
                            let mut swap = false;
846
0
                            let left_key = Self::normalize(&self.plan, l.clone())
847
0
                                .or_else(|_| {
848
0
                                    swap = true;
849
0
                                    Self::normalize(&right, l)
850
0
                                });
851
0
                            if swap {
852
0
                                (Self::normalize(&self.plan, r), left_key)
853
                            } else {
854
0
                                (left_key, Self::normalize(&right, r))
855
                            }
856
                        }
857
                    }
858
0
                })
859
0
                .unzip();
860
861
0
        let left_keys = left_keys.into_iter().collect::<Result<Vec<Column>>>()?;
862
0
        let right_keys = right_keys.into_iter().collect::<Result<Vec<Column>>>()?;
863
864
0
        let on = left_keys
865
0
            .into_iter()
866
0
            .zip(right_keys)
867
0
            .map(|(l, r)| (Expr::Column(l), Expr::Column(r)))
868
0
            .collect();
869
0
        let join_schema =
870
0
            build_join_schema(self.plan.schema(), right.schema(), &join_type)?;
871
872
0
        Ok(Self::new(LogicalPlan::Join(Join {
873
0
            left: self.plan,
874
0
            right: Arc::new(right),
875
0
            on,
876
0
            filter,
877
0
            join_type,
878
0
            join_constraint: JoinConstraint::On,
879
0
            schema: DFSchemaRef::new(join_schema),
880
0
            null_equals_null,
881
0
        })))
882
0
    }
883
884
    /// Apply a join with using constraint, which duplicates all join columns in output schema.
885
0
    pub fn join_using(
886
0
        self,
887
0
        right: LogicalPlan,
888
0
        join_type: JoinType,
889
0
        using_keys: Vec<impl Into<Column> + Clone>,
890
0
    ) -> Result<Self> {
891
0
        let left_keys: Vec<Column> = using_keys
892
0
            .clone()
893
0
            .into_iter()
894
0
            .map(|c| Self::normalize(&self.plan, c))
895
0
            .collect::<Result<_>>()?;
896
0
        let right_keys: Vec<Column> = using_keys
897
0
            .into_iter()
898
0
            .map(|c| Self::normalize(&right, c))
899
0
            .collect::<Result<_>>()?;
900
901
0
        let on: Vec<(_, _)> = left_keys.into_iter().zip(right_keys).collect();
902
0
        let join_schema =
903
0
            build_join_schema(self.plan.schema(), right.schema(), &join_type)?;
904
0
        let mut join_on: Vec<(Expr, Expr)> = vec![];
905
0
        let mut filters: Option<Expr> = None;
906
0
        for (l, r) in &on {
907
0
            if self.plan.schema().has_column(l)
908
0
                && right.schema().has_column(r)
909
0
                && can_hash(self.plan.schema().field_from_column(l)?.data_type())
910
0
            {
911
0
                join_on.push((Expr::Column(l.clone()), Expr::Column(r.clone())));
912
0
            } else if self.plan.schema().has_column(l)
913
0
                && right.schema().has_column(r)
914
0
                && can_hash(self.plan.schema().field_from_column(r)?.data_type())
915
0
            {
916
0
                join_on.push((Expr::Column(r.clone()), Expr::Column(l.clone())));
917
0
            } else {
918
0
                let expr = binary_expr(
919
0
                    Expr::Column(l.clone()),
920
0
                    Operator::Eq,
921
0
                    Expr::Column(r.clone()),
922
0
                );
923
0
                match filters {
924
0
                    None => filters = Some(expr),
925
0
                    Some(filter_expr) => filters = Some(and(expr, filter_expr)),
926
                }
927
            }
928
        }
929
930
0
        if join_on.is_empty() {
931
0
            let join = Self::from(self.plan).cross_join(right)?;
932
0
            join.filter(filters.ok_or_else(|| {
933
0
                DataFusionError::Internal("filters should not be None here".to_string())
934
0
            })?)
935
        } else {
936
0
            Ok(Self::new(LogicalPlan::Join(Join {
937
0
                left: self.plan,
938
0
                right: Arc::new(right),
939
0
                on: join_on,
940
0
                filter: filters,
941
0
                join_type,
942
0
                join_constraint: JoinConstraint::Using,
943
0
                schema: DFSchemaRef::new(join_schema),
944
0
                null_equals_null: false,
945
0
            })))
946
        }
947
0
    }
948
949
    /// Apply a cross join
950
0
    pub fn cross_join(self, right: LogicalPlan) -> Result<Self> {
951
0
        let join_schema =
952
0
            build_join_schema(self.plan.schema(), right.schema(), &JoinType::Inner)?;
953
0
        Ok(Self::new(LogicalPlan::CrossJoin(CrossJoin {
954
0
            left: self.plan,
955
0
            right: Arc::new(right),
956
0
            schema: DFSchemaRef::new(join_schema),
957
0
        })))
958
0
    }
959
960
    /// Repartition
961
0
    pub fn repartition(self, partitioning_scheme: Partitioning) -> Result<Self> {
962
0
        Ok(Self::new(LogicalPlan::Repartition(Repartition {
963
0
            input: self.plan,
964
0
            partitioning_scheme,
965
0
        })))
966
0
    }
967
968
    /// Apply a window functions to extend the schema
969
0
    pub fn window(
970
0
        self,
971
0
        window_expr: impl IntoIterator<Item = impl Into<Expr>>,
972
0
    ) -> Result<Self> {
973
0
        let window_expr = normalize_cols(window_expr, &self.plan)?;
974
0
        validate_unique_names("Windows", &window_expr)?;
975
0
        Ok(Self::new(LogicalPlan::Window(Window::try_new(
976
0
            window_expr,
977
0
            self.plan,
978
0
        )?)))
979
0
    }
980
981
    /// Apply an aggregate: grouping on the `group_expr` expressions
982
    /// and calculating `aggr_expr` aggregates for each distinct
983
    /// value of the `group_expr`;
984
0
    pub fn aggregate(
985
0
        self,
986
0
        group_expr: impl IntoIterator<Item = impl Into<Expr>>,
987
0
        aggr_expr: impl IntoIterator<Item = impl Into<Expr>>,
988
0
    ) -> Result<Self> {
989
0
        let group_expr = normalize_cols(group_expr, &self.plan)?;
990
0
        let aggr_expr = normalize_cols(aggr_expr, &self.plan)?;
991
992
0
        let group_expr =
993
0
            add_group_by_exprs_from_dependencies(group_expr, self.plan.schema())?;
994
0
        Aggregate::try_new(self.plan, group_expr, aggr_expr)
995
0
            .map(LogicalPlan::Aggregate)
996
0
            .map(Self::new)
997
0
    }
998
999
    /// Create an expression to represent the explanation of the plan
1000
    ///
1001
    /// if `analyze` is true, runs the actual plan and produces
1002
    /// information about metrics during run.
1003
    ///
1004
    /// if `verbose` is true, prints out additional details.
1005
0
    pub fn explain(self, verbose: bool, analyze: bool) -> Result<Self> {
1006
0
        let schema = LogicalPlan::explain_schema();
1007
0
        let schema = schema.to_dfschema_ref()?;
1008
1009
0
        if analyze {
1010
0
            Ok(Self::new(LogicalPlan::Analyze(Analyze {
1011
0
                verbose,
1012
0
                input: self.plan,
1013
0
                schema,
1014
0
            })))
1015
        } else {
1016
0
            let stringified_plans =
1017
0
                vec![self.plan.to_stringified(PlanType::InitialLogicalPlan)];
1018
0
1019
0
            Ok(Self::new(LogicalPlan::Explain(Explain {
1020
0
                verbose,
1021
0
                plan: self.plan,
1022
0
                stringified_plans,
1023
0
                schema,
1024
0
                logical_optimization_succeeded: false,
1025
0
            })))
1026
        }
1027
0
    }
1028
1029
    /// Process intersect set operator
1030
0
    pub fn intersect(
1031
0
        left_plan: LogicalPlan,
1032
0
        right_plan: LogicalPlan,
1033
0
        is_all: bool,
1034
0
    ) -> Result<LogicalPlan> {
1035
0
        LogicalPlanBuilder::intersect_or_except(
1036
0
            left_plan,
1037
0
            right_plan,
1038
0
            JoinType::LeftSemi,
1039
0
            is_all,
1040
0
        )
1041
0
    }
1042
1043
    /// Process except set operator
1044
0
    pub fn except(
1045
0
        left_plan: LogicalPlan,
1046
0
        right_plan: LogicalPlan,
1047
0
        is_all: bool,
1048
0
    ) -> Result<LogicalPlan> {
1049
0
        LogicalPlanBuilder::intersect_or_except(
1050
0
            left_plan,
1051
0
            right_plan,
1052
0
            JoinType::LeftAnti,
1053
0
            is_all,
1054
0
        )
1055
0
    }
1056
1057
    /// Process intersect or except
1058
0
    fn intersect_or_except(
1059
0
        left_plan: LogicalPlan,
1060
0
        right_plan: LogicalPlan,
1061
0
        join_type: JoinType,
1062
0
        is_all: bool,
1063
0
    ) -> Result<LogicalPlan> {
1064
0
        let left_len = left_plan.schema().fields().len();
1065
0
        let right_len = right_plan.schema().fields().len();
1066
0
1067
0
        if left_len != right_len {
1068
0
            return plan_err!(
1069
0
                "INTERSECT/EXCEPT query must have the same number of columns. Left is {left_len} and right is {right_len}."
1070
0
            );
1071
0
        }
1072
0
1073
0
        let join_keys = left_plan
1074
0
            .schema()
1075
0
            .fields()
1076
0
            .iter()
1077
0
            .zip(right_plan.schema().fields().iter())
1078
0
            .map(|(left_field, right_field)| {
1079
0
                (
1080
0
                    (Column::from_name(left_field.name())),
1081
0
                    (Column::from_name(right_field.name())),
1082
0
                )
1083
0
            })
1084
0
            .unzip();
1085
0
        if is_all {
1086
0
            LogicalPlanBuilder::from(left_plan)
1087
0
                .join_detailed(right_plan, join_type, join_keys, None, true)?
1088
0
                .build()
1089
        } else {
1090
0
            LogicalPlanBuilder::from(left_plan)
1091
0
                .distinct()?
1092
0
                .join_detailed(right_plan, join_type, join_keys, None, true)?
1093
0
                .build()
1094
        }
1095
0
    }
1096
1097
    /// Build the plan
1098
0
    pub fn build(self) -> Result<LogicalPlan> {
1099
0
        Ok(Arc::unwrap_or_clone(self.plan))
1100
0
    }
1101
1102
    /// Apply a join with the expression on constraint.
1103
    ///
1104
    /// equi_exprs are "equijoin" predicates expressions on the existing and right inputs, respectively.
1105
    ///
1106
    /// filter: any other filter expression to apply during the join. equi_exprs predicates are likely
1107
    /// to be evaluated more quickly than the filter expressions
1108
0
    pub fn join_with_expr_keys(
1109
0
        self,
1110
0
        right: LogicalPlan,
1111
0
        join_type: JoinType,
1112
0
        equi_exprs: (Vec<impl Into<Expr>>, Vec<impl Into<Expr>>),
1113
0
        filter: Option<Expr>,
1114
0
    ) -> Result<Self> {
1115
0
        if equi_exprs.0.len() != equi_exprs.1.len() {
1116
0
            return plan_err!("left_keys and right_keys were not the same length");
1117
0
        }
1118
1119
0
        let join_key_pairs = equi_exprs
1120
0
            .0
1121
0
            .into_iter()
1122
0
            .zip(equi_exprs.1.into_iter())
1123
0
            .map(|(l, r)| {
1124
0
                let left_key = l.into();
1125
0
                let right_key = r.into();
1126
0
1127
0
                let mut left_using_columns = HashSet::new();
1128
0
                expr_to_columns(&left_key, &mut left_using_columns)?;
1129
0
                let normalized_left_key = normalize_col_with_schemas_and_ambiguity_check(
1130
0
                    left_key,
1131
0
                    &[&[self.plan.schema(), right.schema()]],
1132
0
                    &[left_using_columns],
1133
0
                )?;
1134
1135
0
                let mut right_using_columns = HashSet::new();
1136
0
                expr_to_columns(&right_key, &mut right_using_columns)?;
1137
0
                let normalized_right_key = normalize_col_with_schemas_and_ambiguity_check(
1138
0
                    right_key,
1139
0
                    &[&[self.plan.schema(), right.schema()]],
1140
0
                    &[right_using_columns],
1141
0
                )?;
1142
1143
                // find valid equijoin
1144
0
                find_valid_equijoin_key_pair(
1145
0
                        &normalized_left_key,
1146
0
                        &normalized_right_key,
1147
0
                        self.plan.schema(),
1148
0
                        right.schema(),
1149
0
                    )?.ok_or_else(||
1150
0
                        plan_datafusion_err!(
1151
0
                            "can't create join plan, join key should belong to one input, error key: ({normalized_left_key},{normalized_right_key})"
1152
0
                        ))
1153
0
            })
1154
0
            .collect::<Result<Vec<_>>>()?;
1155
1156
0
        let join_schema =
1157
0
            build_join_schema(self.plan.schema(), right.schema(), &join_type)?;
1158
1159
0
        Ok(Self::new(LogicalPlan::Join(Join {
1160
0
            left: self.plan,
1161
0
            right: Arc::new(right),
1162
0
            on: join_key_pairs,
1163
0
            filter,
1164
0
            join_type,
1165
0
            join_constraint: JoinConstraint::On,
1166
0
            schema: DFSchemaRef::new(join_schema),
1167
0
            null_equals_null: false,
1168
0
        })))
1169
0
    }
1170
1171
    /// Unnest the given column.
1172
0
    pub fn unnest_column(self, column: impl Into<Column>) -> Result<Self> {
1173
0
        unnest(Arc::unwrap_or_clone(self.plan), vec![column.into()]).map(Self::new)
1174
0
    }
1175
1176
    /// Unnest the given column given [`UnnestOptions`]
1177
0
    pub fn unnest_column_with_options(
1178
0
        self,
1179
0
        column: impl Into<Column>,
1180
0
        options: UnnestOptions,
1181
0
    ) -> Result<Self> {
1182
0
        unnest_with_options(
1183
0
            Arc::unwrap_or_clone(self.plan),
1184
0
            vec![(column.into(), ColumnUnnestType::Inferred)],
1185
0
            options,
1186
0
        )
1187
0
        .map(Self::new)
1188
0
    }
1189
1190
    /// Unnest the given columns with the given [`UnnestOptions`]
1191
0
    pub fn unnest_columns_with_options(
1192
0
        self,
1193
0
        columns: Vec<Column>,
1194
0
        options: UnnestOptions,
1195
0
    ) -> Result<Self> {
1196
0
        unnest_with_options(
1197
0
            Arc::unwrap_or_clone(self.plan),
1198
0
            columns
1199
0
                .into_iter()
1200
0
                .map(|c| (c, ColumnUnnestType::Inferred))
1201
0
                .collect(),
1202
0
            options,
1203
0
        )
1204
0
        .map(Self::new)
1205
0
    }
1206
1207
    /// Unnest the given columns with the given [`UnnestOptions`]
1208
    /// if one column is a list type, it can be recursively and simultaneously
1209
    /// unnested into the desired recursion levels
1210
    /// e.g select unnest(list_col,depth=1), unnest(list_col,depth=2)
1211
0
    pub fn unnest_columns_recursive_with_options(
1212
0
        self,
1213
0
        columns: Vec<(Column, ColumnUnnestType)>,
1214
0
        options: UnnestOptions,
1215
0
    ) -> Result<Self> {
1216
0
        unnest_with_options(Arc::unwrap_or_clone(self.plan), columns, options)
1217
0
            .map(Self::new)
1218
0
    }
1219
}
1220
1221
impl From<LogicalPlan> for LogicalPlanBuilder {
1222
0
    fn from(plan: LogicalPlan) -> Self {
1223
0
        LogicalPlanBuilder::new(plan)
1224
0
    }
1225
}
1226
1227
impl From<Arc<LogicalPlan>> for LogicalPlanBuilder {
1228
0
    fn from(plan: Arc<LogicalPlan>) -> Self {
1229
0
        LogicalPlanBuilder::new_from_arc(plan)
1230
0
    }
1231
}
1232
1233
0
pub fn change_redundant_column(fields: &Fields) -> Vec<Field> {
1234
0
    let mut name_map = HashMap::new();
1235
0
    fields
1236
0
        .into_iter()
1237
0
        .map(|field| {
1238
0
            let counter = name_map.entry(field.name().to_string()).or_insert(0);
1239
0
            *counter += 1;
1240
0
            if *counter > 1 {
1241
0
                let new_name = format!("{}:{}", field.name(), *counter - 1);
1242
0
                Field::new(new_name, field.data_type().clone(), field.is_nullable())
1243
            } else {
1244
0
                field.as_ref().clone()
1245
            }
1246
0
        })
1247
0
        .collect()
1248
0
}
1249
/// Creates a schema for a join operation.
1250
/// The fields from the left side are first
1251
0
pub fn build_join_schema(
1252
0
    left: &DFSchema,
1253
0
    right: &DFSchema,
1254
0
    join_type: &JoinType,
1255
0
) -> Result<DFSchema> {
1256
0
    fn nullify_fields<'a>(
1257
0
        fields: impl Iterator<Item = (Option<&'a TableReference>, &'a Arc<Field>)>,
1258
0
    ) -> Vec<(Option<TableReference>, Arc<Field>)> {
1259
0
        fields
1260
0
            .map(|(q, f)| {
1261
0
                // TODO: find a good way to do that
1262
0
                let field = f.as_ref().clone().with_nullable(true);
1263
0
                (q.cloned(), Arc::new(field))
1264
0
            })
1265
0
            .collect()
1266
0
    }
1267
1268
0
    let right_fields = right.iter();
1269
0
    let left_fields = left.iter();
1270
1271
0
    let qualified_fields: Vec<(Option<TableReference>, Arc<Field>)> = match join_type {
1272
        JoinType::Inner => {
1273
            // left then right
1274
0
            let left_fields = left_fields
1275
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1276
0
                .collect::<Vec<_>>();
1277
0
            let right_fields = right_fields
1278
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1279
0
                .collect::<Vec<_>>();
1280
0
            left_fields.into_iter().chain(right_fields).collect()
1281
        }
1282
        JoinType::Left => {
1283
            // left then right, right set to nullable in case of not matched scenario
1284
0
            let left_fields = left_fields
1285
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1286
0
                .collect::<Vec<_>>();
1287
0
            left_fields
1288
0
                .into_iter()
1289
0
                .chain(nullify_fields(right_fields))
1290
0
                .collect()
1291
        }
1292
        JoinType::Right => {
1293
            // left then right, left set to nullable in case of not matched scenario
1294
0
            let right_fields = right_fields
1295
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1296
0
                .collect::<Vec<_>>();
1297
0
            nullify_fields(left_fields)
1298
0
                .into_iter()
1299
0
                .chain(right_fields)
1300
0
                .collect()
1301
        }
1302
        JoinType::Full => {
1303
            // left then right, all set to nullable in case of not matched scenario
1304
0
            nullify_fields(left_fields)
1305
0
                .into_iter()
1306
0
                .chain(nullify_fields(right_fields))
1307
0
                .collect()
1308
        }
1309
        JoinType::LeftSemi | JoinType::LeftAnti => {
1310
            // Only use the left side for the schema
1311
0
            left_fields
1312
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1313
0
                .collect()
1314
        }
1315
        JoinType::RightSemi | JoinType::RightAnti => {
1316
            // Only use the right side for the schema
1317
0
            right_fields
1318
0
                .map(|(q, f)| (q.cloned(), Arc::clone(f)))
1319
0
                .collect()
1320
        }
1321
    };
1322
0
    let func_dependencies = left.functional_dependencies().join(
1323
0
        right.functional_dependencies(),
1324
0
        join_type,
1325
0
        left.fields().len(),
1326
0
    );
1327
0
    let mut metadata = left.metadata().clone();
1328
0
    metadata.extend(right.metadata().clone());
1329
0
    let dfschema = DFSchema::new_with_metadata(qualified_fields, metadata)?;
1330
0
    dfschema.with_functional_dependencies(func_dependencies)
1331
0
}
1332
1333
/// Add additional "synthetic" group by expressions based on functional
1334
/// dependencies.
1335
///
1336
/// For example, if we are grouping on `[c1]`, and we know from
1337
/// functional dependencies that column `c1` determines `c2`, this function
1338
/// adds `c2` to the group by list.
1339
///
1340
/// This allows MySQL style selects like
1341
/// `SELECT col FROM t WHERE pk = 5` if col is unique
1342
0
pub fn add_group_by_exprs_from_dependencies(
1343
0
    mut group_expr: Vec<Expr>,
1344
0
    schema: &DFSchemaRef,
1345
0
) -> Result<Vec<Expr>> {
1346
0
    // Names of the fields produced by the GROUP BY exprs for example, `GROUP BY
1347
0
    // c1 + 1` produces an output field named `"c1 + 1"`
1348
0
    let mut group_by_field_names = group_expr
1349
0
        .iter()
1350
0
        .map(|e| e.schema_name().to_string())
1351
0
        .collect::<Vec<_>>();
1352
1353
0
    if let Some(target_indices) =
1354
0
        get_target_functional_dependencies(schema, &group_by_field_names)
1355
    {
1356
0
        for idx in target_indices {
1357
0
            let expr = Expr::Column(Column::from(schema.qualified_field(idx)));
1358
0
            let expr_name = expr.schema_name().to_string();
1359
0
            if !group_by_field_names.contains(&expr_name) {
1360
0
                group_by_field_names.push(expr_name);
1361
0
                group_expr.push(expr);
1362
0
            }
1363
        }
1364
0
    }
1365
0
    Ok(group_expr)
1366
0
}
1367
/// Errors if one or more expressions have equal names.
1368
0
pub fn validate_unique_names<'a>(
1369
0
    node_name: &str,
1370
0
    expressions: impl IntoIterator<Item = &'a Expr>,
1371
0
) -> Result<()> {
1372
0
    let mut unique_names = HashMap::new();
1373
0
1374
0
    expressions.into_iter().enumerate().try_for_each(|(position, expr)| {
1375
0
        let name = expr.schema_name().to_string();
1376
0
        match unique_names.get(&name) {
1377
            None => {
1378
0
                unique_names.insert(name, (position, expr));
1379
0
                Ok(())
1380
            },
1381
0
            Some((existing_position, existing_expr)) => {
1382
0
                plan_err!("{node_name} require unique expression names \
1383
0
                             but the expression \"{existing_expr}\" at position {existing_position} and \"{expr}\" \
1384
0
                             at position {position} have the same name. Consider aliasing (\"AS\") one of them."
1385
0
                            )
1386
            }
1387
        }
1388
0
    })
1389
0
}
1390
1391
/// Union two [`LogicalPlan`]s.
1392
///
1393
/// Constructs the UNION plan, but does not perform type-coercion. Therefore the
1394
/// subtree expressions will not be properly typed until the optimizer pass.
1395
///
1396
/// If a properly typed UNION plan is needed, refer to [`TypeCoercionRewriter::coerce_union`]
1397
/// or alternatively, merge the union input schema using [`coerce_union_schema`] and
1398
/// apply the expression rewrite with [`coerce_plan_expr_for_schema`].
1399
///
1400
/// [`TypeCoercionRewriter::coerce_union`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/struct.TypeCoercionRewriter.html#method.coerce_union
1401
/// [`coerce_union_schema`]: https://docs.rs/datafusion-optimizer/latest/datafusion_optimizer/analyzer/type_coercion/fn.coerce_union_schema.html
1402
0
pub fn union(left_plan: LogicalPlan, right_plan: LogicalPlan) -> Result<LogicalPlan> {
1403
0
    // Temporarily use the schema from the left input and later rely on the analyzer to
1404
0
    // coerce the two schemas into a common one.
1405
0
    let schema = Arc::clone(left_plan.schema());
1406
0
    Ok(LogicalPlan::Union(Union {
1407
0
        inputs: vec![Arc::new(left_plan), Arc::new(right_plan)],
1408
0
        schema,
1409
0
    }))
1410
0
}
1411
1412
/// Create Projection
1413
/// # Errors
1414
/// This function errors under any of the following conditions:
1415
/// * Two or more expressions have the same name
1416
/// * An invalid expression is used (e.g. a `sort` expression)
1417
0
pub fn project(
1418
0
    plan: LogicalPlan,
1419
0
    expr: impl IntoIterator<Item = impl Into<Expr>>,
1420
0
) -> Result<LogicalPlan> {
1421
0
    let mut projected_expr = vec![];
1422
0
    for e in expr {
1423
0
        let e = e.into();
1424
0
        match e {
1425
0
            Expr::Wildcard { .. } => projected_expr.push(e),
1426
0
            _ => projected_expr.push(columnize_expr(normalize_col(e, &plan)?, &plan)?),
1427
        }
1428
    }
1429
0
    validate_unique_names("Projections", projected_expr.iter())?;
1430
1431
0
    Projection::try_new(projected_expr, Arc::new(plan)).map(LogicalPlan::Projection)
1432
0
}
1433
1434
/// Create a SubqueryAlias to wrap a LogicalPlan.
1435
0
pub fn subquery_alias(
1436
0
    plan: LogicalPlan,
1437
0
    alias: impl Into<TableReference>,
1438
0
) -> Result<LogicalPlan> {
1439
0
    SubqueryAlias::try_new(Arc::new(plan), alias).map(LogicalPlan::SubqueryAlias)
1440
0
}
1441
1442
/// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema.
1443
/// This is mostly used for testing and documentation.
1444
0
pub fn table_scan(
1445
0
    name: Option<impl Into<TableReference>>,
1446
0
    table_schema: &Schema,
1447
0
    projection: Option<Vec<usize>>,
1448
0
) -> Result<LogicalPlanBuilder> {
1449
0
    table_scan_with_filters(name, table_schema, projection, vec![])
1450
0
}
1451
1452
/// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema,
1453
/// and inlined filters.
1454
/// This is mostly used for testing and documentation.
1455
0
pub fn table_scan_with_filters(
1456
0
    name: Option<impl Into<TableReference>>,
1457
0
    table_schema: &Schema,
1458
0
    projection: Option<Vec<usize>>,
1459
0
    filters: Vec<Expr>,
1460
0
) -> Result<LogicalPlanBuilder> {
1461
0
    let table_source = table_source(table_schema);
1462
0
    let name = name
1463
0
        .map(|n| n.into())
1464
0
        .unwrap_or_else(|| TableReference::bare(UNNAMED_TABLE));
1465
0
    LogicalPlanBuilder::scan_with_filters(name, table_source, projection, filters)
1466
0
}
1467
1468
/// Create a LogicalPlanBuilder representing a scan of a table with the provided name and schema,
1469
/// filters, and inlined fetch.
1470
/// This is mostly used for testing and documentation.
1471
0
pub fn table_scan_with_filter_and_fetch(
1472
0
    name: Option<impl Into<TableReference>>,
1473
0
    table_schema: &Schema,
1474
0
    projection: Option<Vec<usize>>,
1475
0
    filters: Vec<Expr>,
1476
0
    fetch: Option<usize>,
1477
0
) -> Result<LogicalPlanBuilder> {
1478
0
    let table_source = table_source(table_schema);
1479
0
    let name = name
1480
0
        .map(|n| n.into())
1481
0
        .unwrap_or_else(|| TableReference::bare(UNNAMED_TABLE));
1482
0
    LogicalPlanBuilder::scan_with_filters_fetch(
1483
0
        name,
1484
0
        table_source,
1485
0
        projection,
1486
0
        filters,
1487
0
        fetch,
1488
0
    )
1489
0
}
1490
1491
0
fn table_source(table_schema: &Schema) -> Arc<dyn TableSource> {
1492
0
    let table_schema = Arc::new(table_schema.clone());
1493
0
    Arc::new(LogicalTableSource { table_schema })
1494
0
}
1495
1496
/// Wrap projection for a plan, if the join keys contains normal expression.
1497
0
pub fn wrap_projection_for_join_if_necessary(
1498
0
    join_keys: &[Expr],
1499
0
    input: LogicalPlan,
1500
0
) -> Result<(LogicalPlan, Vec<Column>, bool)> {
1501
0
    let input_schema = input.schema();
1502
0
    let alias_join_keys: Vec<Expr> = join_keys
1503
0
        .iter()
1504
0
        .map(|key| {
1505
            // The display_name() of cast expression will ignore the cast info, and show the inner expression name.
1506
            // If we do not add alais, it will throw same field name error in the schema when adding projection.
1507
            // For example:
1508
            //    input scan : [a, b, c],
1509
            //    join keys: [cast(a as int)]
1510
            //
1511
            //  then a and cast(a as int) will use the same field name - `a` in projection schema.
1512
            //  https://github.com/apache/datafusion/issues/4478
1513
0
            if matches!(key, Expr::Cast(_)) || matches!(key, Expr::TryCast(_)) {
1514
0
                let alias = format!("{key}");
1515
0
                key.clone().alias(alias)
1516
            } else {
1517
0
                key.clone()
1518
            }
1519
0
        })
1520
0
        .collect::<Vec<_>>();
1521
0
1522
0
    let need_project = join_keys.iter().any(|key| !matches!(key, Expr::Column(_)));
1523
0
    let plan = if need_project {
1524
        // Include all columns from the input and extend them with the join keys
1525
0
        let mut projection = input_schema
1526
0
            .columns()
1527
0
            .into_iter()
1528
0
            .map(Expr::Column)
1529
0
            .collect::<Vec<_>>();
1530
0
        let join_key_items = alias_join_keys
1531
0
            .iter()
1532
0
            .flat_map(|expr| expr.try_as_col().is_none().then_some(expr))
1533
0
            .cloned()
1534
0
            .collect::<HashSet<Expr>>();
1535
0
        projection.extend(join_key_items);
1536
0
1537
0
        LogicalPlanBuilder::from(input)
1538
0
            .project(projection)?
1539
0
            .build()?
1540
    } else {
1541
0
        input
1542
    };
1543
1544
0
    let join_on = alias_join_keys
1545
0
        .into_iter()
1546
0
        .map(|key| {
1547
0
            if let Some(col) = key.try_as_col() {
1548
0
                Ok(col.clone())
1549
            } else {
1550
0
                let name = key.schema_name().to_string();
1551
0
                Ok(Column::from_name(name))
1552
            }
1553
0
        })
1554
0
        .collect::<Result<Vec<_>>>()?;
1555
1556
0
    Ok((plan, join_on, need_project))
1557
0
}
1558
1559
/// Basic TableSource implementation intended for use in tests and documentation. It is expected
1560
/// that users will provide their own TableSource implementations or use DataFusion's
1561
/// DefaultTableSource.
1562
pub struct LogicalTableSource {
1563
    table_schema: SchemaRef,
1564
}
1565
1566
impl LogicalTableSource {
1567
    /// Create a new LogicalTableSource
1568
0
    pub fn new(table_schema: SchemaRef) -> Self {
1569
0
        Self { table_schema }
1570
0
    }
1571
}
1572
1573
impl TableSource for LogicalTableSource {
1574
0
    fn as_any(&self) -> &dyn Any {
1575
0
        self
1576
0
    }
1577
1578
0
    fn schema(&self) -> SchemaRef {
1579
0
        Arc::clone(&self.table_schema)
1580
0
    }
1581
1582
0
    fn supports_filters_pushdown(
1583
0
        &self,
1584
0
        filters: &[&Expr],
1585
0
    ) -> Result<Vec<crate::TableProviderFilterPushDown>> {
1586
0
        Ok(vec![TableProviderFilterPushDown::Exact; filters.len()])
1587
0
    }
1588
}
1589
1590
/// Create a [`LogicalPlan::Unnest`] plan
1591
0
pub fn unnest(input: LogicalPlan, columns: Vec<Column>) -> Result<LogicalPlan> {
1592
0
    let unnestings = columns
1593
0
        .into_iter()
1594
0
        .map(|c| (c, ColumnUnnestType::Inferred))
1595
0
        .collect();
1596
0
    unnest_with_options(input, unnestings, UnnestOptions::default())
1597
0
}
1598
1599
0
pub fn get_unnested_list_datatype_recursive(
1600
0
    data_type: &DataType,
1601
0
    depth: usize,
1602
0
) -> Result<DataType> {
1603
0
    match data_type {
1604
0
        DataType::List(field)
1605
0
        | DataType::FixedSizeList(field, _)
1606
0
        | DataType::LargeList(field) => {
1607
0
            if depth == 1 {
1608
0
                return Ok(field.data_type().clone());
1609
0
            }
1610
0
            return get_unnested_list_datatype_recursive(field.data_type(), depth - 1);
1611
        }
1612
0
        _ => {}
1613
0
    };
1614
0
1615
0
    internal_err!("trying to unnest on invalid data type {:?}", data_type)
1616
0
}
1617
1618
/// Infer the unnest type based on the data type:
1619
/// - list type: infer to unnest(list(col, depth=1))
1620
/// - struct type: infer to unnest(struct)
1621
0
fn infer_unnest_type(
1622
0
    col_name: &String,
1623
0
    data_type: &DataType,
1624
0
) -> Result<ColumnUnnestType> {
1625
0
    match data_type {
1626
        DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => {
1627
0
            Ok(ColumnUnnestType::List(vec![ColumnUnnestList {
1628
0
                output_column: Column::from_name(col_name),
1629
0
                depth: 1,
1630
0
            }]))
1631
        }
1632
0
        DataType::Struct(_) => Ok(ColumnUnnestType::Struct),
1633
        _ => {
1634
0
            internal_err!("trying to unnest on invalid data type {:?}", data_type)
1635
        }
1636
    }
1637
0
}
1638
1639
0
pub fn get_struct_unnested_columns(
1640
0
    col_name: &String,
1641
0
    inner_fields: &Fields,
1642
0
) -> Vec<Column> {
1643
0
    inner_fields
1644
0
        .iter()
1645
0
        .map(|f| Column::from_name(format!("{}.{}", col_name, f.name())))
1646
0
        .collect()
1647
0
}
1648
1649
// Based on data type, either struct or a variant of list
1650
// return a set of columns as the result of unnesting
1651
// the input columns.
1652
// For example, given a column with name "a",
1653
// - List(Element) returns ["a"] with data type Element
1654
// - Struct(field1, field2) returns ["a.field1","a.field2"]
1655
// For list data type, an argument depth is used to specify
1656
// the recursion level
1657
0
pub fn get_unnested_columns(
1658
0
    col_name: &String,
1659
0
    data_type: &DataType,
1660
0
    depth: usize,
1661
0
) -> Result<Vec<(Column, Arc<Field>)>> {
1662
0
    let mut qualified_columns = Vec::with_capacity(1);
1663
0
1664
0
    match data_type {
1665
        DataType::List(_) | DataType::FixedSizeList(_, _) | DataType::LargeList(_) => {
1666
0
            let data_type = get_unnested_list_datatype_recursive(data_type, depth)?;
1667
0
            let new_field = Arc::new(Field::new(
1668
0
                col_name, data_type,
1669
0
                // Unnesting may produce NULLs even if the list is not null.
1670
0
                // For example: unnset([1], []) -> 1, null
1671
0
                true,
1672
0
            ));
1673
0
            let column = Column::from_name(col_name);
1674
0
            // let column = Column::from((None, &new_field));
1675
0
            qualified_columns.push((column, new_field));
1676
        }
1677
0
        DataType::Struct(fields) => {
1678
0
            qualified_columns.extend(fields.iter().map(|f| {
1679
0
                let new_name = format!("{}.{}", col_name, f.name());
1680
0
                let column = Column::from_name(&new_name);
1681
0
                let new_field = f.as_ref().clone().with_name(new_name);
1682
0
                // let column = Column::from((None, &f));
1683
0
                (column, Arc::new(new_field))
1684
0
            }))
1685
        }
1686
        _ => {
1687
0
            return internal_err!(
1688
0
                "trying to unnest on invalid data type {:?}",
1689
0
                data_type
1690
0
            );
1691
        }
1692
    };
1693
0
    Ok(qualified_columns)
1694
0
}
1695
1696
/// Create a [`LogicalPlan::Unnest`] plan with options
1697
/// This function receive a list of columns to be unnested
1698
/// because multiple unnest can be performed on the same column (e.g unnest with different depth)
1699
/// The new schema will contains post-unnest fields replacing the original field
1700
///
1701
/// For example:
1702
/// Input schema as
1703
/// ```text
1704
/// +---------------------+-------------------+
1705
/// | col1                | col2              |
1706
/// +---------------------+-------------------+
1707
/// | Struct(INT64,INT32) | List(List(Int64)) |
1708
/// +---------------------+-------------------+
1709
/// ```
1710
///
1711
///
1712
///
1713
/// Then unnesting columns with:
1714
/// - (col1,Struct)
1715
/// - (col2,List(\[depth=1,depth=2\]))
1716
///
1717
/// will generate a new schema as
1718
/// ```text
1719
/// +---------+---------+---------------------+---------------------+
1720
/// | col1.c0 | col1.c1 | unnest_col2_depth_1 | unnest_col2_depth_2 |
1721
/// +---------+---------+---------------------+---------------------+
1722
/// | Int64   | Int32   | List(Int64)         |  Int64              |
1723
/// +---------+---------+---------------------+---------------------+
1724
/// ```
1725
0
pub fn unnest_with_options(
1726
0
    input: LogicalPlan,
1727
0
    columns_to_unnest: Vec<(Column, ColumnUnnestType)>,
1728
0
    options: UnnestOptions,
1729
0
) -> Result<LogicalPlan> {
1730
0
    let mut list_columns: Vec<(usize, ColumnUnnestList)> = vec![];
1731
0
    let mut struct_columns = vec![];
1732
0
    let indices_to_unnest = columns_to_unnest
1733
0
        .iter()
1734
0
        .map(|col_unnesting| {
1735
0
            Ok((
1736
0
                input.schema().index_of_column(&col_unnesting.0)?,
1737
0
                col_unnesting,
1738
            ))
1739
0
        })
1740
0
        .collect::<Result<HashMap<usize, &(Column, ColumnUnnestType)>>>()?;
1741
1742
0
    let input_schema = input.schema();
1743
0
1744
0
    let mut dependency_indices = vec![];
1745
    // Transform input schema into new schema
1746
    // Given this comprehensive example
1747
    //
1748
    // input schema:
1749
    // 1.col1_unnest_placeholder: list[list[int]],
1750
    // 2.col1: list[list[int]]
1751
    // 3.col2: list[int]
1752
    // with unnest on unnest(col1,depth=2), unnest(col1,depth=1) and unnest(col2,depth=1)
1753
    // output schema:
1754
    // 1.unnest_col1_depth_2: int
1755
    // 2.unnest_col1_depth_1: list[int]
1756
    // 3.col1: list[list[int]]
1757
    // 4.unnest_col2_depth_1: int
1758
    // Meaning the placeholder column will be replaced by its unnested variation(s), note
1759
    // the plural.
1760
0
    let fields = input_schema
1761
0
        .iter()
1762
0
        .enumerate()
1763
0
        .map(|(index, (original_qualifier, original_field))| {
1764
0
            match indices_to_unnest.get(&index) {
1765
0
                Some((column_to_unnest, unnest_type)) => {
1766
0
                    let mut inferred_unnest_type = unnest_type.clone();
1767
0
                    if let ColumnUnnestType::Inferred = unnest_type {
1768
0
                        inferred_unnest_type = infer_unnest_type(
1769
0
                            &column_to_unnest.name,
1770
0
                            original_field.data_type(),
1771
0
                        )?;
1772
0
                    }
1773
0
                    let transformed_columns: Vec<(Column, Arc<Field>)> =
1774
0
                        match inferred_unnest_type {
1775
                            ColumnUnnestType::Struct => {
1776
0
                                struct_columns.push(index);
1777
0
                                get_unnested_columns(
1778
0
                                    &column_to_unnest.name,
1779
0
                                    original_field.data_type(),
1780
0
                                    1,
1781
0
                                )?
1782
                            }
1783
0
                            ColumnUnnestType::List(unnest_lists) => {
1784
0
                                list_columns.extend(
1785
0
                                    unnest_lists
1786
0
                                        .iter()
1787
0
                                        .map(|ul| (index, ul.to_owned().clone())),
1788
0
                                );
1789
0
                                unnest_lists
1790
0
                                    .iter()
1791
0
                                    .map(
1792
0
                                        |ColumnUnnestList {
1793
                                             output_column,
1794
                                             depth,
1795
0
                                         }| {
1796
0
                                            get_unnested_columns(
1797
0
                                                &output_column.name,
1798
0
                                                original_field.data_type(),
1799
0
                                                *depth,
1800
0
                                            )
1801
0
                                        },
1802
0
                                    )
1803
0
                                    .collect::<Result<Vec<Vec<(Column, Arc<Field>)>>>>()?
1804
0
                                    .into_iter()
1805
0
                                    .flatten()
1806
0
                                    .collect::<Vec<_>>()
1807
                            }
1808
0
                            _ => return internal_err!("Invalid unnest type"),
1809
                        };
1810
                    // new columns dependent on the same original index
1811
0
                    dependency_indices
1812
0
                        .extend(std::iter::repeat(index).take(transformed_columns.len()));
1813
0
                    Ok(transformed_columns
1814
0
                        .iter()
1815
0
                        .map(|(col, data_type)| {
1816
0
                            (col.relation.to_owned(), data_type.to_owned())
1817
0
                        })
1818
0
                        .collect())
1819
                }
1820
                None => {
1821
0
                    dependency_indices.push(index);
1822
0
                    Ok(vec![(
1823
0
                        original_qualifier.cloned(),
1824
0
                        Arc::clone(original_field),
1825
0
                    )])
1826
                }
1827
            }
1828
0
        })
1829
0
        .collect::<Result<Vec<_>>>()?
1830
0
        .into_iter()
1831
0
        .flatten()
1832
0
        .collect::<Vec<_>>();
1833
0
1834
0
    let metadata = input_schema.metadata().clone();
1835
0
    let df_schema = DFSchema::new_with_metadata(fields, metadata)?;
1836
    // We can use the existing functional dependencies:
1837
0
    let deps = input_schema.functional_dependencies().clone();
1838
0
    let schema = Arc::new(df_schema.with_functional_dependencies(deps)?);
1839
1840
0
    Ok(LogicalPlan::Unnest(Unnest {
1841
0
        input: Arc::new(input),
1842
0
        exec_columns: columns_to_unnest,
1843
0
        list_type_columns: list_columns,
1844
0
        struct_type_columns: struct_columns,
1845
0
        dependency_indices,
1846
0
        schema,
1847
0
        options,
1848
0
    }))
1849
0
}
1850
1851
#[cfg(test)]
1852
mod tests {
1853
1854
    use super::*;
1855
    use crate::logical_plan::StringifiedPlan;
1856
    use crate::{col, expr, expr_fn::exists, in_subquery, lit, scalar_subquery};
1857
1858
    use datafusion_common::SchemaError;
1859
1860
    #[test]
1861
    fn plan_builder_simple() -> Result<()> {
1862
        let plan =
1863
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![0, 3]))?
1864
                .filter(col("state").eq(lit("CO")))?
1865
                .project(vec![col("id")])?
1866
                .build()?;
1867
1868
        let expected = "Projection: employee_csv.id\
1869
        \n  Filter: employee_csv.state = Utf8(\"CO\")\
1870
        \n    TableScan: employee_csv projection=[id, state]";
1871
1872
        assert_eq!(expected, format!("{plan}"));
1873
1874
        Ok(())
1875
    }
1876
1877
    #[test]
1878
    fn plan_builder_schema() {
1879
        let schema = employee_schema();
1880
        let projection = None;
1881
        let plan =
1882
            LogicalPlanBuilder::scan("employee_csv", table_source(&schema), projection)
1883
                .unwrap();
1884
        let expected = DFSchema::try_from_qualified_schema(
1885
            TableReference::bare("employee_csv"),
1886
            &schema,
1887
        )
1888
        .unwrap();
1889
        assert_eq!(&expected, plan.schema().as_ref());
1890
1891
        // Note scan of "EMPLOYEE_CSV" is treated as a SQL identifier
1892
        // (and thus normalized to "employee"csv") as well
1893
        let projection = None;
1894
        let plan =
1895
            LogicalPlanBuilder::scan("EMPLOYEE_CSV", table_source(&schema), projection)
1896
                .unwrap();
1897
        assert_eq!(&expected, plan.schema().as_ref());
1898
    }
1899
1900
    #[test]
1901
    fn plan_builder_empty_name() {
1902
        let schema = employee_schema();
1903
        let projection = None;
1904
        let err =
1905
            LogicalPlanBuilder::scan("", table_source(&schema), projection).unwrap_err();
1906
        assert_eq!(
1907
            err.strip_backtrace(),
1908
            "Error during planning: table_name cannot be empty"
1909
        );
1910
    }
1911
1912
    #[test]
1913
    fn plan_builder_sort() -> Result<()> {
1914
        let plan =
1915
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?
1916
                .sort(vec![
1917
                    expr::Sort::new(col("state"), true, true),
1918
                    expr::Sort::new(col("salary"), false, false),
1919
                ])?
1920
                .build()?;
1921
1922
        let expected = "Sort: employee_csv.state ASC NULLS FIRST, employee_csv.salary DESC NULLS LAST\
1923
        \n  TableScan: employee_csv projection=[state, salary]";
1924
1925
        assert_eq!(expected, format!("{plan}"));
1926
1927
        Ok(())
1928
    }
1929
1930
    #[test]
1931
    fn plan_builder_union() -> Result<()> {
1932
        let plan =
1933
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?;
1934
1935
        let plan = plan
1936
            .clone()
1937
            .union(plan.clone().build()?)?
1938
            .union(plan.clone().build()?)?
1939
            .union(plan.build()?)?
1940
            .build()?;
1941
1942
        let expected = "Union\
1943
        \n  Union\
1944
        \n    Union\
1945
        \n      TableScan: employee_csv projection=[state, salary]\
1946
        \n      TableScan: employee_csv projection=[state, salary]\
1947
        \n    TableScan: employee_csv projection=[state, salary]\
1948
        \n  TableScan: employee_csv projection=[state, salary]";
1949
1950
        assert_eq!(expected, format!("{plan}"));
1951
1952
        Ok(())
1953
    }
1954
1955
    #[test]
1956
    fn plan_builder_union_distinct() -> Result<()> {
1957
        let plan =
1958
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?;
1959
1960
        let plan = plan
1961
            .clone()
1962
            .union_distinct(plan.clone().build()?)?
1963
            .union_distinct(plan.clone().build()?)?
1964
            .union_distinct(plan.build()?)?
1965
            .build()?;
1966
1967
        let expected = "\
1968
        Distinct:\
1969
        \n  Union\
1970
        \n    Distinct:\
1971
        \n      Union\
1972
        \n        Distinct:\
1973
        \n          Union\
1974
        \n            TableScan: employee_csv projection=[state, salary]\
1975
        \n            TableScan: employee_csv projection=[state, salary]\
1976
        \n        TableScan: employee_csv projection=[state, salary]\
1977
        \n    TableScan: employee_csv projection=[state, salary]";
1978
1979
        assert_eq!(expected, format!("{plan}"));
1980
1981
        Ok(())
1982
    }
1983
1984
    #[test]
1985
    fn plan_builder_simple_distinct() -> Result<()> {
1986
        let plan =
1987
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![0, 3]))?
1988
                .filter(col("state").eq(lit("CO")))?
1989
                .project(vec![col("id")])?
1990
                .distinct()?
1991
                .build()?;
1992
1993
        let expected = "\
1994
        Distinct:\
1995
        \n  Projection: employee_csv.id\
1996
        \n    Filter: employee_csv.state = Utf8(\"CO\")\
1997
        \n      TableScan: employee_csv projection=[id, state]";
1998
1999
        assert_eq!(expected, format!("{plan}"));
2000
2001
        Ok(())
2002
    }
2003
2004
    #[test]
2005
    fn exists_subquery() -> Result<()> {
2006
        let foo = test_table_scan_with_name("foo")?;
2007
        let bar = test_table_scan_with_name("bar")?;
2008
2009
        let subquery = LogicalPlanBuilder::from(foo)
2010
            .project(vec![col("a")])?
2011
            .filter(col("a").eq(col("bar.a")))?
2012
            .build()?;
2013
2014
        let outer_query = LogicalPlanBuilder::from(bar)
2015
            .project(vec![col("a")])?
2016
            .filter(exists(Arc::new(subquery)))?
2017
            .build()?;
2018
2019
        let expected = "Filter: EXISTS (<subquery>)\
2020
        \n  Subquery:\
2021
        \n    Filter: foo.a = bar.a\
2022
        \n      Projection: foo.a\
2023
        \n        TableScan: foo\
2024
        \n  Projection: bar.a\
2025
        \n    TableScan: bar";
2026
        assert_eq!(expected, format!("{outer_query}"));
2027
2028
        Ok(())
2029
    }
2030
2031
    #[test]
2032
    fn filter_in_subquery() -> Result<()> {
2033
        let foo = test_table_scan_with_name("foo")?;
2034
        let bar = test_table_scan_with_name("bar")?;
2035
2036
        let subquery = LogicalPlanBuilder::from(foo)
2037
            .project(vec![col("a")])?
2038
            .filter(col("a").eq(col("bar.a")))?
2039
            .build()?;
2040
2041
        // SELECT a FROM bar WHERE a IN (SELECT a FROM foo WHERE a = bar.a)
2042
        let outer_query = LogicalPlanBuilder::from(bar)
2043
            .project(vec![col("a")])?
2044
            .filter(in_subquery(col("a"), Arc::new(subquery)))?
2045
            .build()?;
2046
2047
        let expected = "Filter: bar.a IN (<subquery>)\
2048
        \n  Subquery:\
2049
        \n    Filter: foo.a = bar.a\
2050
        \n      Projection: foo.a\
2051
        \n        TableScan: foo\
2052
        \n  Projection: bar.a\
2053
        \n    TableScan: bar";
2054
        assert_eq!(expected, format!("{outer_query}"));
2055
2056
        Ok(())
2057
    }
2058
2059
    #[test]
2060
    fn select_scalar_subquery() -> Result<()> {
2061
        let foo = test_table_scan_with_name("foo")?;
2062
        let bar = test_table_scan_with_name("bar")?;
2063
2064
        let subquery = LogicalPlanBuilder::from(foo)
2065
            .project(vec![col("b")])?
2066
            .filter(col("a").eq(col("bar.a")))?
2067
            .build()?;
2068
2069
        // SELECT (SELECT a FROM foo WHERE a = bar.a) FROM bar
2070
        let outer_query = LogicalPlanBuilder::from(bar)
2071
            .project(vec![scalar_subquery(Arc::new(subquery))])?
2072
            .build()?;
2073
2074
        let expected = "Projection: (<subquery>)\
2075
        \n  Subquery:\
2076
        \n    Filter: foo.a = bar.a\
2077
        \n      Projection: foo.b\
2078
        \n        TableScan: foo\
2079
        \n  TableScan: bar";
2080
        assert_eq!(expected, format!("{outer_query}"));
2081
2082
        Ok(())
2083
    }
2084
2085
    #[test]
2086
    fn projection_non_unique_names() -> Result<()> {
2087
        let plan = table_scan(
2088
            Some("employee_csv"),
2089
            &employee_schema(),
2090
            // project id and first_name by column index
2091
            Some(vec![0, 1]),
2092
        )?
2093
        // two columns with the same name => error
2094
        .project(vec![col("id"), col("first_name").alias("id")]);
2095
2096
        match plan {
2097
            Err(DataFusionError::SchemaError(
2098
                SchemaError::AmbiguousReference {
2099
                    field:
2100
                        Column {
2101
                            relation: Some(TableReference::Bare { table }),
2102
                            name,
2103
                        },
2104
                },
2105
                _,
2106
            )) => {
2107
                assert_eq!(*"employee_csv", *table);
2108
                assert_eq!("id", &name);
2109
                Ok(())
2110
            }
2111
            _ => plan_err!("Plan should have returned an DataFusionError::SchemaError"),
2112
        }
2113
    }
2114
2115
    fn employee_schema() -> Schema {
2116
        Schema::new(vec![
2117
            Field::new("id", DataType::Int32, false),
2118
            Field::new("first_name", DataType::Utf8, false),
2119
            Field::new("last_name", DataType::Utf8, false),
2120
            Field::new("state", DataType::Utf8, false),
2121
            Field::new("salary", DataType::Int32, false),
2122
        ])
2123
    }
2124
2125
    #[test]
2126
    fn stringified_plan() {
2127
        let stringified_plan =
2128
            StringifiedPlan::new(PlanType::InitialLogicalPlan, "...the plan...");
2129
        assert!(stringified_plan.should_display(true));
2130
        assert!(!stringified_plan.should_display(false)); // not in non verbose mode
2131
2132
        let stringified_plan =
2133
            StringifiedPlan::new(PlanType::FinalLogicalPlan, "...the plan...");
2134
        assert!(stringified_plan.should_display(true));
2135
        assert!(stringified_plan.should_display(false)); // display in non verbose mode too
2136
2137
        let stringified_plan =
2138
            StringifiedPlan::new(PlanType::InitialPhysicalPlan, "...the plan...");
2139
        assert!(stringified_plan.should_display(true));
2140
        assert!(!stringified_plan.should_display(false)); // not in non verbose mode
2141
2142
        let stringified_plan =
2143
            StringifiedPlan::new(PlanType::FinalPhysicalPlan, "...the plan...");
2144
        assert!(stringified_plan.should_display(true));
2145
        assert!(stringified_plan.should_display(false)); // display in non verbose mode
2146
2147
        let stringified_plan = StringifiedPlan::new(
2148
            PlanType::OptimizedLogicalPlan {
2149
                optimizer_name: "random opt pass".into(),
2150
            },
2151
            "...the plan...",
2152
        );
2153
        assert!(stringified_plan.should_display(true));
2154
        assert!(!stringified_plan.should_display(false));
2155
    }
2156
2157
    fn test_table_scan_with_name(name: &str) -> Result<LogicalPlan> {
2158
        let schema = Schema::new(vec![
2159
            Field::new("a", DataType::UInt32, false),
2160
            Field::new("b", DataType::UInt32, false),
2161
            Field::new("c", DataType::UInt32, false),
2162
        ]);
2163
        table_scan(Some(name), &schema, None)?.build()
2164
    }
2165
2166
    #[test]
2167
    fn plan_builder_intersect_different_num_columns_error() -> Result<()> {
2168
        let plan1 =
2169
            table_scan(TableReference::none(), &employee_schema(), Some(vec![3]))?;
2170
        let plan2 =
2171
            table_scan(TableReference::none(), &employee_schema(), Some(vec![3, 4]))?;
2172
2173
        let expected = "Error during planning: INTERSECT/EXCEPT query must have the same number of columns. \
2174
         Left is 1 and right is 2.";
2175
        let err_msg1 =
2176
            LogicalPlanBuilder::intersect(plan1.build()?, plan2.build()?, true)
2177
                .unwrap_err();
2178
2179
        assert_eq!(err_msg1.strip_backtrace(), expected);
2180
2181
        Ok(())
2182
    }
2183
2184
    #[test]
2185
    fn plan_builder_unnest() -> Result<()> {
2186
        // Cannot unnest on a scalar column
2187
        let err = nested_table_scan("test_table")?
2188
            .unnest_column("scalar")
2189
            .unwrap_err();
2190
        assert!(err
2191
            .to_string()
2192
            .starts_with("Internal error: trying to unnest on invalid data type UInt32"));
2193
2194
        // Unnesting the strings list.
2195
        let plan = nested_table_scan("test_table")?
2196
            .unnest_column("strings")?
2197
            .build()?;
2198
2199
        let expected = "\
2200
        Unnest: lists[test_table.strings|depth=1] structs[]\
2201
        \n  TableScan: test_table";
2202
        assert_eq!(expected, format!("{plan}"));
2203
2204
        // Check unnested field is a scalar
2205
        let field = plan.schema().field_with_name(None, "strings").unwrap();
2206
        assert_eq!(&DataType::Utf8, field.data_type());
2207
2208
        // Unnesting the singular struct column result into 2 new columns for each subfield
2209
        let plan = nested_table_scan("test_table")?
2210
            .unnest_column("struct_singular")?
2211
            .build()?;
2212
2213
        let expected = "\
2214
        Unnest: lists[] structs[test_table.struct_singular]\
2215
        \n  TableScan: test_table";
2216
        assert_eq!(expected, format!("{plan}"));
2217
2218
        for field_name in &["a", "b"] {
2219
            // Check unnested struct field is a scalar
2220
            let field = plan
2221
                .schema()
2222
                .field_with_name(None, &format!("struct_singular.{}", field_name))
2223
                .unwrap();
2224
            assert_eq!(&DataType::UInt32, field.data_type());
2225
        }
2226
2227
        // Unnesting multiple fields in separate plans
2228
        let plan = nested_table_scan("test_table")?
2229
            .unnest_column("strings")?
2230
            .unnest_column("structs")?
2231
            .unnest_column("struct_singular")?
2232
            .build()?;
2233
2234
        let expected = "\
2235
        Unnest: lists[] structs[test_table.struct_singular]\
2236
        \n  Unnest: lists[test_table.structs|depth=1] structs[]\
2237
        \n    Unnest: lists[test_table.strings|depth=1] structs[]\
2238
        \n      TableScan: test_table";
2239
        assert_eq!(expected, format!("{plan}"));
2240
2241
        // Check unnested struct list field should be a struct.
2242
        let field = plan.schema().field_with_name(None, "structs").unwrap();
2243
        assert!(matches!(field.data_type(), DataType::Struct(_)));
2244
2245
        // Unnesting multiple fields at the same time, using infer syntax
2246
        let cols = vec!["strings", "structs", "struct_singular"]
2247
            .into_iter()
2248
            .map(|c| c.into())
2249
            .collect();
2250
2251
        let plan = nested_table_scan("test_table")?
2252
            .unnest_columns_with_options(cols, UnnestOptions::default())?
2253
            .build()?;
2254
2255
        let expected = "\
2256
        Unnest: lists[test_table.strings|depth=1, test_table.structs|depth=1] structs[test_table.struct_singular]\
2257
        \n  TableScan: test_table";
2258
        assert_eq!(expected, format!("{plan}"));
2259
2260
        // Unnesting missing column should fail.
2261
        let plan = nested_table_scan("test_table")?.unnest_column("missing");
2262
        assert!(plan.is_err());
2263
2264
        // Simultaneously unnesting a list (with different depth) and a struct column
2265
        let plan = nested_table_scan("test_table")?
2266
            .unnest_columns_recursive_with_options(
2267
                vec![
2268
                    (
2269
                        "stringss".into(),
2270
                        ColumnUnnestType::List(vec![
2271
                            ColumnUnnestList {
2272
                                output_column: Column::from_name("stringss_depth_1"),
2273
                                depth: 1,
2274
                            },
2275
                            ColumnUnnestList {
2276
                                output_column: Column::from_name("stringss_depth_2"),
2277
                                depth: 2,
2278
                            },
2279
                        ]),
2280
                    ),
2281
                    ("struct_singular".into(), ColumnUnnestType::Inferred),
2282
                ],
2283
                UnnestOptions::default(),
2284
            )?
2285
            .build()?;
2286
2287
        let expected = "\
2288
        Unnest: lists[test_table.stringss|depth=1, test_table.stringss|depth=2] structs[test_table.struct_singular]\
2289
        \n  TableScan: test_table";
2290
        assert_eq!(expected, format!("{plan}"));
2291
2292
        // Check output columns has correct type
2293
        let field = plan
2294
            .schema()
2295
            .field_with_name(None, "stringss_depth_1")
2296
            .unwrap();
2297
        assert_eq!(
2298
            &DataType::new_list(DataType::Utf8, false),
2299
            field.data_type()
2300
        );
2301
        let field = plan
2302
            .schema()
2303
            .field_with_name(None, "stringss_depth_2")
2304
            .unwrap();
2305
        assert_eq!(&DataType::Utf8, field.data_type());
2306
        // unnesting struct is still correct
2307
        for field_name in &["a", "b"] {
2308
            let field = plan
2309
                .schema()
2310
                .field_with_name(None, &format!("struct_singular.{}", field_name))
2311
                .unwrap();
2312
            assert_eq!(&DataType::UInt32, field.data_type());
2313
        }
2314
2315
        Ok(())
2316
    }
2317
2318
    fn nested_table_scan(table_name: &str) -> Result<LogicalPlanBuilder> {
2319
        // Create a schema with a scalar field, a list of strings, a list of structs
2320
        // and a singular struct
2321
        let struct_field_in_list = Field::new_struct(
2322
            "item",
2323
            vec![
2324
                Field::new("a", DataType::UInt32, false),
2325
                Field::new("b", DataType::UInt32, false),
2326
            ],
2327
            false,
2328
        );
2329
        let string_field = Field::new("item", DataType::Utf8, false);
2330
        let strings_field = Field::new_list("item", string_field.clone(), false);
2331
        let schema = Schema::new(vec![
2332
            Field::new("scalar", DataType::UInt32, false),
2333
            Field::new_list("strings", string_field, false),
2334
            Field::new_list("structs", struct_field_in_list, false),
2335
            Field::new(
2336
                "struct_singular",
2337
                DataType::Struct(Fields::from(vec![
2338
                    Field::new("a", DataType::UInt32, false),
2339
                    Field::new("b", DataType::UInt32, false),
2340
                ])),
2341
                false,
2342
            ),
2343
            Field::new_list("stringss", strings_field, false),
2344
        ]);
2345
2346
        table_scan(Some(table_name), &schema, None)
2347
    }
2348
2349
    #[test]
2350
    fn test_union_after_join() -> Result<()> {
2351
        let values = vec![vec![lit(1)]];
2352
2353
        let left = LogicalPlanBuilder::values(values.clone())?
2354
            .alias("left")?
2355
            .build()?;
2356
        let right = LogicalPlanBuilder::values(values)?
2357
            .alias("right")?
2358
            .build()?;
2359
2360
        let join = LogicalPlanBuilder::from(left).cross_join(right)?.build()?;
2361
2362
        let _ = LogicalPlanBuilder::from(join.clone())
2363
            .union(join)?
2364
            .build()?;
2365
2366
        Ok(())
2367
    }
2368
2369
    #[test]
2370
    fn test_change_redundant_column() -> Result<()> {
2371
        let t1_field_1 = Field::new("a", DataType::Int32, false);
2372
        let t2_field_1 = Field::new("a", DataType::Int32, false);
2373
        let t2_field_3 = Field::new("a", DataType::Int32, false);
2374
        let t1_field_2 = Field::new("b", DataType::Int32, false);
2375
        let t2_field_2 = Field::new("b", DataType::Int32, false);
2376
2377
        let field_vec = vec![t1_field_1, t2_field_1, t1_field_2, t2_field_2, t2_field_3];
2378
        let remove_redundant = change_redundant_column(&Fields::from(field_vec));
2379
2380
        assert_eq!(
2381
            remove_redundant,
2382
            vec![
2383
                Field::new("a", DataType::Int32, false),
2384
                Field::new("a:1", DataType::Int32, false),
2385
                Field::new("b", DataType::Int32, false),
2386
                Field::new("b:1", DataType::Int32, false),
2387
                Field::new("a:2", DataType::Int32, false),
2388
            ]
2389
        );
2390
        Ok(())
2391
    }
2392
2393
    #[test]
2394
    fn plan_builder_from_logical_plan() -> Result<()> {
2395
        let plan =
2396
            table_scan(Some("employee_csv"), &employee_schema(), Some(vec![3, 4]))?
2397
                .sort(vec![
2398
                    expr::Sort::new(col("state"), true, true),
2399
                    expr::Sort::new(col("salary"), false, false),
2400
                ])?
2401
                .build()?;
2402
2403
        let plan_expected = format!("{plan}");
2404
        let plan_builder: LogicalPlanBuilder = Arc::new(plan).into();
2405
        assert_eq!(plan_expected, format!("{}", plan_builder.plan));
2406
2407
        Ok(())
2408
    }
2409
}