Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/execution_plan.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::any::Any;
19
use std::fmt::Debug;
20
use std::sync::Arc;
21
22
use arrow::datatypes::SchemaRef;
23
use arrow::record_batch::RecordBatch;
24
use futures::stream::{StreamExt, TryStreamExt};
25
use tokio::task::JoinSet;
26
27
use datafusion_common::config::ConfigOptions;
28
pub use datafusion_common::hash_utils;
29
pub use datafusion_common::utils::project_schema;
30
use datafusion_common::{exec_err, Result};
31
pub use datafusion_common::{internal_err, ColumnStatistics, Statistics};
32
use datafusion_execution::TaskContext;
33
pub use datafusion_execution::{RecordBatchStream, SendableRecordBatchStream};
34
pub use datafusion_expr::{Accumulator, ColumnarValue};
35
pub use datafusion_physical_expr::window::WindowExpr;
36
pub use datafusion_physical_expr::{
37
    expressions, udf, Distribution, Partitioning, PhysicalExpr,
38
};
39
use datafusion_physical_expr::{EquivalenceProperties, LexOrdering, PhysicalSortExpr};
40
use datafusion_physical_expr_common::sort_expr::LexRequirement;
41
42
use crate::coalesce_partitions::CoalescePartitionsExec;
43
use crate::display::DisplayableExecutionPlan;
44
pub use crate::display::{DefaultDisplay, DisplayAs, DisplayFormatType, VerboseDisplay};
45
pub use crate::metrics::Metric;
46
use crate::metrics::MetricsSet;
47
pub use crate::ordering::InputOrderMode;
48
use crate::repartition::RepartitionExec;
49
use crate::sorts::sort_preserving_merge::SortPreservingMergeExec;
50
pub use crate::stream::EmptyRecordBatchStream;
51
use crate::stream::RecordBatchStreamAdapter;
52
53
/// Represent nodes in the DataFusion Physical Plan.
54
///
55
/// Calling [`execute`] produces an `async` [`SendableRecordBatchStream`] of
56
/// [`RecordBatch`] that incrementally computes a partition of the
57
/// `ExecutionPlan`'s output from its input. See [`Partitioning`] for more
58
/// details on partitioning.
59
///
60
/// Methods such as [`Self::schema`] and [`Self::properties`] communicate
61
/// properties of the output to the DataFusion optimizer, and methods such as
62
/// [`required_input_distribution`] and [`required_input_ordering`] express
63
/// requirements of the `ExecutionPlan` from its input.
64
///
65
/// [`ExecutionPlan`] can be displayed in a simplified form using the
66
/// return value from [`displayable`] in addition to the (normally
67
/// quite verbose) `Debug` output.
68
///
69
/// [`execute`]: ExecutionPlan::execute
70
/// [`required_input_distribution`]: ExecutionPlan::required_input_distribution
71
/// [`required_input_ordering`]: ExecutionPlan::required_input_ordering
72
pub trait ExecutionPlan: Debug + DisplayAs + Send + Sync {
73
    /// Short name for the ExecutionPlan, such as 'ParquetExec'.
74
    ///
75
    /// Implementation note: this method can just proxy to
76
    /// [`static_name`](ExecutionPlan::static_name) if no special action is
77
    /// needed. It doesn't provide a default implementation like that because
78
    /// this method doesn't require the `Sized` constrain to allow a wilder
79
    /// range of use cases.
80
    fn name(&self) -> &str;
81
82
    /// Short name for the ExecutionPlan, such as 'ParquetExec'.
83
    /// Like [`name`](ExecutionPlan::name) but can be called without an instance.
84
1
    fn static_name() -> &'static str
85
1
    where
86
1
        Self: Sized,
87
1
    {
88
1
        let full_name = std::any::type_name::<Self>();
89
1
        let maybe_start_idx = full_name.rfind(':');
90
1
        match maybe_start_idx {
91
1
            Some(start_idx) => &full_name[start_idx + 1..],
92
0
            None => "UNKNOWN",
93
        }
94
1
    }
95
96
    /// Returns the execution plan as [`Any`] so that it can be
97
    /// downcast to a specific implementation.
98
    fn as_any(&self) -> &dyn Any;
99
100
    /// Get the schema for this execution plan
101
25.7k
    fn schema(&self) -> SchemaRef {
102
25.7k
        Arc::clone(self.properties().schema())
103
25.7k
    }
104
105
    /// Return properties of the output of the `ExecutionPlan`, such as output
106
    /// ordering(s), partitioning information etc.
107
    ///
108
    /// This information is available via methods on [`ExecutionPlanProperties`]
109
    /// trait, which is implemented for all `ExecutionPlan`s.
110
    fn properties(&self) -> &PlanProperties;
111
112
    /// Specifies the data distribution requirements for all the
113
    /// children for this `ExecutionPlan`, By default it's [[Distribution::UnspecifiedDistribution]] for each child,
114
0
    fn required_input_distribution(&self) -> Vec<Distribution> {
115
0
        vec![Distribution::UnspecifiedDistribution; self.children().len()]
116
0
    }
117
118
    /// Specifies the ordering required for all of the children of this
119
    /// `ExecutionPlan`.
120
    ///
121
    /// For each child, it's the local ordering requirement within
122
    /// each partition rather than the global ordering
123
    ///
124
    /// NOTE that checking `!is_empty()` does **not** check for a
125
    /// required input ordering. Instead, the correct check is that at
126
    /// least one entry must be `Some`
127
0
    fn required_input_ordering(&self) -> Vec<Option<LexRequirement>> {
128
0
        vec![None; self.children().len()]
129
0
    }
130
131
    /// Returns `false` if this `ExecutionPlan`'s implementation may reorder
132
    /// rows within or between partitions.
133
    ///
134
    /// For example, Projection, Filter, and Limit maintain the order
135
    /// of inputs -- they may transform values (Projection) or not
136
    /// produce the same number of rows that went in (Filter and
137
    /// Limit), but the rows that are produced go in the same way.
138
    ///
139
    /// DataFusion uses this metadata to apply certain optimizations
140
    /// such as automatically repartitioning correctly.
141
    ///
142
    /// The default implementation returns `false`
143
    ///
144
    /// WARNING: if you override this default, you *MUST* ensure that
145
    /// the `ExecutionPlan`'s maintains the ordering invariant or else
146
    /// DataFusion may produce incorrect results.
147
0
    fn maintains_input_order(&self) -> Vec<bool> {
148
0
        vec![false; self.children().len()]
149
0
    }
150
151
    /// Specifies whether the `ExecutionPlan` benefits from increased
152
    /// parallelization at its input for each child.
153
    ///
154
    /// If returns `true`, the `ExecutionPlan` would benefit from partitioning
155
    /// its corresponding child (and thus from more parallelism). For
156
    /// `ExecutionPlan` that do very little work the overhead of extra
157
    /// parallelism may outweigh any benefits
158
    ///
159
    /// The default implementation returns `true` unless this `ExecutionPlan`
160
    /// has signalled it requires a single child input partition.
161
0
    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
162
0
        // By default try to maximize parallelism with more CPUs if
163
0
        // possible
164
0
        self.required_input_distribution()
165
0
            .into_iter()
166
0
            .map(|dist| !matches!(dist, Distribution::SinglePartition))
167
0
            .collect()
168
0
    }
169
170
    /// Get a list of children `ExecutionPlan`s that act as inputs to this plan.
171
    /// The returned list will be empty for leaf nodes such as scans, will contain
172
    /// a single value for unary nodes, or two values for binary nodes (such as
173
    /// joins).
174
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>>;
175
176
    /// Returns a new `ExecutionPlan` where all existing children were replaced
177
    /// by the `children`, in order
178
    fn with_new_children(
179
        self: Arc<Self>,
180
        children: Vec<Arc<dyn ExecutionPlan>>,
181
    ) -> Result<Arc<dyn ExecutionPlan>>;
182
183
    /// If supported, attempt to increase the partitioning of this `ExecutionPlan` to
184
    /// produce `target_partitions` partitions.
185
    ///
186
    /// If the `ExecutionPlan` does not support changing its partitioning,
187
    /// returns `Ok(None)` (the default).
188
    ///
189
    /// It is the `ExecutionPlan` can increase its partitioning, but not to the
190
    /// `target_partitions`, it may return an ExecutionPlan with fewer
191
    /// partitions. This might happen, for example, if each new partition would
192
    /// be too small to be efficiently processed individually.
193
    ///
194
    /// The DataFusion optimizer attempts to use as many threads as possible by
195
    /// repartitioning its inputs to match the target number of threads
196
    /// available (`target_partitions`). Some data sources, such as the built in
197
    /// CSV and Parquet readers, implement this method as they are able to read
198
    /// from their input files in parallel, regardless of how the source data is
199
    /// split amongst files.
200
0
    fn repartitioned(
201
0
        &self,
202
0
        _target_partitions: usize,
203
0
        _config: &ConfigOptions,
204
0
    ) -> Result<Option<Arc<dyn ExecutionPlan>>> {
205
0
        Ok(None)
206
0
    }
207
208
    /// Begin execution of `partition`, returning a [`Stream`] of
209
    /// [`RecordBatch`]es.
210
    ///
211
    /// # Notes
212
    ///
213
    /// The `execute` method itself is not `async` but it returns an `async`
214
    /// [`futures::stream::Stream`]. This `Stream` should incrementally compute
215
    /// the output, `RecordBatch` by `RecordBatch` (in a streaming fashion).
216
    /// Most `ExecutionPlan`s should not do any work before the first
217
    /// `RecordBatch` is requested from the stream.
218
    ///
219
    /// [`RecordBatchStreamAdapter`] can be used to convert an `async`
220
    /// [`Stream`] into a [`SendableRecordBatchStream`].
221
    ///
222
    /// Using `async` `Streams` allows for network I/O during execution and
223
    /// takes advantage of Rust's built in support for `async` continuations and
224
    /// crate ecosystem.
225
    ///
226
    /// [`Stream`]: futures::stream::Stream
227
    /// [`StreamExt`]: futures::stream::StreamExt
228
    /// [`TryStreamExt`]: futures::stream::TryStreamExt
229
    /// [`RecordBatchStreamAdapter`]: crate::stream::RecordBatchStreamAdapter
230
    ///
231
    /// # Error handling
232
    ///
233
    /// Any error that occurs during execution is sent as an `Err` in the output
234
    /// stream.
235
    ///
236
    /// `ExecutionPlan` implementations in DataFusion cancel additional work
237
    /// immediately once an error occurs. The rationale is that if the overall
238
    /// query will return an error,  any additional work such as continued
239
    /// polling of inputs will be wasted as it will be thrown away.
240
    ///
241
    /// # Cancellation / Aborting Execution
242
    ///
243
    /// The [`Stream`] that is returned must ensure that any allocated resources
244
    /// are freed when the stream itself is dropped. This is particularly
245
    /// important for [`spawn`]ed tasks or threads. Unless care is taken to
246
    /// "abort" such tasks, they may continue to consume resources even after
247
    /// the plan is dropped, generating intermediate results that are never
248
    /// used.
249
    /// Thus, [`spawn`] is disallowed, and instead use [`SpawnedTask`].
250
    ///
251
    /// For more details see [`SpawnedTask`], [`JoinSet`] and [`RecordBatchReceiverStreamBuilder`]
252
    /// for structures to help ensure all background tasks are cancelled.
253
    ///
254
    /// [`spawn`]: tokio::task::spawn
255
    /// [`JoinSet`]: tokio::task::JoinSet
256
    /// [`SpawnedTask`]: datafusion_common_runtime::SpawnedTask
257
    /// [`RecordBatchReceiverStreamBuilder`]: crate::stream::RecordBatchReceiverStreamBuilder
258
    ///
259
    /// # Implementation Examples
260
    ///
261
    /// While `async` `Stream`s have a non trivial learning curve, the
262
    /// [`futures`] crate provides [`StreamExt`] and [`TryStreamExt`]
263
    /// which help simplify many common operations.
264
    ///
265
    /// Here are some common patterns:
266
    ///
267
    /// ## Return Precomputed `RecordBatch`
268
    ///
269
    /// We can return a precomputed `RecordBatch` as a `Stream`:
270
    ///
271
    /// ```
272
    /// # use std::sync::Arc;
273
    /// # use arrow_array::RecordBatch;
274
    /// # use arrow_schema::SchemaRef;
275
    /// # use datafusion_common::Result;
276
    /// # use datafusion_execution::{SendableRecordBatchStream, TaskContext};
277
    /// # use datafusion_physical_plan::memory::MemoryStream;
278
    /// # use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
279
    /// struct MyPlan {
280
    ///     batch: RecordBatch,
281
    /// }
282
    ///
283
    /// impl MyPlan {
284
    ///     fn execute(
285
    ///         &self,
286
    ///         partition: usize,
287
    ///         context: Arc<TaskContext>
288
    ///     ) -> Result<SendableRecordBatchStream> {
289
    ///         // use functions from futures crate convert the batch into a stream
290
    ///         let fut = futures::future::ready(Ok(self.batch.clone()));
291
    ///         let stream = futures::stream::once(fut);
292
    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.batch.schema(), stream)))
293
    ///     }
294
    /// }
295
    /// ```
296
    ///
297
    /// ## Lazily (async) Compute `RecordBatch`
298
    ///
299
    /// We can also lazily compute a `RecordBatch` when the returned `Stream` is polled
300
    ///
301
    /// ```
302
    /// # use std::sync::Arc;
303
    /// # use arrow_array::RecordBatch;
304
    /// # use arrow_schema::SchemaRef;
305
    /// # use datafusion_common::Result;
306
    /// # use datafusion_execution::{SendableRecordBatchStream, TaskContext};
307
    /// # use datafusion_physical_plan::memory::MemoryStream;
308
    /// # use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
309
    /// struct MyPlan {
310
    ///     schema: SchemaRef,
311
    /// }
312
    ///
313
    /// /// Returns a single batch when the returned stream is polled
314
    /// async fn get_batch() -> Result<RecordBatch> {
315
    ///     todo!()
316
    /// }
317
    ///
318
    /// impl MyPlan {
319
    ///     fn execute(
320
    ///         &self,
321
    ///         partition: usize,
322
    ///         context: Arc<TaskContext>
323
    ///     ) -> Result<SendableRecordBatchStream> {
324
    ///         let fut = get_batch();
325
    ///         let stream = futures::stream::once(fut);
326
    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), stream)))
327
    ///     }
328
    /// }
329
    /// ```
330
    ///
331
    /// ## Lazily (async) create a Stream
332
    ///
333
    /// If you need to create the return `Stream` using an `async` function,
334
    /// you can do so by flattening the result:
335
    ///
336
    /// ```
337
    /// # use std::sync::Arc;
338
    /// # use arrow_array::RecordBatch;
339
    /// # use arrow_schema::SchemaRef;
340
    /// # use futures::TryStreamExt;
341
    /// # use datafusion_common::Result;
342
    /// # use datafusion_execution::{SendableRecordBatchStream, TaskContext};
343
    /// # use datafusion_physical_plan::memory::MemoryStream;
344
    /// # use datafusion_physical_plan::stream::RecordBatchStreamAdapter;
345
    /// struct MyPlan {
346
    ///     schema: SchemaRef,
347
    /// }
348
    ///
349
    /// /// async function that returns a stream
350
    /// async fn get_batch_stream() -> Result<SendableRecordBatchStream> {
351
    ///     todo!()
352
    /// }
353
    ///
354
    /// impl MyPlan {
355
    ///     fn execute(
356
    ///         &self,
357
    ///         partition: usize,
358
    ///         context: Arc<TaskContext>
359
    ///     ) -> Result<SendableRecordBatchStream> {
360
    ///         // A future that yields a stream
361
    ///         let fut = get_batch_stream();
362
    ///         // Use TryStreamExt::try_flatten to flatten the stream of streams
363
    ///         let stream = futures::stream::once(fut).try_flatten();
364
    ///         Ok(Box::pin(RecordBatchStreamAdapter::new(self.schema.clone(), stream)))
365
    ///     }
366
    /// }
367
    /// ```
368
    fn execute(
369
        &self,
370
        partition: usize,
371
        context: Arc<TaskContext>,
372
    ) -> Result<SendableRecordBatchStream>;
373
374
    /// Return a snapshot of the set of [`Metric`]s for this
375
    /// [`ExecutionPlan`]. If no `Metric`s are available, return None.
376
    ///
377
    /// While the values of the metrics in the returned
378
    /// [`MetricsSet`]s may change as execution progresses, the
379
    /// specific metrics will not.
380
    ///
381
    /// Once `self.execute()` has returned (technically the future is
382
    /// resolved) for all available partitions, the set of metrics
383
    /// should be complete. If this function is called prior to
384
    /// `execute()` new metrics may appear in subsequent calls.
385
0
    fn metrics(&self) -> Option<MetricsSet> {
386
0
        None
387
0
    }
388
389
    /// Returns statistics for this `ExecutionPlan` node. If statistics are not
390
    /// available, should return [`Statistics::new_unknown`] (the default), not
391
    /// an error.
392
    ///
393
    /// For TableScan executors, which supports filter pushdown, special attention
394
    /// needs to be paid to whether the stats returned by this method are exact or not
395
0
    fn statistics(&self) -> Result<Statistics> {
396
0
        Ok(Statistics::new_unknown(&self.schema()))
397
0
    }
398
399
    /// Returns `true` if a limit can be safely pushed down through this
400
    /// `ExecutionPlan` node.
401
    ///
402
    /// If this method returns `true`, and the query plan contains a limit at
403
    /// the output of this node, DataFusion will push the limit to the input
404
    /// of this node.
405
0
    fn supports_limit_pushdown(&self) -> bool {
406
0
        false
407
0
    }
408
409
    /// Returns a fetching variant of this `ExecutionPlan` node, if it supports
410
    /// fetch limits. Returns `None` otherwise.
411
0
    fn with_fetch(&self, _limit: Option<usize>) -> Option<Arc<dyn ExecutionPlan>> {
412
0
        None
413
0
    }
414
415
    /// Gets the fetch count for the operator, `None` means there is no fetch.
416
0
    fn fetch(&self) -> Option<usize> {
417
0
        None
418
0
    }
419
}
420
421
/// Extension trait provides an easy API to fetch various properties of
422
/// [`ExecutionPlan`] objects based on [`ExecutionPlan::properties`].
423
pub trait ExecutionPlanProperties {
424
    /// Specifies how the output of this `ExecutionPlan` is split into
425
    /// partitions.
426
    fn output_partitioning(&self) -> &Partitioning;
427
428
    /// Specifies whether this plan generates an infinite stream of records.
429
    /// If the plan does not support pipelining, but its input(s) are
430
    /// infinite, returns [`ExecutionMode::PipelineBreaking`] to indicate this.
431
    fn execution_mode(&self) -> ExecutionMode;
432
433
    /// If the output of this `ExecutionPlan` within each partition is sorted,
434
    /// returns `Some(keys)` describing the ordering. A `None` return value
435
    /// indicates no assumptions should be made on the output ordering.
436
    ///
437
    /// For example, `SortExec` (obviously) produces sorted output as does
438
    /// `SortPreservingMergeStream`. Less obviously, `Projection` produces sorted
439
    /// output if its input is sorted as it does not reorder the input rows.
440
    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]>;
441
442
    /// Get the [`EquivalenceProperties`] within the plan.
443
    ///
444
    /// Equivalence properties tell DataFusion what columns are known to be
445
    /// equal, during various optimization passes. By default, this returns "no
446
    /// known equivalences" which is always correct, but may cause DataFusion to
447
    /// unnecessarily resort data.
448
    ///
449
    /// If this ExecutionPlan makes no changes to the schema of the rows flowing
450
    /// through it or how columns within each row relate to each other, it
451
    /// should return the equivalence properties of its input. For
452
    /// example, since `FilterExec` may remove rows from its input, but does not
453
    /// otherwise modify them, it preserves its input equivalence properties.
454
    /// However, since `ProjectionExec` may calculate derived expressions, it
455
    /// needs special handling.
456
    ///
457
    /// See also [`ExecutionPlan::maintains_input_order`] and [`Self::output_ordering`]
458
    /// for related concepts.
459
    fn equivalence_properties(&self) -> &EquivalenceProperties;
460
}
461
462
impl ExecutionPlanProperties for Arc<dyn ExecutionPlan> {
463
19.0k
    fn output_partitioning(&self) -> &Partitioning {
464
19.0k
        self.properties().output_partitioning()
465
19.0k
    }
466
467
5.27k
    fn execution_mode(&self) -> ExecutionMode {
468
5.27k
        self.properties().execution_mode()
469
5.27k
    }
470
471
2.55k
    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
472
2.55k
        self.properties().output_ordering()
473
2.55k
    }
474
475
4.20k
    fn equivalence_properties(&self) -> &EquivalenceProperties {
476
4.20k
        self.properties().equivalence_properties()
477
4.20k
    }
478
}
479
480
impl ExecutionPlanProperties for &dyn ExecutionPlan {
481
0
    fn output_partitioning(&self) -> &Partitioning {
482
0
        self.properties().output_partitioning()
483
0
    }
484
485
0
    fn execution_mode(&self) -> ExecutionMode {
486
0
        self.properties().execution_mode()
487
0
    }
488
489
0
    fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
490
0
        self.properties().output_ordering()
491
0
    }
492
493
0
    fn equivalence_properties(&self) -> &EquivalenceProperties {
494
0
        self.properties().equivalence_properties()
495
0
    }
496
}
497
498
/// Describes the execution mode of the result of calling
499
/// [`ExecutionPlan::execute`] with respect to its size and behavior.
500
///
501
/// The mode of the execution plan is determined by the mode of its input
502
/// execution plans and the details of the operator itself. For example, a
503
/// `FilterExec` operator will have the same execution mode as its input, but a
504
/// `SortExec` operator may have a different execution mode than its input,
505
/// depending on how the input stream is sorted.
506
///
507
/// There are three possible execution modes: `Bounded`, `Unbounded` and
508
/// `PipelineBreaking`.
509
#[derive(Clone, Copy, PartialEq, Debug)]
510
pub enum ExecutionMode {
511
    /// The stream is bounded / finite.
512
    ///
513
    /// In this case the stream will eventually return `None` to indicate that
514
    /// there are no more records to process.
515
    Bounded,
516
    /// The stream is unbounded / infinite.
517
    ///
518
    /// In this case, the stream will never be done (never return `None`),
519
    /// except in case of error.
520
    ///
521
    /// This mode is often used in "Steaming" use cases where data is
522
    /// incrementally processed as it arrives.
523
    ///
524
    /// Note that even though the operator generates an unbounded stream of
525
    /// results, it can execute with bounded memory and incrementally produces
526
    /// output.
527
    Unbounded,
528
    /// Some of the operator's input stream(s) are unbounded, but the operator
529
    /// cannot generate streaming results from these streaming inputs.
530
    ///
531
    /// In this case, the execution mode will be pipeline breaking, e.g. the
532
    /// operator requires unbounded memory to generate results. This
533
    /// information is used by the planner when performing sanity checks
534
    /// on plans processings unbounded data sources.
535
    PipelineBreaking,
536
}
537
538
impl ExecutionMode {
539
    /// Check whether the execution mode is unbounded or not.
540
1.42k
    pub fn is_unbounded(&self) -> bool {
541
1.42k
        matches!(self, ExecutionMode::Unbounded)
542
1.42k
    }
543
544
    /// Check whether the execution is pipeline friendly. If so, operator can
545
    /// execute safely.
546
0
    pub fn pipeline_friendly(&self) -> bool {
547
0
        matches!(self, ExecutionMode::Bounded | ExecutionMode::Unbounded)
548
0
    }
549
}
550
551
/// Conservatively "combines" execution modes of a given collection of operators.
552
1.15k
pub(crate) fn execution_mode_from_children<'a>(
553
1.15k
    children: impl IntoIterator<Item = &'a Arc<dyn ExecutionPlan>>,
554
1.15k
) -> ExecutionMode {
555
1.15k
    let mut result = ExecutionMode::Bounded;
556
2.30k
    for mode in 
children.into_iter().map(1.15k
|child| child.execution_mode()
)1.15k
{
557
2.30k
        match (mode, result) {
558
            (ExecutionMode::PipelineBreaking, _)
559
            | (_, ExecutionMode::PipelineBreaking) => {
560
                // If any of the modes is `PipelineBreaking`, so is the result:
561
0
                return ExecutionMode::PipelineBreaking;
562
            }
563
0
            (ExecutionMode::Unbounded, _) | (_, ExecutionMode::Unbounded) => {
564
0
                // Unbounded mode eats up bounded mode:
565
0
                result = ExecutionMode::Unbounded;
566
0
            }
567
2.30k
            (ExecutionMode::Bounded, ExecutionMode::Bounded) => {
568
2.30k
                // When both modes are bounded, so is the result:
569
2.30k
                result = ExecutionMode::Bounded;
570
2.30k
            }
571
        }
572
    }
573
1.15k
    result
574
1.15k
}
575
576
/// Stores certain, often expensive to compute, plan properties used in query
577
/// optimization.
578
///
579
/// These properties are stored a single structure to permit this information to
580
/// be computed once and then those cached results used multiple times without
581
/// recomputation (aka a cache)
582
#[derive(Debug, Clone)]
583
pub struct PlanProperties {
584
    /// See [ExecutionPlanProperties::equivalence_properties]
585
    pub eq_properties: EquivalenceProperties,
586
    /// See [ExecutionPlanProperties::output_partitioning]
587
    pub partitioning: Partitioning,
588
    /// See [ExecutionPlanProperties::execution_mode]
589
    pub execution_mode: ExecutionMode,
590
    /// See [ExecutionPlanProperties::output_ordering]
591
    output_ordering: Option<LexOrdering>,
592
}
593
594
impl PlanProperties {
595
    /// Construct a new `PlanPropertiesCache` from the
596
4.01k
    pub fn new(
597
4.01k
        eq_properties: EquivalenceProperties,
598
4.01k
        partitioning: Partitioning,
599
4.01k
        execution_mode: ExecutionMode,
600
4.01k
    ) -> Self {
601
4.01k
        // Output ordering can be derived from `eq_properties`.
602
4.01k
        let output_ordering = eq_properties.output_ordering();
603
4.01k
        Self {
604
4.01k
            eq_properties,
605
4.01k
            partitioning,
606
4.01k
            execution_mode,
607
4.01k
            output_ordering,
608
4.01k
        }
609
4.01k
    }
610
611
    /// Overwrite output partitioning with its new value.
612
2
    pub fn with_partitioning(mut self, partitioning: Partitioning) -> Self {
613
2
        self.partitioning = partitioning;
614
2
        self
615
2
    }
616
617
    /// Overwrite the execution Mode with its new value.
618
0
    pub fn with_execution_mode(mut self, execution_mode: ExecutionMode) -> Self {
619
0
        self.execution_mode = execution_mode;
620
0
        self
621
0
    }
622
623
    /// Overwrite equivalence properties with its new value.
624
713
    pub fn with_eq_properties(mut self, eq_properties: EquivalenceProperties) -> Self {
625
713
        // Changing equivalence properties also changes output ordering, so
626
713
        // make sure to overwrite it:
627
713
        self.output_ordering = eq_properties.output_ordering();
628
713
        self.eq_properties = eq_properties;
629
713
        self
630
713
    }
631
632
4.27k
    pub fn equivalence_properties(&self) -> &EquivalenceProperties {
633
4.27k
        &self.eq_properties
634
4.27k
    }
635
636
19.0k
    pub fn output_partitioning(&self) -> &Partitioning {
637
19.0k
        &self.partitioning
638
19.0k
    }
639
640
2.55k
    pub fn output_ordering(&self) -> Option<&[PhysicalSortExpr]> {
641
2.55k
        self.output_ordering.as_deref()
642
2.55k
    }
643
644
5.27k
    pub fn execution_mode(&self) -> ExecutionMode {
645
5.27k
        self.execution_mode
646
5.27k
    }
647
648
    /// Get schema of the node.
649
25.7k
    fn schema(&self) -> &SchemaRef {
650
25.7k
        self.eq_properties.schema()
651
25.7k
    }
652
}
653
654
/// Indicate whether a data exchange is needed for the input of `plan`, which will be very helpful
655
/// especially for the distributed engine to judge whether need to deal with shuffling.
656
/// Currently there are 3 kinds of execution plan which needs data exchange
657
///     1. RepartitionExec for changing the partition number between two `ExecutionPlan`s
658
///     2. CoalescePartitionsExec for collapsing all of the partitions into one without ordering guarantee
659
///     3. SortPreservingMergeExec for collapsing all of the sorted partitions into one with ordering guarantee
660
0
pub fn need_data_exchange(plan: Arc<dyn ExecutionPlan>) -> bool {
661
0
    if let Some(repartition) = plan.as_any().downcast_ref::<RepartitionExec>() {
662
0
        !matches!(
663
0
            repartition.properties().output_partitioning(),
664
            Partitioning::RoundRobinBatch(_)
665
        )
666
0
    } else if let Some(coalesce) = plan.as_any().downcast_ref::<CoalescePartitionsExec>()
667
    {
668
0
        coalesce.input().output_partitioning().partition_count() > 1
669
0
    } else if let Some(sort_preserving_merge) =
670
0
        plan.as_any().downcast_ref::<SortPreservingMergeExec>()
671
    {
672
0
        sort_preserving_merge
673
0
            .input()
674
0
            .output_partitioning()
675
0
            .partition_count()
676
0
            > 1
677
    } else {
678
0
        false
679
    }
680
0
}
681
682
/// Returns a copy of this plan if we change any child according to the pointer comparison.
683
/// The size of `children` must be equal to the size of `ExecutionPlan::children()`.
684
4
pub fn with_new_children_if_necessary(
685
4
    plan: Arc<dyn ExecutionPlan>,
686
4
    children: Vec<Arc<dyn ExecutionPlan>>,
687
4
) -> Result<Arc<dyn ExecutionPlan>> {
688
4
    let old_children = plan.children();
689
4
    if children.len() != old_children.len() {
690
2
        internal_err!("Wrong number of children")
691
2
    } else if children.is_empty()
692
0
        || children
693
0
            .iter()
694
0
            .zip(old_children.iter())
695
0
            .any(|(c1, c2)| !Arc::ptr_eq(c1, c2))
696
    {
697
2
        plan.with_new_children(children)
698
    } else {
699
0
        Ok(plan)
700
    }
701
4
}
702
703
/// Return a [wrapper](DisplayableExecutionPlan) around an
704
/// [`ExecutionPlan`] which can be displayed in various easier to
705
/// understand ways.
706
5
pub fn displayable(plan: &dyn ExecutionPlan) -> DisplayableExecutionPlan<'_> {
707
5
    DisplayableExecutionPlan::new(plan)
708
5
}
709
710
/// Execute the [ExecutionPlan] and collect the results in memory
711
65
pub async fn collect(
712
65
    plan: Arc<dyn ExecutionPlan>,
713
65
    context: Arc<TaskContext>,
714
65
) -> Result<Vec<RecordBatch>> {
715
65
    let 
stream64
= execute_stream(plan, context)
?1
;
716
297
    
crate::common::collect(stream)64
.await
717
55
}
718
719
/// Execute the [ExecutionPlan] and return a single stream of `RecordBatch`es.
720
///
721
/// See [collect] to buffer the `RecordBatch`es in memory.
722
///
723
/// # Aborting Execution
724
///
725
/// Dropping the stream will abort the execution of the query, and free up
726
/// any allocated resources
727
66
pub fn execute_stream(
728
66
    plan: Arc<dyn ExecutionPlan>,
729
66
    context: Arc<TaskContext>,
730
66
) -> Result<SendableRecordBatchStream> {
731
66
    match plan.output_partitioning().partition_count() {
732
0
        0 => Ok(Box::pin(EmptyRecordBatchStream::new(plan.schema()))),
733
65
        1 => plan.execute(0, context),
734
1
        2.. => {
735
            // merge into a single partition
736
1
            let plan = CoalescePartitionsExec::new(Arc::clone(&plan));
737
1
            // CoalescePartitionsExec must produce a single partition
738
1
            assert_eq!(1, plan.properties().output_partitioning().partition_count());
739
1
            plan.execute(0, context)
740
        }
741
    }
742
66
}
743
744
/// Execute the [ExecutionPlan] and collect the results in memory
745
2
pub async fn collect_partitioned(
746
2
    plan: Arc<dyn ExecutionPlan>,
747
2
    context: Arc<TaskContext>,
748
2
) -> Result<Vec<Vec<RecordBatch>>> {
749
2
    let streams = execute_stream_partitioned(plan, context)
?0
;
750
751
2
    let mut join_set = JoinSet::new();
752
2
    // Execute the plan and collect the results into batches.
753
2
    streams.into_iter().enumerate().for_each(|(idx, stream)| {
754
2
        join_set.spawn(async move {
755
2
            let result: Result<Vec<RecordBatch>> = stream.try_collect().
await0
;
756
2
            (idx, result)
757
2
        });
758
2
    });
759
2
760
2
    let mut batches = vec![];
761
    // Note that currently this doesn't identify the thread that panicked
762
    //
763
    // TODO: Replace with [join_next_with_id](https://docs.rs/tokio/latest/tokio/task/struct.JoinSet.html#method.join_next_with_id
764
    // once it is stable
765
4
    while let Some(
result2
) = join_set.join_next().
await2
{
766
2
        match result {
767
2
            Ok((idx, res)) => batches.push((idx, res
?0
)),
768
0
            Err(e) => {
769
0
                if e.is_panic() {
770
0
                    std::panic::resume_unwind(e.into_panic());
771
                } else {
772
0
                    unreachable!();
773
                }
774
            }
775
        }
776
    }
777
778
2
    batches.sort_by_key(|(idx, _)| 
*idx0
);
779
2
    let batches = batches.into_iter().map(|(_, batch)| batch).collect();
780
2
781
2
    Ok(batches)
782
2
}
783
784
/// Execute the [ExecutionPlan] and return a vec with one stream per output
785
/// partition
786
///
787
/// # Aborting Execution
788
///
789
/// Dropping the stream will abort the execution of the query, and free up
790
/// any allocated resources
791
2
pub fn execute_stream_partitioned(
792
2
    plan: Arc<dyn ExecutionPlan>,
793
2
    context: Arc<TaskContext>,
794
2
) -> Result<Vec<SendableRecordBatchStream>> {
795
2
    let num_partitions = plan.output_partitioning().partition_count();
796
2
    let mut streams = Vec::with_capacity(num_partitions);
797
2
    for i in 0..num_partitions {
798
2
        streams.push(plan.execute(i, Arc::clone(&context))
?0
);
799
    }
800
2
    Ok(streams)
801
2
}
802
803
/// Executes an input stream and ensures that the resulting stream adheres to
804
/// the `not null` constraints specified in the `sink_schema`.
805
///
806
/// # Arguments
807
///
808
/// * `input` - An execution plan
809
/// * `sink_schema` - The schema to be applied to the output stream
810
/// * `partition` - The partition index to be executed
811
/// * `context` - The task context
812
///
813
/// # Returns
814
///
815
/// * `Result<SendableRecordBatchStream>` - A stream of `RecordBatch`es if successful
816
///
817
/// This function first executes the given input plan for the specified partition
818
/// and context. It then checks if there are any columns in the input that might
819
/// violate the `not null` constraints specified in the `sink_schema`. If there are
820
/// such columns, it wraps the resulting stream to enforce the `not null` constraints
821
/// by invoking the `check_not_null_contraits` function on each batch of the stream.
822
0
pub fn execute_input_stream(
823
0
    input: Arc<dyn ExecutionPlan>,
824
0
    sink_schema: SchemaRef,
825
0
    partition: usize,
826
0
    context: Arc<TaskContext>,
827
0
) -> Result<SendableRecordBatchStream> {
828
0
    let input_stream = input.execute(partition, context)?;
829
830
0
    debug_assert_eq!(sink_schema.fields().len(), input.schema().fields().len());
831
832
    // Find input columns that may violate the not null constraint.
833
0
    let risky_columns: Vec<_> = sink_schema
834
0
        .fields()
835
0
        .iter()
836
0
        .zip(input.schema().fields().iter())
837
0
        .enumerate()
838
0
        .filter_map(|(idx, (sink_field, input_field))| {
839
0
            (!sink_field.is_nullable() && input_field.is_nullable()).then_some(idx)
840
0
        })
841
0
        .collect();
842
0
843
0
    if risky_columns.is_empty() {
844
0
        Ok(input_stream)
845
    } else {
846
        // Check not null constraint on the input stream
847
0
        Ok(Box::pin(RecordBatchStreamAdapter::new(
848
0
            sink_schema,
849
0
            input_stream
850
0
                .map(move |batch| check_not_null_contraits(batch?, &risky_columns)),
851
0
        )))
852
    }
853
0
}
854
855
/// Checks a `RecordBatch` for `not null` constraints on specified columns.
856
///
857
/// # Arguments
858
///
859
/// * `batch` - The `RecordBatch` to be checked
860
/// * `column_indices` - A vector of column indices that should be checked for
861
///   `not null` constraints.
862
///
863
/// # Returns
864
///
865
/// * `Result<RecordBatch>` - The original `RecordBatch` if all constraints are met
866
///
867
/// This function iterates over the specified column indices and ensures that none
868
/// of the columns contain null values. If any column contains null values, an error
869
/// is returned.
870
0
pub fn check_not_null_contraits(
871
0
    batch: RecordBatch,
872
0
    column_indices: &Vec<usize>,
873
0
) -> Result<RecordBatch> {
874
0
    for &index in column_indices {
875
0
        if batch.num_columns() <= index {
876
0
            return exec_err!(
877
0
                "Invalid batch column count {} expected > {}",
878
0
                batch.num_columns(),
879
0
                index
880
0
            );
881
0
        }
882
0
883
0
        if batch.column(index).null_count() > 0 {
884
0
            return exec_err!(
885
0
                "Invalid batch column at '{}' has null but schema specifies non-nullable",
886
0
                index
887
0
            );
888
0
        }
889
    }
890
891
0
    Ok(batch)
892
0
}
893
894
/// Utility function yielding a string representation of the given [`ExecutionPlan`].
895
2
pub fn get_plan_string(plan: &Arc<dyn ExecutionPlan>) -> Vec<String> {
896
2
    let formatted = displayable(plan.as_ref()).indent(true).to_string();
897
2
    let actual: Vec<&str> = formatted.trim().lines().collect();
898
5
    actual.iter().map(|elem| elem.to_string()).collect()
899
2
}
900
901
#[cfg(test)]
902
mod tests {
903
    use super::*;
904
    use std::any::Any;
905
    use std::sync::Arc;
906
907
    use arrow_schema::{Schema, SchemaRef};
908
909
    use datafusion_common::{Result, Statistics};
910
    use datafusion_execution::{SendableRecordBatchStream, TaskContext};
911
912
    use crate::{DisplayAs, DisplayFormatType, ExecutionPlan};
913
914
    #[derive(Debug)]
915
    pub struct EmptyExec;
916
917
    impl EmptyExec {
918
1
        pub fn new(_schema: SchemaRef) -> Self {
919
1
            Self
920
1
        }
921
    }
922
923
    impl DisplayAs for EmptyExec {
924
0
        fn fmt_as(
925
0
            &self,
926
0
            _t: DisplayFormatType,
927
0
            _f: &mut std::fmt::Formatter,
928
0
        ) -> std::fmt::Result {
929
0
            unimplemented!()
930
        }
931
    }
932
933
    impl ExecutionPlan for EmptyExec {
934
1
        fn name(&self) -> &'static str {
935
1
            Self::static_name()
936
1
        }
937
938
0
        fn as_any(&self) -> &dyn Any {
939
0
            self
940
0
        }
941
942
0
        fn properties(&self) -> &PlanProperties {
943
0
            unimplemented!()
944
        }
945
946
0
        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
947
0
            vec![]
948
0
        }
949
950
0
        fn with_new_children(
951
0
            self: Arc<Self>,
952
0
            _: Vec<Arc<dyn ExecutionPlan>>,
953
0
        ) -> Result<Arc<dyn ExecutionPlan>> {
954
0
            unimplemented!()
955
        }
956
957
0
        fn execute(
958
0
            &self,
959
0
            _partition: usize,
960
0
            _context: Arc<TaskContext>,
961
0
        ) -> Result<SendableRecordBatchStream> {
962
0
            unimplemented!()
963
        }
964
965
0
        fn statistics(&self) -> Result<Statistics> {
966
0
            unimplemented!()
967
        }
968
    }
969
970
    #[derive(Debug)]
971
    pub struct RenamedEmptyExec;
972
973
    impl RenamedEmptyExec {
974
1
        pub fn new(_schema: SchemaRef) -> Self {
975
1
            Self
976
1
        }
977
    }
978
979
    impl DisplayAs for RenamedEmptyExec {
980
0
        fn fmt_as(
981
0
            &self,
982
0
            _t: DisplayFormatType,
983
0
            _f: &mut std::fmt::Formatter,
984
0
        ) -> std::fmt::Result {
985
0
            unimplemented!()
986
        }
987
    }
988
989
    impl ExecutionPlan for RenamedEmptyExec {
990
1
        fn name(&self) -> &'static str {
991
1
            Self::static_name()
992
1
        }
993
994
2
        fn static_name() -> &'static str
995
2
        where
996
2
            Self: Sized,
997
2
        {
998
2
            "MyRenamedEmptyExec"
999
2
        }
1000
1001
0
        fn as_any(&self) -> &dyn Any {
1002
0
            self
1003
0
        }
1004
1005
0
        fn properties(&self) -> &PlanProperties {
1006
0
            unimplemented!()
1007
        }
1008
1009
0
        fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
1010
0
            vec![]
1011
0
        }
1012
1013
0
        fn with_new_children(
1014
0
            self: Arc<Self>,
1015
0
            _: Vec<Arc<dyn ExecutionPlan>>,
1016
0
        ) -> Result<Arc<dyn ExecutionPlan>> {
1017
0
            unimplemented!()
1018
        }
1019
1020
0
        fn execute(
1021
0
            &self,
1022
0
            _partition: usize,
1023
0
            _context: Arc<TaskContext>,
1024
0
        ) -> Result<SendableRecordBatchStream> {
1025
0
            unimplemented!()
1026
        }
1027
1028
0
        fn statistics(&self) -> Result<Statistics> {
1029
0
            unimplemented!()
1030
        }
1031
    }
1032
1033
    #[test]
1034
1
    fn test_execution_plan_name() {
1035
1
        let schema1 = Arc::new(Schema::empty());
1036
1
        let default_name_exec = EmptyExec::new(schema1);
1037
1
        assert_eq!(default_name_exec.name(), "EmptyExec");
1038
1039
1
        let schema2 = Arc::new(Schema::empty());
1040
1
        let renamed_exec = RenamedEmptyExec::new(schema2);
1041
1
        assert_eq!(renamed_exec.name(), "MyRenamedEmptyExec");
1042
1
        assert_eq!(RenamedEmptyExec::static_name(), "MyRenamedEmptyExec");
1043
1
    }
1044
1045
    /// A compilation test to ensure that the `ExecutionPlan::name()` method can
1046
    /// be called from a trait object.
1047
    /// Related ticket: https://github.com/apache/datafusion/pull/11047
1048
    #[allow(dead_code)]
1049
0
    fn use_execution_plan_as_trait_object(plan: &dyn ExecutionPlan) {
1050
0
        let _ = plan.name();
1051
0
    }
1052
}
1053
1054
// pub mod test;