Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-plan/src/union.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
// Some of these functions reference the Postgres documentation
19
// or implementation to ensure compatibility and are subject to
20
// the Postgres license.
21
22
//! The Union operator combines multiple inputs with the same schema
23
24
use std::borrow::Borrow;
25
use std::pin::Pin;
26
use std::task::{Context, Poll};
27
use std::{any::Any, sync::Arc};
28
29
use super::{
30
    execution_mode_from_children,
31
    metrics::{ExecutionPlanMetricsSet, MetricsSet},
32
    ColumnStatistics, DisplayAs, DisplayFormatType, ExecutionPlan,
33
    ExecutionPlanProperties, Partitioning, PlanProperties, RecordBatchStream,
34
    SendableRecordBatchStream, Statistics,
35
};
36
use crate::metrics::BaselineMetrics;
37
use crate::stream::ObservedStream;
38
39
use arrow::datatypes::{Field, Schema, SchemaRef};
40
use arrow::record_batch::RecordBatch;
41
use datafusion_common::stats::Precision;
42
use datafusion_common::{exec_err, internal_err, Result};
43
use datafusion_execution::TaskContext;
44
use datafusion_physical_expr::{calculate_union, EquivalenceProperties};
45
46
use futures::Stream;
47
use itertools::Itertools;
48
use log::{debug, trace, warn};
49
use tokio::macros::support::thread_rng_n;
50
51
/// `UnionExec`: `UNION ALL` execution plan.
52
///
53
/// `UnionExec` combines multiple inputs with the same schema by
54
/// concatenating the partitions.  It does not mix or copy data within
55
/// or across partitions. Thus if the input partitions are sorted, the
56
/// output partitions of the union are also sorted.
57
///
58
/// For example, given a `UnionExec` of two inputs, with `N`
59
/// partitions, and `M` partitions, there will be `N+M` output
60
/// partitions. The first `N` output partitions are from Input 1
61
/// partitions, and then next `M` output partitions are from Input 2.
62
///
63
/// ```text
64
///                       ▲       ▲           ▲         ▲
65
///                       │       │           │         │
66
///     Output            │  ...  │           │         │
67
///   Partitions          │0      │N-1        │ N       │N+M-1
68
///(passes through   ┌────┴───────┴───────────┴─────────┴───┐
69
/// the N+M input    │              UnionExec               │
70
///  partitions)     │                                      │
71
///                  └──────────────────────────────────────┘
72
///                                      ▲
73
///                                      │
74
///                                      │
75
///       Input           ┌────────┬─────┴────┬──────────┐
76
///     Partitions        │ ...    │          │     ...  │
77
///                    0  │        │ N-1      │ 0        │  M-1
78
///                  ┌────┴────────┴───┐  ┌───┴──────────┴───┐
79
///                  │                 │  │                  │
80
///                  │                 │  │                  │
81
///                  │                 │  │                  │
82
///                  │                 │  │                  │
83
///                  │                 │  │                  │
84
///                  │                 │  │                  │
85
///                  │Input 1          │  │Input 2           │
86
///                  └─────────────────┘  └──────────────────┘
87
/// ```
88
#[derive(Debug)]
89
pub struct UnionExec {
90
    /// Input execution plan
91
    inputs: Vec<Arc<dyn ExecutionPlan>>,
92
    /// Execution metrics
93
    metrics: ExecutionPlanMetricsSet,
94
    /// Cache holding plan properties like equivalences, output partitioning etc.
95
    cache: PlanProperties,
96
}
97
98
impl UnionExec {
99
    /// Create a new UnionExec
100
5
    pub fn new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Self {
101
5
        let schema = union_schema(&inputs);
102
5
        // The schema of the inputs and the union schema is consistent when:
103
5
        // - They have the same number of fields, and
104
5
        // - Their fields have same types at the same indices.
105
5
        // Here, we know that schemas are consistent and the call below can
106
5
        // not return an error.
107
5
        let cache = Self::compute_properties(&inputs, schema).unwrap();
108
5
        UnionExec {
109
5
            inputs,
110
5
            metrics: ExecutionPlanMetricsSet::new(),
111
5
            cache,
112
5
        }
113
5
    }
114
115
    /// Get inputs of the execution plan
116
0
    pub fn inputs(&self) -> &Vec<Arc<dyn ExecutionPlan>> {
117
0
        &self.inputs
118
0
    }
119
120
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
121
5
    fn compute_properties(
122
5
        inputs: &[Arc<dyn ExecutionPlan>],
123
5
        schema: SchemaRef,
124
5
    ) -> Result<PlanProperties> {
125
5
        // Calculate equivalence properties:
126
5
        let children_eqps = inputs
127
5
            .iter()
128
10
            .map(|child| child.equivalence_properties().clone())
129
5
            .collect::<Vec<_>>();
130
5
        let eq_properties = calculate_union(children_eqps, schema)
?0
;
131
132
        // Calculate output partitioning; i.e. sum output partitions of the inputs.
133
5
        let num_partitions = inputs
134
5
            .iter()
135
10
            .map(|plan| plan.output_partitioning().partition_count())
136
5
            .sum();
137
5
        let output_partitioning = Partitioning::UnknownPartitioning(num_partitions);
138
5
139
5
        // Determine execution mode:
140
5
        let mode = execution_mode_from_children(inputs.iter());
141
5
142
5
        Ok(PlanProperties::new(
143
5
            eq_properties,
144
5
            output_partitioning,
145
5
            mode,
146
5
        ))
147
5
    }
148
}
149
150
impl DisplayAs for UnionExec {
151
2
    fn fmt_as(
152
2
        &self,
153
2
        t: DisplayFormatType,
154
2
        f: &mut std::fmt::Formatter,
155
2
    ) -> std::fmt::Result {
156
2
        match t {
157
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
158
2
                write!(f, "UnionExec")
159
2
            }
160
2
        }
161
2
    }
162
}
163
164
impl ExecutionPlan for UnionExec {
165
0
    fn name(&self) -> &'static str {
166
0
        "UnionExec"
167
0
    }
168
169
    /// Return a reference to Any that can be used for downcasting
170
0
    fn as_any(&self) -> &dyn Any {
171
0
        self
172
0
    }
173
174
26
    fn properties(&self) -> &PlanProperties {
175
26
        &self.cache
176
26
    }
177
178
2
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
179
2
        self.inputs.iter().collect()
180
2
    }
181
182
0
    fn maintains_input_order(&self) -> Vec<bool> {
183
        // If the Union has an output ordering, it maintains at least one
184
        // child's ordering (i.e. the meet).
185
        // For instance, assume that the first child is SortExpr('a','b','c'),
186
        // the second child is SortExpr('a','b') and the third child is
187
        // SortExpr('a','b'). The output ordering would be SortExpr('a','b'),
188
        // which is the "meet" of all input orderings. In this example, this
189
        // function will return vec![false, true, true], indicating that we
190
        // preserve the orderings for the 2nd and the 3rd children.
191
0
        if let Some(output_ordering) = self.properties().output_ordering() {
192
0
            self.inputs()
193
0
                .iter()
194
0
                .map(|child| {
195
0
                    if let Some(child_ordering) = child.output_ordering() {
196
0
                        output_ordering.len() == child_ordering.len()
197
                    } else {
198
0
                        false
199
                    }
200
0
                })
201
0
                .collect()
202
        } else {
203
0
            vec![false; self.inputs().len()]
204
        }
205
0
    }
206
207
0
    fn with_new_children(
208
0
        self: Arc<Self>,
209
0
        children: Vec<Arc<dyn ExecutionPlan>>,
210
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
211
0
        Ok(Arc::new(UnionExec::new(children)))
212
0
    }
213
214
9
    fn execute(
215
9
        &self,
216
9
        mut partition: usize,
217
9
        context: Arc<TaskContext>,
218
9
    ) -> Result<SendableRecordBatchStream> {
219
9
        trace!(
"Start UnionExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id()0
);
220
9
        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
221
9
        // record the tiny amount of work done in this function so
222
9
        // elapsed_compute is reported as non zero
223
9
        let elapsed_compute = baseline_metrics.elapsed_compute().clone();
224
9
        let _timer = elapsed_compute.timer(); // record on drop
225
226
        // find partition to execute
227
14
        for input in 
self.inputs.iter()9
{
228
            // Calculate whether partition belongs to the current partition
229
14
            if partition < input.output_partitioning().partition_count() {
230
9
                let stream = input.execute(partition, context)
?0
;
231
9
                debug!(
"Found a Union partition to execute"0
);
232
9
                return Ok(Box::pin(ObservedStream::new(stream, baseline_metrics)));
233
5
            } else {
234
5
                partition -= input.output_partitioning().partition_count();
235
5
            }
236
        }
237
238
0
        warn!("Error in Union: Partition {} not found", partition);
239
240
0
        exec_err!("Partition {partition} not found in Union")
241
9
    }
242
243
0
    fn metrics(&self) -> Option<MetricsSet> {
244
0
        Some(self.metrics.clone_inner())
245
0
    }
246
247
0
    fn statistics(&self) -> Result<Statistics> {
248
0
        let stats = self
249
0
            .inputs
250
0
            .iter()
251
0
            .map(|stat| stat.statistics())
252
0
            .collect::<Result<Vec<_>>>()?;
253
254
0
        Ok(stats
255
0
            .into_iter()
256
0
            .reduce(stats_union)
257
0
            .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
258
0
    }
259
260
0
    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
261
0
        vec![false; self.children().len()]
262
0
    }
263
264
0
    fn supports_limit_pushdown(&self) -> bool {
265
0
        true
266
0
    }
267
}
268
269
/// Combines multiple input streams by interleaving them.
270
///
271
/// This only works if all inputs have the same hash-partitioning.
272
///
273
/// # Data Flow
274
/// ```text
275
/// +---------+
276
/// |         |---+
277
/// | Input 1 |   |
278
/// |         |-------------+
279
/// +---------+   |         |
280
///               |         |         +---------+
281
///               +------------------>|         |
282
///                 +---------------->| Combine |-->
283
///                 | +-------------->|         |
284
///                 | |     |         +---------+
285
/// +---------+     | |     |
286
/// |         |-----+ |     |
287
/// | Input 2 |       |     |
288
/// |         |---------------+
289
/// +---------+       |     | |
290
///                   |     | |       +---------+
291
///                   |     +-------->|         |
292
///                   |       +------>| Combine |-->
293
///                   |         +---->|         |
294
///                   |         |     +---------+
295
/// +---------+       |         |
296
/// |         |-------+         |
297
/// | Input 3 |                 |
298
/// |         |-----------------+
299
/// +---------+
300
/// ```
301
#[derive(Debug)]
302
pub struct InterleaveExec {
303
    /// Input execution plan
304
    inputs: Vec<Arc<dyn ExecutionPlan>>,
305
    /// Execution metrics
306
    metrics: ExecutionPlanMetricsSet,
307
    /// Cache holding plan properties like equivalences, output partitioning etc.
308
    cache: PlanProperties,
309
}
310
311
impl InterleaveExec {
312
    /// Create a new InterleaveExec
313
0
    pub fn try_new(inputs: Vec<Arc<dyn ExecutionPlan>>) -> Result<Self> {
314
0
        if !can_interleave(inputs.iter()) {
315
0
            return internal_err!(
316
0
                "Not all InterleaveExec children have a consistent hash partitioning"
317
0
            );
318
0
        }
319
0
        let cache = Self::compute_properties(&inputs);
320
0
        Ok(InterleaveExec {
321
0
            inputs,
322
0
            metrics: ExecutionPlanMetricsSet::new(),
323
0
            cache,
324
0
        })
325
0
    }
326
327
    /// Get inputs of the execution plan
328
0
    pub fn inputs(&self) -> &Vec<Arc<dyn ExecutionPlan>> {
329
0
        &self.inputs
330
0
    }
331
332
    /// This function creates the cache object that stores the plan properties such as schema, equivalence properties, ordering, partitioning, etc.
333
0
    fn compute_properties(inputs: &[Arc<dyn ExecutionPlan>]) -> PlanProperties {
334
0
        let schema = union_schema(inputs);
335
0
        let eq_properties = EquivalenceProperties::new(schema);
336
0
        // Get output partitioning:
337
0
        let output_partitioning = inputs[0].output_partitioning().clone();
338
0
        // Determine execution mode:
339
0
        let mode = execution_mode_from_children(inputs.iter());
340
0
341
0
        PlanProperties::new(eq_properties, output_partitioning, mode)
342
0
    }
343
}
344
345
impl DisplayAs for InterleaveExec {
346
0
    fn fmt_as(
347
0
        &self,
348
0
        t: DisplayFormatType,
349
0
        f: &mut std::fmt::Formatter,
350
0
    ) -> std::fmt::Result {
351
0
        match t {
352
            DisplayFormatType::Default | DisplayFormatType::Verbose => {
353
0
                write!(f, "InterleaveExec")
354
0
            }
355
0
        }
356
0
    }
357
}
358
359
impl ExecutionPlan for InterleaveExec {
360
0
    fn name(&self) -> &'static str {
361
0
        "InterleaveExec"
362
0
    }
363
364
    /// Return a reference to Any that can be used for downcasting
365
0
    fn as_any(&self) -> &dyn Any {
366
0
        self
367
0
    }
368
369
0
    fn properties(&self) -> &PlanProperties {
370
0
        &self.cache
371
0
    }
372
373
0
    fn children(&self) -> Vec<&Arc<dyn ExecutionPlan>> {
374
0
        self.inputs.iter().collect()
375
0
    }
376
377
0
    fn maintains_input_order(&self) -> Vec<bool> {
378
0
        vec![false; self.inputs().len()]
379
0
    }
380
381
0
    fn with_new_children(
382
0
        self: Arc<Self>,
383
0
        children: Vec<Arc<dyn ExecutionPlan>>,
384
0
    ) -> Result<Arc<dyn ExecutionPlan>> {
385
0
        // New children are no longer interleavable, which might be a bug of optimization rewrite.
386
0
        if !can_interleave(children.iter()) {
387
0
            return internal_err!(
388
0
                "Can not create InterleaveExec: new children can not be interleaved"
389
0
            );
390
0
        }
391
0
        Ok(Arc::new(InterleaveExec::try_new(children)?))
392
0
    }
393
394
0
    fn execute(
395
0
        &self,
396
0
        partition: usize,
397
0
        context: Arc<TaskContext>,
398
0
    ) -> Result<SendableRecordBatchStream> {
399
0
        trace!("Start InterleaveExec::execute for partition {} of context session_id {} and task_id {:?}", partition, context.session_id(), context.task_id());
400
0
        let baseline_metrics = BaselineMetrics::new(&self.metrics, partition);
401
0
        // record the tiny amount of work done in this function so
402
0
        // elapsed_compute is reported as non zero
403
0
        let elapsed_compute = baseline_metrics.elapsed_compute().clone();
404
0
        let _timer = elapsed_compute.timer(); // record on drop
405
0
406
0
        let mut input_stream_vec = vec![];
407
0
        for input in self.inputs.iter() {
408
0
            if partition < input.output_partitioning().partition_count() {
409
0
                input_stream_vec.push(input.execute(partition, Arc::clone(&context))?);
410
            } else {
411
                // Do not find a partition to execute
412
0
                break;
413
            }
414
        }
415
0
        if input_stream_vec.len() == self.inputs.len() {
416
0
            let stream = Box::pin(CombinedRecordBatchStream::new(
417
0
                self.schema(),
418
0
                input_stream_vec,
419
0
            ));
420
0
            return Ok(Box::pin(ObservedStream::new(stream, baseline_metrics)));
421
0
        }
422
0
423
0
        warn!("Error in InterleaveExec: Partition {} not found", partition);
424
425
0
        exec_err!("Partition {partition} not found in InterleaveExec")
426
0
    }
427
428
0
    fn metrics(&self) -> Option<MetricsSet> {
429
0
        Some(self.metrics.clone_inner())
430
0
    }
431
432
0
    fn statistics(&self) -> Result<Statistics> {
433
0
        let stats = self
434
0
            .inputs
435
0
            .iter()
436
0
            .map(|stat| stat.statistics())
437
0
            .collect::<Result<Vec<_>>>()?;
438
439
0
        Ok(stats
440
0
            .into_iter()
441
0
            .reduce(stats_union)
442
0
            .unwrap_or_else(|| Statistics::new_unknown(&self.schema())))
443
0
    }
444
445
0
    fn benefits_from_input_partitioning(&self) -> Vec<bool> {
446
0
        vec![false; self.children().len()]
447
0
    }
448
}
449
450
/// If all the input partitions have the same Hash partition spec with the first_input_partition
451
/// The InterleaveExec is partition aware.
452
///
453
/// It might be too strict here in the case that the input partition specs are compatible but not exactly the same.
454
/// For example one input partition has the partition spec Hash('a','b','c') and
455
/// other has the partition spec Hash('a'), It is safe to derive the out partition with the spec Hash('a','b','c').
456
0
pub fn can_interleave<T: Borrow<Arc<dyn ExecutionPlan>>>(
457
0
    mut inputs: impl Iterator<Item = T>,
458
0
) -> bool {
459
0
    let Some(first) = inputs.next() else {
460
0
        return false;
461
    };
462
463
0
    let reference = first.borrow().output_partitioning();
464
0
    matches!(reference, Partitioning::Hash(_, _))
465
0
        && inputs
466
0
            .map(|plan| plan.borrow().output_partitioning().clone())
467
0
            .all(|partition| partition == *reference)
468
0
}
469
470
5
fn union_schema(inputs: &[Arc<dyn ExecutionPlan>]) -> SchemaRef {
471
5
    let fields: Vec<Field> = (0..inputs[0].schema().fields().len())
472
17
        .map(|i| {
473
17
            inputs
474
17
                .iter()
475
19
                .filter_map(|input| {
476
19
                    if input.schema().fields().len() > i {
477
19
                        let field = input.schema().field(i).clone();
478
19
                        let right_hand_metdata = inputs
479
19
                            .get(1)
480
19
                            .map(|right_input| {
481
19
                                right_input.schema().field(i).metadata().clone()
482
19
                            })
483
19
                            .unwrap_or_default();
484
19
                        let mut metadata = field.metadata().clone();
485
19
                        metadata.extend(right_hand_metdata);
486
19
                        Some(field.with_metadata(metadata))
487
                    } else {
488
0
                        None
489
                    }
490
19
                })
491
19
                .find_or_first(|f| f.is_nullable())
492
17
                .unwrap()
493
17
        })
494
5
        .collect();
495
5
496
5
    Arc::new(Schema::new_with_metadata(
497
5
        fields,
498
5
        inputs[0].schema().metadata().clone(),
499
5
    ))
500
5
}
501
502
/// CombinedRecordBatchStream can be used to combine a Vec of SendableRecordBatchStreams into one
503
struct CombinedRecordBatchStream {
504
    /// Schema wrapped by Arc
505
    schema: SchemaRef,
506
    /// Stream entries
507
    entries: Vec<SendableRecordBatchStream>,
508
}
509
510
impl CombinedRecordBatchStream {
511
    /// Create an CombinedRecordBatchStream
512
0
    pub fn new(schema: SchemaRef, entries: Vec<SendableRecordBatchStream>) -> Self {
513
0
        Self { schema, entries }
514
0
    }
515
}
516
517
impl RecordBatchStream for CombinedRecordBatchStream {
518
0
    fn schema(&self) -> SchemaRef {
519
0
        Arc::clone(&self.schema)
520
0
    }
521
}
522
523
impl Stream for CombinedRecordBatchStream {
524
    type Item = Result<RecordBatch>;
525
526
0
    fn poll_next(
527
0
        mut self: Pin<&mut Self>,
528
0
        cx: &mut Context<'_>,
529
0
    ) -> Poll<Option<Self::Item>> {
530
        use Poll::*;
531
532
0
        let start = thread_rng_n(self.entries.len() as u32) as usize;
533
0
        let mut idx = start;
534
0
535
0
        for _ in 0..self.entries.len() {
536
0
            let stream = self.entries.get_mut(idx).unwrap();
537
0
538
0
            match Pin::new(stream).poll_next(cx) {
539
0
                Ready(Some(val)) => return Ready(Some(val)),
540
                Ready(None) => {
541
                    // Remove the entry
542
0
                    self.entries.swap_remove(idx);
543
0
544
0
                    // Check if this was the last entry, if so the cursor needs
545
0
                    // to wrap
546
0
                    if idx == self.entries.len() {
547
0
                        idx = 0;
548
0
                    } else if idx < start && start <= self.entries.len() {
549
0
                        // The stream being swapped into the current index has
550
0
                        // already been polled, so skip it.
551
0
                        idx = idx.wrapping_add(1) % self.entries.len();
552
0
                    }
553
                }
554
0
                Pending => {
555
0
                    idx = idx.wrapping_add(1) % self.entries.len();
556
0
                }
557
            }
558
        }
559
560
        // If the map is empty, then the stream is complete.
561
0
        if self.entries.is_empty() {
562
0
            Ready(None)
563
        } else {
564
0
            Pending
565
        }
566
0
    }
567
}
568
569
3
fn col_stats_union(
570
3
    mut left: ColumnStatistics,
571
3
    right: ColumnStatistics,
572
3
) -> ColumnStatistics {
573
3
    left.distinct_count = Precision::Absent;
574
3
    left.min_value = left.min_value.min(&right.min_value);
575
3
    left.max_value = left.max_value.max(&right.max_value);
576
3
    left.null_count = left.null_count.add(&right.null_count);
577
3
578
3
    left
579
3
}
580
581
1
fn stats_union(mut left: Statistics, right: Statistics) -> Statistics {
582
1
    left.num_rows = left.num_rows.add(&right.num_rows);
583
1
    left.total_byte_size = left.total_byte_size.add(&right.total_byte_size);
584
1
    left.column_statistics = left
585
1
        .column_statistics
586
1
        .into_iter()
587
1
        .zip(right.column_statistics)
588
3
        .map(|(a, b)| col_stats_union(a, b))
589
1
        .collect::<Vec<_>>();
590
1
    left
591
1
}
592
593
#[cfg(test)]
594
mod tests {
595
    use super::*;
596
    use crate::collect;
597
    use crate::memory::MemoryExec;
598
    use crate::test;
599
600
    use arrow_schema::{DataType, SortOptions};
601
    use datafusion_common::ScalarValue;
602
    use datafusion_physical_expr::expressions::col;
603
    use datafusion_physical_expr::{PhysicalExpr, PhysicalSortExpr};
604
605
    // Generate a schema which consists of 7 columns (a, b, c, d, e, f, g)
606
1
    fn create_test_schema() -> Result<SchemaRef> {
607
1
        let a = Field::new("a", DataType::Int32, true);
608
1
        let b = Field::new("b", DataType::Int32, true);
609
1
        let c = Field::new("c", DataType::Int32, true);
610
1
        let d = Field::new("d", DataType::Int32, true);
611
1
        let e = Field::new("e", DataType::Int32, true);
612
1
        let f = Field::new("f", DataType::Int32, true);
613
1
        let g = Field::new("g", DataType::Int32, true);
614
1
        let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f, g]));
615
1
616
1
        Ok(schema)
617
1
    }
618
619
    // Convert each tuple to PhysicalSortExpr
620
9
    fn convert_to_sort_exprs(
621
9
        in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)],
622
9
    ) -> Vec<PhysicalSortExpr> {
623
9
        in_data
624
9
            .iter()
625
22
            .map(|(expr, options)| PhysicalSortExpr {
626
22
                expr: Arc::clone(*expr),
627
22
                options: *options,
628
22
            })
629
9
            .collect::<Vec<_>>()
630
9
    }
631
632
    #[tokio::test]
633
1
    async fn test_union_partitions() -> Result<()> {
634
1
        let task_ctx = Arc::new(TaskContext::default());
635
1
636
1
        // Create inputs with different partitioning
637
1
        let csv = test::scan_partitioned(4);
638
1
        let csv2 = test::scan_partitioned(5);
639
1
640
1
        let union_exec = Arc::new(UnionExec::new(vec![csv, csv2]));
641
1
642
1
        // Should have 9 partitions and 9 output batches
643
1
        assert_eq!(
644
1
            union_exec
645
1
                .properties()
646
1
                .output_partitioning()
647
1
                .partition_count(),
648
1
            9
649
1
        );
650
1
651
1
        let result: Vec<RecordBatch> = collect(union_exec, task_ctx).await
?0
;
652
1
        assert_eq!(result.len(), 9);
653
1
654
1
        Ok(())
655
1
    }
656
657
    #[tokio::test]
658
1
    async fn test_stats_union() {
659
1
        let left = Statistics {
660
1
            num_rows: Precision::Exact(5),
661
1
            total_byte_size: Precision::Exact(23),
662
1
            column_statistics: vec![
663
1
                ColumnStatistics {
664
1
                    distinct_count: Precision::Exact(5),
665
1
                    max_value: Precision::Exact(ScalarValue::Int64(Some(21))),
666
1
                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
667
1
                    null_count: Precision::Exact(0),
668
1
                },
669
1
                ColumnStatistics {
670
1
                    distinct_count: Precision::Exact(1),
671
1
                    max_value: Precision::Exact(ScalarValue::from("x")),
672
1
                    min_value: Precision::Exact(ScalarValue::from("a")),
673
1
                    null_count: Precision::Exact(3),
674
1
                },
675
1
                ColumnStatistics {
676
1
                    distinct_count: Precision::Absent,
677
1
                    max_value: Precision::Exact(ScalarValue::Float32(Some(1.1))),
678
1
                    min_value: Precision::Exact(ScalarValue::Float32(Some(0.1))),
679
1
                    null_count: Precision::Absent,
680
1
                },
681
1
            ],
682
1
        };
683
1
684
1
        let right = Statistics {
685
1
            num_rows: Precision::Exact(7),
686
1
            total_byte_size: Precision::Exact(29),
687
1
            column_statistics: vec![
688
1
                ColumnStatistics {
689
1
                    distinct_count: Precision::Exact(3),
690
1
                    max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
691
1
                    min_value: Precision::Exact(ScalarValue::Int64(Some(1))),
692
1
                    null_count: Precision::Exact(1),
693
1
                },
694
1
                ColumnStatistics {
695
1
                    distinct_count: Precision::Absent,
696
1
                    max_value: Precision::Exact(ScalarValue::from("c")),
697
1
                    min_value: Precision::Exact(ScalarValue::from("b")),
698
1
                    null_count: Precision::Absent,
699
1
                },
700
1
                ColumnStatistics {
701
1
                    distinct_count: Precision::Absent,
702
1
                    max_value: Precision::Absent,
703
1
                    min_value: Precision::Absent,
704
1
                    null_count: Precision::Absent,
705
1
                },
706
1
            ],
707
1
        };
708
1
709
1
        let result = stats_union(left, right);
710
1
        let expected = Statistics {
711
1
            num_rows: Precision::Exact(12),
712
1
            total_byte_size: Precision::Exact(52),
713
1
            column_statistics: vec![
714
1
                ColumnStatistics {
715
1
                    distinct_count: Precision::Absent,
716
1
                    max_value: Precision::Exact(ScalarValue::Int64(Some(34))),
717
1
                    min_value: Precision::Exact(ScalarValue::Int64(Some(-4))),
718
1
                    null_count: Precision::Exact(1),
719
1
                },
720
1
                ColumnStatistics {
721
1
                    distinct_count: Precision::Absent,
722
1
                    max_value: Precision::Exact(ScalarValue::from("x")),
723
1
                    min_value: Precision::Exact(ScalarValue::from("a")),
724
1
                    null_count: Precision::Absent,
725
1
                },
726
1
                ColumnStatistics {
727
1
                    distinct_count: Precision::Absent,
728
1
                    max_value: Precision::Absent,
729
1
                    min_value: Precision::Absent,
730
1
                    null_count: Precision::Absent,
731
1
                },
732
1
            ],
733
1
        };
734
1
735
1
        assert_eq!(result, expected);
736
1
    }
737
738
    #[tokio::test]
739
1
    async fn test_union_equivalence_properties() -> Result<()> {
740
1
        let schema = create_test_schema()
?0
;
741
1
        let col_a = &col("a", &schema)
?0
;
742
1
        let col_b = &col("b", &schema)
?0
;
743
1
        let col_c = &col("c", &schema)
?0
;
744
1
        let col_d = &col("d", &schema)
?0
;
745
1
        let col_e = &col("e", &schema)
?0
;
746
1
        let col_f = &col("f", &schema)
?0
;
747
1
        let options = SortOptions::default();
748
1
        let test_cases = [
749
1
            //-----------TEST CASE 1----------//
750
1
            (
751
1
                // First child orderings
752
1
                vec![
753
1
                    // [a ASC, b ASC, f ASC]
754
1
                    vec![(col_a, options), (col_b, options), (col_f, options)],
755
1
                ],
756
1
                // Second child orderings
757
1
                vec![
758
1
                    // [a ASC, b ASC, c ASC]
759
1
                    vec![(col_a, options), (col_b, options), (col_c, options)],
760
1
                    // [a ASC, b ASC, f ASC]
761
1
                    vec![(col_a, options), (col_b, options), (col_f, options)],
762
1
                ],
763
1
                // Union output orderings
764
1
                vec![
765
1
                    // [a ASC, b ASC, f ASC]
766
1
                    vec![(col_a, options), (col_b, options), (col_f, options)],
767
1
                ],
768
1
            ),
769
1
            //-----------TEST CASE 2----------//
770
1
            (
771
1
                // First child orderings
772
1
                vec![
773
1
                    // [a ASC, b ASC, f ASC]
774
1
                    vec![(col_a, options), (col_b, options), (col_f, options)],
775
1
                    // d ASC
776
1
                    vec![(col_d, options)],
777
1
                ],
778
1
                // Second child orderings
779
1
                vec![
780
1
                    // [a ASC, b ASC, c ASC]
781
1
                    vec![(col_a, options), (col_b, options), (col_c, options)],
782
1
                    // [e ASC]
783
1
                    vec![(col_e, options)],
784
1
                ],
785
1
                // Union output orderings
786
1
                vec![
787
1
                    // [a ASC, b ASC]
788
1
                    vec![(col_a, options), (col_b, options)],
789
1
                ],
790
1
            ),
791
1
        ];
792
1
793
1
        for (
794
2
            test_idx,
795
2
            (first_child_orderings, second_child_orderings, union_orderings),
796
1
        ) in test_cases.iter().enumerate()
797
1
        {
798
2
            let first_orderings = first_child_orderings
799
2
                .iter()
800
3
                .map(|ordering| convert_to_sort_exprs(ordering))
801
2
                .collect::<Vec<_>>();
802
2
            let second_orderings = second_child_orderings
803
2
                .iter()
804
4
                .map(|ordering| convert_to_sort_exprs(ordering))
805
2
                .collect::<Vec<_>>();
806
2
            let union_expected_orderings = union_orderings
807
2
                .iter()
808
2
                .map(|ordering| convert_to_sort_exprs(ordering))
809
2
                .collect::<Vec<_>>();
810
2
            let child1 = Arc::new(
811
2
                MemoryExec::try_new(&[], Arc::clone(&schema), None)
?0
812
2
                    .with_sort_information(first_orderings),
813
1
            );
814
2
            let child2 = Arc::new(
815
2
                MemoryExec::try_new(&[], Arc::clone(&schema), None)
?0
816
2
                    .with_sort_information(second_orderings),
817
2
            );
818
2
819
2
            let mut union_expected_eq = EquivalenceProperties::new(Arc::clone(&schema));
820
2
            union_expected_eq.add_new_orderings(union_expected_orderings);
821
2
822
2
            let union = UnionExec::new(vec![child1, child2]);
823
2
            let union_eq_properties = union.properties().equivalence_properties();
824
2
            let err_msg = format!(
825
2
                "Error in test id: {:?}, test case: {:?}",
826
2
                test_idx, test_cases[test_idx]
827
2
            );
828
2
            assert_eq_properties_same(union_eq_properties, &union_expected_eq, err_msg);
829
1
        }
830
1
        Ok(())
831
1
    }
832
833
2
    fn assert_eq_properties_same(
834
2
        lhs: &EquivalenceProperties,
835
2
        rhs: &EquivalenceProperties,
836
2
        err_msg: String,
837
2
    ) {
838
2
        // Check whether orderings are same.
839
2
        let lhs_orderings = lhs.oeq_class();
840
2
        let rhs_orderings = &rhs.oeq_class.orderings;
841
2
        assert_eq!(lhs_orderings.len(), rhs_orderings.len(), 
"{}"0
, err_msg);
842
4
        for 
rhs_ordering2
in rhs_orderings {
843
2
            assert!(lhs_orderings.contains(rhs_ordering), "{}", err_msg);
844
        }
845
2
    }
846
}