Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/partitioning.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
//! [`Partitioning`] and [`Distribution`] for `ExecutionPlans`
19
20
use crate::{
21
    equivalence::ProjectionMapping, expressions::UnKnownColumn, physical_exprs_equal,
22
    EquivalenceProperties, PhysicalExpr,
23
};
24
use datafusion_physical_expr_common::physical_expr::format_physical_expr_list;
25
use std::fmt;
26
use std::fmt::Display;
27
use std::sync::Arc;
28
29
/// Output partitioning supported by [`ExecutionPlan`]s.
30
///
31
/// Calling [`ExecutionPlan::execute`] produce one or more independent streams of
32
/// [`RecordBatch`]es in parallel, referred to as partitions. The streams are Rust
33
/// `async` [`Stream`]s (a special kind of future). The number of output
34
/// partitions varies based on the input and the operation performed.
35
///
36
/// For example, an `ExecutionPlan` that has output partitioning of 3 will
37
/// produce 3 distinct output streams as the result of calling
38
/// `ExecutionPlan::execute(0)`, `ExecutionPlan::execute(1)`, and
39
/// `ExecutionPlan::execute(2)`, as shown below:
40
///
41
/// ```text
42
///                                                   ...         ...        ...
43
///               ...                                  ▲           ▲           ▲
44
///                                                    │           │           │
45
///                ▲                                   │           │           │
46
///                │                                   │           │           │
47
///                │                               ┌───┴────┐  ┌───┴────┐  ┌───┴────┐
48
///     ┌────────────────────┐                     │ Stream │  │ Stream │  │ Stream │
49
///     │   ExecutionPlan    │                     │  (0)   │  │  (1)   │  │  (2)   │
50
///     └────────────────────┘                     └────────┘  └────────┘  └────────┘
51
///                ▲                                   ▲           ▲           ▲
52
///                │                                   │           │           │
53
///     ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─                          │           │           │
54
///             Input        │                         │           │           │
55
///     └ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─                          │           │           │
56
///                ▲                               ┌ ─ ─ ─ ─   ┌ ─ ─ ─ ─   ┌ ─ ─ ─ ─
57
///                │                                 Input  │    Input  │    Input  │
58
///                │                               │ Stream    │ Stream    │ Stream
59
///                                                   (0)   │     (1)   │     (2)   │
60
///               ...                              └ ─ ▲ ─ ─   └ ─ ▲ ─ ─   └ ─ ▲ ─ ─
61
///                                                    │           │           │
62
///                                                    │           │           │
63
///                                                    │           │           │
64
///
65
/// ExecutionPlan with 1 input                      3 (async) streams, one for each
66
/// that has 3 partitions, which itself             output partition
67
/// has 3 output partitions
68
/// ```
69
///
70
/// It is common (but not required) that an `ExecutionPlan` has the same number
71
/// of input partitions as output partitions. However, some plans have different
72
/// numbers such as the `RepartitionExec` that redistributes batches from some
73
/// number of inputs to some number of outputs
74
///
75
/// ```text
76
///               ...                                     ...         ...        ...
77
///
78
///                                                        ▲           ▲           ▲
79
///                ▲                                       │           │           │
80
///                │                                       │           │           │
81
///       ┌────────┴───────────┐                           │           │           │
82
///       │  RepartitionExec   │                      ┌────┴───┐  ┌────┴───┐  ┌────┴───┐
83
///       └────────────────────┘                      │ Stream │  │ Stream │  │ Stream │
84
///                ▲                                  │  (0)   │  │  (1)   │  │  (2)   │
85
///                │                                  └────────┘  └────────┘  └────────┘
86
///                │                                       ▲           ▲           ▲
87
///                ...                                     │           │           │
88
///                                                        └──────────┐│┌──────────┘
89
///                                                                   │││
90
///                                                                   │││
91
/// RepartitionExec with 1 input
92
/// partition and 3 output partitions                 3 (async) streams, that internally
93
///                                                    pull from the same input stream
94
///                                                                  ...
95
/// ```
96
///
97
/// # Additional Examples
98
///
99
/// A simple `FileScanExec` might produce one output stream (partition) for each
100
/// file (note the actual DataFusion file scaners can read individual files in
101
/// parallel, potentially producing multiple partitions per file)
102
///
103
/// Plans such as `SortPreservingMerge` produce a single output stream
104
/// (1 output partition) by combining some number of input streams (input partitions)
105
///
106
/// Plans such as `FilterExec` produce the same number of output streams
107
/// (partitions) as input streams (partitions).
108
///
109
/// [`RecordBatch`]: arrow::record_batch::RecordBatch
110
/// [`ExecutionPlan::execute`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html#tymethod.execute
111
/// [`ExecutionPlan`]: https://docs.rs/datafusion/latest/datafusion/physical_plan/trait.ExecutionPlan.html
112
/// [`Stream`]: https://docs.rs/futures/latest/futures/stream/trait.Stream.html
113
#[derive(Debug, Clone)]
114
pub enum Partitioning {
115
    /// Allocate batches using a round-robin algorithm and the specified number of partitions
116
    RoundRobinBatch(usize),
117
    /// Allocate rows based on a hash of one of more expressions and the specified number of
118
    /// partitions
119
    Hash(Vec<Arc<dyn PhysicalExpr>>, usize),
120
    /// Unknown partitioning scheme with a known number of partitions
121
    UnknownPartitioning(usize),
122
}
123
124
impl fmt::Display for Partitioning {
125
3
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
126
3
        match self {
127
3
            Partitioning::RoundRobinBatch(size) => write!(f, "RoundRobinBatch({size})"),
128
0
            Partitioning::Hash(phy_exprs, size) => {
129
0
                let phy_exprs_str = phy_exprs
130
0
                    .iter()
131
0
                    .map(|e| format!("{e}"))
132
0
                    .collect::<Vec<String>>()
133
0
                    .join(", ");
134
0
                write!(f, "Hash([{phy_exprs_str}], {size})")
135
            }
136
0
            Partitioning::UnknownPartitioning(size) => {
137
0
                write!(f, "UnknownPartitioning({size})")
138
            }
139
        }
140
3
    }
141
}
142
impl Partitioning {
143
    /// Returns the number of partitions in this partitioning scheme
144
18.7k
    pub fn partition_count(&self) -> usize {
145
        use Partitioning::*;
146
18.7k
        match self {
147
18.7k
            RoundRobinBatch(
n46
) | Hash(_,
n6.96k
) | UnknownPartitioning(
n11.7k
) => *n,
148
18.7k
        }
149
18.7k
    }
150
151
    /// Returns true when the guarantees made by this [`Partitioning`] are sufficient to
152
    /// satisfy the partitioning scheme mandated by the `required` [`Distribution`].
153
0
    pub fn satisfy(
154
0
        &self,
155
0
        required: &Distribution,
156
0
        eq_properties: &EquivalenceProperties,
157
0
    ) -> bool {
158
0
        match required {
159
0
            Distribution::UnspecifiedDistribution => true,
160
0
            Distribution::SinglePartition if self.partition_count() == 1 => true,
161
            // When partition count is 1, hash requirement is satisfied.
162
0
            Distribution::HashPartitioned(_) if self.partition_count() == 1 => true,
163
0
            Distribution::HashPartitioned(required_exprs) => {
164
0
                match self {
165
                    // Here we do not check the partition count for hash partitioning and assumes the partition count
166
                    // and hash functions in the system are the same. In future if we plan to support storage partition-wise joins,
167
                    // then we need to have the partition count and hash functions validation.
168
0
                    Partitioning::Hash(partition_exprs, _) => {
169
0
                        let fast_match =
170
0
                            physical_exprs_equal(required_exprs, partition_exprs);
171
0
                        // If the required exprs do not match, need to leverage the eq_properties provided by the child
172
0
                        // and normalize both exprs based on the equivalent groups.
173
0
                        if !fast_match {
174
0
                            let eq_groups = eq_properties.eq_group();
175
0
                            if !eq_groups.is_empty() {
176
0
                                let normalized_required_exprs = required_exprs
177
0
                                    .iter()
178
0
                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
179
0
                                    .collect::<Vec<_>>();
180
0
                                let normalized_partition_exprs = partition_exprs
181
0
                                    .iter()
182
0
                                    .map(|e| eq_groups.normalize_expr(Arc::clone(e)))
183
0
                                    .collect::<Vec<_>>();
184
0
                                return physical_exprs_equal(
185
0
                                    &normalized_required_exprs,
186
0
                                    &normalized_partition_exprs,
187
0
                                );
188
0
                            }
189
0
                        }
190
0
                        fast_match
191
                    }
192
0
                    _ => false,
193
                }
194
            }
195
0
            _ => false,
196
        }
197
0
    }
198
199
    /// Calculate the output partitioning after applying the given projection.
200
27
    pub fn project(
201
27
        &self,
202
27
        projection_mapping: &ProjectionMapping,
203
27
        input_eq_properties: &EquivalenceProperties,
204
27
    ) -> Self {
205
27
        if let Partitioning::Hash(
exprs, part0
) = self {
206
0
            let normalized_exprs = exprs
207
0
                .iter()
208
0
                .map(|expr| {
209
0
                    input_eq_properties
210
0
                        .project_expr(expr, projection_mapping)
211
0
                        .unwrap_or_else(|| {
212
0
                            Arc::new(UnKnownColumn::new(&expr.to_string()))
213
0
                        })
214
0
                })
215
0
                .collect();
216
0
            Partitioning::Hash(normalized_exprs, *part)
217
        } else {
218
27
            self.clone()
219
        }
220
27
    }
221
}
222
223
impl PartialEq for Partitioning {
224
0
    fn eq(&self, other: &Partitioning) -> bool {
225
0
        match (self, other) {
226
            (
227
0
                Partitioning::RoundRobinBatch(count1),
228
0
                Partitioning::RoundRobinBatch(count2),
229
0
            ) if count1 == count2 => true,
230
0
            (Partitioning::Hash(exprs1, count1), Partitioning::Hash(exprs2, count2))
231
0
                if physical_exprs_equal(exprs1, exprs2) && (count1 == count2) =>
232
0
            {
233
0
                true
234
            }
235
0
            _ => false,
236
        }
237
0
    }
238
}
239
240
/// How data is distributed amongst partitions. See [`Partitioning`] for more
241
/// details.
242
#[derive(Debug, Clone)]
243
pub enum Distribution {
244
    /// Unspecified distribution
245
    UnspecifiedDistribution,
246
    /// A single partition is required
247
    SinglePartition,
248
    /// Requires children to be distributed in such a way that the same
249
    /// values of the keys end up in the same partition
250
    HashPartitioned(Vec<Arc<dyn PhysicalExpr>>),
251
}
252
253
impl Distribution {
254
    /// Creates a `Partitioning` that satisfies this `Distribution`
255
0
    pub fn create_partitioning(self, partition_count: usize) -> Partitioning {
256
0
        match self {
257
            Distribution::UnspecifiedDistribution => {
258
0
                Partitioning::UnknownPartitioning(partition_count)
259
            }
260
0
            Distribution::SinglePartition => Partitioning::UnknownPartitioning(1),
261
0
            Distribution::HashPartitioned(expr) => {
262
0
                Partitioning::Hash(expr, partition_count)
263
            }
264
        }
265
0
    }
266
}
267
268
impl Display for Distribution {
269
0
    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
270
0
        match self {
271
0
            Distribution::UnspecifiedDistribution => write!(f, "Unspecified"),
272
0
            Distribution::SinglePartition => write!(f, "SinglePartition"),
273
0
            Distribution::HashPartitioned(exprs) => {
274
0
                write!(f, "HashPartitioned[{}])", format_physical_expr_list(exprs))
275
            }
276
        }
277
0
    }
278
}
279
280
#[cfg(test)]
281
mod tests {
282
283
    use super::*;
284
    use crate::expressions::Column;
285
286
    use arrow::datatypes::{DataType, Field, Schema};
287
    use datafusion_common::Result;
288
289
    #[test]
290
    fn partitioning_satisfy_distribution() -> Result<()> {
291
        let schema = Arc::new(Schema::new(vec![
292
            Field::new("column_1", DataType::Int64, false),
293
            Field::new("column_2", DataType::Utf8, false),
294
        ]));
295
296
        let partition_exprs1: Vec<Arc<dyn PhysicalExpr>> = vec![
297
            Arc::new(Column::new_with_schema("column_1", &schema).unwrap()),
298
            Arc::new(Column::new_with_schema("column_2", &schema).unwrap()),
299
        ];
300
301
        let partition_exprs2: Vec<Arc<dyn PhysicalExpr>> = vec![
302
            Arc::new(Column::new_with_schema("column_2", &schema).unwrap()),
303
            Arc::new(Column::new_with_schema("column_1", &schema).unwrap()),
304
        ];
305
306
        let distribution_types = vec![
307
            Distribution::UnspecifiedDistribution,
308
            Distribution::SinglePartition,
309
            Distribution::HashPartitioned(partition_exprs1.clone()),
310
        ];
311
312
        let single_partition = Partitioning::UnknownPartitioning(1);
313
        let unspecified_partition = Partitioning::UnknownPartitioning(10);
314
        let round_robin_partition = Partitioning::RoundRobinBatch(10);
315
        let hash_partition1 = Partitioning::Hash(partition_exprs1, 10);
316
        let hash_partition2 = Partitioning::Hash(partition_exprs2, 10);
317
        let eq_properties = EquivalenceProperties::new(schema);
318
319
        for distribution in distribution_types {
320
            let result = (
321
                single_partition.satisfy(&distribution, &eq_properties),
322
                unspecified_partition.satisfy(&distribution, &eq_properties),
323
                round_robin_partition.satisfy(&distribution, &eq_properties),
324
                hash_partition1.satisfy(&distribution, &eq_properties),
325
                hash_partition2.satisfy(&distribution, &eq_properties),
326
            );
327
328
            match distribution {
329
                Distribution::UnspecifiedDistribution => {
330
                    assert_eq!(result, (true, true, true, true, true))
331
                }
332
                Distribution::SinglePartition => {
333
                    assert_eq!(result, (true, false, false, false, false))
334
                }
335
                Distribution::HashPartitioned(_) => {
336
                    assert_eq!(result, (true, false, false, true, false))
337
                }
338
            }
339
        }
340
341
        Ok(())
342
    }
343
}