Coverage Report

Created: 2024-10-13 08:39

/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/equivalence/mod.rs
Line
Count
Source (jump to first uncovered line)
1
// Licensed to the Apache Software Foundation (ASF) under one
2
// or more contributor license agreements.  See the NOTICE file
3
// distributed with this work for additional information
4
// regarding copyright ownership.  The ASF licenses this file
5
// to you under the Apache License, Version 2.0 (the
6
// "License"); you may not use this file except in compliance
7
// with the License.  You may obtain a copy of the License at
8
//
9
//   http://www.apache.org/licenses/LICENSE-2.0
10
//
11
// Unless required by applicable law or agreed to in writing,
12
// software distributed under the License is distributed on an
13
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14
// KIND, either express or implied.  See the License for the
15
// specific language governing permissions and limitations
16
// under the License.
17
18
use std::sync::Arc;
19
20
use crate::expressions::Column;
21
use crate::{LexRequirement, PhysicalExpr, PhysicalSortRequirement};
22
23
use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode};
24
25
mod class;
26
mod ordering;
27
mod projection;
28
mod properties;
29
30
pub use class::{ConstExpr, EquivalenceClass, EquivalenceGroup};
31
pub use ordering::OrderingEquivalenceClass;
32
pub use projection::ProjectionMapping;
33
pub use properties::{
34
    calculate_union, join_equivalence_properties, EquivalenceProperties,
35
};
36
37
/// This function constructs a duplicate-free `LexOrderingReq` by filtering out
38
/// duplicate entries that have same physical expression inside. For example,
39
/// `vec![a Some(ASC), a Some(DESC)]` collapses to `vec![a Some(ASC)]`.
40
///
41
/// It will also filter out entries that are ordered if the next entry is;
42
/// for instance, `vec![floor(a) Some(ASC), a Some(ASC)]` will be collapsed to
43
/// `vec![a Some(ASC)]`.
44
2.64k
pub fn collapse_lex_req(input: LexRequirement) -> LexRequirement {
45
2.64k
    let mut output = Vec::<PhysicalSortRequirement>::new();
46
8.62k
    for 
item5.97k
in input {
47
6.52k
        if 
!output.iter().any(5.97k
|req| req.expr.eq(&item.expr)
)5.97k
{
48
5.93k
            output.push(item);
49
5.93k
        }
40
50
    }
51
2.64k
    LexRequirement::new(output)
52
2.64k
}
53
54
/// Adds the `offset` value to `Column` indices inside `expr`. This function is
55
/// generally used during the update of the right table schema in join operations.
56
302
pub fn add_offset_to_expr(
57
302
    expr: Arc<dyn PhysicalExpr>,
58
302
    offset: usize,
59
302
) -> Arc<dyn PhysicalExpr> {
60
302
    expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() {
61
302
        Some(col) => Ok(Transformed::yes(Arc::new(Column::new(
62
302
            col.name(),
63
302
            offset + col.index(),
64
302
        )))),
65
0
        None => Ok(Transformed::no(e)),
66
302
    })
67
302
    .data()
68
302
    .unwrap()
69
302
    // Note that we can safely unwrap here since our transform always returns
70
302
    // an `Ok` value.
71
302
}
72
73
#[cfg(test)]
74
mod tests {
75
    use super::*;
76
    use crate::expressions::col;
77
    use crate::PhysicalSortExpr;
78
79
    use arrow::compute::{lexsort_to_indices, SortColumn};
80
    use arrow::datatypes::{DataType, Field, Schema};
81
    use arrow_array::{ArrayRef, Float64Array, RecordBatch, UInt32Array};
82
    use arrow_schema::{SchemaRef, SortOptions};
83
    use datafusion_common::{plan_datafusion_err, Result};
84
85
    use itertools::izip;
86
    use rand::rngs::StdRng;
87
    use rand::seq::SliceRandom;
88
    use rand::{Rng, SeedableRng};
89
90
    pub fn output_schema(
91
        mapping: &ProjectionMapping,
92
        input_schema: &Arc<Schema>,
93
    ) -> Result<SchemaRef> {
94
        // Calculate output schema
95
        let fields: Result<Vec<Field>> = mapping
96
            .iter()
97
            .map(|(source, target)| {
98
                let name = target
99
                    .as_any()
100
                    .downcast_ref::<Column>()
101
                    .ok_or_else(|| plan_datafusion_err!("Expects to have column"))?
102
                    .name();
103
                let field = Field::new(
104
                    name,
105
                    source.data_type(input_schema)?,
106
                    source.nullable(input_schema)?,
107
                );
108
109
                Ok(field)
110
            })
111
            .collect();
112
113
        let output_schema = Arc::new(Schema::new_with_metadata(
114
            fields?,
115
            input_schema.metadata().clone(),
116
        ));
117
118
        Ok(output_schema)
119
    }
120
121
    // Generate a schema which consists of 8 columns (a, b, c, d, e, f, g, h)
122
    pub fn create_test_schema() -> Result<SchemaRef> {
123
        let a = Field::new("a", DataType::Int32, true);
124
        let b = Field::new("b", DataType::Int32, true);
125
        let c = Field::new("c", DataType::Int32, true);
126
        let d = Field::new("d", DataType::Int32, true);
127
        let e = Field::new("e", DataType::Int32, true);
128
        let f = Field::new("f", DataType::Int32, true);
129
        let g = Field::new("g", DataType::Int32, true);
130
        let h = Field::new("h", DataType::Int32, true);
131
        let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f, g, h]));
132
133
        Ok(schema)
134
    }
135
136
    /// Construct a schema with following properties
137
    /// Schema satisfies following orderings:
138
    /// [a ASC], [d ASC, b ASC], [e DESC, f ASC, g ASC]
139
    /// and
140
    /// Column [a=c] (e.g they are aliases).
141
    pub fn create_test_params() -> Result<(SchemaRef, EquivalenceProperties)> {
142
        let test_schema = create_test_schema()?;
143
        let col_a = &col("a", &test_schema)?;
144
        let col_b = &col("b", &test_schema)?;
145
        let col_c = &col("c", &test_schema)?;
146
        let col_d = &col("d", &test_schema)?;
147
        let col_e = &col("e", &test_schema)?;
148
        let col_f = &col("f", &test_schema)?;
149
        let col_g = &col("g", &test_schema)?;
150
        let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
151
        eq_properties.add_equal_conditions(col_a, col_c)?;
152
153
        let option_asc = SortOptions {
154
            descending: false,
155
            nulls_first: false,
156
        };
157
        let option_desc = SortOptions {
158
            descending: true,
159
            nulls_first: true,
160
        };
161
        let orderings = vec![
162
            // [a ASC]
163
            vec![(col_a, option_asc)],
164
            // [d ASC, b ASC]
165
            vec![(col_d, option_asc), (col_b, option_asc)],
166
            // [e DESC, f ASC, g ASC]
167
            vec![
168
                (col_e, option_desc),
169
                (col_f, option_asc),
170
                (col_g, option_asc),
171
            ],
172
        ];
173
        let orderings = convert_to_orderings(&orderings);
174
        eq_properties.add_new_orderings(orderings);
175
        Ok((test_schema, eq_properties))
176
    }
177
178
    // Generate a schema which consists of 6 columns (a, b, c, d, e, f)
179
    fn create_test_schema_2() -> Result<SchemaRef> {
180
        let a = Field::new("a", DataType::Float64, true);
181
        let b = Field::new("b", DataType::Float64, true);
182
        let c = Field::new("c", DataType::Float64, true);
183
        let d = Field::new("d", DataType::Float64, true);
184
        let e = Field::new("e", DataType::Float64, true);
185
        let f = Field::new("f", DataType::Float64, true);
186
        let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f]));
187
188
        Ok(schema)
189
    }
190
191
    /// Construct a schema with random ordering
192
    /// among column a, b, c, d
193
    /// where
194
    /// Column [a=f] (e.g they are aliases).
195
    /// Column e is constant.
196
    pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperties)> {
197
        let test_schema = create_test_schema_2()?;
198
        let col_a = &col("a", &test_schema)?;
199
        let col_b = &col("b", &test_schema)?;
200
        let col_c = &col("c", &test_schema)?;
201
        let col_d = &col("d", &test_schema)?;
202
        let col_e = &col("e", &test_schema)?;
203
        let col_f = &col("f", &test_schema)?;
204
        let col_exprs = [col_a, col_b, col_c, col_d, col_e, col_f];
205
206
        let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema));
207
        // Define a and f are aliases
208
        eq_properties.add_equal_conditions(col_a, col_f)?;
209
        // Column e has constant value.
210
        eq_properties = eq_properties.with_constants([ConstExpr::from(col_e)]);
211
212
        // Randomly order columns for sorting
213
        let mut rng = StdRng::seed_from_u64(seed);
214
        let mut remaining_exprs = col_exprs[0..4].to_vec(); // only a, b, c, d are sorted
215
216
        let options_asc = SortOptions {
217
            descending: false,
218
            nulls_first: false,
219
        };
220
221
        while !remaining_exprs.is_empty() {
222
            let n_sort_expr = rng.gen_range(0..remaining_exprs.len() + 1);
223
            remaining_exprs.shuffle(&mut rng);
224
225
            let ordering = remaining_exprs
226
                .drain(0..n_sort_expr)
227
                .map(|expr| PhysicalSortExpr {
228
                    expr: Arc::clone(expr),
229
                    options: options_asc,
230
                })
231
                .collect();
232
233
            eq_properties.add_new_orderings([ordering]);
234
        }
235
236
        Ok((test_schema, eq_properties))
237
    }
238
239
    // Convert each tuple to PhysicalSortRequirement
240
    pub fn convert_to_sort_reqs(
241
        in_data: &[(&Arc<dyn PhysicalExpr>, Option<SortOptions>)],
242
    ) -> LexRequirement {
243
        in_data
244
            .iter()
245
            .map(|(expr, options)| {
246
                PhysicalSortRequirement::new(Arc::clone(*expr), *options)
247
            })
248
            .collect()
249
    }
250
251
    // Convert each tuple to PhysicalSortExpr
252
    pub fn convert_to_sort_exprs(
253
        in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)],
254
    ) -> Vec<PhysicalSortExpr> {
255
        in_data
256
            .iter()
257
            .map(|(expr, options)| PhysicalSortExpr {
258
                expr: Arc::clone(*expr),
259
                options: *options,
260
            })
261
            .collect()
262
    }
263
264
    // Convert each inner tuple to PhysicalSortExpr
265
    pub fn convert_to_orderings(
266
        orderings: &[Vec<(&Arc<dyn PhysicalExpr>, SortOptions)>],
267
    ) -> Vec<Vec<PhysicalSortExpr>> {
268
        orderings
269
            .iter()
270
            .map(|sort_exprs| convert_to_sort_exprs(sort_exprs))
271
            .collect()
272
    }
273
274
    // Convert each tuple to PhysicalSortExpr
275
    pub fn convert_to_sort_exprs_owned(
276
        in_data: &[(Arc<dyn PhysicalExpr>, SortOptions)],
277
    ) -> Vec<PhysicalSortExpr> {
278
        in_data
279
            .iter()
280
            .map(|(expr, options)| PhysicalSortExpr {
281
                expr: Arc::clone(expr),
282
                options: *options,
283
            })
284
            .collect()
285
    }
286
287
    // Convert each inner tuple to PhysicalSortExpr
288
    pub fn convert_to_orderings_owned(
289
        orderings: &[Vec<(Arc<dyn PhysicalExpr>, SortOptions)>],
290
    ) -> Vec<Vec<PhysicalSortExpr>> {
291
        orderings
292
            .iter()
293
            .map(|sort_exprs| convert_to_sort_exprs_owned(sort_exprs))
294
            .collect()
295
    }
296
297
    // Apply projection to the input_data, return projected equivalence properties and record batch
298
    pub fn apply_projection(
299
        proj_exprs: Vec<(Arc<dyn PhysicalExpr>, String)>,
300
        input_data: &RecordBatch,
301
        input_eq_properties: &EquivalenceProperties,
302
    ) -> Result<(RecordBatch, EquivalenceProperties)> {
303
        let input_schema = input_data.schema();
304
        let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?;
305
306
        let output_schema = output_schema(&projection_mapping, &input_schema)?;
307
        let num_rows = input_data.num_rows();
308
        // Apply projection to the input record batch.
309
        let projected_values = projection_mapping
310
            .iter()
311
            .map(|(source, _target)| source.evaluate(input_data)?.into_array(num_rows))
312
            .collect::<Result<Vec<_>>>()?;
313
        let projected_batch = if projected_values.is_empty() {
314
            RecordBatch::new_empty(Arc::clone(&output_schema))
315
        } else {
316
            RecordBatch::try_new(Arc::clone(&output_schema), projected_values)?
317
        };
318
319
        let projected_eq =
320
            input_eq_properties.project(&projection_mapping, output_schema);
321
        Ok((projected_batch, projected_eq))
322
    }
323
324
    #[test]
325
    fn add_equal_conditions_test() -> Result<()> {
326
        let schema = Arc::new(Schema::new(vec![
327
            Field::new("a", DataType::Int64, true),
328
            Field::new("b", DataType::Int64, true),
329
            Field::new("c", DataType::Int64, true),
330
            Field::new("x", DataType::Int64, true),
331
            Field::new("y", DataType::Int64, true),
332
        ]));
333
334
        let mut eq_properties = EquivalenceProperties::new(schema);
335
        let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>;
336
        let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>;
337
        let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>;
338
        let col_x_expr = Arc::new(Column::new("x", 3)) as Arc<dyn PhysicalExpr>;
339
        let col_y_expr = Arc::new(Column::new("y", 4)) as Arc<dyn PhysicalExpr>;
340
341
        // a and b are aliases
342
        eq_properties.add_equal_conditions(&col_a_expr, &col_b_expr)?;
343
        assert_eq!(eq_properties.eq_group().len(), 1);
344
345
        // This new entry is redundant, size shouldn't increase
346
        eq_properties.add_equal_conditions(&col_b_expr, &col_a_expr)?;
347
        assert_eq!(eq_properties.eq_group().len(), 1);
348
        let eq_groups = &eq_properties.eq_group().classes[0];
349
        assert_eq!(eq_groups.len(), 2);
350
        assert!(eq_groups.contains(&col_a_expr));
351
        assert!(eq_groups.contains(&col_b_expr));
352
353
        // b and c are aliases. Exising equivalence class should expand,
354
        // however there shouldn't be any new equivalence class
355
        eq_properties.add_equal_conditions(&col_b_expr, &col_c_expr)?;
356
        assert_eq!(eq_properties.eq_group().len(), 1);
357
        let eq_groups = &eq_properties.eq_group().classes[0];
358
        assert_eq!(eq_groups.len(), 3);
359
        assert!(eq_groups.contains(&col_a_expr));
360
        assert!(eq_groups.contains(&col_b_expr));
361
        assert!(eq_groups.contains(&col_c_expr));
362
363
        // This is a new set of equality. Hence equivalent class count should be 2.
364
        eq_properties.add_equal_conditions(&col_x_expr, &col_y_expr)?;
365
        assert_eq!(eq_properties.eq_group().len(), 2);
366
367
        // This equality bridges distinct equality sets.
368
        // Hence equivalent class count should decrease from 2 to 1.
369
        eq_properties.add_equal_conditions(&col_x_expr, &col_a_expr)?;
370
        assert_eq!(eq_properties.eq_group().len(), 1);
371
        let eq_groups = &eq_properties.eq_group().classes[0];
372
        assert_eq!(eq_groups.len(), 5);
373
        assert!(eq_groups.contains(&col_a_expr));
374
        assert!(eq_groups.contains(&col_b_expr));
375
        assert!(eq_groups.contains(&col_c_expr));
376
        assert!(eq_groups.contains(&col_x_expr));
377
        assert!(eq_groups.contains(&col_y_expr));
378
379
        Ok(())
380
    }
381
382
    /// Checks if the table (RecordBatch) remains unchanged when sorted according to the provided `required_ordering`.
383
    ///
384
    /// The function works by adding a unique column of ascending integers to the original table. This column ensures
385
    /// that rows that are otherwise indistinguishable (e.g., if they have the same values in all other columns) can
386
    /// still be differentiated. When sorting the extended table, the unique column acts as a tie-breaker to produce
387
    /// deterministic sorting results.
388
    ///
389
    /// If the table remains the same after sorting with the added unique column, it indicates that the table was
390
    /// already sorted according to `required_ordering` to begin with.
391
    pub fn is_table_same_after_sort(
392
        mut required_ordering: Vec<PhysicalSortExpr>,
393
        batch: RecordBatch,
394
    ) -> Result<bool> {
395
        // Clone the original schema and columns
396
        let original_schema = batch.schema();
397
        let mut columns = batch.columns().to_vec();
398
399
        // Create a new unique column
400
        let n_row = batch.num_rows();
401
        let vals: Vec<usize> = (0..n_row).collect::<Vec<_>>();
402
        let vals: Vec<f64> = vals.into_iter().map(|val| val as f64).collect();
403
        let unique_col = Arc::new(Float64Array::from_iter_values(vals)) as ArrayRef;
404
        columns.push(Arc::clone(&unique_col));
405
406
        // Create a new schema with the added unique column
407
        let unique_col_name = "unique";
408
        let unique_field =
409
            Arc::new(Field::new(unique_col_name, DataType::Float64, false));
410
        let fields: Vec<_> = original_schema
411
            .fields()
412
            .iter()
413
            .cloned()
414
            .chain(std::iter::once(unique_field))
415
            .collect();
416
        let schema = Arc::new(Schema::new(fields));
417
418
        // Create a new batch with the added column
419
        let new_batch = RecordBatch::try_new(Arc::clone(&schema), columns)?;
420
421
        // Add the unique column to the required ordering to ensure deterministic results
422
        required_ordering.push(PhysicalSortExpr {
423
            expr: Arc::new(Column::new(unique_col_name, original_schema.fields().len())),
424
            options: Default::default(),
425
        });
426
427
        // Convert the required ordering to a list of SortColumn
428
        let sort_columns = required_ordering
429
            .iter()
430
            .map(|order_expr| {
431
                let expr_result = order_expr.expr.evaluate(&new_batch)?;
432
                let values = expr_result.into_array(new_batch.num_rows())?;
433
                Ok(SortColumn {
434
                    values,
435
                    options: Some(order_expr.options),
436
                })
437
            })
438
            .collect::<Result<Vec<_>>>()?;
439
440
        // Check if the indices after sorting match the initial ordering
441
        let sorted_indices = lexsort_to_indices(&sort_columns, None)?;
442
        let original_indices = UInt32Array::from_iter_values(0..n_row as u32);
443
444
        Ok(sorted_indices == original_indices)
445
    }
446
447
    // If we already generated a random result for one of the
448
    // expressions in the equivalence classes. For other expressions in the same
449
    // equivalence class use same result. This util gets already calculated result, when available.
450
    fn get_representative_arr(
451
        eq_group: &EquivalenceClass,
452
        existing_vec: &[Option<ArrayRef>],
453
        schema: SchemaRef,
454
    ) -> Option<ArrayRef> {
455
        for expr in eq_group.iter() {
456
            let col = expr.as_any().downcast_ref::<Column>().unwrap();
457
            let (idx, _field) = schema.column_with_name(col.name()).unwrap();
458
            if let Some(res) = &existing_vec[idx] {
459
                return Some(Arc::clone(res));
460
            }
461
        }
462
        None
463
    }
464
465
    // Generate a table that satisfies the given equivalence properties; i.e.
466
    // equivalences, ordering equivalences, and constants.
467
    pub fn generate_table_for_eq_properties(
468
        eq_properties: &EquivalenceProperties,
469
        n_elem: usize,
470
        n_distinct: usize,
471
    ) -> Result<RecordBatch> {
472
        let mut rng = StdRng::seed_from_u64(23);
473
474
        let schema = eq_properties.schema();
475
        let mut schema_vec = vec![None; schema.fields.len()];
476
477
        // Utility closure to generate random array
478
        let mut generate_random_array = |num_elems: usize, max_val: usize| -> ArrayRef {
479
            let values: Vec<f64> = (0..num_elems)
480
                .map(|_| rng.gen_range(0..max_val) as f64 / 2.0)
481
                .collect();
482
            Arc::new(Float64Array::from_iter_values(values))
483
        };
484
485
        // Fill constant columns
486
        for constant in &eq_properties.constants {
487
            let col = constant.expr().as_any().downcast_ref::<Column>().unwrap();
488
            let (idx, _field) = schema.column_with_name(col.name()).unwrap();
489
            let arr = Arc::new(Float64Array::from_iter_values(vec![0 as f64; n_elem]))
490
                as ArrayRef;
491
            schema_vec[idx] = Some(arr);
492
        }
493
494
        // Fill columns based on ordering equivalences
495
        for ordering in eq_properties.oeq_class.iter() {
496
            let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering
497
                .iter()
498
                .map(|PhysicalSortExpr { expr, options }| {
499
                    let col = expr.as_any().downcast_ref::<Column>().unwrap();
500
                    let (idx, _field) = schema.column_with_name(col.name()).unwrap();
501
                    let arr = generate_random_array(n_elem, n_distinct);
502
                    (
503
                        SortColumn {
504
                            values: arr,
505
                            options: Some(*options),
506
                        },
507
                        idx,
508
                    )
509
                })
510
                .unzip();
511
512
            let sort_arrs = arrow::compute::lexsort(&sort_columns, None)?;
513
            for (idx, arr) in izip!(indices, sort_arrs) {
514
                schema_vec[idx] = Some(arr);
515
            }
516
        }
517
518
        // Fill columns based on equivalence groups
519
        for eq_group in eq_properties.eq_group.iter() {
520
            let representative_array =
521
                get_representative_arr(eq_group, &schema_vec, Arc::clone(schema))
522
                    .unwrap_or_else(|| generate_random_array(n_elem, n_distinct));
523
524
            for expr in eq_group.iter() {
525
                let col = expr.as_any().downcast_ref::<Column>().unwrap();
526
                let (idx, _field) = schema.column_with_name(col.name()).unwrap();
527
                schema_vec[idx] = Some(Arc::clone(&representative_array));
528
            }
529
        }
530
531
        let res: Vec<_> = schema_vec
532
            .into_iter()
533
            .zip(schema.fields.iter())
534
            .map(|(elem, field)| {
535
                (
536
                    field.name(),
537
                    // Generate random values for columns that do not occur in any of the groups (equivalence, ordering equivalence, constants)
538
                    elem.unwrap_or_else(|| generate_random_array(n_elem, n_distinct)),
539
                )
540
            })
541
            .collect();
542
543
        Ok(RecordBatch::try_from_iter(res)?)
544
    }
545
}