/Users/andrewlamb/Software/datafusion/datafusion/physical-expr/src/equivalence/mod.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | use std::sync::Arc; |
19 | | |
20 | | use crate::expressions::Column; |
21 | | use crate::{LexRequirement, PhysicalExpr, PhysicalSortRequirement}; |
22 | | |
23 | | use datafusion_common::tree_node::{Transformed, TransformedResult, TreeNode}; |
24 | | |
25 | | mod class; |
26 | | mod ordering; |
27 | | mod projection; |
28 | | mod properties; |
29 | | |
30 | | pub use class::{ConstExpr, EquivalenceClass, EquivalenceGroup}; |
31 | | pub use ordering::OrderingEquivalenceClass; |
32 | | pub use projection::ProjectionMapping; |
33 | | pub use properties::{ |
34 | | calculate_union, join_equivalence_properties, EquivalenceProperties, |
35 | | }; |
36 | | |
37 | | /// This function constructs a duplicate-free `LexOrderingReq` by filtering out |
38 | | /// duplicate entries that have same physical expression inside. For example, |
39 | | /// `vec![a Some(ASC), a Some(DESC)]` collapses to `vec![a Some(ASC)]`. |
40 | | /// |
41 | | /// It will also filter out entries that are ordered if the next entry is; |
42 | | /// for instance, `vec![floor(a) Some(ASC), a Some(ASC)]` will be collapsed to |
43 | | /// `vec![a Some(ASC)]`. |
44 | 2.64k | pub fn collapse_lex_req(input: LexRequirement) -> LexRequirement { |
45 | 2.64k | let mut output = Vec::<PhysicalSortRequirement>::new(); |
46 | 8.62k | for item5.97k in input { |
47 | 6.52k | if !output.iter().any(5.97k |req| req.expr.eq(&item.expr))5.97k { |
48 | 5.93k | output.push(item); |
49 | 5.93k | }40 |
50 | | } |
51 | 2.64k | LexRequirement::new(output) |
52 | 2.64k | } |
53 | | |
54 | | /// Adds the `offset` value to `Column` indices inside `expr`. This function is |
55 | | /// generally used during the update of the right table schema in join operations. |
56 | 302 | pub fn add_offset_to_expr( |
57 | 302 | expr: Arc<dyn PhysicalExpr>, |
58 | 302 | offset: usize, |
59 | 302 | ) -> Arc<dyn PhysicalExpr> { |
60 | 302 | expr.transform_down(|e| match e.as_any().downcast_ref::<Column>() { |
61 | 302 | Some(col) => Ok(Transformed::yes(Arc::new(Column::new( |
62 | 302 | col.name(), |
63 | 302 | offset + col.index(), |
64 | 302 | )))), |
65 | 0 | None => Ok(Transformed::no(e)), |
66 | 302 | }) |
67 | 302 | .data() |
68 | 302 | .unwrap() |
69 | 302 | // Note that we can safely unwrap here since our transform always returns |
70 | 302 | // an `Ok` value. |
71 | 302 | } |
72 | | |
73 | | #[cfg(test)] |
74 | | mod tests { |
75 | | use super::*; |
76 | | use crate::expressions::col; |
77 | | use crate::PhysicalSortExpr; |
78 | | |
79 | | use arrow::compute::{lexsort_to_indices, SortColumn}; |
80 | | use arrow::datatypes::{DataType, Field, Schema}; |
81 | | use arrow_array::{ArrayRef, Float64Array, RecordBatch, UInt32Array}; |
82 | | use arrow_schema::{SchemaRef, SortOptions}; |
83 | | use datafusion_common::{plan_datafusion_err, Result}; |
84 | | |
85 | | use itertools::izip; |
86 | | use rand::rngs::StdRng; |
87 | | use rand::seq::SliceRandom; |
88 | | use rand::{Rng, SeedableRng}; |
89 | | |
90 | | pub fn output_schema( |
91 | | mapping: &ProjectionMapping, |
92 | | input_schema: &Arc<Schema>, |
93 | | ) -> Result<SchemaRef> { |
94 | | // Calculate output schema |
95 | | let fields: Result<Vec<Field>> = mapping |
96 | | .iter() |
97 | | .map(|(source, target)| { |
98 | | let name = target |
99 | | .as_any() |
100 | | .downcast_ref::<Column>() |
101 | | .ok_or_else(|| plan_datafusion_err!("Expects to have column"))? |
102 | | .name(); |
103 | | let field = Field::new( |
104 | | name, |
105 | | source.data_type(input_schema)?, |
106 | | source.nullable(input_schema)?, |
107 | | ); |
108 | | |
109 | | Ok(field) |
110 | | }) |
111 | | .collect(); |
112 | | |
113 | | let output_schema = Arc::new(Schema::new_with_metadata( |
114 | | fields?, |
115 | | input_schema.metadata().clone(), |
116 | | )); |
117 | | |
118 | | Ok(output_schema) |
119 | | } |
120 | | |
121 | | // Generate a schema which consists of 8 columns (a, b, c, d, e, f, g, h) |
122 | | pub fn create_test_schema() -> Result<SchemaRef> { |
123 | | let a = Field::new("a", DataType::Int32, true); |
124 | | let b = Field::new("b", DataType::Int32, true); |
125 | | let c = Field::new("c", DataType::Int32, true); |
126 | | let d = Field::new("d", DataType::Int32, true); |
127 | | let e = Field::new("e", DataType::Int32, true); |
128 | | let f = Field::new("f", DataType::Int32, true); |
129 | | let g = Field::new("g", DataType::Int32, true); |
130 | | let h = Field::new("h", DataType::Int32, true); |
131 | | let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f, g, h])); |
132 | | |
133 | | Ok(schema) |
134 | | } |
135 | | |
136 | | /// Construct a schema with following properties |
137 | | /// Schema satisfies following orderings: |
138 | | /// [a ASC], [d ASC, b ASC], [e DESC, f ASC, g ASC] |
139 | | /// and |
140 | | /// Column [a=c] (e.g they are aliases). |
141 | | pub fn create_test_params() -> Result<(SchemaRef, EquivalenceProperties)> { |
142 | | let test_schema = create_test_schema()?; |
143 | | let col_a = &col("a", &test_schema)?; |
144 | | let col_b = &col("b", &test_schema)?; |
145 | | let col_c = &col("c", &test_schema)?; |
146 | | let col_d = &col("d", &test_schema)?; |
147 | | let col_e = &col("e", &test_schema)?; |
148 | | let col_f = &col("f", &test_schema)?; |
149 | | let col_g = &col("g", &test_schema)?; |
150 | | let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema)); |
151 | | eq_properties.add_equal_conditions(col_a, col_c)?; |
152 | | |
153 | | let option_asc = SortOptions { |
154 | | descending: false, |
155 | | nulls_first: false, |
156 | | }; |
157 | | let option_desc = SortOptions { |
158 | | descending: true, |
159 | | nulls_first: true, |
160 | | }; |
161 | | let orderings = vec![ |
162 | | // [a ASC] |
163 | | vec![(col_a, option_asc)], |
164 | | // [d ASC, b ASC] |
165 | | vec![(col_d, option_asc), (col_b, option_asc)], |
166 | | // [e DESC, f ASC, g ASC] |
167 | | vec![ |
168 | | (col_e, option_desc), |
169 | | (col_f, option_asc), |
170 | | (col_g, option_asc), |
171 | | ], |
172 | | ]; |
173 | | let orderings = convert_to_orderings(&orderings); |
174 | | eq_properties.add_new_orderings(orderings); |
175 | | Ok((test_schema, eq_properties)) |
176 | | } |
177 | | |
178 | | // Generate a schema which consists of 6 columns (a, b, c, d, e, f) |
179 | | fn create_test_schema_2() -> Result<SchemaRef> { |
180 | | let a = Field::new("a", DataType::Float64, true); |
181 | | let b = Field::new("b", DataType::Float64, true); |
182 | | let c = Field::new("c", DataType::Float64, true); |
183 | | let d = Field::new("d", DataType::Float64, true); |
184 | | let e = Field::new("e", DataType::Float64, true); |
185 | | let f = Field::new("f", DataType::Float64, true); |
186 | | let schema = Arc::new(Schema::new(vec![a, b, c, d, e, f])); |
187 | | |
188 | | Ok(schema) |
189 | | } |
190 | | |
191 | | /// Construct a schema with random ordering |
192 | | /// among column a, b, c, d |
193 | | /// where |
194 | | /// Column [a=f] (e.g they are aliases). |
195 | | /// Column e is constant. |
196 | | pub fn create_random_schema(seed: u64) -> Result<(SchemaRef, EquivalenceProperties)> { |
197 | | let test_schema = create_test_schema_2()?; |
198 | | let col_a = &col("a", &test_schema)?; |
199 | | let col_b = &col("b", &test_schema)?; |
200 | | let col_c = &col("c", &test_schema)?; |
201 | | let col_d = &col("d", &test_schema)?; |
202 | | let col_e = &col("e", &test_schema)?; |
203 | | let col_f = &col("f", &test_schema)?; |
204 | | let col_exprs = [col_a, col_b, col_c, col_d, col_e, col_f]; |
205 | | |
206 | | let mut eq_properties = EquivalenceProperties::new(Arc::clone(&test_schema)); |
207 | | // Define a and f are aliases |
208 | | eq_properties.add_equal_conditions(col_a, col_f)?; |
209 | | // Column e has constant value. |
210 | | eq_properties = eq_properties.with_constants([ConstExpr::from(col_e)]); |
211 | | |
212 | | // Randomly order columns for sorting |
213 | | let mut rng = StdRng::seed_from_u64(seed); |
214 | | let mut remaining_exprs = col_exprs[0..4].to_vec(); // only a, b, c, d are sorted |
215 | | |
216 | | let options_asc = SortOptions { |
217 | | descending: false, |
218 | | nulls_first: false, |
219 | | }; |
220 | | |
221 | | while !remaining_exprs.is_empty() { |
222 | | let n_sort_expr = rng.gen_range(0..remaining_exprs.len() + 1); |
223 | | remaining_exprs.shuffle(&mut rng); |
224 | | |
225 | | let ordering = remaining_exprs |
226 | | .drain(0..n_sort_expr) |
227 | | .map(|expr| PhysicalSortExpr { |
228 | | expr: Arc::clone(expr), |
229 | | options: options_asc, |
230 | | }) |
231 | | .collect(); |
232 | | |
233 | | eq_properties.add_new_orderings([ordering]); |
234 | | } |
235 | | |
236 | | Ok((test_schema, eq_properties)) |
237 | | } |
238 | | |
239 | | // Convert each tuple to PhysicalSortRequirement |
240 | | pub fn convert_to_sort_reqs( |
241 | | in_data: &[(&Arc<dyn PhysicalExpr>, Option<SortOptions>)], |
242 | | ) -> LexRequirement { |
243 | | in_data |
244 | | .iter() |
245 | | .map(|(expr, options)| { |
246 | | PhysicalSortRequirement::new(Arc::clone(*expr), *options) |
247 | | }) |
248 | | .collect() |
249 | | } |
250 | | |
251 | | // Convert each tuple to PhysicalSortExpr |
252 | | pub fn convert_to_sort_exprs( |
253 | | in_data: &[(&Arc<dyn PhysicalExpr>, SortOptions)], |
254 | | ) -> Vec<PhysicalSortExpr> { |
255 | | in_data |
256 | | .iter() |
257 | | .map(|(expr, options)| PhysicalSortExpr { |
258 | | expr: Arc::clone(*expr), |
259 | | options: *options, |
260 | | }) |
261 | | .collect() |
262 | | } |
263 | | |
264 | | // Convert each inner tuple to PhysicalSortExpr |
265 | | pub fn convert_to_orderings( |
266 | | orderings: &[Vec<(&Arc<dyn PhysicalExpr>, SortOptions)>], |
267 | | ) -> Vec<Vec<PhysicalSortExpr>> { |
268 | | orderings |
269 | | .iter() |
270 | | .map(|sort_exprs| convert_to_sort_exprs(sort_exprs)) |
271 | | .collect() |
272 | | } |
273 | | |
274 | | // Convert each tuple to PhysicalSortExpr |
275 | | pub fn convert_to_sort_exprs_owned( |
276 | | in_data: &[(Arc<dyn PhysicalExpr>, SortOptions)], |
277 | | ) -> Vec<PhysicalSortExpr> { |
278 | | in_data |
279 | | .iter() |
280 | | .map(|(expr, options)| PhysicalSortExpr { |
281 | | expr: Arc::clone(expr), |
282 | | options: *options, |
283 | | }) |
284 | | .collect() |
285 | | } |
286 | | |
287 | | // Convert each inner tuple to PhysicalSortExpr |
288 | | pub fn convert_to_orderings_owned( |
289 | | orderings: &[Vec<(Arc<dyn PhysicalExpr>, SortOptions)>], |
290 | | ) -> Vec<Vec<PhysicalSortExpr>> { |
291 | | orderings |
292 | | .iter() |
293 | | .map(|sort_exprs| convert_to_sort_exprs_owned(sort_exprs)) |
294 | | .collect() |
295 | | } |
296 | | |
297 | | // Apply projection to the input_data, return projected equivalence properties and record batch |
298 | | pub fn apply_projection( |
299 | | proj_exprs: Vec<(Arc<dyn PhysicalExpr>, String)>, |
300 | | input_data: &RecordBatch, |
301 | | input_eq_properties: &EquivalenceProperties, |
302 | | ) -> Result<(RecordBatch, EquivalenceProperties)> { |
303 | | let input_schema = input_data.schema(); |
304 | | let projection_mapping = ProjectionMapping::try_new(&proj_exprs, &input_schema)?; |
305 | | |
306 | | let output_schema = output_schema(&projection_mapping, &input_schema)?; |
307 | | let num_rows = input_data.num_rows(); |
308 | | // Apply projection to the input record batch. |
309 | | let projected_values = projection_mapping |
310 | | .iter() |
311 | | .map(|(source, _target)| source.evaluate(input_data)?.into_array(num_rows)) |
312 | | .collect::<Result<Vec<_>>>()?; |
313 | | let projected_batch = if projected_values.is_empty() { |
314 | | RecordBatch::new_empty(Arc::clone(&output_schema)) |
315 | | } else { |
316 | | RecordBatch::try_new(Arc::clone(&output_schema), projected_values)? |
317 | | }; |
318 | | |
319 | | let projected_eq = |
320 | | input_eq_properties.project(&projection_mapping, output_schema); |
321 | | Ok((projected_batch, projected_eq)) |
322 | | } |
323 | | |
324 | | #[test] |
325 | | fn add_equal_conditions_test() -> Result<()> { |
326 | | let schema = Arc::new(Schema::new(vec![ |
327 | | Field::new("a", DataType::Int64, true), |
328 | | Field::new("b", DataType::Int64, true), |
329 | | Field::new("c", DataType::Int64, true), |
330 | | Field::new("x", DataType::Int64, true), |
331 | | Field::new("y", DataType::Int64, true), |
332 | | ])); |
333 | | |
334 | | let mut eq_properties = EquivalenceProperties::new(schema); |
335 | | let col_a_expr = Arc::new(Column::new("a", 0)) as Arc<dyn PhysicalExpr>; |
336 | | let col_b_expr = Arc::new(Column::new("b", 1)) as Arc<dyn PhysicalExpr>; |
337 | | let col_c_expr = Arc::new(Column::new("c", 2)) as Arc<dyn PhysicalExpr>; |
338 | | let col_x_expr = Arc::new(Column::new("x", 3)) as Arc<dyn PhysicalExpr>; |
339 | | let col_y_expr = Arc::new(Column::new("y", 4)) as Arc<dyn PhysicalExpr>; |
340 | | |
341 | | // a and b are aliases |
342 | | eq_properties.add_equal_conditions(&col_a_expr, &col_b_expr)?; |
343 | | assert_eq!(eq_properties.eq_group().len(), 1); |
344 | | |
345 | | // This new entry is redundant, size shouldn't increase |
346 | | eq_properties.add_equal_conditions(&col_b_expr, &col_a_expr)?; |
347 | | assert_eq!(eq_properties.eq_group().len(), 1); |
348 | | let eq_groups = &eq_properties.eq_group().classes[0]; |
349 | | assert_eq!(eq_groups.len(), 2); |
350 | | assert!(eq_groups.contains(&col_a_expr)); |
351 | | assert!(eq_groups.contains(&col_b_expr)); |
352 | | |
353 | | // b and c are aliases. Exising equivalence class should expand, |
354 | | // however there shouldn't be any new equivalence class |
355 | | eq_properties.add_equal_conditions(&col_b_expr, &col_c_expr)?; |
356 | | assert_eq!(eq_properties.eq_group().len(), 1); |
357 | | let eq_groups = &eq_properties.eq_group().classes[0]; |
358 | | assert_eq!(eq_groups.len(), 3); |
359 | | assert!(eq_groups.contains(&col_a_expr)); |
360 | | assert!(eq_groups.contains(&col_b_expr)); |
361 | | assert!(eq_groups.contains(&col_c_expr)); |
362 | | |
363 | | // This is a new set of equality. Hence equivalent class count should be 2. |
364 | | eq_properties.add_equal_conditions(&col_x_expr, &col_y_expr)?; |
365 | | assert_eq!(eq_properties.eq_group().len(), 2); |
366 | | |
367 | | // This equality bridges distinct equality sets. |
368 | | // Hence equivalent class count should decrease from 2 to 1. |
369 | | eq_properties.add_equal_conditions(&col_x_expr, &col_a_expr)?; |
370 | | assert_eq!(eq_properties.eq_group().len(), 1); |
371 | | let eq_groups = &eq_properties.eq_group().classes[0]; |
372 | | assert_eq!(eq_groups.len(), 5); |
373 | | assert!(eq_groups.contains(&col_a_expr)); |
374 | | assert!(eq_groups.contains(&col_b_expr)); |
375 | | assert!(eq_groups.contains(&col_c_expr)); |
376 | | assert!(eq_groups.contains(&col_x_expr)); |
377 | | assert!(eq_groups.contains(&col_y_expr)); |
378 | | |
379 | | Ok(()) |
380 | | } |
381 | | |
382 | | /// Checks if the table (RecordBatch) remains unchanged when sorted according to the provided `required_ordering`. |
383 | | /// |
384 | | /// The function works by adding a unique column of ascending integers to the original table. This column ensures |
385 | | /// that rows that are otherwise indistinguishable (e.g., if they have the same values in all other columns) can |
386 | | /// still be differentiated. When sorting the extended table, the unique column acts as a tie-breaker to produce |
387 | | /// deterministic sorting results. |
388 | | /// |
389 | | /// If the table remains the same after sorting with the added unique column, it indicates that the table was |
390 | | /// already sorted according to `required_ordering` to begin with. |
391 | | pub fn is_table_same_after_sort( |
392 | | mut required_ordering: Vec<PhysicalSortExpr>, |
393 | | batch: RecordBatch, |
394 | | ) -> Result<bool> { |
395 | | // Clone the original schema and columns |
396 | | let original_schema = batch.schema(); |
397 | | let mut columns = batch.columns().to_vec(); |
398 | | |
399 | | // Create a new unique column |
400 | | let n_row = batch.num_rows(); |
401 | | let vals: Vec<usize> = (0..n_row).collect::<Vec<_>>(); |
402 | | let vals: Vec<f64> = vals.into_iter().map(|val| val as f64).collect(); |
403 | | let unique_col = Arc::new(Float64Array::from_iter_values(vals)) as ArrayRef; |
404 | | columns.push(Arc::clone(&unique_col)); |
405 | | |
406 | | // Create a new schema with the added unique column |
407 | | let unique_col_name = "unique"; |
408 | | let unique_field = |
409 | | Arc::new(Field::new(unique_col_name, DataType::Float64, false)); |
410 | | let fields: Vec<_> = original_schema |
411 | | .fields() |
412 | | .iter() |
413 | | .cloned() |
414 | | .chain(std::iter::once(unique_field)) |
415 | | .collect(); |
416 | | let schema = Arc::new(Schema::new(fields)); |
417 | | |
418 | | // Create a new batch with the added column |
419 | | let new_batch = RecordBatch::try_new(Arc::clone(&schema), columns)?; |
420 | | |
421 | | // Add the unique column to the required ordering to ensure deterministic results |
422 | | required_ordering.push(PhysicalSortExpr { |
423 | | expr: Arc::new(Column::new(unique_col_name, original_schema.fields().len())), |
424 | | options: Default::default(), |
425 | | }); |
426 | | |
427 | | // Convert the required ordering to a list of SortColumn |
428 | | let sort_columns = required_ordering |
429 | | .iter() |
430 | | .map(|order_expr| { |
431 | | let expr_result = order_expr.expr.evaluate(&new_batch)?; |
432 | | let values = expr_result.into_array(new_batch.num_rows())?; |
433 | | Ok(SortColumn { |
434 | | values, |
435 | | options: Some(order_expr.options), |
436 | | }) |
437 | | }) |
438 | | .collect::<Result<Vec<_>>>()?; |
439 | | |
440 | | // Check if the indices after sorting match the initial ordering |
441 | | let sorted_indices = lexsort_to_indices(&sort_columns, None)?; |
442 | | let original_indices = UInt32Array::from_iter_values(0..n_row as u32); |
443 | | |
444 | | Ok(sorted_indices == original_indices) |
445 | | } |
446 | | |
447 | | // If we already generated a random result for one of the |
448 | | // expressions in the equivalence classes. For other expressions in the same |
449 | | // equivalence class use same result. This util gets already calculated result, when available. |
450 | | fn get_representative_arr( |
451 | | eq_group: &EquivalenceClass, |
452 | | existing_vec: &[Option<ArrayRef>], |
453 | | schema: SchemaRef, |
454 | | ) -> Option<ArrayRef> { |
455 | | for expr in eq_group.iter() { |
456 | | let col = expr.as_any().downcast_ref::<Column>().unwrap(); |
457 | | let (idx, _field) = schema.column_with_name(col.name()).unwrap(); |
458 | | if let Some(res) = &existing_vec[idx] { |
459 | | return Some(Arc::clone(res)); |
460 | | } |
461 | | } |
462 | | None |
463 | | } |
464 | | |
465 | | // Generate a table that satisfies the given equivalence properties; i.e. |
466 | | // equivalences, ordering equivalences, and constants. |
467 | | pub fn generate_table_for_eq_properties( |
468 | | eq_properties: &EquivalenceProperties, |
469 | | n_elem: usize, |
470 | | n_distinct: usize, |
471 | | ) -> Result<RecordBatch> { |
472 | | let mut rng = StdRng::seed_from_u64(23); |
473 | | |
474 | | let schema = eq_properties.schema(); |
475 | | let mut schema_vec = vec![None; schema.fields.len()]; |
476 | | |
477 | | // Utility closure to generate random array |
478 | | let mut generate_random_array = |num_elems: usize, max_val: usize| -> ArrayRef { |
479 | | let values: Vec<f64> = (0..num_elems) |
480 | | .map(|_| rng.gen_range(0..max_val) as f64 / 2.0) |
481 | | .collect(); |
482 | | Arc::new(Float64Array::from_iter_values(values)) |
483 | | }; |
484 | | |
485 | | // Fill constant columns |
486 | | for constant in &eq_properties.constants { |
487 | | let col = constant.expr().as_any().downcast_ref::<Column>().unwrap(); |
488 | | let (idx, _field) = schema.column_with_name(col.name()).unwrap(); |
489 | | let arr = Arc::new(Float64Array::from_iter_values(vec![0 as f64; n_elem])) |
490 | | as ArrayRef; |
491 | | schema_vec[idx] = Some(arr); |
492 | | } |
493 | | |
494 | | // Fill columns based on ordering equivalences |
495 | | for ordering in eq_properties.oeq_class.iter() { |
496 | | let (sort_columns, indices): (Vec<_>, Vec<_>) = ordering |
497 | | .iter() |
498 | | .map(|PhysicalSortExpr { expr, options }| { |
499 | | let col = expr.as_any().downcast_ref::<Column>().unwrap(); |
500 | | let (idx, _field) = schema.column_with_name(col.name()).unwrap(); |
501 | | let arr = generate_random_array(n_elem, n_distinct); |
502 | | ( |
503 | | SortColumn { |
504 | | values: arr, |
505 | | options: Some(*options), |
506 | | }, |
507 | | idx, |
508 | | ) |
509 | | }) |
510 | | .unzip(); |
511 | | |
512 | | let sort_arrs = arrow::compute::lexsort(&sort_columns, None)?; |
513 | | for (idx, arr) in izip!(indices, sort_arrs) { |
514 | | schema_vec[idx] = Some(arr); |
515 | | } |
516 | | } |
517 | | |
518 | | // Fill columns based on equivalence groups |
519 | | for eq_group in eq_properties.eq_group.iter() { |
520 | | let representative_array = |
521 | | get_representative_arr(eq_group, &schema_vec, Arc::clone(schema)) |
522 | | .unwrap_or_else(|| generate_random_array(n_elem, n_distinct)); |
523 | | |
524 | | for expr in eq_group.iter() { |
525 | | let col = expr.as_any().downcast_ref::<Column>().unwrap(); |
526 | | let (idx, _field) = schema.column_with_name(col.name()).unwrap(); |
527 | | schema_vec[idx] = Some(Arc::clone(&representative_array)); |
528 | | } |
529 | | } |
530 | | |
531 | | let res: Vec<_> = schema_vec |
532 | | .into_iter() |
533 | | .zip(schema.fields.iter()) |
534 | | .map(|(elem, field)| { |
535 | | ( |
536 | | field.name(), |
537 | | // Generate random values for columns that do not occur in any of the groups (equivalence, ordering equivalence, constants) |
538 | | elem.unwrap_or_else(|| generate_random_array(n_elem, n_distinct)), |
539 | | ) |
540 | | }) |
541 | | .collect(); |
542 | | |
543 | | Ok(RecordBatch::try_from_iter(res)?) |
544 | | } |
545 | | } |