/Users/andrewlamb/Software/datafusion/datafusion/common/src/functional_dependencies.rs
Line | Count | Source (jump to first uncovered line) |
1 | | // Licensed to the Apache Software Foundation (ASF) under one |
2 | | // or more contributor license agreements. See the NOTICE file |
3 | | // distributed with this work for additional information |
4 | | // regarding copyright ownership. The ASF licenses this file |
5 | | // to you under the Apache License, Version 2.0 (the |
6 | | // "License"); you may not use this file except in compliance |
7 | | // with the License. You may obtain a copy of the License at |
8 | | // |
9 | | // http://www.apache.org/licenses/LICENSE-2.0 |
10 | | // |
11 | | // Unless required by applicable law or agreed to in writing, |
12 | | // software distributed under the License is distributed on an |
13 | | // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
14 | | // KIND, either express or implied. See the License for the |
15 | | // specific language governing permissions and limitations |
16 | | // under the License. |
17 | | |
18 | | //! FunctionalDependencies keeps track of functional dependencies |
19 | | //! inside DFSchema. |
20 | | |
21 | | use std::collections::HashSet; |
22 | | use std::fmt::{Display, Formatter}; |
23 | | use std::ops::Deref; |
24 | | use std::vec::IntoIter; |
25 | | |
26 | | use crate::error::_plan_err; |
27 | | use crate::utils::{merge_and_order_indices, set_difference}; |
28 | | use crate::{DFSchema, DFSchemaRef, DataFusionError, JoinType, Result}; |
29 | | |
30 | | use sqlparser::ast::TableConstraint; |
31 | | |
32 | | /// This object defines a constraint on a table. |
33 | | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] |
34 | | pub enum Constraint { |
35 | | /// Columns with the given indices form a composite primary key (they are |
36 | | /// jointly unique and not nullable): |
37 | | PrimaryKey(Vec<usize>), |
38 | | /// Columns with the given indices form a composite unique key: |
39 | | Unique(Vec<usize>), |
40 | | } |
41 | | |
42 | | /// This object encapsulates a list of functional constraints: |
43 | | #[derive(Debug, Clone, PartialEq, Eq, PartialOrd, Hash)] |
44 | | pub struct Constraints { |
45 | | inner: Vec<Constraint>, |
46 | | } |
47 | | |
48 | | impl Constraints { |
49 | | /// Create empty constraints |
50 | 0 | pub fn empty() -> Self { |
51 | 0 | Constraints::new_unverified(vec![]) |
52 | 0 | } |
53 | | |
54 | | /// Create a new `Constraints` object from the given `constraints`. |
55 | | /// Users should use the `empty` or `new_from_table_constraints` functions |
56 | | /// for constructing `Constraints`. This constructor is for internal |
57 | | /// purposes only and does not check whether the argument is valid. The user |
58 | | /// is responsible for supplying a valid vector of `Constraint` objects. |
59 | 0 | pub fn new_unverified(constraints: Vec<Constraint>) -> Self { |
60 | 0 | Self { inner: constraints } |
61 | 0 | } |
62 | | |
63 | | /// Convert each `TableConstraint` to corresponding `Constraint` |
64 | 0 | pub fn new_from_table_constraints( |
65 | 0 | constraints: &[TableConstraint], |
66 | 0 | df_schema: &DFSchemaRef, |
67 | 0 | ) -> Result<Self> { |
68 | 0 | let constraints = constraints |
69 | 0 | .iter() |
70 | 0 | .map(|c: &TableConstraint| match c { |
71 | 0 | TableConstraint::Unique { name, columns, .. } => { |
72 | 0 | let field_names = df_schema.field_names(); |
73 | | // Get unique constraint indices in the schema: |
74 | 0 | let indices = columns |
75 | 0 | .iter() |
76 | 0 | .map(|u| { |
77 | 0 | let idx = field_names |
78 | 0 | .iter() |
79 | 0 | .position(|item| *item == u.value) |
80 | 0 | .ok_or_else(|| { |
81 | 0 | let name = name |
82 | 0 | .as_ref() |
83 | 0 | .map(|name| format!("with name '{name}' ")) |
84 | 0 | .unwrap_or("".to_string()); |
85 | 0 | DataFusionError::Execution( |
86 | 0 | format!("Column for unique constraint {}not found in schema: {}", name,u.value) |
87 | 0 | ) |
88 | 0 | })?; |
89 | 0 | Ok(idx) |
90 | 0 | }) |
91 | 0 | .collect::<Result<Vec<_>>>()?; |
92 | 0 | Ok(Constraint::Unique(indices)) |
93 | | } |
94 | 0 | TableConstraint::PrimaryKey { columns, .. } => { |
95 | 0 | let field_names = df_schema.field_names(); |
96 | | // Get primary key indices in the schema: |
97 | 0 | let indices = columns |
98 | 0 | .iter() |
99 | 0 | .map(|pk| { |
100 | 0 | let idx = field_names |
101 | 0 | .iter() |
102 | 0 | .position(|item| *item == pk.value) |
103 | 0 | .ok_or_else(|| { |
104 | 0 | DataFusionError::Execution(format!( |
105 | 0 | "Column for primary key not found in schema: {}", |
106 | 0 | pk.value |
107 | 0 | )) |
108 | 0 | })?; |
109 | 0 | Ok(idx) |
110 | 0 | }) |
111 | 0 | .collect::<Result<Vec<_>>>()?; |
112 | 0 | Ok(Constraint::PrimaryKey(indices)) |
113 | | } |
114 | | TableConstraint::ForeignKey { .. } => { |
115 | 0 | _plan_err!("Foreign key constraints are not currently supported") |
116 | | } |
117 | | TableConstraint::Check { .. } => { |
118 | 0 | _plan_err!("Check constraints are not currently supported") |
119 | | } |
120 | | TableConstraint::Index { .. } => { |
121 | 0 | _plan_err!("Indexes are not currently supported") |
122 | | } |
123 | | TableConstraint::FulltextOrSpatial { .. } => { |
124 | 0 | _plan_err!("Indexes are not currently supported") |
125 | | } |
126 | 0 | }) |
127 | 0 | .collect::<Result<Vec<_>>>()?; |
128 | 0 | Ok(Constraints::new_unverified(constraints)) |
129 | 0 | } |
130 | | |
131 | | /// Check whether constraints is empty |
132 | 0 | pub fn is_empty(&self) -> bool { |
133 | 0 | self.inner.is_empty() |
134 | 0 | } |
135 | | } |
136 | | |
137 | | impl IntoIterator for Constraints { |
138 | | type Item = Constraint; |
139 | | type IntoIter = IntoIter<Constraint>; |
140 | | |
141 | 0 | fn into_iter(self) -> Self::IntoIter { |
142 | 0 | self.inner.into_iter() |
143 | 0 | } |
144 | | } |
145 | | |
146 | | impl Display for Constraints { |
147 | 0 | fn fmt(&self, f: &mut Formatter<'_>) -> std::fmt::Result { |
148 | 0 | let pk: Vec<String> = self.inner.iter().map(|c| format!("{:?}", c)).collect(); |
149 | 0 | let pk = pk.join(", "); |
150 | 0 | if !pk.is_empty() { |
151 | 0 | write!(f, " constraints=[{pk}]") |
152 | | } else { |
153 | 0 | write!(f, "") |
154 | | } |
155 | 0 | } |
156 | | } |
157 | | |
158 | | impl Deref for Constraints { |
159 | | type Target = [Constraint]; |
160 | | |
161 | 0 | fn deref(&self) -> &Self::Target { |
162 | 0 | self.inner.as_slice() |
163 | 0 | } |
164 | | } |
165 | | |
166 | | /// This object defines a functional dependence in the schema. A functional |
167 | | /// dependence defines a relationship between determinant keys and dependent |
168 | | /// columns. A determinant key is a column, or a set of columns, whose value |
169 | | /// uniquely determines values of some other (dependent) columns. If two rows |
170 | | /// have the same determinant key, dependent columns in these rows are |
171 | | /// necessarily the same. If the determinant key is unique, the set of |
172 | | /// dependent columns is equal to the entire schema and the determinant key can |
173 | | /// serve as a primary key. Note that a primary key may "downgrade" into a |
174 | | /// determinant key due to an operation such as a join, and this object is |
175 | | /// used to track dependence relationships in such cases. For more information |
176 | | /// on functional dependencies, see: |
177 | | /// <https://www.scaler.com/topics/dbms/functional-dependency-in-dbms/> |
178 | | #[derive(Debug, Clone, PartialEq, Eq)] |
179 | | pub struct FunctionalDependence { |
180 | | // Column indices of the (possibly composite) determinant key: |
181 | | pub source_indices: Vec<usize>, |
182 | | // Column indices of dependent column(s): |
183 | | pub target_indices: Vec<usize>, |
184 | | /// Flag indicating whether one of the `source_indices` can receive NULL values. |
185 | | /// For a data source, if the constraint in question is `Constraint::Unique`, |
186 | | /// this flag is `true`. If the constraint in question is `Constraint::PrimaryKey`, |
187 | | /// this flag is `false`. |
188 | | /// Note that as the schema changes between different stages in a plan, |
189 | | /// such as after LEFT JOIN or RIGHT JOIN operations, this property may |
190 | | /// change. |
191 | | pub nullable: bool, |
192 | | // The functional dependency mode: |
193 | | pub mode: Dependency, |
194 | | } |
195 | | |
196 | | /// Describes functional dependency mode. |
197 | | #[derive(Debug, Clone, Copy, PartialEq, Eq)] |
198 | | pub enum Dependency { |
199 | | Single, // A determinant key may occur only once. |
200 | | Multi, // A determinant key may occur multiple times (in multiple rows). |
201 | | } |
202 | | |
203 | | impl FunctionalDependence { |
204 | | // Creates a new functional dependence. |
205 | 0 | pub fn new( |
206 | 0 | source_indices: Vec<usize>, |
207 | 0 | target_indices: Vec<usize>, |
208 | 0 | nullable: bool, |
209 | 0 | ) -> Self { |
210 | 0 | Self { |
211 | 0 | source_indices, |
212 | 0 | target_indices, |
213 | 0 | nullable, |
214 | 0 | // Start with the least restrictive mode by default: |
215 | 0 | mode: Dependency::Multi, |
216 | 0 | } |
217 | 0 | } |
218 | | |
219 | 0 | pub fn with_mode(mut self, mode: Dependency) -> Self { |
220 | 0 | self.mode = mode; |
221 | 0 | self |
222 | 0 | } |
223 | | } |
224 | | |
225 | | /// This object encapsulates all functional dependencies in a given relation. |
226 | | #[derive(Debug, Clone, PartialEq, Eq)] |
227 | | pub struct FunctionalDependencies { |
228 | | deps: Vec<FunctionalDependence>, |
229 | | } |
230 | | |
231 | | impl FunctionalDependencies { |
232 | | /// Creates an empty `FunctionalDependencies` object. |
233 | 0 | pub fn empty() -> Self { |
234 | 0 | Self { deps: vec![] } |
235 | 0 | } |
236 | | |
237 | | /// Creates a new `FunctionalDependencies` object from a vector of |
238 | | /// `FunctionalDependence` objects. |
239 | 0 | pub fn new(dependencies: Vec<FunctionalDependence>) -> Self { |
240 | 0 | Self { deps: dependencies } |
241 | 0 | } |
242 | | |
243 | | /// Creates a new `FunctionalDependencies` object from the given constraints. |
244 | 0 | pub fn new_from_constraints( |
245 | 0 | constraints: Option<&Constraints>, |
246 | 0 | n_field: usize, |
247 | 0 | ) -> Self { |
248 | 0 | if let Some(Constraints { inner: constraints }) = constraints { |
249 | | // Construct dependency objects based on each individual constraint: |
250 | 0 | let dependencies = constraints |
251 | 0 | .iter() |
252 | 0 | .map(|constraint| { |
253 | | // All the field indices are associated with the whole table |
254 | | // since we are dealing with table level constraints: |
255 | 0 | let dependency = match constraint { |
256 | 0 | Constraint::PrimaryKey(indices) => FunctionalDependence::new( |
257 | 0 | indices.to_vec(), |
258 | 0 | (0..n_field).collect::<Vec<_>>(), |
259 | 0 | false, |
260 | 0 | ), |
261 | 0 | Constraint::Unique(indices) => FunctionalDependence::new( |
262 | 0 | indices.to_vec(), |
263 | 0 | (0..n_field).collect::<Vec<_>>(), |
264 | 0 | true, |
265 | 0 | ), |
266 | | }; |
267 | | // As primary keys are guaranteed to be unique, set the |
268 | | // functional dependency mode to `Dependency::Single`: |
269 | 0 | dependency.with_mode(Dependency::Single) |
270 | 0 | }) |
271 | 0 | .collect::<Vec<_>>(); |
272 | 0 | Self::new(dependencies) |
273 | | } else { |
274 | | // There is no constraint, return an empty object: |
275 | 0 | Self::empty() |
276 | | } |
277 | 0 | } |
278 | | |
279 | 0 | pub fn with_dependency(mut self, mode: Dependency) -> Self { |
280 | 0 | self.deps.iter_mut().for_each(|item| item.mode = mode); |
281 | 0 | self |
282 | 0 | } |
283 | | |
284 | | /// Merges the given functional dependencies with these. |
285 | 0 | pub fn extend(&mut self, other: FunctionalDependencies) { |
286 | 0 | self.deps.extend(other.deps); |
287 | 0 | } |
288 | | |
289 | | /// Sanity checks if functional dependencies are valid. For example, if |
290 | | /// there are 10 fields, we cannot receive any index further than 9. |
291 | 0 | pub fn is_valid(&self, n_field: usize) -> bool { |
292 | 0 | self.deps.iter().all( |
293 | 0 | |FunctionalDependence { |
294 | | source_indices, |
295 | | target_indices, |
296 | | .. |
297 | 0 | }| { |
298 | 0 | source_indices |
299 | 0 | .iter() |
300 | 0 | .max() |
301 | 0 | .map(|&max_index| max_index < n_field) |
302 | 0 | .unwrap_or(true) |
303 | 0 | && target_indices |
304 | 0 | .iter() |
305 | 0 | .max() |
306 | 0 | .map(|&max_index| max_index < n_field) |
307 | 0 | .unwrap_or(true) |
308 | 0 | }, |
309 | 0 | ) |
310 | 0 | } |
311 | | |
312 | | /// Adds the `offset` value to `source_indices` and `target_indices` for |
313 | | /// each functional dependency. |
314 | 0 | pub fn add_offset(&mut self, offset: usize) { |
315 | 0 | self.deps.iter_mut().for_each( |
316 | 0 | |FunctionalDependence { |
317 | | source_indices, |
318 | | target_indices, |
319 | | .. |
320 | 0 | }| { |
321 | 0 | *source_indices = add_offset_to_vec(source_indices, offset); |
322 | 0 | *target_indices = add_offset_to_vec(target_indices, offset); |
323 | 0 | }, |
324 | 0 | ) |
325 | 0 | } |
326 | | |
327 | | /// Updates `source_indices` and `target_indices` of each functional |
328 | | /// dependence using the index mapping given in `proj_indices`. |
329 | | /// |
330 | | /// Assume that `proj_indices` is \[2, 5, 8\] and we have a functional |
331 | | /// dependence \[5\] (`source_indices`) -> \[5, 8\] (`target_indices`). |
332 | | /// In the updated schema, fields at indices \[2, 5, 8\] will transform |
333 | | /// to \[0, 1, 2\]. Therefore, the resulting functional dependence will |
334 | | /// be \[1\] -> \[1, 2\]. |
335 | 0 | pub fn project_functional_dependencies( |
336 | 0 | &self, |
337 | 0 | proj_indices: &[usize], |
338 | 0 | // The argument `n_out` denotes the schema field length, which is needed |
339 | 0 | // to correctly associate a `Single`-mode dependence with the whole table. |
340 | 0 | n_out: usize, |
341 | 0 | ) -> FunctionalDependencies { |
342 | 0 | let mut projected_func_dependencies = vec![]; |
343 | | for FunctionalDependence { |
344 | 0 | source_indices, |
345 | 0 | target_indices, |
346 | 0 | nullable, |
347 | 0 | mode, |
348 | 0 | } in &self.deps |
349 | | { |
350 | 0 | let new_source_indices = |
351 | 0 | update_elements_with_matching_indices(source_indices, proj_indices); |
352 | 0 | let new_target_indices = if *mode == Dependency::Single { |
353 | | // Associate with all of the fields in the schema: |
354 | 0 | (0..n_out).collect() |
355 | | } else { |
356 | | // Update associations according to projection: |
357 | 0 | update_elements_with_matching_indices(target_indices, proj_indices) |
358 | | }; |
359 | | // All of the composite indices should still be valid after projection; |
360 | | // otherwise, functional dependency cannot be propagated. |
361 | 0 | if new_source_indices.len() == source_indices.len() { |
362 | 0 | let new_func_dependence = FunctionalDependence::new( |
363 | 0 | new_source_indices, |
364 | 0 | new_target_indices, |
365 | 0 | *nullable, |
366 | 0 | ) |
367 | 0 | .with_mode(*mode); |
368 | 0 | projected_func_dependencies.push(new_func_dependence); |
369 | 0 | } |
370 | | } |
371 | 0 | FunctionalDependencies::new(projected_func_dependencies) |
372 | 0 | } |
373 | | |
374 | | /// This function joins this set of functional dependencies with the `other` |
375 | | /// according to the given `join_type`. |
376 | 0 | pub fn join( |
377 | 0 | &self, |
378 | 0 | other: &FunctionalDependencies, |
379 | 0 | join_type: &JoinType, |
380 | 0 | left_cols_len: usize, |
381 | 0 | ) -> FunctionalDependencies { |
382 | 0 | // Get mutable copies of left and right side dependencies: |
383 | 0 | let mut right_func_dependencies = other.clone(); |
384 | 0 | let mut left_func_dependencies = self.clone(); |
385 | 0 |
|
386 | 0 | match join_type { |
387 | | JoinType::Inner | JoinType::Left | JoinType::Right => { |
388 | | // Add offset to right schema: |
389 | 0 | right_func_dependencies.add_offset(left_cols_len); |
390 | 0 |
|
391 | 0 | // Result may have multiple values, update the dependency mode: |
392 | 0 | left_func_dependencies = |
393 | 0 | left_func_dependencies.with_dependency(Dependency::Multi); |
394 | 0 | right_func_dependencies = |
395 | 0 | right_func_dependencies.with_dependency(Dependency::Multi); |
396 | 0 |
|
397 | 0 | if *join_type == JoinType::Left { |
398 | 0 | // Downgrade the right side, since it may have additional NULL values: |
399 | 0 | right_func_dependencies.downgrade_dependencies(); |
400 | 0 | } else if *join_type == JoinType::Right { |
401 | 0 | // Downgrade the left side, since it may have additional NULL values: |
402 | 0 | left_func_dependencies.downgrade_dependencies(); |
403 | 0 | } |
404 | | // Combine left and right functional dependencies: |
405 | 0 | left_func_dependencies.extend(right_func_dependencies); |
406 | 0 | left_func_dependencies |
407 | | } |
408 | | JoinType::LeftSemi | JoinType::LeftAnti => { |
409 | | // These joins preserve functional dependencies of the left side: |
410 | 0 | left_func_dependencies |
411 | | } |
412 | | JoinType::RightSemi | JoinType::RightAnti => { |
413 | | // These joins preserve functional dependencies of the right side: |
414 | 0 | right_func_dependencies |
415 | | } |
416 | | JoinType::Full => { |
417 | | // All of the functional dependencies are lost in a FULL join: |
418 | 0 | FunctionalDependencies::empty() |
419 | | } |
420 | | } |
421 | 0 | } |
422 | | |
423 | | /// This function downgrades a functional dependency when nullability becomes |
424 | | /// a possibility: |
425 | | /// - If the dependency in question is UNIQUE (i.e. nullable), a new null value |
426 | | /// invalidates the dependency. |
427 | | /// - If the dependency in question is PRIMARY KEY (i.e. not nullable), a new |
428 | | /// null value turns it into UNIQUE mode. |
429 | 0 | fn downgrade_dependencies(&mut self) { |
430 | 0 | // Delete nullable dependencies, since they are no longer valid: |
431 | 0 | self.deps.retain(|item| !item.nullable); |
432 | 0 | self.deps.iter_mut().for_each(|item| item.nullable = true); |
433 | 0 | } |
434 | | |
435 | | /// This function ensures that functional dependencies involving uniquely |
436 | | /// occurring determinant keys cover their entire table in terms of |
437 | | /// dependent columns. |
438 | 0 | pub fn extend_target_indices(&mut self, n_out: usize) { |
439 | 0 | self.deps.iter_mut().for_each( |
440 | 0 | |FunctionalDependence { |
441 | | mode, |
442 | | target_indices, |
443 | | .. |
444 | 0 | }| { |
445 | 0 | // If unique, cover the whole table: |
446 | 0 | if *mode == Dependency::Single { |
447 | 0 | *target_indices = (0..n_out).collect::<Vec<_>>(); |
448 | 0 | } |
449 | 0 | }, |
450 | 0 | ) |
451 | 0 | } |
452 | | } |
453 | | |
454 | | impl Deref for FunctionalDependencies { |
455 | | type Target = [FunctionalDependence]; |
456 | | |
457 | 0 | fn deref(&self) -> &Self::Target { |
458 | 0 | self.deps.as_slice() |
459 | 0 | } |
460 | | } |
461 | | |
462 | | /// Calculates functional dependencies for aggregate output, when there is a GROUP BY expression. |
463 | 0 | pub fn aggregate_functional_dependencies( |
464 | 0 | aggr_input_schema: &DFSchema, |
465 | 0 | group_by_expr_names: &[String], |
466 | 0 | aggr_schema: &DFSchema, |
467 | 0 | ) -> FunctionalDependencies { |
468 | 0 | let mut aggregate_func_dependencies = vec![]; |
469 | 0 | let aggr_input_fields = aggr_input_schema.field_names(); |
470 | 0 | let aggr_fields = aggr_schema.fields(); |
471 | 0 | // Association covers the whole table: |
472 | 0 | let target_indices = (0..aggr_schema.fields().len()).collect::<Vec<_>>(); |
473 | 0 | // Get functional dependencies of the schema: |
474 | 0 | let func_dependencies = aggr_input_schema.functional_dependencies(); |
475 | | for FunctionalDependence { |
476 | 0 | source_indices, |
477 | 0 | nullable, |
478 | 0 | mode, |
479 | | .. |
480 | 0 | } in &func_dependencies.deps |
481 | | { |
482 | | // Keep source indices in a `HashSet` to prevent duplicate entries: |
483 | 0 | let mut new_source_indices = vec![]; |
484 | 0 | let mut new_source_field_names = vec![]; |
485 | 0 | let source_field_names = source_indices |
486 | 0 | .iter() |
487 | 0 | .map(|&idx| &aggr_input_fields[idx]) |
488 | 0 | .collect::<Vec<_>>(); |
489 | | |
490 | 0 | for (idx, group_by_expr_name) in group_by_expr_names.iter().enumerate() { |
491 | | // When one of the input determinant expressions matches with |
492 | | // the GROUP BY expression, add the index of the GROUP BY |
493 | | // expression as a new determinant key: |
494 | 0 | if source_field_names.contains(&group_by_expr_name) { |
495 | 0 | new_source_indices.push(idx); |
496 | 0 | new_source_field_names.push(group_by_expr_name.clone()); |
497 | 0 | } |
498 | | } |
499 | 0 | let existing_target_indices = |
500 | 0 | get_target_functional_dependencies(aggr_input_schema, group_by_expr_names); |
501 | 0 | let new_target_indices = get_target_functional_dependencies( |
502 | 0 | aggr_input_schema, |
503 | 0 | &new_source_field_names, |
504 | 0 | ); |
505 | 0 | let mode = if existing_target_indices == new_target_indices |
506 | 0 | && new_target_indices.is_some() |
507 | | { |
508 | | // If dependency covers all GROUP BY expressions, mode will be `Single`: |
509 | 0 | Dependency::Single |
510 | | } else { |
511 | | // Otherwise, existing mode is preserved: |
512 | 0 | *mode |
513 | | }; |
514 | | // All of the composite indices occur in the GROUP BY expression: |
515 | 0 | if new_source_indices.len() == source_indices.len() { |
516 | 0 | aggregate_func_dependencies.push( |
517 | 0 | FunctionalDependence::new( |
518 | 0 | new_source_indices, |
519 | 0 | target_indices.clone(), |
520 | 0 | *nullable, |
521 | 0 | ) |
522 | 0 | .with_mode(mode), |
523 | 0 | ); |
524 | 0 | } |
525 | | } |
526 | | |
527 | | // When we have a GROUP BY key, we can guarantee uniqueness after |
528 | | // aggregation: |
529 | 0 | if !group_by_expr_names.is_empty() { |
530 | 0 | let count = group_by_expr_names.len(); |
531 | 0 | let source_indices = (0..count).collect::<Vec<_>>(); |
532 | 0 | let nullable = source_indices |
533 | 0 | .iter() |
534 | 0 | .any(|idx| aggr_fields[*idx].is_nullable()); |
535 | 0 | // If GROUP BY expressions do not already act as a determinant: |
536 | 0 | if !aggregate_func_dependencies.iter().any(|item| { |
537 | 0 | // If `item.source_indices` is a subset of GROUP BY expressions, we shouldn't add |
538 | 0 | // them since `item.source_indices` defines this relation already. |
539 | 0 |
|
540 | 0 | // The following simple comparison is working well because |
541 | 0 | // GROUP BY expressions come here as a prefix. |
542 | 0 | item.source_indices.iter().all(|idx| idx < &count) |
543 | 0 | }) { |
544 | 0 | // Add a new functional dependency associated with the whole table: |
545 | 0 | // Use nullable property of the GROUP BY expression: |
546 | 0 | aggregate_func_dependencies.push( |
547 | 0 | // Use nullable property of the GROUP BY expression: |
548 | 0 | FunctionalDependence::new(source_indices, target_indices, nullable) |
549 | 0 | .with_mode(Dependency::Single), |
550 | 0 | ); |
551 | 0 | } |
552 | 0 | } |
553 | 0 | FunctionalDependencies::new(aggregate_func_dependencies) |
554 | 0 | } |
555 | | |
556 | | /// Returns target indices, for the determinant keys that are inside |
557 | | /// group by expressions. |
558 | 0 | pub fn get_target_functional_dependencies( |
559 | 0 | schema: &DFSchema, |
560 | 0 | group_by_expr_names: &[String], |
561 | 0 | ) -> Option<Vec<usize>> { |
562 | 0 | let mut combined_target_indices = HashSet::new(); |
563 | 0 | let dependencies = schema.functional_dependencies(); |
564 | 0 | let field_names = schema.field_names(); |
565 | | for FunctionalDependence { |
566 | 0 | source_indices, |
567 | 0 | target_indices, |
568 | | .. |
569 | 0 | } in &dependencies.deps |
570 | | { |
571 | 0 | let source_key_names = source_indices |
572 | 0 | .iter() |
573 | 0 | .map(|id_key_idx| &field_names[*id_key_idx]) |
574 | 0 | .collect::<Vec<_>>(); |
575 | 0 | // If the GROUP BY expression contains a determinant key, we can use |
576 | 0 | // the associated fields after aggregation even if they are not part |
577 | 0 | // of the GROUP BY expression. |
578 | 0 | if source_key_names |
579 | 0 | .iter() |
580 | 0 | .all(|source_key_name| group_by_expr_names.contains(source_key_name)) |
581 | 0 | { |
582 | 0 | combined_target_indices.extend(target_indices.iter()); |
583 | 0 | } |
584 | | } |
585 | 0 | (!combined_target_indices.is_empty()).then_some({ |
586 | 0 | let mut result = combined_target_indices.into_iter().collect::<Vec<_>>(); |
587 | 0 | result.sort(); |
588 | 0 | result |
589 | 0 | }) |
590 | 0 | } |
591 | | |
592 | | /// Returns indices for the minimal subset of GROUP BY expressions that are |
593 | | /// functionally equivalent to the original set of GROUP BY expressions. |
594 | 0 | pub fn get_required_group_by_exprs_indices( |
595 | 0 | schema: &DFSchema, |
596 | 0 | group_by_expr_names: &[String], |
597 | 0 | ) -> Option<Vec<usize>> { |
598 | 0 | let dependencies = schema.functional_dependencies(); |
599 | 0 | let field_names = schema.field_names(); |
600 | 0 | let mut groupby_expr_indices = group_by_expr_names |
601 | 0 | .iter() |
602 | 0 | .map(|group_by_expr_name| { |
603 | 0 | field_names |
604 | 0 | .iter() |
605 | 0 | .position(|field_name| field_name == group_by_expr_name) |
606 | 0 | }) |
607 | 0 | .collect::<Option<Vec<_>>>()?; |
608 | | |
609 | 0 | groupby_expr_indices.sort(); |
610 | | for FunctionalDependence { |
611 | 0 | source_indices, |
612 | 0 | target_indices, |
613 | | .. |
614 | 0 | } in &dependencies.deps |
615 | | { |
616 | 0 | if source_indices |
617 | 0 | .iter() |
618 | 0 | .all(|source_idx| groupby_expr_indices.contains(source_idx)) |
619 | 0 | { |
620 | 0 | // If all source indices are among GROUP BY expression indices, we |
621 | 0 | // can remove target indices from GROUP BY expression indices and |
622 | 0 | // use source indices instead. |
623 | 0 | groupby_expr_indices = set_difference(&groupby_expr_indices, target_indices); |
624 | 0 | groupby_expr_indices = |
625 | 0 | merge_and_order_indices(groupby_expr_indices, source_indices); |
626 | 0 | } |
627 | | } |
628 | 0 | groupby_expr_indices |
629 | 0 | .iter() |
630 | 0 | .map(|idx| { |
631 | 0 | group_by_expr_names |
632 | 0 | .iter() |
633 | 0 | .position(|name| &field_names[*idx] == name) |
634 | 0 | }) |
635 | 0 | .collect() |
636 | 0 | } |
637 | | |
638 | | /// Updates entries inside the `entries` vector with their corresponding |
639 | | /// indices inside the `proj_indices` vector. |
640 | 0 | fn update_elements_with_matching_indices( |
641 | 0 | entries: &[usize], |
642 | 0 | proj_indices: &[usize], |
643 | 0 | ) -> Vec<usize> { |
644 | 0 | entries |
645 | 0 | .iter() |
646 | 0 | .filter_map(|val| proj_indices.iter().position(|proj_idx| proj_idx == val)) |
647 | 0 | .collect() |
648 | 0 | } |
649 | | |
650 | | /// Adds `offset` value to each entry inside `in_data`. |
651 | 0 | fn add_offset_to_vec<T: Copy + std::ops::Add<Output = T>>( |
652 | 0 | in_data: &[T], |
653 | 0 | offset: T, |
654 | 0 | ) -> Vec<T> { |
655 | 0 | in_data.iter().map(|&item| item + offset).collect() |
656 | 0 | } |
657 | | |
658 | | #[cfg(test)] |
659 | | mod tests { |
660 | | use super::*; |
661 | | |
662 | | #[test] |
663 | | fn constraints_iter() { |
664 | | let constraints = Constraints::new_unverified(vec![ |
665 | | Constraint::PrimaryKey(vec![10]), |
666 | | Constraint::Unique(vec![20]), |
667 | | ]); |
668 | | let mut iter = constraints.iter(); |
669 | | assert_eq!(iter.next(), Some(&Constraint::PrimaryKey(vec![10]))); |
670 | | assert_eq!(iter.next(), Some(&Constraint::Unique(vec![20]))); |
671 | | assert_eq!(iter.next(), None); |
672 | | } |
673 | | |
674 | | #[test] |
675 | | fn test_get_updated_id_keys() { |
676 | | let fund_dependencies = |
677 | | FunctionalDependencies::new(vec![FunctionalDependence::new( |
678 | | vec![1], |
679 | | vec![0, 1, 2], |
680 | | true, |
681 | | )]); |
682 | | let res = fund_dependencies.project_functional_dependencies(&[1, 2], 2); |
683 | | let expected = FunctionalDependencies::new(vec![FunctionalDependence::new( |
684 | | vec![0], |
685 | | vec![0, 1], |
686 | | true, |
687 | | )]); |
688 | | assert_eq!(res, expected); |
689 | | } |
690 | | } |