From 6b2db4c674698af7831f0c92fe026d12258b8d96 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 1 Dec 2023 08:59:15 -0500 Subject: [PATCH 1/3] Minor: Improve PruningPredicate documentation --- .../core/src/physical_optimizer/pruning.rs | 55 +++++++++++++------ 1 file changed, 37 insertions(+), 18 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index de508327fade..d3214e4a585d 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -66,43 +66,57 @@ use log::trace; /// min_values("X") -> None /// ``` pub trait PruningStatistics { - /// return the minimum values for the named column, if known. - /// Note: the returned array must contain `num_containers()` rows + /// Return the minimum values for the named column, if known. + /// + /// If the minimum value for a particular container is not known, the + /// returned array should have `null` in that row. If the minimum value is + /// not know for any row, return `None`. + /// + /// Note: the returned array must contain [`Self::num_containers`] rows fn min_values(&self, column: &Column) -> Option; - /// return the maximum values for the named column, if known. - /// Note: the returned array must contain `num_containers()` rows. + /// Return the maximum values for the named column, if known. + /// + /// see [`Self::min_values`] for when to return `None` and null values. + /// + /// Note: the returned array must contain [`Self::num_containers`] rows fn max_values(&self, column: &Column) -> Option; - /// return the number of containers (e.g. row groups) being - /// pruned with these statistics + /// Return the number of containers (e.g. row groups) being + /// pruned with these statistics (the number of rows in each returned array) fn num_containers(&self) -> usize; - /// return the number of null values for the named column as an + /// Return the number of null values for the named column as an /// `Option`. /// + /// see [`Self::min_values`] for when to return `None` and null values. + /// /// Note: the returned array must contain `num_containers()` rows. fn null_counts(&self, column: &Column) -> Option; } -/// Evaluates filter expressions on statistics, rather than the actual data. If -/// no rows could possibly pass the filter entire containers can be "pruned" -/// (skipped), without reading any actual data, leading to significant +/// Evaluates filter expressions on statistics such as min/max values and null +/// counts, attempting to prove a "container" (e.g. Parquet Row Group) can be +/// skipped without reading the actual data, potentially leading to significant /// performance improvements. /// -/// [`PruningPredicate`]s are used to prune (avoid scanning) Parquet Row Groups +/// For example, [`PruningPredicate`]s are used to prune Parquet Row Groups /// based on the min/max values found in the Parquet metadata. If the /// `PruningPredicate` can guarantee that no rows in the Row Group match the /// filter, the entire Row Group is skipped during query execution. /// -/// Note that this API is designed to be general, as it works: +/// The `PruningPredicate` API is general, allowing it to be used for pruning +/// other types of containers (e.g. files) based on statistics that may be +/// known from external catalogs (e.g. Delta Lake) or other sources. Thus it +/// supports: /// /// 1. Arbitrary expressions expressions (including user defined functions) /// -/// 2. Anything that implements the [`PruningStatistics`] trait, not just -/// Parquet metadata, allowing it to be used by other systems to prune entities -/// (e.g. entire files) if the statistics are known via some other source, such -/// as a catalog. +/// 2. Vectorized evaluation (provide more than one set of statistics at a time) +/// so it is suitable for pruning 1000s of containers. +/// +/// 3. Anything that implements the [`PruningStatistics`] trait, not just +/// Parquet metadata. /// /// # Example /// @@ -122,6 +136,7 @@ pub trait PruningStatistics { /// B: true (rows might match x = 5) /// C: true (rows might match x = 5) /// ``` +/// /// See [`PruningPredicate::try_new`] and [`PruningPredicate::prune`] for more information. #[derive(Debug, Clone)] pub struct PruningPredicate { @@ -251,8 +266,12 @@ fn is_always_true(expr: &Arc) -> bool { .unwrap_or_default() } -/// Records for which columns statistics are necessary to evaluate a -/// pruning predicate. +/// Describes which columns statistics are necessary to evaluate a +/// [`PruningPredicate`]. +/// +/// This structure permits reading and creating the minimum number statistics, +/// which is important since statistics may be non trivial to read (e.g. large +/// strings or when there are 1000s of columns. /// /// Handles creating references to the min/max statistics /// for columns as well as recording which statistics are needed From 0f7c5d688a0060df5289311785f41a9c4d1e1ff9 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 1 Dec 2023 09:02:57 -0500 Subject: [PATCH 2/3] tweaks --- datafusion/core/src/physical_optimizer/pruning.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index d3214e4a585d..e33942dfeb62 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -77,7 +77,7 @@ pub trait PruningStatistics { /// Return the maximum values for the named column, if known. /// - /// see [`Self::min_values`] for when to return `None` and null values. + /// See [`Self::min_values`] for when to return `None` and null values. /// /// Note: the returned array must contain [`Self::num_containers`] rows fn max_values(&self, column: &Column) -> Option; @@ -89,9 +89,9 @@ pub trait PruningStatistics { /// Return the number of null values for the named column as an /// `Option`. /// - /// see [`Self::min_values`] for when to return `None` and null values. + /// See [`Self::min_values`] for when to return `None` and null values. /// - /// Note: the returned array must contain `num_containers()` rows. + /// Note: the returned array must contain [`Self::num_containers`] rows fn null_counts(&self, column: &Column) -> Option; } From e381b3bb68fc4c0d24116c86c8e0aaa31725f070 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Sun, 3 Dec 2023 07:21:12 -0500 Subject: [PATCH 3/3] Apply suggestions from code review Co-authored-by: Liang-Chi Hsieh --- datafusion/core/src/physical_optimizer/pruning.rs | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/datafusion/core/src/physical_optimizer/pruning.rs b/datafusion/core/src/physical_optimizer/pruning.rs index e33942dfeb62..b2ba7596db8d 100644 --- a/datafusion/core/src/physical_optimizer/pruning.rs +++ b/datafusion/core/src/physical_optimizer/pruning.rs @@ -70,7 +70,7 @@ pub trait PruningStatistics { /// /// If the minimum value for a particular container is not known, the /// returned array should have `null` in that row. If the minimum value is - /// not know for any row, return `None`. + /// not known for any row, return `None`. /// /// Note: the returned array must contain [`Self::num_containers`] rows fn min_values(&self, column: &Column) -> Option; @@ -106,7 +106,7 @@ pub trait PruningStatistics { /// filter, the entire Row Group is skipped during query execution. /// /// The `PruningPredicate` API is general, allowing it to be used for pruning -/// other types of containers (e.g. files) based on statistics that may be +/// other types of containers (e.g. files) based on statistics that may be /// known from external catalogs (e.g. Delta Lake) or other sources. Thus it /// supports: /// @@ -271,7 +271,7 @@ fn is_always_true(expr: &Arc) -> bool { /// /// This structure permits reading and creating the minimum number statistics, /// which is important since statistics may be non trivial to read (e.g. large -/// strings or when there are 1000s of columns. +/// strings or when there are 1000s of columns). /// /// Handles creating references to the min/max statistics /// for columns as well as recording which statistics are needed