From 2a16704db7af0045d465cda39b90d1a17e68dbe8 Mon Sep 17 00:00:00 2001 From: Andrew Lamb Date: Fri, 16 Aug 2024 13:35:01 -0400 Subject: [PATCH] Improve documentation about `ParquetExec` / Parquet predicate pushdown (#11994) * Minor: improve ParquetExec docs * typo * clippy * fix whitespace so rustdoc does not treat as tests * Apply suggestions from code review Co-authored-by: Oleks V * expound upon column rewriting in the context of schema evolution --------- Co-authored-by: Oleks V --- datafusion/common/src/tree_node.rs | 3 + .../datasource/physical_plan/parquet/mod.rs | 60 ++++-- .../physical_plan/parquet/row_filter.rs | 194 ++++++++++++++---- 3 files changed, 192 insertions(+), 65 deletions(-) diff --git a/datafusion/common/src/tree_node.rs b/datafusion/common/src/tree_node.rs index bcf4d7664acc..88300e3edd0e 100644 --- a/datafusion/common/src/tree_node.rs +++ b/datafusion/common/src/tree_node.rs @@ -486,6 +486,9 @@ pub trait TreeNodeVisitor<'n>: Sized { /// A [Visitor](https://en.wikipedia.org/wiki/Visitor_pattern) for recursively /// rewriting [`TreeNode`]s via [`TreeNode::rewrite`]. /// +/// For example you can implement this trait on a struct to rewrite `Expr` or +/// `LogicalPlan` that needs to track state during the rewrite. +/// /// See [`TreeNode`] for more details on available APIs /// /// When passed to [`TreeNode::rewrite`], [`TreeNodeRewriter::f_down`] and diff --git a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs index 72aabefba595..cb026522cfa8 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/mod.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/mod.rs @@ -116,13 +116,12 @@ pub use writer::plan_to_parquet; /// /// Supports the following optimizations: /// -/// * Concurrent reads: Can read from one or more files in parallel as multiple +/// * Concurrent reads: reads from one or more files in parallel as multiple /// partitions, including concurrently reading multiple row groups from a single /// file. /// -/// * Predicate push down: skips row groups and pages based on -/// min/max/null_counts in the row group metadata, the page index and bloom -/// filters. +/// * Predicate push down: skips row groups, pages, rows based on metadata +/// and late materialization. See "Predicate Pushdown" below. /// /// * Projection pushdown: reads and decodes only the columns required. /// @@ -132,9 +131,8 @@ pub use writer::plan_to_parquet; /// coalesce I/O operations, etc. See [`ParquetFileReaderFactory`] for more /// details. /// -/// * Schema adapters: read parquet files with different schemas into a unified -/// table schema. This can be used to implement "schema evolution". See -/// [`SchemaAdapterFactory`] for more details. +/// * Schema evolution: read parquet files with different schemas into a unified +/// table schema. See [`SchemaAdapterFactory`] for more details. /// /// * metadata_size_hint: controls the number of bytes read from the end of the /// file in the initial I/O when the default [`ParquetFileReaderFactory`]. If a @@ -144,6 +142,29 @@ pub use writer::plan_to_parquet; /// * User provided [`ParquetAccessPlan`]s to skip row groups and/or pages /// based on external information. See "Implementing External Indexes" below /// +/// # Predicate Pushdown +/// +/// `ParquetExec` uses the provided [`PhysicalExpr`] predicate as a filter to +/// skip reading unnecessary data and improve query performance using several techniques: +/// +/// * Row group pruning: skips entire row groups based on min/max statistics +/// found in [`ParquetMetaData`] and any Bloom filters that are present. +/// +/// * Page pruning: skips individual pages within a ColumnChunk using the +/// [Parquet PageIndex], if present. +/// +/// * Row filtering: skips rows within a page using a form of late +/// materialization. When possible, predicates are applied by the parquet +/// decoder *during* decode (see [`ArrowPredicate`] and [`RowFilter`] for more +/// details). This is only enabled if `ParquetScanOptions::pushdown_filters` is set to true. +/// +/// Note: If the predicate can not be used to accelerate the scan, it is ignored +/// (no error is raised on predicate evaluation errors). +/// +/// [`ArrowPredicate`]: parquet::arrow::arrow_reader::ArrowPredicate +/// [`RowFilter`]: parquet::arrow::arrow_reader::RowFilter +/// [Parquet PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md +/// /// # Implementing External Indexes /// /// It is possible to restrict the row groups and selections within those row @@ -199,10 +220,11 @@ pub use writer::plan_to_parquet; /// applying predicates to metadata. The plan and projections are used to /// determine what pages must be read. /// -/// * Step 4: The stream begins reading data, fetching the required pages -/// and incrementally decoding them. +/// * Step 4: The stream begins reading data, fetching the required parquet +/// pages incrementally decoding them, and applying any row filters (see +/// [`Self::with_pushdown_filters`]). /// -/// * Step 5: As each [`RecordBatch]` is read, it may be adapted by a +/// * Step 5: As each [`RecordBatch`] is read, it may be adapted by a /// [`SchemaAdapter`] to match the table schema. By default missing columns are /// filled with nulls, but this can be customized via [`SchemaAdapterFactory`]. /// @@ -268,13 +290,10 @@ impl ParquetExecBuilder { } } - /// Set the predicate for the scan. - /// - /// The ParquetExec uses this predicate to filter row groups and data pages - /// using the Parquet statistics and bloom filters. + /// Set the filter predicate when reading. /// - /// If the predicate can not be used to prune the scan, it is ignored (no - /// error is raised). + /// See the "Predicate Pushdown" section of the [`ParquetExec`] documenation + /// for more details. pub fn with_predicate(mut self, predicate: Arc) -> Self { self.predicate = Some(predicate); self @@ -291,7 +310,7 @@ impl ParquetExecBuilder { self } - /// Set the table parquet options that control how the ParquetExec reads. + /// Set the options for controlling how the ParquetExec reads parquet files. /// /// See also [`Self::new_with_options`] pub fn with_table_parquet_options( @@ -480,11 +499,8 @@ impl ParquetExec { self } - /// If true, any filter [`Expr`]s on the scan will converted to a - /// [`RowFilter`](parquet::arrow::arrow_reader::RowFilter) in the - /// `ParquetRecordBatchStream`. These filters are applied by the - /// parquet decoder to skip unecessairly decoding other columns - /// which would not pass the predicate. Defaults to false + /// If true, the predicate will be used during the parquet scan. + /// Defaults to false /// /// [`Expr`]: datafusion_expr::Expr pub fn with_pushdown_filters(mut self, pushdown_filters: bool) -> Self { diff --git a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs index 9de132169389..23fdadc2cdee 100644 --- a/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs +++ b/datafusion/core/src/datasource/physical_plan/parquet/row_filter.rs @@ -15,6 +15,50 @@ // specific language governing permissions and limitations // under the License. +//! Utilities to push down of DataFusion filter predicates (any DataFusion +//! `PhysicalExpr` that evaluates to a [`BooleanArray`]) to the parquet decoder +//! level in `arrow-rs`. +//! +//! DataFusion will use a `ParquetRecordBatchStream` to read data from parquet +//! into [`RecordBatch`]es. +//! +//! The `ParquetRecordBatchStream` takes an optional `RowFilter` which is itself +//! a Vec of `Box`. During decoding, the predicates are +//! evaluated in order, to generate a mask which is used to avoid decoding rows +//! in projected columns which do not pass the filter which can significantly +//! reduce the amount of compute required for decoding and thus improve query +//! performance. +//! +//! Since the predicates are applied serially in the order defined in the +//! `RowFilter`, the optimal ordering depends on the exact filters. The best +//! filters to execute first have two properties: +//! +//! 1. They are relatively inexpensive to evaluate (e.g. they read +//! column chunks which are relatively small) +//! +//! 2. They filter many (contiguous) rows, reducing the amount of decoding +//! required for subsequent filters and projected columns +//! +//! If requested, this code will reorder the filters based on heuristics try and +//! reduce the evaluation cost. +//! +//! The basic algorithm for constructing the `RowFilter` is as follows +//! +//! 1. Break conjunctions into separate predicates. An expression +//! like `a = 1 AND (b = 2 AND c = 3)` would be +//! separated into the expressions `a = 1`, `b = 2`, and `c = 3`. +//! 2. Determine whether each predicate can be evaluated as an `ArrowPredicate`. +//! 3. Determine, for each predicate, the total compressed size of all +//! columns required to evaluate the predicate. +//! 4. Determine, for each predicate, whether all columns required to +//! evaluate the expression are sorted. +//! 5. Re-order the predicate by total size (from step 3). +//! 6. Partition the predicates according to whether they are sorted (from step 4) +//! 7. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`. +//! 8. Build the `RowFilter` with the sorted predicates followed by +//! the unsorted predicates. Within each partition, predicates are +//! still be sorted by size. + use std::collections::BTreeSet; use std::sync::Arc; @@ -40,41 +84,24 @@ use crate::physical_plan::metrics; use super::ParquetFileMetrics; -/// This module contains utilities for enabling the pushdown of DataFusion filter predicates (which -/// can be any DataFusion `Expr` that evaluates to a `BooleanArray`) to the parquet decoder level in `arrow-rs`. -/// DataFusion will use a `ParquetRecordBatchStream` to read data from parquet into arrow `RecordBatch`es. -/// When constructing the `ParquetRecordBatchStream` you can provide a `RowFilter` which is itself just a vector -/// of `Box`. During decoding, the predicates are evaluated to generate a mask which is used -/// to avoid decoding rows in projected columns which are not selected which can significantly reduce the amount -/// of compute required for decoding. +/// A "compiled" predicate passed to `ParquetRecordBatchStream` to perform +/// row-level filtering during parquet decoding. /// -/// Since the predicates are applied serially in the order defined in the `RowFilter`, the optimal ordering -/// will depend on the exact filters. The best filters to execute first have two properties: -/// 1. The are relatively inexpensive to evaluate (e.g. they read column chunks which are relatively small) -/// 2. They filter a lot of rows, reducing the amount of decoding required for subsequent filters and projected columns +/// See the module level documentation for more information. /// -/// Given the metadata exposed by parquet, the selectivity of filters is not easy to estimate so the heuristics we use here primarily -/// focus on the evaluation cost. +/// Implements the `ArrowPredicate` trait used by the parquet decoder /// -/// The basic algorithm for constructing the `RowFilter` is as follows -/// 1. Recursively break conjunctions into separate predicates. An expression like `a = 1 AND (b = 2 AND c = 3)` would be -/// separated into the expressions `a = 1`, `b = 2`, and `c = 3`. -/// 2. Determine whether each predicate is suitable as an `ArrowPredicate`. As long as the predicate does not reference any projected columns -/// or columns with non-primitive types, then it is considered suitable. -/// 3. Determine, for each predicate, the total compressed size of all columns required to evaluate the predicate. -/// 4. Determine, for each predicate, whether all columns required to evaluate the expression are sorted. -/// 5. Re-order the predicate by total size (from step 3). -/// 6. Partition the predicates according to whether they are sorted (from step 4) -/// 7. "Compile" each predicate `Expr` to a `DatafusionArrowPredicate`. -/// 8. Build the `RowFilter` with the sorted predicates followed by the unsorted predicates. Within each partition -/// the predicates will still be sorted by size. - -/// A predicate which can be passed to `ParquetRecordBatchStream` to perform row-level -/// filtering during parquet decoding. +/// An expression can be evaluated as a `DatafusionArrowPredicate` if it: +/// * Does not reference any projected columns +/// * Does not reference columns with non-primitive types (e.g. structs / lists) #[derive(Debug)] pub(crate) struct DatafusionArrowPredicate { + /// the filter expression physical_expr: Arc, + /// Path to the columns in the parquet schema required to evaluate the + /// expression projection_mask: ProjectionMask, + /// Columns required to evaluate the expression in the arrow schema projection: Vec, /// how many rows were filtered out by this predicate rows_filtered: metrics::Count, @@ -85,6 +112,7 @@ pub(crate) struct DatafusionArrowPredicate { } impl DatafusionArrowPredicate { + /// Create a new `DatafusionArrowPredicate` from a `FilterCandidate` pub fn try_new( candidate: FilterCandidate, schema: &Schema, @@ -152,9 +180,12 @@ impl ArrowPredicate for DatafusionArrowPredicate { } } -/// A candidate expression for creating a `RowFilter` contains the -/// expression as well as data to estimate the cost of evaluating -/// the resulting expression. +/// A candidate expression for creating a `RowFilter`. +/// +/// Each candidate contains the expression as well as data to estimate the cost +/// of evaluating the resulting expression. +/// +/// See the module level documentation for more information. pub(crate) struct FilterCandidate { expr: Arc, required_bytes: usize, @@ -162,19 +193,55 @@ pub(crate) struct FilterCandidate { projection: Vec, } -/// Helper to build a `FilterCandidate`. This will do several things +/// Helper to build a `FilterCandidate`. +/// +/// This will do several things /// 1. Determine the columns required to evaluate the expression /// 2. Calculate data required to estimate the cost of evaluating the filter -/// 3. Rewrite column expressions in the predicate which reference columns not in the particular file schema. -/// This is relevant in the case where we have determined the table schema by merging all individual file schemas -/// and any given file may or may not contain all columns in the merged schema. If a particular column is not present -/// we replace the column expression with a literal expression that produces a null value. +/// 3. Rewrite column expressions in the predicate which reference columns not +/// in the particular file schema. +/// +/// # Schema Rewrite +/// +/// When parquet files are read in the context of "schema evolution" there are +/// potentially wo schemas: +/// +/// 1. The table schema (the columns of the table that the parquet file is part of) +/// 2. The file schema (the columns actually in the parquet file) +/// +/// There are times when the table schema contains columns that are not in the +/// file schema, such as when new columns have been added in new parquet files +/// but old files do not have the columns. +/// +/// When a file is missing a column from the table schema, the value of the +/// missing column is filled in with `NULL` via a `SchemaAdapter`. +/// +/// When a predicate is pushed down to the parquet reader, the predicate is +/// evaluated in the context of the file schema. If the predicate references a +/// column that is in the table schema but not in the file schema, the column +/// reference must be rewritten to a literal expression that represents the +/// `NULL` value that would be produced by the `SchemaAdapter`. +/// +/// For example, if: +/// * The table schema is `id, name, address` +/// * The file schema is `id, name` (missing the `address` column) +/// * predicate is `address = 'foo'` +/// +/// When evaluating the predicate as a filter on the parquet file, the predicate +/// must be rewritten to `NULL = 'foo'` as the `address` column will be filled +/// in with `NULL` values during the rest of the evaluation. struct FilterCandidateBuilder<'a> { expr: Arc, + /// The schema of this parquet file file_schema: &'a Schema, + /// The schema of the table (merged schema) -- columns may be in different + /// order than in the file and have columns that are not in the file schema table_schema: &'a Schema, required_column_indices: BTreeSet, + /// Does the expression require any non-primitive columns (like structs)? non_primitive_columns: bool, + /// Does the expression reference any columns that are in the table + /// schema but not in the file schema? projected_columns: bool, } @@ -194,6 +261,13 @@ impl<'a> FilterCandidateBuilder<'a> { } } + /// Attempt to build a `FilterCandidate` from the expression + /// + /// # Return values + /// + /// * `Ok(Some(candidate))` if the expression can be used as an ArrowFilter + /// * `Ok(None)` if the expression cannot be used as an ArrowFilter + /// * `Err(e)` if an error occurs while building the candidate pub fn build( mut self, metadata: &ParquetMetaData, @@ -217,9 +291,13 @@ impl<'a> FilterCandidateBuilder<'a> { } } +/// Implement the `TreeNodeRewriter` trait for `FilterCandidateBuilder` that +/// walks the expression tree and rewrites it in preparation of becoming +/// `FilterCandidate`. impl<'a> TreeNodeRewriter for FilterCandidateBuilder<'a> { type Node = Arc; + /// Called before visiting each child fn f_down( &mut self, node: Arc, @@ -243,13 +321,19 @@ impl<'a> TreeNodeRewriter for FilterCandidateBuilder<'a> { Ok(Transformed::no(node)) } + /// After visiting all children, rewrite column references to nulls if + /// they are not in the file schema fn f_up( &mut self, expr: Arc, ) -> Result>> { + // if the expression is a column, is it in the file schema? if let Some(column) = expr.as_any().downcast_ref::() { if self.file_schema.field_with_name(column.name()).is_err() { - // the column expr must be in the table schema + // Replace the column reference with a NULL (using the type from the table schema) + // e.g. `column = 'foo'` is rewritten be transformed to `NULL = 'foo'` + // + // See comments on `FilterCandidateBuilder` for more information return match self.table_schema.field_with_name(column.name()) { Ok(field) => { // return the null value corresponding to the data type @@ -294,9 +378,11 @@ fn remap_projection(src: &[usize]) -> Vec { projection } -/// Calculate the total compressed size of all `Column's required for -/// predicate `Expr`. This should represent the total amount of file IO -/// required to evaluate the predicate. +/// Calculate the total compressed size of all `Column`'s required for +/// predicate `Expr`. +/// +/// This value represents the total amount of IO required to evaluate the +/// predicate. fn size_of_columns( columns: &BTreeSet, metadata: &ParquetMetaData, @@ -312,8 +398,10 @@ fn size_of_columns( Ok(total_size) } -/// For a given set of `Column`s required for predicate `Expr` determine whether all -/// columns are sorted. Sorted columns may be queried more efficiently in the presence of +/// For a given set of `Column`s required for predicate `Expr` determine whether +/// all columns are sorted. +/// +/// Sorted columns may be queried more efficiently in the presence of /// a PageIndex. fn columns_sorted( _columns: &BTreeSet, @@ -323,7 +411,20 @@ fn columns_sorted( Ok(false) } -/// Build a [`RowFilter`] from the given predicate `Expr` +/// Build a [`RowFilter`] from the given predicate `Expr` if possible +/// +/// # returns +/// * `Ok(Some(row_filter))` if the expression can be used as RowFilter +/// * `Ok(None)` if the expression cannot be used as an RowFilter +/// * `Err(e)` if an error occurs while building the filter +/// +/// Note that the returned `RowFilter` may not contains all conjuncts in the +/// original expression. This is because some conjuncts may not be able to be +/// evaluated as an `ArrowPredicate` and will be ignored. +/// +/// For example, if the expression is `a = 1 AND b = 2 AND c = 3` and `b = 2` +/// can not be evaluated for some reason, the returned `RowFilter` will contain +/// `a = 1` and `c = 3`. pub fn build_row_filter( expr: &Arc, file_schema: &Schema, @@ -336,8 +437,11 @@ pub fn build_row_filter( let rows_filtered = &file_metrics.pushdown_rows_filtered; let time = &file_metrics.pushdown_eval_time; + // Split into conjuncts: + // `a = 1 AND b = 2 AND c = 3` -> [`a = 1`, `b = 2`, `c = 3`] let predicates = split_conjunction(expr); + // Determine which conjuncts can be evaluated as ArrowPredicates, if any let mut candidates: Vec = predicates .into_iter() .flat_map(|expr| { @@ -347,9 +451,11 @@ pub fn build_row_filter( }) .collect(); + // no candidates if candidates.is_empty() { Ok(None) } else if reorder_predicates { + // attempt to reorder the predicates by size and whether they are sorted candidates.sort_by_key(|c| c.required_bytes); let (indexed_candidates, other_candidates): (Vec<_>, Vec<_>) = @@ -385,6 +491,8 @@ pub fn build_row_filter( Ok(Some(RowFilter::new(filters))) } else { + // otherwise evaluate the predicates in the order the appeared in the + // original expressions let mut filters: Vec> = vec![]; for candidate in candidates { let filter = DatafusionArrowPredicate::try_new(