diff --git a/datafusion-examples/examples/parquet_index.rs b/datafusion-examples/examples/parquet_index.rs index a56865aa80a05..646732539f104 100644 --- a/datafusion-examples/examples/parquet_index.rs +++ b/datafusion-examples/examples/parquet_index.rs @@ -113,6 +113,7 @@ async fn main() -> Result<()> { .await? .show() .await?; + println!("Files pruned: {}", provider.index().last_num_pruned()); // Run a query that uses the index to prune files. // @@ -221,8 +222,12 @@ impl TableProvider for IndexTableProvider { // that always evaluates to true we can pass to the index .unwrap_or_else(|| datafusion_physical_expr::expressions::lit(true)); - // Use the index to find the files that might have data that matches the predicate. + // Use the index to find the files that might have data that matches the + // predicate. Any file that can not have data that matches the predicate + // will not be returned. let files = self.index.get_files(predicate.clone())?; + + // Transform to the format needed to pass to ParquetExec // Create one file group per file (default to scanning them all in parallel) let file_groups = files .into_iter() @@ -534,8 +539,8 @@ impl ParquetMetadataIndexBuilder { assert_eq!(value_column_mins.null_count(), 0); assert_eq!(value_column_maxes.null_count(), 0); - // compute the total row count, and overall min and max of the "value" - // column in this file + // The statistics above are one for row group so we need to compute the + // overall file row count, and min and max . let row_count = row_counts .iter() .flatten() // skip nulls (should be none)